⛏️ index : gitore.git

author Alexander von Gluck IV <alex@terarocket.io> 2025-11-25 12:48:56.0 -06:00:00
committer Alexander von Gluck IV <alex@terarocket.io> 2025-11-25 12:48:59.0 -06:00:00
commit
ed84221e1ed623c68024bd57550830796447cf5a [patch]
tree
9eddbaad8818e0d6205a8ac07df51c554b984703
parent
49930a9b597c57af9c5b670706079f8ff90b57ad
download
ed84221e1ed623c68024bd57550830796447cf5a.tar.gz

database/index: Refactor indexing process to not be vec-based

* Greatly reduces the memory consumption parsing big git repositories
* Greatly speeds up indexing
* Regression: Causes git commits to enter database in reverse order.

Diff

 src/database/indexer.rs | 80 ++++++++++++++++++++++++++++++++++++--------------------------------------------
 1 file changed, 36 insertions(+), 44 deletions(-)

diff --git a/src/database/indexer.rs b/src/database/indexer.rs
index 0c0109b..bc3f651 100644
--- a/src/database/indexer.rs
+++ b/src/database/indexer.rs
@@ -13,8 +13,11 @@
    bstr::{BStr, ByteSlice},
    objs::tree::EntryKind,
    refs::Category,
    revision::walk::Sorting,
    traverse::commit::simple::CommitTimeOrder,
    url::Scheme,
};

use itertools::{Either, Itertools};
use rocksdb::WriteBatch;
use time::{OffsetDateTime, UtcOffset};
@@ -256,59 +259,48 @@
        None
    };

    // TODO: stop collecting into a vec
    let revwalk = git_repository
        .rev_walk([commit.id().detach()])
        .all()?
        .collect::<Vec<_>>()
        .into_iter()
        .rev();

    let mut hasher = Xxh3::new();

    let tree_len = commit_tree.len()?;
    let mut seen = false;
    let mut i = 0;
    for revs in &revwalk.chunks(250) {
        let mut batch = WriteBatch::default();

        for rev in revs {
            let rev = rev?;

            if let (false, Some(latest_indexed)) = (seen, &latest_indexed) {
                if rev.id.as_bytes() == latest_indexed.get().hash.as_slice() {
                    seen = true;
                }

                continue;
            }

            seen = true;

            if ((i + 1) % 10_000) == 0 {
    let sorting = Sorting::ByCommitTime(CommitTimeOrder::OldestFirst);
    let revwalk_iter = git_repository.rev_walk([commit.id()]).sorting(sorting).all()?;
    for revwalk in revwalk_iter {
        for revs in &revwalk.into_iter().chunks(250) {
            if ((i + 1) % 5_000) == 0 {
                info!("{}: {} commits ingested", reference.name().shorten(), i + 1);
            }
            let mut batch = WriteBatch::default();
            for rev in revs {
                if let (false, Some(latest_indexed)) = (seen, &latest_indexed) {
                    if rev.id.as_bytes() == latest_indexed.get().hash.as_slice() {
                        seen = true;
                    }
                    continue;
                }

            let commit = rev.object()?;
            let oid = commit.id;
            let commit = commit.decode()?;
            let author = commit.author();
            let committer = commit.committer();

            let tree = git_repository.find_tree(commit.tree())?;
            let tree_id = index_tree(&db, &mut batch, &tree, &mut hasher, submodules)?;

            Commit::new(oid, &commit, author, committer, tree_id)?.insert(
                &commit_tree,
                tree_len + i,
                &mut batch,
            )?;

            i += 1;
                seen = true;
                let commit = rev.object()?;
                let oid = commit.id;
                let commit = commit.decode()?;
                let author = commit.author();
                let committer = commit.committer();

                let tree = git_repository.find_tree(commit.tree())?;
                let tree_id = index_tree(&db, &mut batch, &tree, &mut hasher, submodules)?;

                Commit::new(oid, &commit, author, committer, tree_id)?.insert(
                    &commit_tree,
                    tree_len + i,
                    &mut batch,
                )?;

                i += 1;
            }
            commit_tree.update_counter(tree_len + i, &mut batch)?;
            db.write_without_wal(batch)?;
        }

        commit_tree.update_counter(tree_len + i, &mut batch)?;
        db.write_without_wal(batch)?;
    }

    if !seen && !force_reindex {