From ed84221e1ed623c68024bd57550830796447cf5a Mon Sep 17 00:00:00 2001 From: Alexander von Gluck IV Date: Tue, 25 Nov 2025 12:48:56 -0600 Subject: [PATCH] database/index: Refactor indexing process to not be vec-based * Greatly reduces the memory consumption parsing big git repositories * Greatly speeds up indexing * Regression: Causes git commits to enter database in reverse order. --- src/database/indexer.rs | 80 ++++++++++++++++++++++++++++++++++++-------------------------------------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/src/database/indexer.rs b/src/database/indexer.rs index 0c0109b..bc3f651 100644 --- a/src/database/indexer.rs +++ b/src/database/indexer.rs @@ -13,8 +13,11 @@ bstr::{BStr, ByteSlice}, objs::tree::EntryKind, refs::Category, + revision::walk::Sorting, + traverse::commit::simple::CommitTimeOrder, url::Scheme, }; + use itertools::{Either, Itertools}; use rocksdb::WriteBatch; use time::{OffsetDateTime, UtcOffset}; @@ -256,59 +259,48 @@ None }; - // TODO: stop collecting into a vec - let revwalk = git_repository - .rev_walk([commit.id().detach()]) - .all()? - .collect::>() - .into_iter() - .rev(); - let mut hasher = Xxh3::new(); - let tree_len = commit_tree.len()?; let mut seen = false; let mut i = 0; - for revs in &revwalk.chunks(250) { - let mut batch = WriteBatch::default(); - - for rev in revs { - let rev = rev?; - - if let (false, Some(latest_indexed)) = (seen, &latest_indexed) { - if rev.id.as_bytes() == latest_indexed.get().hash.as_slice() { - seen = true; - } - - continue; - } - seen = true; - - if ((i + 1) % 10_000) == 0 { + let sorting = Sorting::ByCommitTime(CommitTimeOrder::OldestFirst); + let revwalk_iter = git_repository.rev_walk([commit.id()]).sorting(sorting).all()?; + for revwalk in revwalk_iter { + for revs in &revwalk.into_iter().chunks(250) { + if ((i + 1) % 5_000) == 0 { info!("{}: {} commits ingested", reference.name().shorten(), i + 1); } + let mut batch = WriteBatch::default(); + for rev in revs { + if let (false, Some(latest_indexed)) = (seen, &latest_indexed) { + if rev.id.as_bytes() == latest_indexed.get().hash.as_slice() { + seen = true; + } + continue; + } - let commit = rev.object()?; - let oid = commit.id; - let commit = commit.decode()?; - let author = commit.author(); - let committer = commit.committer(); - - let tree = git_repository.find_tree(commit.tree())?; - let tree_id = index_tree(&db, &mut batch, &tree, &mut hasher, submodules)?; - - Commit::new(oid, &commit, author, committer, tree_id)?.insert( - &commit_tree, - tree_len + i, - &mut batch, - )?; - - i += 1; + seen = true; + let commit = rev.object()?; + let oid = commit.id; + let commit = commit.decode()?; + let author = commit.author(); + let committer = commit.committer(); + + let tree = git_repository.find_tree(commit.tree())?; + let tree_id = index_tree(&db, &mut batch, &tree, &mut hasher, submodules)?; + + Commit::new(oid, &commit, author, committer, tree_id)?.insert( + &commit_tree, + tree_len + i, + &mut batch, + )?; + + i += 1; + } + commit_tree.update_counter(tree_len + i, &mut batch)?; + db.write_without_wal(batch)?; } - - commit_tree.update_counter(tree_len + i, &mut batch)?; - db.write_without_wal(batch)?; } if !seen && !force_reindex { -- gitore 0.2.3