database/index: Refactor indexing process to not be vec-based
* Greatly reduces the memory consumption parsing big git repositories
* Greatly speeds up indexing
* Regression: Causes git commits to enter database in reverse order.
Diff
src/database/indexer.rs | 80 ++++++++++++++++++++++++++++++++++++--------------------------------------------
1 file changed, 36 insertions(+), 44 deletions(-)
@@ -13,8 +13,11 @@
bstr::{BStr, ByteSlice},
objs::tree::EntryKind,
refs::Category,
revision::walk::Sorting,
traverse::commit::simple::CommitTimeOrder,
url::Scheme,
};
use itertools::{Either, Itertools};
use rocksdb::WriteBatch;
use time::{OffsetDateTime, UtcOffset};
@@ -256,59 +259,48 @@
None
};
let revwalk = git_repository
.rev_walk([commit.id().detach()])
.all()?
.collect::<Vec<_>>()
.into_iter()
.rev();
let mut hasher = Xxh3::new();
let tree_len = commit_tree.len()?;
let mut seen = false;
let mut i = 0;
for revs in &revwalk.chunks(250) {
let mut batch = WriteBatch::default();
for rev in revs {
let rev = rev?;
if let (false, Some(latest_indexed)) = (seen, &latest_indexed) {
if rev.id.as_bytes() == latest_indexed.get().hash.as_slice() {
seen = true;
}
continue;
}
seen = true;
if ((i + 1) % 10_000) == 0 {
let sorting = Sorting::ByCommitTime(CommitTimeOrder::OldestFirst);
let revwalk_iter = git_repository.rev_walk([commit.id()]).sorting(sorting).all()?;
for revwalk in revwalk_iter {
for revs in &revwalk.into_iter().chunks(250) {
if ((i + 1) % 5_000) == 0 {
info!("{}: {} commits ingested", reference.name().shorten(), i + 1);
}
let mut batch = WriteBatch::default();
for rev in revs {
if let (false, Some(latest_indexed)) = (seen, &latest_indexed) {
if rev.id.as_bytes() == latest_indexed.get().hash.as_slice() {
seen = true;
}
continue;
}
let commit = rev.object()?;
let oid = commit.id;
let commit = commit.decode()?;
let author = commit.author();
let committer = commit.committer();
let tree = git_repository.find_tree(commit.tree())?;
let tree_id = index_tree(&db, &mut batch, &tree, &mut hasher, submodules)?;
Commit::new(oid, &commit, author, committer, tree_id)?.insert(
&commit_tree,
tree_len + i,
&mut batch,
)?;
i += 1;
seen = true;
let commit = rev.object()?;
let oid = commit.id;
let commit = commit.decode()?;
let author = commit.author();
let committer = commit.committer();
let tree = git_repository.find_tree(commit.tree())?;
let tree_id = index_tree(&db, &mut batch, &tree, &mut hasher, submodules)?;
Commit::new(oid, &commit, author, committer, tree_id)?.insert(
&commit_tree,
tree_len + i,
&mut batch,
)?;
i += 1;
}
commit_tree.update_counter(tree_len + i, &mut batch)?;
db.write_without_wal(batch)?;
}
commit_tree.update_counter(tree_len + i, &mut batch)?;
db.write_without_wal(batch)?;
}
if !seen && !force_reindex {