Skip to content

Commit 85e36cb

Browse files
committed
gix-commitgraph: implement support for bloom caches
The `git commit-graph write` command also supports writing a separate section on the cache file that contains information about the paths changed between a commit and its first parent. This information can be used to significantly speed up the performance of some traversal operations, such as `git log -- <PATH>` and `git blame`. This commit teaches the git-commitgraph crate in gitoxide how to parse and access this information. We've only implemented support for reading v2 of this cache, because v1 is deprecated in Git as it can return bad results in some corner cases. The implementation is 100% compatible with Git itself; it uses the exact same version of murmur3 that Git is using, including the seed hashes.
1 parent 15c835a commit 85e36cb

14 files changed

Lines changed: 539 additions & 17 deletions

File tree

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gix-commitgraph/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ nonempty = "0.12.0"
3232
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
3333

3434
document-features = { version = "0.2.0", optional = true }
35+
murmur3 = "0.5.2"
3536

3637
[dev-dependencies]
3738
gix-testtools = { path = "../tests/tools" }

gix-commitgraph/src/access.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::{file, file::Commit, File, Graph, Position};
1+
use crate::{file, file::Commit, BloomFilterSettings, File, Graph, Position};
22

33
/// Access
44
impl Graph {
@@ -52,6 +52,11 @@ impl Graph {
5252
pub fn num_commits(&self) -> u32 {
5353
self.files.iter().map(File::num_commits).sum()
5454
}
55+
56+
/// Return changed-path Bloom filter settings used by the top-most compatible graph layer, if available.
57+
pub fn bloom_filter_settings(&self) -> Option<BloomFilterSettings> {
58+
self.files.iter().rev().find_map(File::bloom_filter_settings)
59+
}
5560
}
5661

5762
/// Access fundamentals

gix-commitgraph/src/bloom.rs

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
//! Query support for changed-path Bloom filters stored in commit-graph files.
2+
3+
use std::io::Cursor;
4+
5+
use bstr::BStr;
6+
7+
use crate::{file, from_be_u32, BloomFilterSettings, File, Graph, Position};
8+
9+
const SEED0: u32 = 0x293a_e76f;
10+
const SEED1: u32 = 0x7e64_6e2c;
11+
const BITS_PER_WORD: u64 = 8;
12+
13+
/// Precomputed hash positions for a path using Bloom filter settings.
14+
#[derive(Clone, Debug, Eq, PartialEq)]
15+
pub struct BloomKey {
16+
h0: u32,
17+
h1: u32,
18+
num_hashes: u32,
19+
}
20+
21+
impl BloomKey {
22+
/// Build a key for `path`.
23+
///
24+
/// `path` must use `/` as separator, matching Git's changed-path Bloom filter expectations.
25+
pub fn from_path(path: &BStr, settings: BloomFilterSettings) -> Self {
26+
Self::from_bytes(path.as_ref(), settings)
27+
}
28+
29+
/// Build keys for `path` and each directory prefix.
30+
///
31+
/// For `a/b/c`, this yields keys for `a/b/c`, `a/b`, and `a`.
32+
/// `path` must use `/` as separator.
33+
pub fn from_path_with_prefixes(path: &BStr, settings: BloomFilterSettings) -> Vec<Self> {
34+
let bytes = path.as_ref();
35+
let mut out = vec![Self::from_bytes(bytes, settings)];
36+
37+
let mut idx = bytes.len();
38+
while idx > 0 {
39+
idx -= 1;
40+
if bytes[idx] == b'/' {
41+
out.push(Self::from_bytes(&bytes[..idx], settings));
42+
}
43+
}
44+
out
45+
}
46+
47+
fn from_bytes(path: &[u8], settings: BloomFilterSettings) -> Self {
48+
Self {
49+
h0: murmur3_v2(SEED0, path),
50+
h1: murmur3_v2(SEED1, path),
51+
num_hashes: settings.num_hashes,
52+
}
53+
}
54+
55+
/// Query whether this key may be contained in `filter_data`.
56+
///
57+
/// Returns `None` if the filter is unusable (empty data), `Some(false)` on a definite miss,
58+
/// and `Some(true)` on a possible hit.
59+
pub fn contains(&self, filter_data: &[u8]) -> Option<bool> {
60+
let modulo = (filter_data.len() as u64) * BITS_PER_WORD;
61+
if modulo == 0 {
62+
return None;
63+
}
64+
65+
for i in 0..self.num_hashes {
66+
let hash = self.h0.wrapping_add(i.wrapping_mul(self.h1));
67+
let bit_pos = u64::from(hash) % modulo;
68+
let byte_pos = (bit_pos / BITS_PER_WORD) as usize;
69+
let mask = 1u8 << (bit_pos % BITS_PER_WORD);
70+
if filter_data[byte_pos] & mask == 0 {
71+
return Some(false);
72+
}
73+
}
74+
Some(true)
75+
}
76+
}
77+
78+
impl File {
79+
/// Query if `path` may be present in the changed-path Bloom filter for commit `pos`.
80+
///
81+
/// Checks the full path and every directory prefix against the filter,
82+
/// matching Git's `bloom_filter_contains_vec()` behavior for reduced false positives.
83+
pub fn maybe_contains_path(&self, pos: file::Position, path: &BStr) -> Option<bool> {
84+
let (data, settings) = self.bloom_filter_at(pos)?;
85+
let keys = BloomKey::from_path_with_prefixes(path, settings);
86+
for key in &keys {
87+
match key.contains(data) {
88+
Some(false) => return Some(false),
89+
None => return None,
90+
Some(true) => {}
91+
}
92+
}
93+
Some(true)
94+
}
95+
96+
/// Query if all `keys` may be present in the changed-path Bloom filter for commit `pos`.
97+
///
98+
/// This corresponds to Git's `bloom_filter_contains_vec()` behavior.
99+
pub fn maybe_contains_all_keys(&self, pos: file::Position, keys: &[BloomKey]) -> Option<bool> {
100+
let (data, _settings) = self.bloom_filter_at(pos)?;
101+
if keys.iter().all(|key| key.contains(data) == Some(true)) {
102+
Some(true)
103+
} else {
104+
Some(false)
105+
}
106+
}
107+
108+
fn bloom_filter_at(&self, pos: file::Position) -> Option<(&[u8], BloomFilterSettings)> {
109+
let settings = self.bloom_filter_settings?;
110+
let index_offset = self.bloom_filter_index_offset?;
111+
let data_offset = self.bloom_filter_data_offset?;
112+
if pos.0 >= self.num_commits() {
113+
return None;
114+
}
115+
116+
let lex = pos.0 as usize;
117+
let end = from_be_u32(&self.data[index_offset + lex * 4..][..4]);
118+
let start = if lex == 0 {
119+
0
120+
} else {
121+
from_be_u32(&self.data[index_offset + (lex - 1) * 4..][..4])
122+
};
123+
let start = start as usize;
124+
let end = end as usize;
125+
if start > end || end > self.bloom_filter_data_len {
126+
return None;
127+
}
128+
let start = data_offset.checked_add(start)?;
129+
let end = data_offset.checked_add(end)?;
130+
self.data.get(start..end).map(|data| (data, settings))
131+
}
132+
}
133+
134+
impl Graph {
135+
/// Query by commit id if `path` may be present in changed-path Bloom filters.
136+
pub fn maybe_contains_path_by_id(&self, id: impl AsRef<gix_hash::oid>, path: &BStr) -> Option<bool> {
137+
let pos = self.lookup(id)?;
138+
self.maybe_contains_path(pos, path)
139+
}
140+
141+
/// Query by graph position if `path` may be present in changed-path Bloom filters.
142+
pub fn maybe_contains_path(&self, pos: Position, path: &BStr) -> Option<bool> {
143+
self.commit_at(pos).maybe_contains_path(path)
144+
}
145+
}
146+
147+
pub(crate) fn murmur3_v2(seed: u32, data: &[u8]) -> u32 {
148+
let mut reader = Cursor::new(data);
149+
murmur3::murmur3_32(&mut reader, seed).expect("reading from memory does not fail")
150+
}
151+
#[cfg(test)]
152+
mod tests {
153+
use super::{murmur3_v2, BloomKey};
154+
use crate::BloomFilterSettings;
155+
use bstr::BStr;
156+
157+
#[test]
158+
fn murmur3_known_vectors_match_git_and_reference_values() {
159+
assert_eq!(murmur3_v2(0, b""), 0x0000_0000);
160+
assert_eq!(murmur3_v2(0, b"Hello world!"), 0x627b_0c2c);
161+
assert_eq!(
162+
murmur3_v2(0, b"The quick brown fox jumps over the lazy dog"),
163+
0x2e4f_f723
164+
);
165+
}
166+
167+
#[test]
168+
fn bloom_key_for_empty_path_matches_git_vector() {
169+
let settings = BloomFilterSettings {
170+
hash_version: 2,
171+
num_hashes: 7,
172+
bits_per_entry: 10,
173+
};
174+
let key = BloomKey::from_path(BStr::new(b""), settings);
175+
assert_eq!(
176+
(0..key.num_hashes)
177+
.map(|i| key.h0.wrapping_add(i.wrapping_mul(key.h1)))
178+
.collect::<Vec<_>>(),
179+
&[
180+
0x5615_800c,
181+
0x5b96_6560,
182+
0x6117_4ab4,
183+
0x6698_3008,
184+
0x6c19_155c,
185+
0x7199_fab0,
186+
0x771a_e004
187+
]
188+
);
189+
}
190+
}

gix-commitgraph/src/file/access.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::{
55

66
use crate::{
77
file::{self, commit::Commit, COMMIT_DATA_ENTRY_SIZE_SANS_HASH},
8-
File,
8+
BloomFilterSettings, File,
99
};
1010

1111
/// Access
@@ -107,6 +107,11 @@ impl File {
107107
pub fn path(&self) -> &Path {
108108
&self.path
109109
}
110+
111+
/// Return changed-path Bloom filter settings if this file has a usable Bloom index and data pair.
112+
pub fn bloom_filter_settings(&self) -> Option<BloomFilterSettings> {
113+
self.bloom_filter_settings
114+
}
110115
}
111116

112117
impl File {
@@ -131,6 +136,13 @@ impl File {
131136
pub(crate) fn extra_edges_data(&self) -> Option<&[u8]> {
132137
Some(&self.data[self.extra_edges_list_range.clone()?])
133138
}
139+
140+
pub(crate) fn clear_bloom_filters(&mut self) {
141+
self.bloom_filter_data_len = 0;
142+
self.bloom_filter_data_offset = None;
143+
self.bloom_filter_index_offset = None;
144+
self.bloom_filter_settings = None;
145+
}
134146
}
135147

136148
impl Debug for File {

gix-commitgraph/src/file/commit.rs

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
//! Low-level operations on individual commits.
22
use crate::{
3+
bloom::BloomKey,
34
file::{self, EXTENDED_EDGES_MASK, LAST_EXTENDED_EDGE_MASK, NO_PARENT},
4-
File, Position,
5+
from_be_u32, File, Position,
56
};
7+
use bstr::BStr;
68
use gix_error::{message, Message};
79
use std::{
810
fmt::{Debug, Formatter},
@@ -22,24 +24,19 @@ pub struct Commit<'a> {
2224
root_tree_id: &'a gix_hash::oid,
2325
}
2426

25-
#[inline]
26-
fn read_u32(b: &[u8]) -> u32 {
27-
u32::from_be_bytes(b.try_into().unwrap())
28-
}
29-
3027
impl<'a> Commit<'a> {
3128
pub(crate) fn new(file: &'a File, pos: file::Position) -> Self {
3229
let bytes = file.commit_data_bytes(pos);
3330
Commit {
3431
file,
3532
pos,
3633
root_tree_id: gix_hash::oid::from_bytes_unchecked(&bytes[..file.hash_len]),
37-
parent1: ParentEdge::from_raw(read_u32(&bytes[file.hash_len..][..4])),
38-
parent2: ParentEdge::from_raw(read_u32(&bytes[file.hash_len + 4..][..4])),
34+
parent1: ParentEdge::from_raw(from_be_u32(&bytes[file.hash_len..][..4])),
35+
parent2: ParentEdge::from_raw(from_be_u32(&bytes[file.hash_len + 4..][..4])),
3936
// TODO: Add support for corrected commit date offset overflow.
4037
// See https://github.com/git/git/commit/e8b63005c48696a26f976f5f9b0ccaf1983e439d and
4138
// https://github.com/git/git/commit/f90fca638e99a031dce8e3aca72427b2f9b4bb38 for more details and hints at a test.
42-
generation: read_u32(&bytes[file.hash_len + 8..][..4]) >> 2,
39+
generation: from_be_u32(&bytes[file.hash_len + 8..][..4]) >> 2,
4340
commit_timestamp: u64::from_be_bytes(bytes[file.hash_len + 8..][..8].try_into().unwrap())
4441
& 0x0003_ffff_ffff,
4542
}
@@ -90,6 +87,16 @@ impl<'a> Commit<'a> {
9087
pub fn root_tree_id(&self) -> &gix_hash::oid {
9188
self.root_tree_id
9289
}
90+
91+
/// Query if `path` may be present in this commit's changed-path Bloom filter.
92+
pub fn maybe_contains_path(&self, path: &BStr) -> Option<bool> {
93+
self.file.maybe_contains_path(self.pos, path)
94+
}
95+
96+
/// Query if all `keys` may be present in this commit's changed-path Bloom filter.
97+
pub fn maybe_contains_all_keys(&self, keys: &[BloomKey]) -> Option<bool> {
98+
self.file.maybe_contains_all_keys(self.pos, keys)
99+
}
93100
}
94101

95102
impl Debug for Commit<'_> {
@@ -176,7 +183,7 @@ impl Iterator for Parents<'_> {
176183
},
177184
ParentIteratorState::Extra(mut chunks) => {
178185
if let Some(chunk) = chunks.next() {
179-
let extra_edge = read_u32(chunk);
186+
let extra_edge = from_be_u32(chunk);
180187
match ExtraEdge::from_raw(extra_edge) {
181188
ExtraEdge::Internal(pos) => {
182189
self.state = ParentIteratorState::Extra(chunks);

0 commit comments

Comments
 (0)