Skip to content

Commit c525ce9

Browse files
szedergitster
authored andcommitted
commit-graph: check all leading directories in changed path Bloom filters
The file 'dir/subdir/file' can only be modified if its leading directories 'dir' and 'dir/subdir' are modified as well. So when checking modified path Bloom filters looking for commits modifying a path with multiple path components, then check not only the full path in the Bloom filters, but all its leading directories as well. Take care to check these paths in "deepest first" order, because it's the full path that is least likely to be modified, and the Bloom filter queries can short circuit sooner. This can significantly reduce the average false positive rate, by about an order of magnitude or three(!), and can further speed up pathspec-limited revision walks. The table below compares the average false positive rate and runtime of git rev-list HEAD -- "$path" before and after this change for 5000+ randomly* selected paths from each repository: Average false Average Average positive rate runtime runtime before after before after difference ------------------------------------------------------------------ git 3.220% 0.7853% 0.0558s 0.0387s -30.6% linux 2.453% 0.0296% 0.1046s 0.0766s -26.8% tensorflow 2.536% 0.6977% 0.0594s 0.0420s -29.2% *Path selection was done with the following pipeline: git ls-tree -r --name-only HEAD | sort -R | head -n 5000 The improvements in runtime are much smaller than the improvements in average false positive rate, as we are clearly reaching diminishing returns here. However, all these timings depend on that accessing tree objects is reasonably fast (warm caches). If we had a partial clone and the tree objects had to be fetched from a promisor remote, e.g.: $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \ commit-graph write --reachable $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/ $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \ rev-list HEAD -- "$path" then checking all leading path component can reduce the runtime from over an hour to a few seconds (and this is with the clone and the promisor on the same machine). This adjusts the tracing values in t4216-log-bloom.sh, which provides a concrete way to notice the improvement. Helped-by: Taylor Blau <[email protected]> Helped-by: René Scharfe <[email protected]> Signed-off-by: SZEDER Gábor <[email protected]> Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent f3c2a36 commit c525ce9

File tree

3 files changed

+42
-12
lines changed

3 files changed

+42
-12
lines changed

revision.c

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -668,9 +668,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
668668
{
669669
struct pathspec_item *pi;
670670
char *path_alloc = NULL;
671-
const char *path;
671+
const char *path, *p;
672672
int last_index;
673-
int len;
673+
size_t len;
674+
int path_component_nr = 1;
674675

675676
if (!revs->commits)
676677
return;
@@ -707,8 +708,33 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
707708
return;
708709
}
709710

710-
revs->bloom_key = xmalloc(sizeof(struct bloom_key));
711-
fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
711+
p = path;
712+
while (*p) {
713+
/*
714+
* At this point, the path is normalized to use Unix-style
715+
* path separators. This is required due to how the
716+
* changed-path Bloom filters store the paths.
717+
*/
718+
if (*p == '/')
719+
path_component_nr++;
720+
p++;
721+
}
722+
723+
revs->bloom_keys_nr = path_component_nr;
724+
ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
725+
726+
fill_bloom_key(path, len, &revs->bloom_keys[0],
727+
revs->bloom_filter_settings);
728+
path_component_nr = 1;
729+
730+
p = path + len - 1;
731+
while (p > path) {
732+
if (*p == '/')
733+
fill_bloom_key(path, p - path,
734+
&revs->bloom_keys[path_component_nr++],
735+
revs->bloom_filter_settings);
736+
p--;
737+
}
712738

713739
if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
714740
atexit(trace2_bloom_filter_statistics_atexit);
@@ -722,7 +748,7 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
722748
struct commit *commit)
723749
{
724750
struct bloom_filter *filter;
725-
int result;
751+
int result = 1, j;
726752

727753
if (!revs->repo->objects->commit_graph)
728754
return -1;
@@ -737,9 +763,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
737763
return -1;
738764
}
739765

740-
result = bloom_filter_contains(filter,
741-
revs->bloom_key,
742-
revs->bloom_filter_settings);
766+
for (j = 0; result && j < revs->bloom_keys_nr; j++) {
767+
result = bloom_filter_contains(filter,
768+
&revs->bloom_keys[j],
769+
revs->bloom_filter_settings);
770+
}
743771

744772
if (result)
745773
count_bloom_filter_maybe++;
@@ -779,7 +807,7 @@ static int rev_compare_tree(struct rev_info *revs,
779807
return REV_TREE_SAME;
780808
}
781809

782-
if (revs->bloom_key && !nth_parent) {
810+
if (revs->bloom_keys_nr && !nth_parent) {
783811
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
784812

785813
if (bloom_ret == 0)

revision.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,8 +295,10 @@ struct rev_info {
295295
struct topo_walk_info *topo_walk_info;
296296

297297
/* Commit graph bloom filter fields */
298-
/* The bloom filter key for the pathspec */
299-
struct bloom_key *bloom_key;
298+
/* The bloom filter key(s) for the pathspec */
299+
struct bloom_key *bloom_keys;
300+
int bloom_keys_nr;
301+
300302
/*
301303
* The bloom filter settings used to generate the key.
302304
* This is loaded from the commit-graph being used.

t/t4216-log-bloom.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
146146

147147
test_bloom_filters_used_when_some_filters_are_missing () {
148148
log_args=$1
149-
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":8,\"definitely_not\":6"
149+
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":8"
150150
setup "$log_args" &&
151151
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
152152
test_cmp log_wo_bloom log_w_bloom

0 commit comments

Comments
 (0)