Skip to content

Commit 8dfa4d6

Browse files
committed
Merge branch 'en/batch-prefetch' into seen
In a lazy clone, "git cherry" and "git grep" often fetch necessary blob objects one by one from promisor remotes. It has been corrected to collect necessary object names and fetch them in bulk to gain reasonable performance. * en/batch-prefetch: grep: prefetch necessary blobs builtin/log: prefetch necessary blobs for `git cherry` patch-ids.h: add missing trailing parenthesis in documentation comment
2 parents 7120fe3 + e775693 commit 8dfa4d6

5 files changed

Lines changed: 321 additions & 1 deletion

File tree

builtin/grep.c

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828
#include "object-file.h"
2929
#include "object-name.h"
3030
#include "odb.h"
31+
#include "oid-array.h"
32+
#include "oidset.h"
3133
#include "packfile.h"
3234
#include "pager.h"
3335
#include "path.h"
36+
#include "promisor-remote.h"
3437
#include "read-cache-ll.h"
3538
#include "write-or-die.h"
3639

@@ -692,6 +695,143 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
692695
return hit;
693696
}
694697

698+
static void collect_blob_oids_for_tree(struct repository *repo,
699+
const struct pathspec *pathspec,
700+
struct tree_desc *tree,
701+
struct strbuf *base,
702+
int tn_len,
703+
struct oidset *blob_oids)
704+
{
705+
struct name_entry entry;
706+
int old_baselen = base->len;
707+
struct strbuf name = STRBUF_INIT;
708+
enum interesting match = entry_not_interesting;
709+
710+
while (tree_entry(tree, &entry)) {
711+
if (match != all_entries_interesting) {
712+
strbuf_addstr(&name, base->buf + tn_len);
713+
match = tree_entry_interesting(repo->index,
714+
&entry, &name,
715+
pathspec);
716+
strbuf_reset(&name);
717+
718+
if (match == all_entries_not_interesting)
719+
break;
720+
if (match == entry_not_interesting)
721+
continue;
722+
}
723+
724+
strbuf_add(base, entry.path, tree_entry_len(&entry));
725+
726+
if (S_ISREG(entry.mode)) {
727+
oidset_insert(blob_oids, &entry.oid);
728+
} else if (S_ISDIR(entry.mode)) {
729+
enum object_type type;
730+
struct tree_desc sub_tree;
731+
void *data;
732+
unsigned long size;
733+
734+
data = odb_read_object(repo->objects, &entry.oid,
735+
&type, &size);
736+
if (!data)
737+
die(_("unable to read tree (%s)"),
738+
oid_to_hex(&entry.oid));
739+
740+
strbuf_addch(base, '/');
741+
init_tree_desc(&sub_tree, &entry.oid, data, size);
742+
collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
743+
base, tn_len, blob_oids);
744+
free(data);
745+
}
746+
/*
747+
* ...no else clause for S_ISGITLINK: submodules have their
748+
* own promisor configuration and would need separate fetches
749+
* anyway.
750+
*/
751+
752+
strbuf_setlen(base, old_baselen);
753+
}
754+
755+
strbuf_release(&name);
756+
}
757+
758+
static void collect_blob_oids_for_treeish(struct grep_opt *opt,
759+
const struct pathspec *pathspec,
760+
const struct object_id *tree_ish_oid,
761+
const char *name,
762+
struct oidset *blob_oids)
763+
{
764+
struct tree_desc tree;
765+
void *data;
766+
unsigned long size;
767+
struct strbuf base = STRBUF_INIT;
768+
int len;
769+
770+
data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
771+
OBJ_TREE, &size, NULL);
772+
773+
if (!data)
774+
return;
775+
776+
len = name ? strlen(name) : 0;
777+
if (len) {
778+
strbuf_add(&base, name, len);
779+
strbuf_addch(&base, ':');
780+
}
781+
init_tree_desc(&tree, tree_ish_oid, data, size);
782+
783+
collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
784+
&base, base.len, blob_oids);
785+
786+
strbuf_release(&base);
787+
free(data);
788+
}
789+
790+
static void prefetch_grep_blobs(struct grep_opt *opt,
791+
const struct pathspec *pathspec,
792+
const struct object_array *list)
793+
{
794+
struct oidset blob_oids = OIDSET_INIT;
795+
796+
/* Exit if we're not in a partial clone */
797+
if (!repo_has_promisor_remote(opt->repo))
798+
return;
799+
800+
/* For each tree, gather the blobs in it */
801+
for (int i = 0; i < list->nr; i++) {
802+
struct object *real_obj;
803+
804+
obj_read_lock();
805+
real_obj = deref_tag(opt->repo, list->objects[i].item,
806+
NULL, 0);
807+
obj_read_unlock();
808+
809+
if (real_obj &&
810+
(real_obj->type == OBJ_COMMIT ||
811+
real_obj->type == OBJ_TREE))
812+
collect_blob_oids_for_treeish(opt, pathspec,
813+
&real_obj->oid,
814+
list->objects[i].name,
815+
&blob_oids);
816+
}
817+
818+
/* Prefetch the blobs we found */
819+
if (oidset_size(&blob_oids)) {
820+
struct oid_array to_fetch = OID_ARRAY_INIT;
821+
struct oidset_iter iter;
822+
const struct object_id *oid;
823+
824+
oidset_iter_init(&blob_oids, &iter);
825+
while ((oid = oidset_iter_next(&iter)))
826+
oid_array_append(&to_fetch, oid);
827+
828+
promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
829+
830+
oid_array_clear(&to_fetch);
831+
}
832+
oidset_clear(&blob_oids);
833+
}
834+
695835
static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
696836
struct object *obj, const char *name, const char *path)
697837
{
@@ -732,6 +872,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
732872
int hit = 0;
733873
const unsigned int nr = list->nr;
734874

875+
prefetch_grep_blobs(opt, pathspec, list);
876+
735877
for (i = 0; i < nr; i++) {
736878
struct object *real_obj;
737879

builtin/log.c

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@
2121
#include "color.h"
2222
#include "commit.h"
2323
#include "diff.h"
24+
#include "diffcore.h"
2425
#include "diff-merges.h"
2526
#include "revision.h"
2627
#include "log-tree.h"
2728
#include "oid-array.h"
29+
#include "oidset.h"
2830
#include "tag.h"
2931
#include "reflog-walk.h"
3032
#include "patch-ids.h"
@@ -43,9 +45,11 @@
4345
#include "utf8.h"
4446

4547
#include "commit-reach.h"
48+
#include "promisor-remote.h"
4649
#include "range-diff.h"
4750
#include "tmp-objdir.h"
4851
#include "tree.h"
52+
#include "userdiff.h"
4953
#include "write-or-die.h"
5054

5155
#define MAIL_DEFAULT_WRAP 72
@@ -2602,6 +2606,125 @@ static void print_commit(char sign, struct commit *commit, int verbose,
26022606
}
26032607
}
26042608

2609+
/*
2610+
* Enumerate blob OIDs from a single commit's diff, inserting them into blobs.
2611+
* Skips files whose userdiff driver explicitly declares binary status
2612+
* (drv->binary > 0), since patch-ID uses oid_to_hex() for those and
2613+
* never reads blob content. Use userdiff_find_by_path() since
2614+
* diff_filespec_load_driver() is static in diff.c.
2615+
*
2616+
* Clean up with diff_queue_clear() (from diffcore.h).
2617+
*/
2618+
static void collect_diff_blob_oids(struct commit *commit,
2619+
struct diff_options *opts,
2620+
struct oidset *blobs)
2621+
{
2622+
struct diff_queue_struct *q;
2623+
2624+
/*
2625+
* Merge commits are filtered out by patch_id_defined() in patch-ids.c,
2626+
* so we'll never be called with one.
2627+
*/
2628+
assert(!commit->parents || !commit->parents->next);
2629+
2630+
if (commit->parents)
2631+
diff_tree_oid(&commit->parents->item->object.oid,
2632+
&commit->object.oid, "", opts);
2633+
else
2634+
diff_root_tree_oid(&commit->object.oid, "", opts);
2635+
diffcore_std(opts);
2636+
2637+
q = &diff_queued_diff;
2638+
for (int i = 0; i < q->nr; i++) {
2639+
struct diff_filepair *p = q->queue[i];
2640+
struct userdiff_driver *drv;
2641+
2642+
/* Skip binary files */
2643+
drv = userdiff_find_by_path(opts->repo->index, p->one->path);
2644+
if (drv && drv->binary > 0)
2645+
continue;
2646+
2647+
if (DIFF_FILE_VALID(p->one))
2648+
oidset_insert(blobs, &p->one->oid);
2649+
if (DIFF_FILE_VALID(p->two))
2650+
oidset_insert(blobs, &p->two->oid);
2651+
}
2652+
diff_queue_clear(q);
2653+
}
2654+
2655+
static int always_match(const void *cmp_data UNUSED,
2656+
const struct hashmap_entry *entry1 UNUSED,
2657+
const struct hashmap_entry *entry2 UNUSED,
2658+
const void *keydata UNUSED)
2659+
{
2660+
return 0;
2661+
}
2662+
2663+
/*
2664+
* Prefetch blobs for git cherry in partial clones.
2665+
*
2666+
* Called between the revision walk (which builds the head-side
2667+
* commit list) and the has_commit_patch_id() comparison loop.
2668+
*
2669+
* Uses a cmpfn-swap trick to avoid reading blobs: temporarily
2670+
* replaces the hashmap's comparison function with a trivial
2671+
* always-match function, so hashmap_get()/hashmap_get_next() match
2672+
* any entry with the same oidhash bucket. These are the set of oids
2673+
* that would trigger patch_id_neq() during normal lookup and cause
2674+
* blobs to be read on demand, and we want to prefetch them all at
2675+
* once instead.
2676+
*/
2677+
static void prefetch_cherry_blobs(struct repository *repo,
2678+
struct commit_list *list,
2679+
struct patch_ids *ids)
2680+
{
2681+
struct oidset blobs = OIDSET_INIT;
2682+
hashmap_cmp_fn original_cmpfn;
2683+
2684+
/* Exit if we're not in a partial clone */
2685+
if (!repo_has_promisor_remote(repo))
2686+
return;
2687+
2688+
/* Save original cmpfn, replace with always_match */
2689+
original_cmpfn = ids->patches.cmpfn;
2690+
ids->patches.cmpfn = always_match;
2691+
2692+
/* Find header-only collisions, gather blobs from those commits */
2693+
for (struct commit_list *l = list; l; l = l->next) {
2694+
struct commit *c = l->item;
2695+
bool match_found = false;
2696+
for (struct patch_id *cur = patch_id_iter_first(c, ids);
2697+
cur;
2698+
cur = patch_id_iter_next(cur, ids)) {
2699+
match_found = true;
2700+
collect_diff_blob_oids(cur->commit, &ids->diffopts,
2701+
&blobs);
2702+
}
2703+
if (match_found)
2704+
collect_diff_blob_oids(c, &ids->diffopts, &blobs);
2705+
}
2706+
2707+
/* Restore original cmpfn */
2708+
ids->patches.cmpfn = original_cmpfn;
2709+
2710+
/* If we have any blobs to fetch, fetch them */
2711+
if (oidset_size(&blobs)) {
2712+
struct oid_array to_fetch = OID_ARRAY_INIT;
2713+
struct oidset_iter iter;
2714+
const struct object_id *oid;
2715+
2716+
oidset_iter_init(&blobs, &iter);
2717+
while ((oid = oidset_iter_next(&iter)))
2718+
oid_array_append(&to_fetch, oid);
2719+
2720+
promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
2721+
2722+
oid_array_clear(&to_fetch);
2723+
}
2724+
2725+
oidset_clear(&blobs);
2726+
}
2727+
26052728
int cmd_cherry(int argc,
26062729
const char **argv,
26072730
const char *prefix,
@@ -2673,6 +2796,8 @@ int cmd_cherry(int argc,
26732796
commit_list_insert(commit, &list);
26742797
}
26752798

2799+
prefetch_cherry_blobs(the_repository, list, &ids);
2800+
26762801
for (struct commit_list *l = list; l; l = l->next) {
26772802
char sign = '+';
26782803

patch-ids.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ int has_commit_patch_id(struct commit *commit, struct patch_ids *);
3737
* struct patch_id *cur;
3838
* for (cur = patch_id_iter_first(commit, ids);
3939
* cur;
40-
* cur = patch_id_iter_next(cur, ids) {
40+
* cur = patch_id_iter_next(cur, ids)) {
4141
* ... look at cur->commit
4242
* }
4343
*/

t/t3500-cherry.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,22 @@ test_expect_success 'cherry ignores whitespace' '
7878
test_cmp expect actual
7979
'
8080

81+
# Reuse the expect file from the previous test, in a partial clone
82+
test_expect_success 'cherry in partial clone does bulk prefetch' '
83+
test_config uploadpack.allowfilter 1 &&
84+
test_config uploadpack.allowanysha1inwant 1 &&
85+
test_when_finished "rm -rf copy" &&
86+
87+
git clone --bare --filter=blob:none file://"$(pwd)" copy &&
88+
(
89+
cd copy &&
90+
GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual &&
91+
test_cmp ../expect actual &&
92+
93+
grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches &&
94+
test_line_count = 1 fetches &&
95+
test_trace2_data promisor fetch_count 4 <trace.output
96+
)
97+
'
98+
8199
test_done

t/t7810-grep.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1929,4 +1929,39 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
19291929
test_cmp expected actual
19301930
'
19311931

1932+
test_expect_success 'grep of revision in partial clone does bulk prefetch' '
1933+
test_when_finished "rm -rf grep-partial-src grep-partial" &&
1934+
1935+
git init grep-partial-src &&
1936+
(
1937+
cd grep-partial-src &&
1938+
git config uploadpack.allowfilter 1 &&
1939+
git config uploadpack.allowanysha1inwant 1 &&
1940+
echo "needle in haystack" >searchme &&
1941+
echo "no match here" >other &&
1942+
mkdir subdir &&
1943+
echo "needle again" >subdir/deep &&
1944+
git add . &&
1945+
git commit -m "initial"
1946+
) &&
1947+
1948+
git clone --no-checkout --filter=blob:none \
1949+
"file://$(pwd)/grep-partial-src" grep-partial &&
1950+
1951+
# All blobs should be missing after a blobless clone.
1952+
git -C grep-partial rev-list --quiet --objects \
1953+
--missing=print HEAD >missing &&
1954+
test_line_count = 3 missing &&
1955+
1956+
# grep HEAD should batch-prefetch all blobs in one request.
1957+
GIT_TRACE2_EVENT="$(pwd)/grep-trace" \
1958+
git -C grep-partial grep -c "needle" HEAD >result &&
1959+
1960+
# Should find matches in two files.
1961+
test_line_count = 2 result &&
1962+
1963+
# Should have prefetched all 3 objects at once
1964+
test_trace2_data promisor fetch_count 3 <grep-trace
1965+
'
1966+
19321967
test_done

0 commit comments

Comments
 (0)