Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions builtin/grep.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@
#include "object-file.h"
#include "object-name.h"
#include "odb.h"
#include "oid-array.h"
#include "oidset.h"
#include "packfile.h"
#include "pager.h"
#include "path.h"
#include "promisor-remote.h"
#include "read-cache-ll.h"
#include "write-or-die.h"

Expand Down Expand Up @@ -692,6 +695,143 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
return hit;
}

static void collect_blob_oids_for_tree(struct repository *repo,
const struct pathspec *pathspec,
struct tree_desc *tree,
struct strbuf *base,
int tn_len,
struct oidset *blob_oids)
{
struct name_entry entry;
int old_baselen = base->len;
struct strbuf name = STRBUF_INIT;
enum interesting match = entry_not_interesting;

while (tree_entry(tree, &entry)) {
if (match != all_entries_interesting) {
strbuf_addstr(&name, base->buf + tn_len);
match = tree_entry_interesting(repo->index,
&entry, &name,
pathspec);
strbuf_reset(&name);

if (match == all_entries_not_interesting)
break;
if (match == entry_not_interesting)
continue;
}

strbuf_add(base, entry.path, tree_entry_len(&entry));

if (S_ISREG(entry.mode)) {
oidset_insert(blob_oids, &entry.oid);
} else if (S_ISDIR(entry.mode)) {
enum object_type type;
struct tree_desc sub_tree;
void *data;
unsigned long size;

data = odb_read_object(repo->objects, &entry.oid,
&type, &size);
if (!data)
die(_("unable to read tree (%s)"),
oid_to_hex(&entry.oid));

strbuf_addch(base, '/');
init_tree_desc(&sub_tree, &entry.oid, data, size);
collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
base, tn_len, blob_oids);
free(data);
}
/*
* ...no else clause for S_ISGITLINK: submodules have their
* own promisor configuration and would need separate fetches
* anyway.
*/

strbuf_setlen(base, old_baselen);
}

strbuf_release(&name);
}

static void collect_blob_oids_for_treeish(struct grep_opt *opt,
const struct pathspec *pathspec,
const struct object_id *tree_ish_oid,
const char *name,
struct oidset *blob_oids)
{
struct tree_desc tree;
void *data;
unsigned long size;
struct strbuf base = STRBUF_INIT;
int len;

data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
OBJ_TREE, &size, NULL);

if (!data)
return;

len = name ? strlen(name) : 0;
if (len) {
strbuf_add(&base, name, len);
strbuf_addch(&base, ':');
}
init_tree_desc(&tree, tree_ish_oid, data, size);

collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
&base, base.len, blob_oids);

strbuf_release(&base);
free(data);
}

static void prefetch_grep_blobs(struct grep_opt *opt,
const struct pathspec *pathspec,
const struct object_array *list)
{
struct oidset blob_oids = OIDSET_INIT;

/* Exit if we're not in a partial clone */
if (!repo_has_promisor_remote(opt->repo))
return;

/* For each tree, gather the blobs in it */
for (int i = 0; i < list->nr; i++) {
struct object *real_obj;

obj_read_lock();
real_obj = deref_tag(opt->repo, list->objects[i].item,
NULL, 0);
obj_read_unlock();

if (real_obj &&
(real_obj->type == OBJ_COMMIT ||
real_obj->type == OBJ_TREE))
collect_blob_oids_for_treeish(opt, pathspec,
&real_obj->oid,
list->objects[i].name,
&blob_oids);
}

/* Prefetch the blobs we found */
if (oidset_size(&blob_oids)) {
struct oid_array to_fetch = OID_ARRAY_INIT;
struct oidset_iter iter;
const struct object_id *oid;

oidset_iter_init(&blob_oids, &iter);
while ((oid = oidset_iter_next(&iter)))
oid_array_append(&to_fetch, oid);

promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);

oid_array_clear(&to_fetch);
}
oidset_clear(&blob_oids);
}

static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
struct object *obj, const char *name, const char *path)
{
Expand Down Expand Up @@ -732,6 +872,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
int hit = 0;
const unsigned int nr = list->nr;

prefetch_grep_blobs(opt, pathspec, list);

for (i = 0; i < nr; i++) {
struct object *real_obj;

Expand Down
125 changes: 125 additions & 0 deletions builtin/log.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
#include "color.h"
Comment thread
newren marked this conversation as resolved.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Phillip Wood wrote on the Git mailing list (how to reply to this email):

Hi Elijah

On 18/04/2026 01:32, Elijah Newren via GitGitGadget wrote:
> From: Elijah Newren <newren@gmail.com>
> > In partial clones, `git cherry` fetches necessary blobs on-demand one
> at a time, which can be very slow.  We would like to prefetch all
> necessary blobs upfront.  To do so, we need to be able to first figure
> out which blobs are needed.

"git rebase" without "--reapply-cherry-picks" suffers from this problem as well as it does the equivalent of "git log --cherry-pick". Is there any way to share prefetch_cherry_blobs() with the cherry-pick detection in revision.c?

Thanks

Phillip

> `git cherry` does its work in a two-phase approach: first computing
> header-only IDs (based on file paths and modes), then falling back to
> full content-based IDs only when header-only IDs collide -- or, more
> accurately, whenever the oidhash() of the header-only object_ids
> collide.
> > patch-ids.c handles this by creating an ids->patches hashmap that has
> all the data we need, but the problem is that any attempt to query the
> hashmap will invoke the patch_id_neq() function on any colliding objects,
> which causes the on-demand fetching.
> > Insert a new prefetch_cherry_blobs() function before checking for
> collisions.  Use a temporary replacement on the ids->patches.cmpfn
> in order to enumerate the blobs that would be needed without yet
> fetching them, and then fetch them all at once, then restore the old
> ids->patches.cmpfn.
> > Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
>   builtin/log.c     | 125 ++++++++++++++++++++++++++++++++++++++++++++++
>   t/t3500-cherry.sh |  18 +++++++
>   2 files changed, 143 insertions(+)
> > diff --git a/builtin/log.c b/builtin/log.c
> index 8c0939dd42..df19876be6 100644
> --- a/builtin/log.c
> +++ b/builtin/log.c
> @@ -21,10 +21,12 @@
>   #include "color.h"
>   #include "commit.h"
>   #include "diff.h"
> +#include "diffcore.h"
>   #include "diff-merges.h"
>   #include "revision.h"
>   #include "log-tree.h"
>   #include "oid-array.h"
> +#include "oidset.h"
>   #include "tag.h"
>   #include "reflog-walk.h"
>   #include "patch-ids.h"
> @@ -43,9 +45,11 @@
>   #include "utf8.h"
>   >   #include "commit-reach.h"
> +#include "promisor-remote.h"
>   #include "range-diff.h"
>   #include "tmp-objdir.h"
>   #include "tree.h"
> +#include "userdiff.h"
>   #include "write-or-die.h"
>   >   #define MAIL_DEFAULT_WRAP 72
> @@ -2602,6 +2606,125 @@ static void print_commit(char sign, struct commit *commit, int verbose,
>   	}
>   }
>   > +/*
> + * Enumerate blob OIDs from a single commit's diff, inserting them into blobs.
> + * Skips files whose userdiff driver explicitly declares binary status
> + * (drv->binary > 0), since patch-ID uses oid_to_hex() for those and
> + * never reads blob content.  Use userdiff_find_by_path() since
> + * diff_filespec_load_driver() is static in diff.c.
> + *
> + * Clean up with diff_queue_clear() (from diffcore.h).
> + */
> +static void collect_diff_blob_oids(struct commit *commit,
> +				   struct diff_options *opts,
> +				   struct oidset *blobs)
> +{
> +	struct diff_queue_struct *q;
> +
> +	/*
> +	 * Merge commits are filtered out by patch_id_defined() in patch-ids.c,
> +	 * so we'll never be called with one.
> +	 */
> +	assert(!commit->parents || !commit->parents->next);
> +
> +	if (commit->parents)
> +		diff_tree_oid(&commit->parents->item->object.oid,
> +			      &commit->object.oid, "", opts);
> +	else
> +		diff_root_tree_oid(&commit->object.oid, "", opts);
> +	diffcore_std(opts);
> +
> +	q = &diff_queued_diff;
> +	for (int i = 0; i < q->nr; i++) {
> +		struct diff_filepair *p = q->queue[i];
> +		struct userdiff_driver *drv;
> +
> +		/* Skip binary files */
> +		drv = userdiff_find_by_path(opts->repo->index, p->one->path);
> +		if (drv && drv->binary > 0)
> +			continue;
> +
> +		if (DIFF_FILE_VALID(p->one))
> +			oidset_insert(blobs, &p->one->oid);
> +		if (DIFF_FILE_VALID(p->two))
> +			oidset_insert(blobs, &p->two->oid);
> +	}
> +	diff_queue_clear(q);
> +}
> +
> +static int always_match(const void *cmp_data UNUSED,
> +			const struct hashmap_entry *entry1 UNUSED,
> +			const struct hashmap_entry *entry2 UNUSED,
> +			const void *keydata UNUSED)
> +{
> +	return 0;
> +}
> +
> +/*
> + * Prefetch blobs for git cherry in partial clones.
> + *
> + * Called between the revision walk (which builds the head-side
> + * commit list) and the has_commit_patch_id() comparison loop.
> + *
> + * Uses a cmpfn-swap trick to avoid reading blobs: temporarily
> + * replaces the hashmap's comparison function with a trivial
> + * always-match function, so hashmap_get()/hashmap_get_next() match
> + * any entry with the same oidhash bucket.  These are the set of oids
> + * that would trigger patch_id_neq() during normal lookup and cause
> + * blobs to be read on demand, and we want to prefetch them all at
> + * once instead.
> + */
> +static void prefetch_cherry_blobs(struct repository *repo,
> +				  struct commit_list *list,
> +				  struct patch_ids *ids)
> +{
> +	struct oidset blobs = OIDSET_INIT;
> +	hashmap_cmp_fn original_cmpfn;
> +
> +	/* Exit if we're not in a partial clone */
> +	if (!repo_has_promisor_remote(repo))
> +		return;
> +
> +	/* Save original cmpfn, replace with always_match */
> +	original_cmpfn = ids->patches.cmpfn;
> +	ids->patches.cmpfn = always_match;
> +
> +	/* Find header-only collisions, gather blobs from those commits */
> +	for (struct commit_list *l = list; l; l = l->next) {
> +		struct commit *c = l->item;
> +		bool match_found = false;
> +		for (struct patch_id *cur = patch_id_iter_first(c, ids);
> +		     cur;
> +		     cur = patch_id_iter_next(cur, ids)) {
> +			match_found = true;
> +			collect_diff_blob_oids(cur->commit, &ids->diffopts,
> +					       &blobs);
> +		}
> +		if (match_found)
> +			collect_diff_blob_oids(c, &ids->diffopts, &blobs);
> +	}
> +
> +	/* Restore original cmpfn */
> +	ids->patches.cmpfn = original_cmpfn;
> +
> +	/* If we have any blobs to fetch, fetch them */
> +	if (oidset_size(&blobs)) {
> +		struct oid_array to_fetch = OID_ARRAY_INIT;
> +		struct oidset_iter iter;
> +		const struct object_id *oid;
> +
> +		oidset_iter_init(&blobs, &iter);
> +		while ((oid = oidset_iter_next(&iter)))
> +			oid_array_append(&to_fetch, oid);
> +
> +		promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
> +
> +		oid_array_clear(&to_fetch);
> +	}
> +
> +	oidset_clear(&blobs);
> +}
> +
>   int cmd_cherry(int argc,
>   	       const char **argv,
>   	       const char *prefix,
> @@ -2673,6 +2796,8 @@ int cmd_cherry(int argc,
>   		commit_list_insert(commit, &list);
>   	}
>   > +	prefetch_cherry_blobs(the_repository, list, &ids);
> +
>   	for (struct commit_list *l = list; l; l = l->next) {
>   		char sign = '+';
>   > diff --git a/t/t3500-cherry.sh b/t/t3500-cherry.sh
> index 78c3eac54b..17507d9a28 100755
> --- a/t/t3500-cherry.sh
> +++ b/t/t3500-cherry.sh
> @@ -78,4 +78,22 @@ test_expect_success 'cherry ignores whitespace' '
>   	test_cmp expect actual
>   '
>   > +# Reuse the expect file from the previous test, in a partial clone
> +test_expect_success 'cherry in partial clone does bulk prefetch' '
> +	test_config uploadpack.allowfilter 1 &&
> +	test_config uploadpack.allowanysha1inwant 1 &&
> +	test_when_finished "rm -rf copy" &&
> +
> +	git clone --bare --filter=blob:none file://"$(pwd)" copy &&
> +	(
> +		cd copy &&
> +		GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual &&
> +		test_cmp ../expect actual &&
> +
> +		grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches &&
> +		test_line_count = 1 fetches &&
> +		test_trace2_data promisor fetch_count 4 <trace.output
> +	)
> +'
> +
>   test_done

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Elijah Newren wrote on the Git mailing list (how to reply to this email):

Hi Phillip,

On Sun, Apr 19, 2026 at 7:04 AM Phillip Wood <phillip.wood123@gmail.com> wrote:
>
> Hi Elijah
>
> On 18/04/2026 01:32, Elijah Newren via GitGitGadget wrote:
> > From: Elijah Newren <newren@gmail.com>
> >
> > In partial clones, `git cherry` fetches necessary blobs on-demand one
> > at a time, which can be very slow.  We would like to prefetch all
> > necessary blobs upfront.  To do so, we need to be able to first figure
> > out which blobs are needed.
>
> "git rebase" without "--reapply-cherry-picks" suffers from this problem
> as well as it does the equivalent of "git log --cherry-pick". Is there
> any way to share prefetch_cherry_blobs() with the cherry-pick detection
> in revision.c?

Yes, you're right; git rebase without --reapply-cherry-picks and git
log --cherry-pick both go through cherry_pick_list() in revision.c,
which has exactly the same shape as the patch-ids loop in
cmd_cherry(): build a hashmap of one side via add_commit_patch_id(),
then look up the other side via patch_id_iter_first(). The on-demand
blob fetches come from the same patch_id_neq() callback.

After poking around, I think the approximate scope of the fix would
be: Move collect_diff_blob_oids(), always_match(), and
prefetch_cherry_blobs() from builtin/log.c to patch-ids.c and expose
the last one in patch-ids.h. In cherry_pick_list(), between the
add_commit_patch_id loop and the comparison loop, build a temporary
list of just the lookup-side commits (filtering by
SYMMETRIC_LEFT/BOUNDARY as the existing loop already does) and call
prefetch_cherry_blobs() on it.

That said, I'd rather leave this out of the current series. The bigger
picture is that I have reservations about expanding partial-clone
support further into this area. git cherry, git log --cherry-pick, and
the default cherry-pick detection in git rebase all exist to answer
"has this patch already landed upstream?" -- a question that, in
repositories large enough to need partial clones, I feel is rarely
worth the cost of computing patch-ids across arbitrary amounts of
history. The honest guidance I would probably give for users on a
large repo is "pass --reapply-cherry-picks (with rebase) and skip this
entirely" or to narrow the range under consideration.  The omission of
a --no-reapply-cherry-picks option in git-replay wasn't a lack of
effort or oversight, but a deliberate choice where I'd rather hold off
(possibly indefinitely) on implementing it.  So I'm a bit reluctant to
make the performance hazard less visible without also asking whether
we should even be doing that piece of the operation.

I only implemented the git cherry fix because of a specific customer
situation where the operation was already baked into tooling, and
prefetching at least makes the worst case tolerable. I don't want to
hold myself to doing the same for the cherry_pick_list() path, but I'm
fairly confident the code here can be re-used for those other cases
and I'd help review a patch from anyone who wants to carry it forward.

Anyway, you are making the right connection, it's just that my
personal answer is to let some other interested individual do it.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Phillip Wood wrote on the Git mailing list (how to reply to this email):

Hi Elijah

On 21/04/2026 22:28, Elijah Newren wrote:
> On Sun, Apr 19, 2026 at 7:04 AM Phillip Wood <phillip.wood123@gmail.com> wrote:
>> On 18/04/2026 01:32, Elijah Newren via GitGitGadget wrote:
>>> From: Elijah Newren <newren@gmail.com>
>>>
>>> In partial clones, `git cherry` fetches necessary blobs on-demand one
>>> at a time, which can be very slow.  We would like to prefetch all
>>> necessary blobs upfront.  To do so, we need to be able to first figure
>>> out which blobs are needed.
>>
>> "git rebase" without "--reapply-cherry-picks" suffers from this problem
>> as well as it does the equivalent of "git log --cherry-pick". Is there
>> any way to share prefetch_cherry_blobs() with the cherry-pick detection
>> in revision.c?
> > Yes, you're right; git rebase without --reapply-cherry-picks and git
> log --cherry-pick both go through cherry_pick_list() in revision.c,
> which has exactly the same shape as the patch-ids loop in
> cmd_cherry(): build a hashmap of one side via add_commit_patch_id(),
> then look up the other side via patch_id_iter_first(). The on-demand
> blob fetches come from the same patch_id_neq() callback.
> > After poking around, I think the approximate scope of the fix would
> be: Move collect_diff_blob_oids(), always_match(), and
> prefetch_cherry_blobs() from builtin/log.c to patch-ids.c and expose
> the last one in patch-ids.h. In cherry_pick_list(), between the
> add_commit_patch_id loop and the comparison loop, build a temporary
> list of just the lookup-side commits (filtering by
> SYMMETRIC_LEFT/BOUNDARY as the existing loop already does) and call
> prefetch_cherry_blobs() on it.

Thanks for taking a look

> That said, I'd rather leave this out of the current series. The bigger
> picture is that I have reservations about expanding partial-clone
> support further into this area. git cherry, git log --cherry-pick, and
> the default cherry-pick detection in git rebase all exist to answer
> "has this patch already landed upstream?" -- a question that, in
> repositories large enough to need partial clones, I feel is rarely
> worth the cost of computing patch-ids across arbitrary amounts of
> history. The honest guidance I would probably give for users on a
> large repo is "pass --reapply-cherry-picks (with rebase) and skip this
> entirely" or to narrow the range under consideration.

"--reapply-cherry-picks --empty=drop" is certainly more efficient. When we're computing patch ids do we do it for every upstream commit or just the ones that modify the set of paths that are modified in the branch we're rebasing?

It is a shame that we don't have a config setting for "-reapply-cherry-picks" as it is easy to forget to pass that option. Unfortunately it is not supported by the apply backend which makes such a setting potentially confusing.

>  The omission of
> a --no-reapply-cherry-picks option in git-replay wasn't a lack of
> effort or oversight, but a deliberate choice where I'd rather hold off
> (possibly indefinitely) on implementing it.  So I'm a bit reluctant to
> make the performance hazard less visible without also asking whether
> we should even be doing that piece of the operation.
> > I only implemented the git cherry fix because of a specific customer
> situation where the operation was already baked into tooling, and
> prefetching at least makes the worst case tolerable.

I'm a bit surprised customers aren't complaining about tools that use "git rebase" being slow.

> I don't want to
> hold myself to doing the same for the cherry_pick_list() path, but I'm
> fairly confident the code here can be re-used for those other cases
> and I'd help review a patch from anyone who wants to carry it forward.
> > Anyway, you are making the right connection, it's just that my
> personal answer is to let some other interested individual do it.

Fair enough

Thanks

Phillip

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Elijah Newren wrote on the Git mailing list (how to reply to this email):

Hi Phillip,

On Thu, Apr 23, 2026 at 8:15 AM Phillip Wood <phillip.wood123@gmail.com> wrote:
> On 21/04/2026 22:28, Elijah Newren wrote:
> > On Sun, Apr 19, 2026 at 7:04 AM Phillip Wood <phillip.wood123@gmail.com> wrote:
> >> On 18/04/2026 01:32, Elijah Newren via GitGitGadget wrote:

> "--reapply-cherry-picks --empty=drop" is certainly more efficient. When
> we're computing patch ids do we do it for every upstream commit or just
> the ones that modify the set of paths that are modified in the branch
> we're rebasing?

You are correct that the patch id computations won't look at file
contents of commits unless they modify the same set of files as one of
the commits in our topic branch, but in order to determine the set of
commits which modify the same paths as commits in the branch we're
rebasing, we have to walk the upstream commits and do a tree-diff for
every one of them.  Yes, commits and trees tend to be much smaller
than blobs, but the number of trees/commits we have to look at may be
far larger than the number of blobs.  The biggest repositories are
constantly pushing so many commits that they are at a size where even
a merge-base operation can start to feel expensive.

> It is a shame that we don't have a config setting for
> "-reapply-cherry-picks" as it is easy to forget to pass that option.
> Unfortunately it is not supported by the apply backend which makes such
> a setting potentially confusing.

Indeed.

> >  The omission of
> > a --no-reapply-cherry-picks option in git-replay wasn't a lack of
> > effort or oversight, but a deliberate choice where I'd rather hold off
> > (possibly indefinitely) on implementing it.  So I'm a bit reluctant to
> > make the performance hazard less visible without also asking whether
> > we should even be doing that piece of the operation.
> >
> > I only implemented the git cherry fix because of a specific customer
> > situation where the operation was already baked into tooling, and
> > prefetching at least makes the worst case tolerable.
>
> I'm a bit surprised customers aren't complaining about tools that use
> "git rebase" being slow.

Are you sure they aren't complaining?

The merging parts of a rebase operation do have batch prefetching
already (up to 3 batches per commit; done that way to minimize the
number of objects downloaded because sometimes 2 or more of those
batches can be skipped entirely and trying to combine them into a
single batch would only be doable by downloading far more than
needed).  But, as you're alluding to, the --no-reapply-cherry-picks
part does not.

I'll note that GitHub tends to focus far more on the server side; it's
just that in this particular case with a special customer, they had me
dig a little closer to their client side operations.  In their case,
they were using git-replay rather than git-rebase, so they'd have no
reason to complain about rebase.  git-replay shares the same batch
prefetching for merge operations that rebase has, and doesn't have a
--no-reapply-cherry-picks behavior that can even be selected.
Honestly, I think the main reason this customer was also using
git-cherry was because I didn't get the drop-commits-that-become-empty
logic in the early versions of git-replay.  You added that to
git-replay (thanks again!), but after they had already built their
tooling.  This is only a guess on my part; they may have other reasons
for actively wanting git-cherry, but I think it might be worthwhile
for me to ask them if they can upgrade git versions (to get your fixes
for empty commits in replay) and then drop the calls to git-cherry.
However, I didn't want it to sound like I was pushing them to change
their workflows at my convenience, and hence this patch so that things
can be fast even if they keep the git-cherry in there.

> > I don't want to
> > hold myself to doing the same for the cherry_pick_list() path, but I'm
> > fairly confident the code here can be re-used for those other cases
> > and I'd help review a patch from anyone who wants to carry it forward.
> >
> > Anyway, you are making the right connection, it's just that my
> > personal answer is to let some other interested individual do it.
>
> Fair enough

Thanks for taking a look and asking interesting questions.

Elijah

#include "commit.h"
#include "diff.h"
#include "diffcore.h"
#include "diff-merges.h"
#include "revision.h"
#include "log-tree.h"
#include "oid-array.h"
#include "oidset.h"
#include "tag.h"
#include "reflog-walk.h"
#include "patch-ids.h"
Expand All @@ -43,9 +45,11 @@
#include "utf8.h"

#include "commit-reach.h"
#include "promisor-remote.h"
#include "range-diff.h"
#include "tmp-objdir.h"
#include "tree.h"
#include "userdiff.h"
#include "write-or-die.h"

#define MAIL_DEFAULT_WRAP 72
Expand Down Expand Up @@ -2602,6 +2606,125 @@ static void print_commit(char sign, struct commit *commit, int verbose,
}
}

/*
* Enumerate blob OIDs from a single commit's diff, inserting them into blobs.
* Skips files whose userdiff driver explicitly declares binary status
* (drv->binary > 0), since patch-ID uses oid_to_hex() for those and
* never reads blob content. Use userdiff_find_by_path() since
* diff_filespec_load_driver() is static in diff.c.
*
* Clean up with diff_queue_clear() (from diffcore.h).
*/
static void collect_diff_blob_oids(struct commit *commit,
struct diff_options *opts,
struct oidset *blobs)
{
struct diff_queue_struct *q;

/*
* Merge commits are filtered out by patch_id_defined() in patch-ids.c,
* so we'll never be called with one.
*/
assert(!commit->parents || !commit->parents->next);

if (commit->parents)
diff_tree_oid(&commit->parents->item->object.oid,
&commit->object.oid, "", opts);
else
diff_root_tree_oid(&commit->object.oid, "", opts);
diffcore_std(opts);

q = &diff_queued_diff;
for (int i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
struct userdiff_driver *drv;

/* Skip binary files */
drv = userdiff_find_by_path(opts->repo->index, p->one->path);
if (drv && drv->binary > 0)
continue;

if (DIFF_FILE_VALID(p->one))
oidset_insert(blobs, &p->one->oid);
if (DIFF_FILE_VALID(p->two))
oidset_insert(blobs, &p->two->oid);
}
diff_queue_clear(q);
}

static int always_match(const void *cmp_data UNUSED,
const struct hashmap_entry *entry1 UNUSED,
const struct hashmap_entry *entry2 UNUSED,
const void *keydata UNUSED)
{
return 0;
}

/*
* Prefetch blobs for git cherry in partial clones.
*
* Called between the revision walk (which builds the head-side
* commit list) and the has_commit_patch_id() comparison loop.
*
* Uses a cmpfn-swap trick to avoid reading blobs: temporarily
* replaces the hashmap's comparison function with a trivial
* always-match function, so hashmap_get()/hashmap_get_next() match
* any entry with the same oidhash bucket. These are the set of oids
* that would trigger patch_id_neq() during normal lookup and cause
* blobs to be read on demand, and we want to prefetch them all at
* once instead.
*/
static void prefetch_cherry_blobs(struct repository *repo,
struct commit_list *list,
struct patch_ids *ids)
{
struct oidset blobs = OIDSET_INIT;
hashmap_cmp_fn original_cmpfn;

/* Exit if we're not in a partial clone */
if (!repo_has_promisor_remote(repo))
return;

/* Save original cmpfn, replace with always_match */
original_cmpfn = ids->patches.cmpfn;
ids->patches.cmpfn = always_match;

/* Find header-only collisions, gather blobs from those commits */
for (struct commit_list *l = list; l; l = l->next) {
struct commit *c = l->item;
bool match_found = false;
for (struct patch_id *cur = patch_id_iter_first(c, ids);
cur;
cur = patch_id_iter_next(cur, ids)) {
match_found = true;
collect_diff_blob_oids(cur->commit, &ids->diffopts,
&blobs);
}
if (match_found)
collect_diff_blob_oids(c, &ids->diffopts, &blobs);
}

/* Restore original cmpfn */
ids->patches.cmpfn = original_cmpfn;

/* If we have any blobs to fetch, fetch them */
if (oidset_size(&blobs)) {
struct oid_array to_fetch = OID_ARRAY_INIT;
struct oidset_iter iter;
const struct object_id *oid;

oidset_iter_init(&blobs, &iter);
while ((oid = oidset_iter_next(&iter)))
oid_array_append(&to_fetch, oid);

promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);

oid_array_clear(&to_fetch);
}

oidset_clear(&blobs);
}

int cmd_cherry(int argc,
const char **argv,
const char *prefix,
Expand Down Expand Up @@ -2673,6 +2796,8 @@ int cmd_cherry(int argc,
commit_list_insert(commit, &list);
}

prefetch_cherry_blobs(the_repository, list, &ids);

for (struct commit_list *l = list; l; l = l->next) {
char sign = '+';

Expand Down
2 changes: 1 addition & 1 deletion patch-ids.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ int has_commit_patch_id(struct commit *commit, struct patch_ids *);
* struct patch_id *cur;
* for (cur = patch_id_iter_first(commit, ids);
* cur;
* cur = patch_id_iter_next(cur, ids) {
* cur = patch_id_iter_next(cur, ids)) {
* ... look at cur->commit
* }
*/
Expand Down
18 changes: 18 additions & 0 deletions t/t3500-cherry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,22 @@ test_expect_success 'cherry ignores whitespace' '
test_cmp expect actual
'

# Reuse the expect file from the previous test, in a partial clone
test_expect_success 'cherry in partial clone does bulk prefetch' '
test_config uploadpack.allowfilter 1 &&
test_config uploadpack.allowanysha1inwant 1 &&
test_when_finished "rm -rf copy" &&

git clone --bare --filter=blob:none file://"$(pwd)" copy &&
(
cd copy &&
GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual &&
test_cmp ../expect actual &&

grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches &&
test_line_count = 1 fetches &&
test_trace2_data promisor fetch_count 4 <trace.output
)
'

test_done
35 changes: 35 additions & 0 deletions t/t7810-grep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1929,4 +1929,39 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
test_cmp expected actual
'

test_expect_success 'grep of revision in partial clone does bulk prefetch' '
test_when_finished "rm -rf grep-partial-src grep-partial" &&

git init grep-partial-src &&
(
cd grep-partial-src &&
git config uploadpack.allowfilter 1 &&
git config uploadpack.allowanysha1inwant 1 &&
echo "needle in haystack" >searchme &&
echo "no match here" >other &&
mkdir subdir &&
echo "needle again" >subdir/deep &&
git add . &&
git commit -m "initial"
) &&

git clone --no-checkout --filter=blob:none \
"file://$(pwd)/grep-partial-src" grep-partial &&

# All blobs should be missing after a blobless clone.
git -C grep-partial rev-list --quiet --objects \
--missing=print HEAD >missing &&
test_line_count = 3 missing &&

# grep HEAD should batch-prefetch all blobs in one request.
GIT_TRACE2_EVENT="$(pwd)/grep-trace" \
git -C grep-partial grep -c "needle" HEAD >result &&

# Should find matches in two files.
test_line_count = 2 result &&

# Should have prefetched all 3 objects at once
test_trace2_data promisor fetch_count 3 <grep-trace
'

test_done
Loading