Skip to content

Commit 8e2964d

Browse files
committed
Merge branch 'ps/object-counting'
The logic to count objects has been cleaned up. * ps/object-counting: odb: introduce generic object counting odb/source: introduce generic object counting object-file: generalize counting objects object-file: extract logic to approximate object count packfile: extract logic to count number of objects odb: stop including "odb/source.h"
2 parents 105a22c + 6801ffd commit 8e2964d

17 files changed

+301
-118
lines changed

builtin/gc.c

Lines changed: 15 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -467,37 +467,19 @@ static int rerere_gc_condition(struct gc_config *cfg UNUSED)
467467
static int too_many_loose_objects(int limit)
468468
{
469469
/*
470-
* Quickly check if a "gc" is needed, by estimating how
471-
* many loose objects there are. Because SHA-1 is evenly
472-
* distributed, we can check only one and get a reasonable
473-
* estimate.
470+
* This is weird, but stems from legacy behaviour: the GC auto
471+
* threshold was always essentially interpreted as if it was rounded up
472+
* to the next multiple 256 of, so we retain this behaviour for now.
474473
*/
475-
DIR *dir;
476-
struct dirent *ent;
477-
int auto_threshold;
478-
int num_loose = 0;
479-
int needed = 0;
480-
const unsigned hexsz_loose = the_hash_algo->hexsz - 2;
481-
char *path;
482-
483-
path = repo_git_path(the_repository, "objects/17");
484-
dir = opendir(path);
485-
free(path);
486-
if (!dir)
474+
int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
475+
unsigned long loose_count;
476+
477+
if (odb_source_loose_count_objects(the_repository->objects->sources,
478+
ODB_COUNT_OBJECTS_APPROXIMATE,
479+
&loose_count) < 0)
487480
return 0;
488481

489-
auto_threshold = DIV_ROUND_UP(limit, 256);
490-
while ((ent = readdir(dir)) != NULL) {
491-
if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
492-
ent->d_name[hexsz_loose] != '\0')
493-
continue;
494-
if (++num_loose > auto_threshold) {
495-
needed = 1;
496-
break;
497-
}
498-
}
499-
closedir(dir);
500-
return needed;
482+
return loose_count > auto_threshold;
501483
}
502484

503485
static struct packed_git *find_base_packs(struct string_list *packs,
@@ -592,9 +574,13 @@ static uint64_t total_ram(void)
592574
static uint64_t estimate_repack_memory(struct gc_config *cfg,
593575
struct packed_git *pack)
594576
{
595-
unsigned long nr_objects = repo_approximate_object_count(the_repository);
577+
unsigned long nr_objects;
596578
size_t os_cache, heap;
597579

580+
if (odb_count_objects(the_repository->objects,
581+
ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0)
582+
return 0;
583+
598584
if (!pack || !nr_objects)
599585
return 0;
600586

builtin/multi-pack-index.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "strbuf.h"
1010
#include "trace2.h"
1111
#include "odb.h"
12+
#include "odb/source.h"
1213
#include "replace-object.h"
1314
#include "repository.h"
1415

builtin/submodule--helper.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "object-file.h"
3030
#include "object-name.h"
3131
#include "odb.h"
32+
#include "odb/source.h"
3233
#include "advice.h"
3334
#include "branch.h"
3435
#include "list-objects-filter-options.h"

commit-graph.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source,
26072607
replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE;
26082608
}
26092609

2610-
ctx.approx_nr_objects = repo_approximate_object_count(r);
2610+
if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0)
2611+
ctx.approx_nr_objects = 0;
26112612

26122613
if (ctx.append && g) {
26132614
for (i = 0; i < g->num_commits; i++) {

object-file.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1868,6 +1868,64 @@ int odb_source_loose_for_each_object(struct odb_source *source,
18681868
NULL, NULL, &data);
18691869
}
18701870

1871+
static int count_loose_object(const struct object_id *oid UNUSED,
1872+
struct object_info *oi UNUSED,
1873+
void *payload)
1874+
{
1875+
unsigned long *count = payload;
1876+
(*count)++;
1877+
return 0;
1878+
}
1879+
1880+
int odb_source_loose_count_objects(struct odb_source *source,
1881+
enum odb_count_objects_flags flags,
1882+
unsigned long *out)
1883+
{
1884+
const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
1885+
char *path = NULL;
1886+
DIR *dir = NULL;
1887+
int ret;
1888+
1889+
if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) {
1890+
unsigned long count = 0;
1891+
struct dirent *ent;
1892+
1893+
path = xstrfmt("%s/17", source->path);
1894+
1895+
dir = opendir(path);
1896+
if (!dir) {
1897+
if (errno == ENOENT) {
1898+
*out = 0;
1899+
ret = 0;
1900+
goto out;
1901+
}
1902+
1903+
ret = error_errno("cannot open object shard '%s'", path);
1904+
goto out;
1905+
}
1906+
1907+
while ((ent = readdir(dir)) != NULL) {
1908+
if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
1909+
ent->d_name[hexsz] != '\0')
1910+
continue;
1911+
count++;
1912+
}
1913+
1914+
*out = count * 256;
1915+
ret = 0;
1916+
} else {
1917+
*out = 0;
1918+
ret = odb_source_loose_for_each_object(source, NULL, count_loose_object,
1919+
out, 0);
1920+
}
1921+
1922+
out:
1923+
if (dir)
1924+
closedir(dir);
1925+
free(path);
1926+
return ret;
1927+
}
1928+
18711929
static int append_loose_object(const struct object_id *oid,
18721930
const char *path UNUSED,
18731931
void *data)

object-file.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,20 @@ int odb_source_loose_for_each_object(struct odb_source *source,
139139
void *cb_data,
140140
unsigned flags);
141141

142+
/*
143+
* Count the number of loose objects in this source.
144+
*
145+
* The object count is approximated by opening a single sharding directory for
146+
* loose objects and scanning its contents. The result is then extrapolated by
147+
* 256. This should generally work as a reasonable estimate given that the
148+
* object hash is supposed to be indistinguishable from random.
149+
*
150+
* Returns 0 on success, a negative error code otherwise.
151+
*/
152+
int odb_source_loose_count_objects(struct odb_source *source,
153+
enum odb_count_objects_flags flags,
154+
unsigned long *out);
155+
142156
/**
143157
* format_object_header() is a thin wrapper around s xsnprintf() that
144158
* writes the initial "<type> <obj-len>" part of the loose object

object-name.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex,
837837
const unsigned hexsz = algo->hexsz;
838838

839839
if (len < 0) {
840-
unsigned long count = repo_approximate_object_count(r);
840+
unsigned long count;
841+
842+
if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0)
843+
count = 0;
844+
841845
/*
842846
* Add one because the MSB only tells us the highest bit set,
843847
* not including the value of all the _other_ bits (so "15"

odb.c

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb,
917917
return 0;
918918
}
919919

920+
int odb_count_objects(struct object_database *odb,
921+
enum odb_count_objects_flags flags,
922+
unsigned long *out)
923+
{
924+
struct odb_source *source;
925+
unsigned long count = 0;
926+
int ret;
927+
928+
if (odb->object_count_valid && odb->object_count_flags == flags) {
929+
*out = odb->object_count;
930+
return 0;
931+
}
932+
933+
odb_prepare_alternates(odb);
934+
for (source = odb->sources; source; source = source->next) {
935+
unsigned long c;
936+
937+
ret = odb_source_count_objects(source, flags, &c);
938+
if (ret < 0)
939+
goto out;
940+
941+
count += c;
942+
}
943+
944+
odb->object_count = count;
945+
odb->object_count_valid = 1;
946+
odb->object_count_flags = flags;
947+
948+
*out = count;
949+
ret = 0;
950+
951+
out:
952+
return ret;
953+
}
954+
920955
void odb_assert_oid_type(struct object_database *odb,
921956
const struct object_id *oid, enum object_type expect)
922957
{
@@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o)
10301065
for (source = o->sources; source; source = source->next)
10311066
odb_source_reprepare(source);
10321067

1033-
o->approximate_object_count_valid = 0;
1068+
o->object_count_valid = 0;
10341069

10351070
obj_read_unlock();
10361071
}

odb.h

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
#include "hashmap.h"
55
#include "object.h"
6-
#include "odb/source.h"
76
#include "oidset.h"
87
#include "oidmap.h"
98
#include "string-list.h"
@@ -12,6 +11,7 @@
1211
struct oidmap;
1312
struct oidtree;
1413
struct strbuf;
14+
struct strvec;
1515
struct repository;
1616
struct multi_pack_index;
1717

@@ -110,10 +110,11 @@ struct object_database {
110110
/*
111111
* A fast, rough count of the number of objects in the repository.
112112
* These two fields are not meant for direct access. Use
113-
* repo_approximate_object_count() instead.
113+
* odb_count_objects() instead.
114114
*/
115-
unsigned long approximate_object_count;
116-
unsigned approximate_object_count_valid : 1;
115+
unsigned long object_count;
116+
unsigned object_count_flags;
117+
unsigned object_count_valid : 1;
117118

118119
/*
119120
* Submodule source paths that will be added as additional sources to
@@ -339,6 +340,42 @@ struct object_info {
339340
*/
340341
#define OBJECT_INFO_INIT { 0 }
341342

343+
/* Flags that can be passed to `odb_read_object_info_extended()`. */
344+
enum object_info_flags {
345+
/* Invoke lookup_replace_object() on the given hash. */
346+
OBJECT_INFO_LOOKUP_REPLACE = (1 << 0),
347+
348+
/* Do not reprepare object sources when the first lookup has failed. */
349+
OBJECT_INFO_QUICK = (1 << 1),
350+
351+
/*
352+
* Do not attempt to fetch the object if missing (even if fetch_is_missing is
353+
* nonzero).
354+
*/
355+
OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2),
356+
357+
/* Die if object corruption (not just an object being missing) was detected. */
358+
OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3),
359+
360+
/*
361+
* We have already tried reading the object, but it couldn't be found
362+
* via any of the attached sources, and are now doing a second read.
363+
* This second read asks the individual sources to also evaluate
364+
* whether any on-disk state may have changed that may have caused the
365+
* object to appear.
366+
*
367+
* This flag is for internal use, only. The second read only occurs
368+
* when `OBJECT_INFO_QUICK` was not passed.
369+
*/
370+
OBJECT_INFO_SECOND_READ = (1 << 4),
371+
372+
/*
373+
* This is meant for bulk prefetching of missing blobs in a partial
374+
* clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK.
375+
*/
376+
OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK),
377+
};
378+
342379
/*
343380
* Read object info from the object database and populate the `object_info`
344381
* structure. Returns 0 on success, a negative error code otherwise.
@@ -432,6 +469,18 @@ enum odb_for_each_object_flags {
432469
ODB_FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4),
433470
};
434471

472+
/*
473+
* A callback function that can be used to iterate through objects. If given,
474+
* the optional `oi` parameter will be populated the same as if you would call
475+
* `odb_read_object_info()`.
476+
*
477+
* Returning a non-zero error code will cause iteration to abort. The error
478+
* code will be propagated.
479+
*/
480+
typedef int (*odb_for_each_object_cb)(const struct object_id *oid,
481+
struct object_info *oi,
482+
void *cb_data);
483+
435484
/*
436485
* Iterate through all objects contained in the object database. Note that
437486
* objects may be iterated over multiple times in case they are either stored
@@ -452,6 +501,27 @@ int odb_for_each_object(struct object_database *odb,
452501
void *cb_data,
453502
unsigned flags);
454503

504+
enum odb_count_objects_flags {
505+
/*
506+
* Instead of providing an accurate count, allow the number of objects
507+
* to be approximated. Details of how this approximation works are
508+
* subject to the specific source's implementation.
509+
*/
510+
ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0),
511+
};
512+
513+
/*
514+
* Count the number of objects in the given object database. This object count
515+
* may double-count objects that are stored in multiple backends, or which are
516+
* stored multiple times in a single backend.
517+
*
518+
* Returns 0 on success, a negative error code otherwise. The number of objects
519+
* will be assigned to the `out` pointer on success.
520+
*/
521+
int odb_count_objects(struct object_database *odb,
522+
enum odb_count_objects_flags flags,
523+
unsigned long *out);
524+
455525
enum {
456526
/*
457527
* By default, `odb_write_object()` does not actually write anything

0 commit comments

Comments
 (0)