Skip to content

Commit c85bcb7

Browse files
committed
midx: implement --max-objects-per-layer
Signed-off-by: Vicent Marti <vmg@strn.cat>
1 parent 1674d48 commit c85bcb7

5 files changed

Lines changed: 478 additions & 2 deletions

File tree

Documentation/git-multi-pack-index.adoc

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ SYNOPSIS
1212
'git multi-pack-index' [<options>] write [--preferred-pack=<pack>]
1313
[--[no-]bitmap] [--[no-]incremental] [--[no-]stdin-packs]
1414
[--refs-snapshot=<path>] [--[no-]write-chain-file]
15-
[--base=<checksum>]
15+
[--base=<checksum>] [--max-objects-per-layer=<n>]
1616
'git multi-pack-index' [<options>] compact [--[no-]incremental]
1717
[--[no-]bitmap] [--base=<checksum>] [--[no-]write-chain-file]
1818
<from> <to>
@@ -99,6 +99,18 @@ marker).
9999
The special value `none` indicates that the new layer
100100
should have no base (i.e., it becomes a root layer).
101101
Requires `--no-write-chain-file`.
102+
103+
--max-objects-per-layer=<n>::
104+
Write the new MIDX as a chain of incremental layers,
105+
where each layer covers at most `<n>` objects counted
106+
across the layer's packs. Implies `--incremental`.
107+
Packs are partitioned largest first, and a single pack
108+
whose object count exceeds `<n>` is written into a layer
109+
of its own. This bounds peak memory when bootstrapping a
110+
MIDX over a large set of packs, for example after restoring
111+
a repository from a snapshot. This option is incompatible
112+
with `--base` and `--no-write-chain-file`, which target
113+
single-layer writes.
102114
--
103115

104116
compact::
@@ -186,6 +198,12 @@ $ git multi-pack-index write --preferred-pack=<pack> --bitmap
186198
$ git multi-pack-index --object-dir <alt> write
187199
-----------------------------------------------
188200
201+
* Write a MIDX chain in bounded-memory batches, with a bitmap for each layer.
202+
+
203+
-------------------------------------------------------------
204+
$ git multi-pack-index write --bitmap --max-objects-per-layer=1000000
205+
-------------------------------------------------------------
206+
189207
* Verify the MIDX file for the packfiles in the current `.git` directory.
190208
+
191209
-----------------------------------------------

builtin/multi-pack-index.c

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
N_("git multi-pack-index [<options>] write [--preferred-pack=<pack>]\n" \
1818
" [--[no-]bitmap] [--[no-]incremental] [--[no-]stdin-packs]\n" \
1919
" [--refs-snapshot=<path>] [--[no-]write-chain-file]\n" \
20-
" [--base=<checksum>]")
20+
" [--base=<checksum>] [--max-objects-per-layer=<n>]")
2121

2222
#define BUILTIN_MIDX_COMPACT_USAGE \
2323
N_("git multi-pack-index [<options>] compact [--[no-]incremental]\n" \
@@ -68,6 +68,7 @@ static struct opts_multi_pack_index {
6868
const char *incremental_base;
6969
char *refs_snapshot;
7070
unsigned long batch_size;
71+
unsigned long max_objects_per_layer;
7172
unsigned flags;
7273
int stdin_packs;
7374
} opts;
@@ -165,6 +166,9 @@ static int cmd_multi_pack_index_write(int argc, const char **argv,
165166
N_("write multi-pack index containing only given indexes")),
166167
OPT_FILENAME(0, "refs-snapshot", &opts.refs_snapshot,
167168
N_("refs snapshot for selecting bitmap commits")),
169+
OPT_UNSIGNED(0, "max-objects-per-layer",
170+
&opts.max_objects_per_layer,
171+
N_("write incremental MIDX layers with at most this many objects each")),
168172
OPT_END(),
169173
};
170174
struct odb_source *source;
@@ -195,6 +199,25 @@ static int cmd_multi_pack_index_write(int argc, const char **argv,
195199
options);
196200
}
197201

202+
if (opts.max_objects_per_layer) {
203+
if (opts.incremental_base) {
204+
error(_("cannot use --max-objects-per-layer with --base"));
205+
usage_with_options(builtin_multi_pack_index_write_usage,
206+
options);
207+
}
208+
if (opts.flags & MIDX_WRITE_NO_CHAIN) {
209+
error(_("cannot use --max-objects-per-layer with --no-write-chain-file"));
210+
usage_with_options(builtin_multi_pack_index_write_usage,
211+
options);
212+
}
213+
if (opts.max_objects_per_layer > UINT32_MAX) {
214+
error(_("--max-objects-per-layer is too large"));
215+
usage_with_options(builtin_multi_pack_index_write_usage,
216+
options);
217+
}
218+
opts.flags |= MIDX_WRITE_INCREMENTAL;
219+
}
220+
198221
if (opts.incremental_base &&
199222
!(opts.flags & MIDX_WRITE_NO_CHAIN)) {
200223
error(_("cannot use --base without --no-write-chain-file"));
@@ -206,6 +229,31 @@ static int cmd_multi_pack_index_write(int argc, const char **argv,
206229

207230
FREE_AND_NULL(options);
208231

232+
if (opts.max_objects_per_layer) {
233+
if (opts.stdin_packs) {
234+
struct string_list packs = STRING_LIST_INIT_DUP;
235+
236+
read_packs_from_stdin(&packs);
237+
238+
ret = write_midx_file_batched(source, &packs,
239+
opts.preferred_pack,
240+
opts.refs_snapshot,
241+
(uint32_t)opts.max_objects_per_layer,
242+
opts.flags);
243+
244+
string_list_clear(&packs, 0);
245+
} else {
246+
ret = write_midx_file_batched(source, NULL,
247+
opts.preferred_pack,
248+
opts.refs_snapshot,
249+
(uint32_t)opts.max_objects_per_layer,
250+
opts.flags);
251+
}
252+
253+
free(opts.refs_snapshot);
254+
return ret;
255+
}
256+
209257
if (opts.stdin_packs) {
210258
struct string_list packs = STRING_LIST_INIT_DUP;
211259

midx-write.c

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "hex.h"
55
#include "lockfile.h"
66
#include "packfile.h"
7+
#include "pack.h"
78
#include "object-file.h"
89
#include "hash-lookup.h"
910
#include "midx.h"
@@ -1251,6 +1252,136 @@ struct write_midx_opts {
12511252
unsigned flags;
12521253
};
12531254

1255+
struct batched_pack {
1256+
char *idx_name;
1257+
uint32_t num_objects;
1258+
};
1259+
1260+
struct batched_pack_collection {
1261+
struct multi_pack_index *base_midx;
1262+
struct string_list *packs_to_include;
1263+
struct batched_pack *packs;
1264+
size_t nr;
1265+
size_t alloc;
1266+
int ret;
1267+
};
1268+
1269+
static int read_pack_idx_num_objects(const char *idx_path,
1270+
uint32_t *num_objects)
1271+
{
1272+
unsigned char header[8];
1273+
unsigned char fanout[4];
1274+
off_t fanout_offset = 255 * sizeof(uint32_t);
1275+
int fd = git_open(idx_path);
1276+
int ret = 0;
1277+
1278+
if (fd < 0)
1279+
return error_errno(_("could not open pack-index '%s'"), idx_path);
1280+
1281+
if (read_in_full(fd, header, sizeof(header)) != sizeof(header)) {
1282+
ret = error_errno(_("could not read pack-index header '%s'"),
1283+
idx_path);
1284+
goto cleanup;
1285+
}
1286+
1287+
if (get_be32(header) == PACK_IDX_SIGNATURE) {
1288+
uint32_t version = get_be32(header + sizeof(uint32_t));
1289+
if (version != 2) {
1290+
ret = error(_("pack-index '%s' is version %"PRIu32
1291+
" and is not supported"),
1292+
idx_path, version);
1293+
goto cleanup;
1294+
}
1295+
fanout_offset += sizeof(header);
1296+
}
1297+
1298+
if (pread_in_full(fd, fanout, sizeof(fanout), fanout_offset) !=
1299+
sizeof(fanout)) {
1300+
ret = error_errno(_("could not read pack-index fanout '%s'"),
1301+
idx_path);
1302+
goto cleanup;
1303+
}
1304+
1305+
*num_objects = get_be32(fanout);
1306+
1307+
cleanup:
1308+
close(fd);
1309+
return ret;
1310+
}
1311+
1312+
static void collect_candidate_pack(const char *full_path,
1313+
size_t full_path_len UNUSED,
1314+
const char *file_name,
1315+
void *data)
1316+
{
1317+
struct batched_pack_collection *collection = data;
1318+
uint32_t num_objects;
1319+
1320+
if (collection->ret)
1321+
return;
1322+
if (!ends_with(file_name, ".idx"))
1323+
return;
1324+
if (collection->base_midx &&
1325+
midx_contains_pack(collection->base_midx, file_name))
1326+
return;
1327+
if (collection->packs_to_include &&
1328+
!string_list_has_string(collection->packs_to_include, file_name))
1329+
return;
1330+
1331+
if (read_pack_idx_num_objects(full_path, &num_objects) < 0) {
1332+
collection->ret = -1;
1333+
return;
1334+
}
1335+
1336+
ALLOC_GROW(collection->packs, collection->nr + 1, collection->alloc);
1337+
collection->packs[collection->nr].idx_name = xstrdup(file_name);
1338+
collection->packs[collection->nr].num_objects = num_objects;
1339+
collection->nr++;
1340+
}
1341+
1342+
static void clear_batched_packs(struct batched_pack *packs, size_t packs_nr)
1343+
{
1344+
for (size_t i = 0; i < packs_nr; i++)
1345+
free(packs[i].idx_name);
1346+
free(packs);
1347+
}
1348+
1349+
static int collect_candidate_packs(struct odb_source *source,
1350+
struct multi_pack_index *base_midx,
1351+
struct string_list *packs_to_include,
1352+
struct batched_pack **packs,
1353+
size_t *packs_nr)
1354+
{
1355+
struct batched_pack_collection collection = {
1356+
.base_midx = base_midx,
1357+
.packs_to_include = packs_to_include,
1358+
};
1359+
1360+
for_each_file_in_pack_dir(source->path, collect_candidate_pack,
1361+
&collection);
1362+
1363+
if (collection.ret) {
1364+
clear_batched_packs(collection.packs, collection.nr);
1365+
return -1;
1366+
}
1367+
1368+
*packs = collection.packs;
1369+
*packs_nr = collection.nr;
1370+
return 0;
1371+
}
1372+
1373+
static int batched_pack_cmp_objects_desc(const void *va, const void *vb)
1374+
{
1375+
const struct batched_pack *a = va;
1376+
const struct batched_pack *b = vb;
1377+
1378+
if (a->num_objects > b->num_objects)
1379+
return -1;
1380+
if (a->num_objects < b->num_objects)
1381+
return 1;
1382+
return strcmp(a->idx_name, b->idx_name);
1383+
}
1384+
12541385
static int write_midx_internal(struct write_midx_opts *opts)
12551386
{
12561387
struct repository *r = opts->source->odb->repo;
@@ -1881,6 +2012,98 @@ int write_midx_file_only(struct odb_source *source,
18812012
return write_midx_internal(&opts);
18822013
}
18832014

2015+
int write_midx_file_batched(struct odb_source *source,
2016+
struct string_list *packs_to_include,
2017+
const char *preferred_pack_name,
2018+
const char *refs_snapshot,
2019+
uint32_t max_objects_per_layer,
2020+
unsigned flags)
2021+
{
2022+
struct repository *r = source->odb->repo;
2023+
struct batched_pack *candidates = NULL;
2024+
size_t candidates_nr = 0;
2025+
size_t i = 0;
2026+
int result = 0;
2027+
2028+
if (!max_objects_per_layer)
2029+
return error(_("--max-objects-per-layer must be greater than zero"));
2030+
if (flags & MIDX_WRITE_COMPACT)
2031+
return error(_("--max-objects-per-layer is incompatible with compaction"));
2032+
if (flags & MIDX_WRITE_NO_CHAIN)
2033+
return error(_("--max-objects-per-layer is incompatible with --no-write-chain-file"));
2034+
2035+
flags |= MIDX_WRITE_INCREMENTAL;
2036+
2037+
odb_reprepare(r->objects);
2038+
if (collect_candidate_packs(source, get_multi_pack_index(source),
2039+
packs_to_include, &candidates,
2040+
&candidates_nr) < 0)
2041+
return -1;
2042+
if (!candidates_nr)
2043+
goto cleanup;
2044+
2045+
QSORT(candidates, candidates_nr, batched_pack_cmp_objects_desc);
2046+
2047+
while (i < candidates_nr) {
2048+
struct string_list batch = STRING_LIST_INIT_DUP;
2049+
uint64_t batch_objects = 0;
2050+
int batch_has_preferred_pack = 0;
2051+
struct write_midx_opts opts;
2052+
2053+
do {
2054+
struct batched_pack *candidate = &candidates[i];
2055+
2056+
string_list_append(&batch, candidate->idx_name);
2057+
if (preferred_pack_name &&
2058+
!cmp_idx_or_pack_name(preferred_pack_name,
2059+
candidate->idx_name))
2060+
batch_has_preferred_pack = 1;
2061+
batch_objects += candidate->num_objects;
2062+
i++;
2063+
} while (i < candidates_nr &&
2064+
batch_objects + candidates[i].num_objects <=
2065+
max_objects_per_layer);
2066+
2067+
string_list_sort(&batch);
2068+
2069+
memset(&opts, 0, sizeof(opts));
2070+
opts.source = source;
2071+
opts.packs_to_include = &batch;
2072+
opts.preferred_pack_name = batch_has_preferred_pack ?
2073+
preferred_pack_name : NULL;
2074+
opts.refs_snapshot = refs_snapshot;
2075+
opts.flags = flags;
2076+
2077+
trace2_region_enter("midx", "write_midx_batched_step", r);
2078+
trace2_data_intmax("midx", r, "batch:packs",
2079+
(intmax_t)batch.nr);
2080+
trace2_data_intmax("midx", r, "batch:objects",
2081+
(intmax_t)batch_objects);
2082+
2083+
if (write_midx_internal(&opts) < 0)
2084+
result = -1;
2085+
2086+
string_list_clear(&batch, 0);
2087+
2088+
/*
2089+
* Reload the object database so the next in-process write sees
2090+
* the MIDX layer that the previous iteration just linked into
2091+
* the chain file.
2092+
*/
2093+
odb_close(r->objects);
2094+
odb_reprepare(r->objects);
2095+
2096+
trace2_region_leave("midx", "write_midx_batched_step", r);
2097+
2098+
if (result)
2099+
break;
2100+
}
2101+
2102+
cleanup:
2103+
clear_batched_packs(candidates, candidates_nr);
2104+
return result;
2105+
}
2106+
18842107
int write_midx_file_compact(struct odb_source *source,
18852108
struct multi_pack_index *from,
18862109
struct multi_pack_index *to,

midx.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,12 @@ int write_midx_file_only(struct odb_source *source,
138138
const char *refs_snapshot,
139139
const char *incremental_base,
140140
unsigned flags);
141+
int write_midx_file_batched(struct odb_source *source,
142+
struct string_list *packs_to_include,
143+
const char *preferred_pack_name,
144+
const char *refs_snapshot,
145+
uint32_t max_objects_per_layer,
146+
unsigned flags);
141147
int write_midx_file_compact(struct odb_source *source,
142148
struct multi_pack_index *from,
143149
struct multi_pack_index *to,

0 commit comments

Comments
 (0)