Skip to content

Commit e1ba5ae

Browse files
committed
test-tool: add a helper to synthesize large packfiles
To test Git's behavior with very large pack files, we need a way to generate such files quickly. A naive approach using only readily-available Git commands would take over 10 hours for a 4GB pack file, which is prohibitive. Side-stepping Git's machinery and actual zlib compression by writing uncompressed content with the appropriate zlib header makes things much faster. The fastest method using this approach generates many small, unreachable blob objects and takes about 1.5 minutes for 4GB. However, this cannot be used because we need to test git clone, which requires a reachable commit history. Generating many reachable commits with small, uncompressed blobs takes about 4 minutes for 4GB. But this approach 1) does not reproduce the issues we want to fix (which require individual objects larger than 4GB) and 2) is comparatively slow because of the many SHA-1 calculations. The approach taken here generates a single large blob (filled with NUL bytes), along with the trees and commits needed to make it reachable. This takes about 2.5 minutes for 4.5GB, which is the fastest option that produces a valid, clonable repository with an object large enough to trigger the bugs we want to test. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
1 parent e89b0b6 commit e1ba5ae

5 files changed

Lines changed: 299 additions & 0 deletions

File tree

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,6 +875,7 @@ TEST_BUILTINS_OBJS += test-submodule-config.o
875875
TEST_BUILTINS_OBJS += test-submodule-nested-repo-config.o
876876
TEST_BUILTINS_OBJS += test-submodule.o
877877
TEST_BUILTINS_OBJS += test-subprocess.o
878+
TEST_BUILTINS_OBJS += test-synthesize.o
878879
TEST_BUILTINS_OBJS += test-trace2.o
879880
TEST_BUILTINS_OBJS += test-truncate.o
880881
TEST_BUILTINS_OBJS += test-userdiff.o

t/helper/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ test_tool_sources = [
7070
'test-submodule-nested-repo-config.c',
7171
'test-submodule.c',
7272
'test-subprocess.c',
73+
'test-synthesize.c',
7374
'test-tool.c',
7475
'test-trace2.c',
7576
'test-truncate.c',

t/helper/test-synthesize.c

Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
#include "test-tool.h"
2+
#include "git-compat-util.h"
3+
#include "hex.h"
4+
#include "object.h"
5+
#include "strbuf.h"
6+
#include "parse-options.h"
7+
#include "hash.h"
8+
#include "git-zlib.h"
9+
10+
/*
11+
* Write a pack object header for the given type and size.
12+
* Returns the number of bytes written to the buffer.
13+
*/
14+
static size_t write_pack_object_header(unsigned char *buf, enum object_type type, size_t size)
15+
{
16+
unsigned char *p = buf;
17+
*p = (type << 4) | (size & 0xf);
18+
size >>= 4;
19+
while (size) {
20+
*p++ |= 0x80;
21+
*p = size & 0x7f;
22+
size >>= 7;
23+
}
24+
p++;
25+
return p - buf;
26+
}
27+
28+
/*
29+
* Write data as an uncompressed zlib stream.
30+
* For data larger than 64KB, writes multiple uncompressed blocks.
31+
* If data is NULL, writes zeros.
32+
* Updates the pack checksum context.
33+
*/
34+
static void write_uncompressed_zlib(FILE *f, struct git_hash_ctx *pack_ctx,
35+
const void *data, size_t len,
36+
const struct git_hash_algo *algo)
37+
{
38+
unsigned char zlib_header[2] = { 0x78, 0x01 }; /* CMF, FLG */
39+
unsigned char block_header[5];
40+
static unsigned char zeros[0xffff];
41+
const unsigned char *p = data;
42+
size_t remaining = len;
43+
uint32_t adler = 1L; /* adler32 initial value */
44+
unsigned char adler_buf[4];
45+
46+
/* Write zlib header */
47+
fwrite(zlib_header, 1, 2, f);
48+
algo->update_fn(pack_ctx, zlib_header, 2);
49+
50+
/* Write uncompressed blocks (max 64KB each) */
51+
do {
52+
size_t block_len = remaining > 0xffff ? 0xffff : remaining;
53+
int is_final = (block_len == remaining);
54+
const unsigned char *block_data = data ? p : zeros;
55+
56+
block_header[0] = is_final ? 0x01 : 0x00;
57+
block_header[1] = block_len & 0xff;
58+
block_header[2] = (block_len >> 8) & 0xff;
59+
block_header[3] = block_header[1] ^ 0xff;
60+
block_header[4] = block_header[2] ^ 0xff;
61+
62+
fwrite(block_header, 1, 5, f);
63+
algo->update_fn(pack_ctx, block_header, 5);
64+
65+
if (block_len) {
66+
fwrite(block_data, 1, block_len, f);
67+
algo->update_fn(pack_ctx, block_data, block_len);
68+
adler = adler32(adler, block_data, block_len);
69+
}
70+
71+
if (data)
72+
p += block_len;
73+
remaining -= block_len;
74+
} while (remaining > 0);
75+
76+
/* Write adler32 checksum */
77+
put_be32(adler_buf, adler);
78+
fwrite(adler_buf, 1, 4, f);
79+
algo->update_fn(pack_ctx, adler_buf, 4);
80+
}
81+
82+
/*
83+
* Write an uncompressed object to the pack file.
84+
* Updates the pack checksum context.
85+
*/
86+
static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx,
87+
enum object_type type, const void *data, size_t len,
88+
const struct git_hash_algo *algo)
89+
{
90+
unsigned char header[32];
91+
size_t header_len;
92+
93+
/* Write pack object header */
94+
header_len = write_pack_object_header(header, type, len);
95+
fwrite(header, 1, header_len, f);
96+
algo->update_fn(pack_ctx, header, header_len);
97+
98+
/* Write the data as uncompressed zlib */
99+
write_uncompressed_zlib(f, pack_ctx, data, len, algo);
100+
}
101+
102+
/*
103+
* Compute the object ID for a given object.
104+
*/
105+
static void hash_object(struct object_id *oid, enum object_type type,
106+
const void *data, size_t len,
107+
const struct git_hash_algo *algo)
108+
{
109+
struct git_hash_ctx ctx;
110+
char header[32];
111+
int header_len;
112+
113+
header_len = xsnprintf(header, sizeof(header), "%s %"PRIuMAX,
114+
type_name(type), (uintmax_t)len) + 1;
115+
116+
algo->init_fn(&ctx);
117+
algo->update_fn(&ctx, header, header_len);
118+
algo->update_fn(&ctx, data, len);
119+
algo->final_oid_fn(oid, &ctx);
120+
}
121+
122+
/*
123+
* Compute the object ID for a large object filled with zeros.
124+
*/
125+
static void hash_large_zero_object(struct object_id *oid, enum object_type type,
126+
size_t len, const struct git_hash_algo *algo)
127+
{
128+
struct git_hash_ctx ctx;
129+
char header[32];
130+
int header_len;
131+
static unsigned char zeros[0xffff];
132+
size_t remaining;
133+
134+
header_len = xsnprintf(header, sizeof(header), "%s %"PRIuMAX,
135+
type_name(type), (uintmax_t)len) + 1;
136+
137+
algo->init_fn(&ctx);
138+
algo->update_fn(&ctx, header, header_len);
139+
for (remaining = len; remaining; ) {
140+
size_t chunk = remaining > sizeof(zeros) ? sizeof(zeros) : remaining;
141+
algo->update_fn(&ctx, zeros, chunk);
142+
remaining -= chunk;
143+
}
144+
algo->final_oid_fn(oid, &ctx);
145+
}
146+
147+
/*
148+
* Write a large object (zeros) to the pack file, streaming to avoid
149+
* allocating the entire object in memory.
150+
*/
151+
static void write_large_pack_object(FILE *f, struct git_hash_ctx *pack_ctx,
152+
enum object_type type, size_t len,
153+
struct object_id *oid,
154+
const struct git_hash_algo *algo)
155+
{
156+
unsigned char header[32];
157+
size_t header_len;
158+
159+
/* Compute the object ID */
160+
hash_large_zero_object(oid, type, len, algo);
161+
162+
/* Write pack object header */
163+
header_len = write_pack_object_header(header, type, len);
164+
fwrite(header, 1, header_len, f);
165+
algo->update_fn(pack_ctx, header, header_len);
166+
167+
/* Write the zeros as uncompressed zlib (NULL means zeros) */
168+
write_uncompressed_zlib(f, pack_ctx, NULL, len, algo);
169+
}
170+
171+
/*
172+
* Generate a pack file with a single large (>4GB) reachable object.
173+
*
174+
* Creates:
175+
* 1. A large blob (all NUL bytes)
176+
* 2. A tree containing that blob as "file"
177+
* 3. A commit using that tree
178+
* 4. The empty tree
179+
* 5. A child commit using the empty tree
180+
*
181+
* This is useful for testing that Git can handle objects larger than 4GB.
182+
*/
183+
static int generate_pack_with_large_object(const char *path, size_t blob_size,
184+
const struct git_hash_algo *algo)
185+
{
186+
FILE *f = fopen_for_writing(path);
187+
struct git_hash_ctx pack_ctx;
188+
char header[1024];
189+
struct object_id blob_oid, tree_oid, commit_oid, empty_tree_oid, final_commit_oid;
190+
struct strbuf buf = STRBUF_INIT;
191+
size_t object_count = 5; /* large blob, tree, commit, empty tree, final commit */
192+
193+
algo->init_fn(&pack_ctx);
194+
195+
/* Write pack header */
196+
memcpy(header, "PACK", 4);
197+
put_be32(header + 4, 2);
198+
put_be32(header + 8, object_count);
199+
fwrite(header, 1, 12, f);
200+
algo->update_fn(&pack_ctx, header, 12);
201+
202+
/* 1. Write the large blob */
203+
write_large_pack_object(f, &pack_ctx, OBJ_BLOB, blob_size, &blob_oid, algo);
204+
205+
/* 2. Write tree containing the blob as "file" */
206+
strbuf_addf(&buf, "100644 file%c", '\0');
207+
strbuf_add(&buf, blob_oid.hash, algo->rawsz);
208+
hash_object(&tree_oid, OBJ_TREE, buf.buf, buf.len, algo);
209+
write_pack_object(f, &pack_ctx, OBJ_TREE, buf.buf, buf.len, algo);
210+
211+
/* 3. Write commit using that tree */
212+
strbuf_reset(&buf);
213+
strbuf_addf(&buf,
214+
"tree %s\n"
215+
"author A U Thor <author@example.com> 1234567890 +0000\n"
216+
"committer C O Mitter <committer@example.com> 1234567890 +0000\n"
217+
"\n"
218+
"Large blob commit\n",
219+
oid_to_hex(&tree_oid));
220+
hash_object(&commit_oid, OBJ_COMMIT, buf.buf, buf.len, algo);
221+
write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, algo);
222+
223+
/* 4. Write the empty tree */
224+
hash_object(&empty_tree_oid, OBJ_TREE, "", 0, algo);
225+
write_pack_object(f, &pack_ctx, OBJ_TREE, "", 0, algo);
226+
227+
/* 5. Write final commit using empty tree, with previous commit as parent */
228+
strbuf_reset(&buf);
229+
strbuf_addf(&buf,
230+
"tree %s\n"
231+
"parent %s\n"
232+
"author A U Thor <author@example.com> 1234567890 +0000\n"
233+
"committer C O Mitter <committer@example.com> 1234567890 +0000\n"
234+
"\n"
235+
"Empty tree commit\n",
236+
oid_to_hex(&empty_tree_oid),
237+
oid_to_hex(&commit_oid));
238+
hash_object(&final_commit_oid, OBJ_COMMIT, buf.buf, buf.len, algo);
239+
write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, algo);
240+
241+
/* Write pack trailer (checksum) */
242+
algo->final_fn((unsigned char *)header, &pack_ctx);
243+
fwrite(header, 1, algo->rawsz, f);
244+
245+
fclose(f);
246+
247+
strbuf_release(&buf);
248+
249+
/* Print the final commit OID so caller can set up refs */
250+
printf("%s\n", oid_to_hex(&final_commit_oid));
251+
252+
return 0;
253+
}
254+
255+
static int cmd__synthesize__pack(int argc, const char **argv,
256+
const char *prefix UNUSED,
257+
struct repository *repo UNUSED)
258+
{
259+
const struct git_hash_algo *algo = hash_algos + GIT_HASH_SHA1;
260+
size_t count;
261+
const char *path;
262+
const char * const usage[] = {
263+
"test-tool synthesize pack <count> <filename>",
264+
NULL
265+
};
266+
struct option options[] = {
267+
OPT_END()
268+
};
269+
270+
argc = parse_options(argc, argv, NULL, options, usage,
271+
PARSE_OPT_KEEP_ARGV0);
272+
if (argc != 3)
273+
usage_with_options(usage, options);
274+
275+
count = strtoumax(argv[1], NULL, 10);
276+
path = argv[2];
277+
278+
return !!generate_pack_with_large_object(path, count, algo);
279+
}
280+
281+
int cmd__synthesize(int argc, const char **argv)
282+
{
283+
const char *prefix = NULL;
284+
char const * const synthesize_usage[] = {
285+
"test-tool synthesize pack <options>",
286+
NULL,
287+
};
288+
parse_opt_subcommand_fn *fn = NULL;
289+
struct option options[] = {
290+
OPT_SUBCOMMAND("pack", &fn, cmd__synthesize__pack),
291+
OPT_END()
292+
};
293+
argc = parse_options(argc, argv, prefix, options, synthesize_usage, 0);
294+
return !!fn(argc, argv, prefix, NULL);
295+
}

t/helper/test-tool.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ static struct test_cmd cmds[] = {
8383
{ "submodule-config", cmd__submodule_config },
8484
{ "submodule-nested-repo-config", cmd__submodule_nested_repo_config },
8585
{ "subprocess", cmd__subprocess },
86+
{ "synthesize", cmd__synthesize },
8687
{ "trace2", cmd__trace2 },
8788
{ "truncate", cmd__truncate },
8889
{ "userdiff", cmd__userdiff },

t/helper/test-tool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ int cmd__submodule(int argc, const char **argv);
7676
int cmd__submodule_config(int argc, const char **argv);
7777
int cmd__submodule_nested_repo_config(int argc, const char **argv);
7878
int cmd__subprocess(int argc, const char **argv);
79+
int cmd__synthesize(int argc, const char **argv);
7980
int cmd__trace2(int argc, const char **argv);
8081
int cmd__truncate(int argc, const char **argv);
8182
int cmd__userdiff(int argc, const char **argv);

0 commit comments

Comments
 (0)