Skip to content

Commit b247256

Browse files
jltoblergitster
authored andcommitted
object-file: avoid fd seekback by checking object size upfront
In certain scenarios, Git handles writing blobs that exceed "core.bigFileThreshold" differently by streaming the object directly into a packfile. When there is an active ODB transaction, these blobs are streamed to the same packfile instead of using a separate packfile for each. If "pack.packSizeLimit" is configured and streaming another object causes the packfile to exceed the configured limit, the packfile is truncated back to the previous object and the object write is restarted in a new packfile. This works fine, but requires the fd being read from to save a checkpoint so it becomes possible to rewind the input source via seeking back to a known offset at the beginning. In a subsequent commit, blob streaming is converted to use `struct odb_write_stream` as a more generic input source instead of an fd which doesn't provide a mechanism for rewinding. For this use case though, rewinding the fd is not strictly necessary because the inflated size of the object is known and can be used to approximate whether writing the object would cause the packfile to exceed the configured limit prior to writing anything. These blobs written to the packfile are never deltified thus the size difference between what is written versus the inflated size is due to zlib compression. While this does prevent packfiles from being filled to the potential maximum is some cases, it should be good enough and still prevents the packfile from exceeding any configured limit. Use the inflated blob size to determine whether writing an object to a packfile will exceed the configured "pack.packSizeLimit". Signed-off-by: Justin Tobler <jltobler@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 43eadce commit b247256

File tree

1 file changed

+25
-61
lines changed

1 file changed

+25
-61
lines changed

object-file.c

Lines changed: 25 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,29 +1447,17 @@ static int hash_blob_stream(struct odb_write_stream *stream,
14471447

14481448
/*
14491449
* Read the contents from fd for size bytes, streaming it to the
1450-
* packfile in state while updating the hash in ctx. Signal a failure
1451-
* by returning a negative value when the resulting pack would exceed
1452-
* the pack size limit and this is not the first object in the pack,
1453-
* so that the caller can discard what we wrote from the current pack
1454-
* by truncating it and opening a new one. The caller will then call
1455-
* us again after rewinding the input fd.
1456-
*
1457-
* The already_hashed_to pointer is kept untouched by the caller to
1458-
* make sure we do not hash the same byte when we are called
1459-
* again. This way, the caller does not have to checkpoint its hash
1460-
* status before calling us just in case we ask it to call us again
1461-
* with a new pack.
1450+
* packfile in state while updating the hash in ctx.
14621451
*/
1463-
static int stream_blob_to_pack(struct transaction_packfile *state,
1464-
struct git_hash_ctx *ctx, off_t *already_hashed_to,
1465-
int fd, size_t size, const char *path)
1452+
static void stream_blob_to_pack(struct transaction_packfile *state,
1453+
struct git_hash_ctx *ctx, int fd, size_t size,
1454+
const char *path)
14661455
{
14671456
git_zstream s;
14681457
unsigned char ibuf[16384];
14691458
unsigned char obuf[16384];
14701459
unsigned hdrlen;
14711460
int status = Z_OK;
1472-
off_t offset = 0;
14731461

14741462
git_deflate_init(&s, pack_compression_level);
14751463

@@ -1486,15 +1474,9 @@ static int stream_blob_to_pack(struct transaction_packfile *state,
14861474
if ((size_t)read_result != rsize)
14871475
die("failed to read %u bytes from '%s'",
14881476
(unsigned)rsize, path);
1489-
offset += rsize;
1490-
if (*already_hashed_to < offset) {
1491-
size_t hsize = offset - *already_hashed_to;
1492-
if (rsize < hsize)
1493-
hsize = rsize;
1494-
if (hsize)
1495-
git_hash_update(ctx, ibuf, hsize);
1496-
*already_hashed_to = offset;
1497-
}
1477+
1478+
git_hash_update(ctx, ibuf, rsize);
1479+
14981480
s.next_in = ibuf;
14991481
s.avail_in = rsize;
15001482
size -= rsize;
@@ -1505,14 +1487,6 @@ static int stream_blob_to_pack(struct transaction_packfile *state,
15051487
if (!s.avail_out || status == Z_STREAM_END) {
15061488
size_t written = s.next_out - obuf;
15071489

1508-
/* would we bust the size limit? */
1509-
if (state->nr_written &&
1510-
pack_size_limit_cfg &&
1511-
pack_size_limit_cfg < state->offset + written) {
1512-
git_deflate_abort(&s);
1513-
return -1;
1514-
}
1515-
15161490
hashwrite(state->f, obuf, written);
15171491
state->offset += written;
15181492
s.next_out = obuf;
@@ -1529,7 +1503,6 @@ static int stream_blob_to_pack(struct transaction_packfile *state,
15291503
}
15301504
}
15311505
git_deflate_end(&s);
1532-
return 0;
15331506
}
15341507

15351508
static void flush_packfile_transaction(struct odb_transaction_files *transaction)
@@ -1605,48 +1578,39 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac
16051578
size_t size, const char *path)
16061579
{
16071580
struct transaction_packfile *state = &transaction->packfile;
1608-
off_t seekback, already_hashed_to;
16091581
struct git_hash_ctx ctx;
16101582
unsigned char obuf[16384];
16111583
unsigned header_len;
16121584
struct hashfile_checkpoint checkpoint;
16131585
struct pack_idx_entry *idx;
16141586

1615-
seekback = lseek(fd, 0, SEEK_CUR);
1616-
if (seekback == (off_t)-1)
1617-
return error("cannot find the current offset");
1618-
16191587
header_len = format_object_header((char *)obuf, sizeof(obuf),
16201588
OBJ_BLOB, size);
16211589
transaction->base.source->odb->repo->hash_algo->init_fn(&ctx);
16221590
git_hash_update(&ctx, obuf, header_len);
16231591

1592+
/*
1593+
* If writing another object to the packfile could result in it
1594+
* exceeding the configured size limit, flush the current packfile
1595+
* transaction.
1596+
*
1597+
* Note that this uses the inflated object size as an approximation.
1598+
* Blob objects written in this manner are not delta-compressed, so
1599+
* the difference between the inflated and on-disk size is limited
1600+
* to zlib compression and is sufficient for this check.
1601+
*/
1602+
if (state->nr_written && pack_size_limit_cfg &&
1603+
pack_size_limit_cfg < state->offset + size)
1604+
flush_packfile_transaction(transaction);
1605+
16241606
CALLOC_ARRAY(idx, 1);
16251607
prepare_packfile_transaction(transaction);
16261608
hashfile_checkpoint_init(state->f, &checkpoint);
16271609

1628-
already_hashed_to = 0;
1629-
1630-
while (1) {
1631-
prepare_packfile_transaction(transaction);
1632-
hashfile_checkpoint(state->f, &checkpoint);
1633-
idx->offset = state->offset;
1634-
crc32_begin(state->f);
1635-
1636-
if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
1637-
fd, size, path))
1638-
break;
1639-
/*
1640-
* Writing this object to the current pack will make
1641-
* it too big; we need to truncate it, start a new
1642-
* pack, and write into it.
1643-
*/
1644-
hashfile_truncate(state->f, &checkpoint);
1645-
state->offset = checkpoint.offset;
1646-
flush_packfile_transaction(transaction);
1647-
if (lseek(fd, seekback, SEEK_SET) == (off_t)-1)
1648-
return error("cannot seek back");
1649-
}
1610+
hashfile_checkpoint(state->f, &checkpoint);
1611+
idx->offset = state->offset;
1612+
crc32_begin(state->f);
1613+
stream_blob_to_pack(state, &ctx, fd, size, path);
16501614
git_hash_final_oid(result_oid, &ctx);
16511615

16521616
idx->crc32 = crc32_end(state->f);

0 commit comments

Comments
 (0)