Skip to content

Commit a77f908

Browse files
[Hub] Dedupe file entries by xet hash within a shard (#2134)
## Summary - Track xet file hashes already written into the current shard's file-info section and skip duplicates so the shard's file list behaves like a set rather than a list. - The dedup set is scoped to the entire `uploadShards` call so it persists across shard flushes — a duplicate in a later shard is also skipped. - File events are still yielded to callers for every input path, so consumers (e.g. `commit.ts`) keep getting per-path progress and `path -> xet hash` mapping for duplicates. Today the only dedup that happens is sha256-based and lives in `createXorbs` — if a file has no sha256 (or only one of the duplicates does), both entries currently get written into the shard's file list. This change handles that case. ## Test plan - [x] `npx vitest run src/utils/uploadShards.spec.ts` — added a case that feeds three paths through `uploadShards` with the mocked `createXorbs` yielding the same xet hash, and asserts: each path still gets a `file` event, but only one file entry ends up in the resulting shard. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Changes shard serialization behavior by skipping duplicate file entries based on Xet hash, which could impact upload correctness and downstream consumers if the dedupe scope/ordering assumptions are wrong. > > **Overview** > Makes `uploadShards` treat the shard file-info section like a set by tracking seen Xet file hashes and **skipping writing duplicate file entries** while still yielding `file` events for every input path. > > Adds a new unit test that feeds multiple paths producing the same Xet hash and asserts only one file entry is stored in the uploaded shard, while all paths still receive `file` events. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit 24884a1. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY --> Co-authored-by: Eliott C. <coyotte508@gmail.com>
1 parent ac45210 commit a77f908

2 files changed

Lines changed: 66 additions & 0 deletions

File tree

packages/hub/src/utils/uploadShards.spec.ts

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,17 @@ function toSource(sha256?: string): AsyncGenerator<{ content: Blob; path: string
9191
})();
9292
}
9393

94+
function toMultiSource(paths: string[]): AsyncGenerator<{ content: Blob; path: string; sha256?: string }> {
95+
return (async function* () {
96+
for (const path of paths) {
97+
yield {
98+
content: new Blob(["content"]),
99+
path,
100+
};
101+
}
102+
})();
103+
}
104+
94105
describe("uploadShards", () => {
95106
it("omits metadata flag and metadata section when sha256 is missing", async () => {
96107
const uploadedShards: Uint8Array[] = [];
@@ -167,4 +178,53 @@ describe("uploadShards", () => {
167178
fileEntryLength: FILE_ENTRY_BASE_SIZE + REPRESENTATION_ENTRY_SIZE + VERIFICATION_ENTRY_SIZE + METADATA_ENTRY_SIZE,
168179
});
169180
});
181+
182+
it("dedupes file entries with the same xet hash within a shard", async () => {
183+
const uploadedShards: Uint8Array[] = [];
184+
const fetchMock: typeof fetch = vi.fn(async (input, init) => {
185+
const url = String(input);
186+
187+
if (url.endsWith("/v1/shards")) {
188+
if (!(init?.body instanceof Uint8Array)) {
189+
throw new Error("Expected Uint8Array shard body");
190+
}
191+
uploadedShards.push(new Uint8Array(init.body));
192+
}
193+
194+
return new Response(null, { status: 200 });
195+
});
196+
197+
const fileEvents: Array<{ path: string; xetHash: string }> = [];
198+
for await (const event of uploadShards(toMultiSource(["a.bin", "b.bin", "c.bin"]), {
199+
accessToken: "test-token",
200+
hubUrl: "https://hub.local",
201+
fetch: fetchMock,
202+
repo: { type: "model", name: "user/repo" },
203+
rev: "main",
204+
xetParams: {
205+
casUrl: "https://cas.local",
206+
accessToken: "cas-token",
207+
expiresAt: new Date(Date.now() + 600_000),
208+
refreshWriteTokenUrl: "https://hub.local/xet-write-token",
209+
},
210+
})) {
211+
if (event.event === "file") {
212+
fileEvents.push({ path: event.path, xetHash: event.xetHash });
213+
}
214+
}
215+
216+
// Each path still gets its file event yielded so callers can map path -> hash.
217+
expect(fileEvents).toEqual([
218+
{ path: "a.bin", xetHash: "3".repeat(64) },
219+
{ path: "b.bin", xetHash: "3".repeat(64) },
220+
{ path: "c.bin", xetHash: "3".repeat(64) },
221+
]);
222+
223+
// But only one file entry is written into the shard.
224+
expect(uploadedShards).toHaveLength(1);
225+
expect(readFileEntryInfo(uploadedShards[0])).toEqual({
226+
flags: MDB_FILE_FLAG_WITH_VERIFICATION,
227+
fileEntryLength: FILE_ENTRY_BASE_SIZE + REPRESENTATION_ENTRY_SIZE + VERIFICATION_ENTRY_SIZE,
228+
});
229+
});
170230
});

packages/hub/src/utils/uploadShards.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ export async function* uploadShards(
8787
| { event: "fileProgress"; path: string; progress: number }
8888
> {
8989
const xorbHashes: Array<string> = [];
90+
const seenFileXetHashes = new Set<string>();
9091

9192
const fileInfoSection = new Uint8Array(Math.floor(SHARD_MAX_SIZE - SHARD_HEADER_SIZE - SHARD_FOOTER_SIZE) * 0.25);
9293
const xorbInfoSection = new Uint8Array(Math.floor(SHARD_MAX_SIZE - SHARD_HEADER_SIZE - SHARD_FOOTER_SIZE) * 0.75);
@@ -172,6 +173,11 @@ export async function* uploadShards(
172173
dedupRatio: output.dedupRatio,
173174
}; // Maybe wait until shard is uploaded before yielding.
174175

176+
if (seenFileXetHashes.has(output.hash)) {
177+
break;
178+
}
179+
seenFileXetHashes.add(output.hash);
180+
175181
// Calculate space needed for this file entry
176182
const fileHeaderSize = HASH_LENGTH + 4 + 4 + 8; // hash + flags + rep length + reserved
177183
const representationSize = output.representation.length * (HASH_LENGTH + 4 + 4 + 4 + 4); // per rep: xorb hash + flags + length + offset + endOffset

0 commit comments

Comments
 (0)