docs

bmiddha · bmiddha · commit aa4839b57621 · 2025-09-26T16:59:18.000-07:00
diff --git a/apps/zipsync/README.md b/apps/zipsync/README.md
@@ -1,22 +1,48 @@
 # @rushstack/zipsync
 
-zipsync is a tool to pack and unpack zip archives. It is designed as a single-purpose tool to pack and unpack build cache entries.
+zipsync is a focused tool for packing and unpacking build cache entries using a constrained subset of the ZIP format for high performance. It optimizes the common scenario where most files already exist in the target location and are unchanged.
 
-## Implementation
+## Goals & Rationale
 
-### Unpack
+- **Optimize partial unpack**: Most builds reuse the majority of previously produced outputs. Skipping rewrites preserves filesystem and page cache state.
+- **Only write when needed**: Fewer syscalls.
+- **Integrated cleanup**: Removes the need for a separate `rm -rf` pass; extra files and empty directories are pruned automatically.
+- **ZIP subset**: Compatibility with malware scanners.
+- **Fast inspection**: The central directory can be enumerated without inflating the entire archive (unlike tar+gzip).
 
-- Read the zip central directory record at the end of the zip file and enumerate zip entries
-- Parse the zipsync metadata file in the archive. This contains the SHA-1 hashes of the files
-- Enumerate the target directories, cleanup any files or folders that aren't in the archive
-- If a file exists with matching size + SHA‑1, skip writing; else unpack it
+## How It Works
 
-### Pack
+### Pack Flow
 
-- Enumerate the target directories.
-- For each file compute a SHA-1 hash for the zipsync metadata file, and the CRC32 (required by zip format), then compress it if needed. Write the headers and file contents to the zip archive.
-- Write the metadata file to the zip archive and the zip central directory record.
+```
+for each file F
+  write LocalFileHeader(F)
+  stream chunks:
+    read -> hash + crc + maybe compress -> write
+  finalize compressor
+  write DataDescriptor(F)
+add metadata entry (same pattern)
+write central directory records
+```
 
-## Constraints
+### Unpack Flow
 
-Though archives created by zipsync can be used by other zip compatible programs, the opposite is not the case. zipsync only implements a subset of zip features to achieve greater performance.
+```
+load archive -> parse central dir -> read metadata
+scan filesystem & delete extraneous entries
+for each entry (except metadata):
+  if unchanged (sha1 matches) => skip
+  else extract (decompress if needed)
+```
+
+## Why ZIP (vs tar + gzip)
+
+Pros for this scenario:
+
+- Central directory enables cheap listing without decompressing entire payload.
+- Widely understood / tooling-friendly (system explorers, scanners, CI tooling).
+- Per-file compression keeps selective unpack simple (no need to inflate all bytes).
+
+Trade-offs:
+
+- Tar+gzip can exploit cross-file redundancy for better compressed size in datasets with many similar files.
diff --git a/apps/zipsync/src/pack.ts b/apps/zipsync/src/pack.ts
@@ -33,20 +33,31 @@ import {
   METADATA_FILENAME
 } from './zipSyncUtils';
 
+/**
+ * File extensions for which additional DEFLATE/ZSTD compression is unlikely to help.
+ * Used by the 'auto' compression heuristic to avoid wasting CPU on data that is already
+ * compressed (images, media, existing archives, fonts, etc.).
+ */
 const LIKELY_COMPRESSED_EXTENSION_REGEX: RegExp =
   /\.(?:zip|gz|tgz|bz2|xz|7z|rar|jpg|jpeg|png|gif|webp|avif|mp4|m4v|mov|mkv|webm|mp3|ogg|aac|flac|pdf|woff|woff2)$/;
 
+/**
+ * Map zip compression method code -> incremental zlib mode label
+ */
 const zlibPackModes: Record<ZipMetaCompressionMethod, IncrementalZlibMode | undefined> = {
   [ZSTD_COMPRESSION]: 'zstd-compress',
   [DEFLATE_COMPRESSION]: 'deflate',
   [STORE_COMPRESSION]: undefined
 } as const;
 
+/**
+ * Public facing CLI option -> actual zip method used for a file we decide to compress.
+ */
 const zipSyncCompressionOptions: Record<ZipSyncOptionCompression, ZipMetaCompressionMethod> = {
   store: STORE_COMPRESSION,
   deflate: DEFLATE_COMPRESSION,
   zstd: ZSTD_COMPRESSION,
-  auto: DEFLATE_COMPRESSION // 'auto' is handled specially in the code
+  auto: DEFLATE_COMPRESSION
 } as const;
 
 /**
@@ -82,6 +93,18 @@ export interface IZipSyncPackResult {
   metadata: IMetadata;
 }
 
+/**
+ * Create a zipsync archive by enumerating target directories, then streaming each file into the
+ * output zip using the local file header + (optional compressed data) + data descriptor pattern.
+ *
+ * Performance characteristics:
+ *  - Single pass per file (no read-then-compress-then-write buffering). CRC32 + SHA-1 are computed
+ *    while streaming so the metadata JSON can later be used for selective unpack.
+ *  - Data descriptor usage (bit 3) allows writing headers before we know sizes or CRC32.
+ *  - A single timestamp (captured once) is applied to all entries for determinism.
+ *  - Metadata entry is added as a normal zip entry at the end (before central directory) so legacy
+ *    tools can still list/extract it, while zipsync can quickly parse file hashes.
+ */
 export function pack({
   archivePath,
   targetDirectories: rawTargetDirectories,
@@ -95,7 +118,7 @@ export function pack({
 
   markStart('pack.total');
   terminal.writeDebugLine('Starting pack');
-  // Pass 1: enumerate
+  // Pass 1: enumerate files with a queue to avoid deep recursion
   markStart('pack.enumerate');
 
   const filePaths: string[] = [];
@@ -140,7 +163,7 @@ export function pack({
   terminal.writeLine(`Found ${filePaths.length} files to pack (enumerated)`);
   markEnd('pack.enumerate');
 
-  // Pass 2: read + hash + compress
+  // Pass 2: stream each file: read chunks -> hash + (maybe) compress -> write local header + data descriptor.
   markStart('pack.prepareEntries');
   const bufferSize: number = 1 << 25; // 32 MiB
   const inputBuffer: Buffer<ArrayBuffer> = Buffer.allocUnsafeSlow(bufferSize);
@@ -150,6 +173,9 @@ export function pack({
   using zipFile: IDisposableFileHandle = getDisposableFileHandle(archivePath, 'w');
   let currentOffset: number = 0;
   // Use this function to do any write to the zip file, so that we can track the current offset.
+  /**
+   * Write a raw chunk to the archive file descriptor, updating current offset.
+   */
   function writeChunkToZip(chunk: Uint8Array, lengthBytes: number = chunk.byteLength): void {
     let offset: number = 0;
     while (lengthBytes > 0 && offset < chunk.byteLength) {
@@ -162,19 +188,35 @@ export function pack({
     }
     currentOffset += offset;
   }
+  /** Convenience wrapper for writing multiple buffers sequentially. */
   function writeChunksToZip(chunks: Uint8Array[]): void {
     for (const chunk of chunks) {
       writeChunkToZip(chunk);
     }
   }
 
   const dosDateTimeNow: { time: number; date: number } = dosDateTime(new Date());
+  /**
+   * Stream a single file into the archive.
+   * Steps:
+   *  1. Decide compression (based on user choice + heuristic).
+   *  2. Emit local file header (sizes/CRC zeroed because we use a data descriptor).
+   *  3. Read file in 32 MiB chunks: update SHA-1 + CRC32; optionally feed compressor or write raw.
+   *  4. Flush compressor (if any) and write trailing data descriptor containing sizes + CRC.
+   *  5. Return populated entry metadata for later central directory + JSON metadata.
+   */
   function writeFileEntry(relativePath: string): IFileEntry {
+    /**
+     * Basic heuristic: skip re-compressing file types that are already compressed.
+     */
     function isLikelyAlreadyCompressed(filename: string): boolean {
       return LIKELY_COMPRESSED_EXTENSION_REGEX.test(filename.toLowerCase());
     }
     const fullPath: string = path.join(baseDir, relativePath);
 
+    /**
+     * Read file in large fixed-size buffer; invoke callback for each filled chunk.
+     */
     const readInputInChunks: (onChunk: (bytesInInputBuffer: number) => void) => void = (
       onChunk: (bytesInInputBuffer: number) => void
     ): void => {
@@ -231,6 +273,9 @@ export function pack({
     let uncompressedSize: number = 0;
     let compressedSize: number = 0;
 
+    /**
+     * Compressor instance (deflate or zstd) created only if needed.
+     */
     using incrementalZlib: IIncrementalZlib | undefined = shouldCompress
       ? createIncrementalZlib(
           outputBuffer,
@@ -270,6 +315,7 @@ export function pack({
     entry.crc32 = crc32;
     entry.sha1Hash = sha1Hash;
 
+    // Trailing data descriptor now that final CRC/sizes are known.
     writeChunkToZip(writeDataDescriptor(entry));
 
     terminal.writeVerboseLine(
@@ -284,6 +330,7 @@ export function pack({
   }
 
   const entries: IFileEntry[] = [];
+  // Emit all file entries in enumeration order.
   for (const relativePath of filePaths) {
     entries.push(writeFileEntry(relativePath));
   }
@@ -293,6 +340,7 @@ export function pack({
 
   markStart('pack.metadata.build');
   const metadata: IMetadata = { version: METADATA_VERSION, files: {} };
+  // Build metadata map used for selective unpack (size + SHA‑1 per file).
   for (const entry of entries) {
     metadata.files[entry.filename] = { size: entry.size, sha1Hash: entry.sha1Hash };
   }
@@ -306,6 +354,7 @@ export function pack({
   let metadataCompressionMethod: ZipMetaCompressionMethod = zipSyncCompressionOptions.store;
   let metadataData: Buffer = metadataBuffer;
   let metadataCompressedSize: number = metadataBuffer.length;
+  // Compress metadata (deflate) iff user allowed compression and it helps (>64 bytes & smaller result).
   if (compression !== 'store' && metadataBuffer.length > 64) {
     const compressed: Buffer = zlib.deflateRawSync(metadataBuffer, { level: 9 });
     if (compressed.length < metadataBuffer.length) {
@@ -348,6 +397,7 @@ export function pack({
 
   markStart('pack.write.centralDirectory');
   const centralDirOffset: number = currentOffset;
+  // Emit central directory records.
   for (const entry of entries) {
     writeChunksToZip(writeCentralDirectoryHeader(entry));
   }
diff --git a/apps/zipsync/src/unpack.ts b/apps/zipsync/src/unpack.ts
@@ -63,6 +63,9 @@ export interface IZipSyncUnpackResult {
   otherEntriesDeleted: number;
 }
 
+/**
+ * Unpack a zipsync archive into the provided target directories.
+ */
 export function unpack({
   archivePath,
   targetDirectories: rawTargetDirectories,
@@ -76,11 +79,13 @@ export function unpack({
   markStart('unpack.total');
   terminal.writeDebugLine('Starting unpackZip');
 
+  // Read entire archive into memory (build cache entries are expected to be relatively small/medium).
   markStart('unpack.read.archive');
   const zipBuffer: Buffer = fs.readFileSync(archivePath);
   terminal.writeDebugLine(`Archive size=${zipBuffer.length} bytes`);
   markEnd('unpack.read.archive');
 
+  // Locate & parse central directory so we have random-access metadata for all entries.
   markStart('unpack.parse.centralDirectory');
   const zipTree: LookupByPath<boolean> = new LookupByPath();
   const endOfCentralDir: IEndOfCentralDirectory = findEndOfCentralDirectory(zipBuffer);
@@ -151,6 +156,7 @@ export function unpack({
 
   terminal.writeLine(`Found ${entries.length} files in archive`);
 
+  // Ensure root target directories exist (they may be empty initially for cache misses).
   for (const targetDirectory of targetDirectories) {
     fs.mkdirSync(targetDirectory, { recursive: true });
     terminal.writeDebugLine(`Ensured target directory: ${targetDirectory}`);
@@ -165,6 +171,7 @@ export function unpack({
 
   const dirsToCleanup: string[] = [];
 
+  // Phase: scan filesystem to delete entries not present in archive and record empty dirs for later removal.
   markStart('unpack.scan.existing');
   const queue: IDirQueueItem[] = targetDirectories.map((dir) => ({
     dir,
@@ -218,6 +225,7 @@ export function unpack({
     }
   }
 
+  // Try to delete now-empty directories (created in previous builds but not in this archive).
   for (const dir of dirsToCleanup) {
     // Try to remove the directory. If it is not empty, this will throw and we can ignore the error.
     if (rmdirSync(dir)) {
@@ -233,6 +241,10 @@ export function unpack({
 
   const bufferSize: number = 1 << 25; // 32 MiB
   const outputBuffer: Buffer<ArrayBuffer> = Buffer.allocUnsafeSlow(bufferSize);
+  /**
+   * Stream-decompress (or copy) an individual file from the archive into place.
+   * We allocate a single large output buffer reused for all inflation operations to limit GC.
+   */
   function extractFileFromZip(targetPath: string, entry: ICentralDirectoryHeaderParseResult): void {
     terminal.writeDebugLine(`Extracting file: ${entry.filename}`);
     const fileZipBuffer: Buffer = getFileFromZip(zipBuffer, entry);
@@ -275,6 +287,10 @@ export function unpack({
     }
   }
 
+  /**
+   * Decide whether a file needs extraction by comparing existing file SHA‑1 vs metadata.
+   * If file is missing or hash differs we extract; otherwise we skip to preserve existing inode/data.
+   */
   function shouldExtract(targetPath: string, entry: ICentralDirectoryHeaderParseResult): boolean {
     if (metadata) {
       const metadataFile: { size: number; sha1Hash: string } | undefined = metadata.files[entry.filename];
@@ -300,6 +316,7 @@ export function unpack({
 
   const dirsCreated: Set<string> = new Set<string>();
 
+  // Iterate all entries excluding metadata; create parent dirs lazily; selective extraction.
   for (const entry of entries) {
     if (entry.filename === METADATA_FILENAME) {
       continue;
diff --git a/apps/zipsync/src/zipUtils.ts b/apps/zipsync/src/zipUtils.ts