Skip to content

Commit 6600b33

Browse files
committed
CLDSRV-902: calculate final FULL_OBJECT (crc combine) and COMPOSITE checksum
1 parent 3f7ce59 commit 6600b33

4 files changed

Lines changed: 608 additions & 0 deletions

File tree

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
// Combine two right-shift CRCs (zlib's gf2_matrix_* trick) without using BigInt
2+
// inside the hot loops. Each GF(2) operator matrix is stored as a Uint32Array
3+
// of `2 * dim` words, where row n is packed as [lo32, hi32]. For 32-bit CRCs
4+
// the high halves stay zero and the per-row loop exits early; for the 64-bit
5+
// CRC (crc64nvme) the pair-of-u32s representation lets every XOR/shift stay on
6+
// 32-bit ints.
7+
//
8+
// References:
9+
// zlib crc32_combine (canonical C implementation):
10+
// https://github.com/madler/zlib/blob/master/crc32.c
11+
// Mark Adler, "How does CRC32 work?" — derivation of the matrix trick:
12+
// https://stackoverflow.com/a/23126768
13+
// AWS S3 multipart upload full-object checksums:
14+
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html
15+
16+
function gf2MatrixTimes(mat, vecLo, vecHi) {
17+
let sumLo = 0;
18+
let sumHi = 0;
19+
let lo = vecLo;
20+
let hi = vecHi;
21+
let i = 0;
22+
while ((lo | hi) !== 0) {
23+
if (lo & 1) {
24+
sumLo ^= mat[2 * i];
25+
sumHi ^= mat[2 * i + 1];
26+
}
27+
lo = (lo >>> 1) | ((hi & 1) << 31);
28+
hi = hi >>> 1;
29+
i += 1;
30+
}
31+
return [sumLo >>> 0, sumHi >>> 0];
32+
}
33+
34+
function gf2MatrixSquare(square, mat, dim) {
35+
for (let n = 0; n < dim; n += 1) {
36+
const r = gf2MatrixTimes(mat, mat[2 * n], mat[2 * n + 1]);
37+
// In-place mutation of the caller's scratch buffer is intentional —
38+
// the callers (combineCrcPair, ensureChainLen) own `square` and re-use
39+
// it across iterations to avoid re-allocating per squaring step.
40+
/* eslint-disable no-param-reassign */
41+
square[2 * n] = r[0];
42+
square[2 * n + 1] = r[1];
43+
/* eslint-enable no-param-reassign */
44+
}
45+
}
46+
47+
// Per (polyReversed, dim), a lazily-grown chain of zero-byte operators.
48+
// state.byteOps[j] is the GF(2) operator for prepending 2^j zero bytes
49+
// (i.e. M^(8 * 2^j)). Building this chain is the dominant cost of combineCrcPair
50+
// and depends only on the polynomial, so we cache it across calls.
51+
const chainCache = new Map();
52+
53+
function getOrInitChain(polyReversed, dim) {
54+
let state = chainCache.get(polyReversed);
55+
if (state !== undefined) {
56+
return state;
57+
}
58+
59+
// M^1: one-zero-bit operator. Column 0 is the polynomial; column k>0 is
60+
// 1 << (k - 1) — what right-shifting a state with bit k set produces.
61+
const m1 = new Uint32Array(2 * dim);
62+
m1[0] = Number(polyReversed & 0xffffffffn);
63+
m1[1] = Number((polyReversed >> 32n) & 0xffffffffn);
64+
for (let k = 1; k < dim; k += 1) {
65+
const bit = k - 1;
66+
if (bit < 32) {
67+
m1[2 * k] = (1 << bit) >>> 0;
68+
} else {
69+
m1[2 * k + 1] = (1 << (bit - 32)) >>> 0;
70+
}
71+
}
72+
73+
const m2 = new Uint32Array(2 * dim);
74+
gf2MatrixSquare(m2, m1, dim);
75+
const m4 = new Uint32Array(2 * dim);
76+
gf2MatrixSquare(m4, m2, dim);
77+
const m8 = new Uint32Array(2 * dim); // operator for 1 zero byte
78+
gf2MatrixSquare(m8, m4, dim);
79+
80+
state = { dim, byteOps: [m8] };
81+
chainCache.set(polyReversed, state);
82+
return state;
83+
}
84+
85+
function ensureChainLen(state, j) {
86+
while (state.byteOps.length <= j) {
87+
const prev = state.byteOps[state.byteOps.length - 1];
88+
const next = new Uint32Array(prev.length);
89+
gf2MatrixSquare(next, prev, state.dim);
90+
state.byteOps.push(next);
91+
}
92+
}
93+
94+
/**
95+
* Combine two CRCs of adjacent byte chunks.
96+
*
97+
* combineCrcPair(crc(a), crc(b), len(b), polyReversed, dim) === crc(a ‖ b)
98+
*
99+
* Works for any right-shift CRC of width `dim` (32 or 64) given its
100+
* bit-reversed polynomial. The squaring chain for `polyReversed` is cached
101+
* across calls, so the per-call cost is just popcount(len2) cheap operator
102+
* applications plus the BigInt boundary conversions.
103+
*
104+
* @param {bigint} crc1 - CRC of the first chunk
105+
* @param {bigint} crc2 - CRC of the second chunk
106+
* @param {bigint} len2 - byte length of the second chunk
107+
* @param {bigint} polyReversed - bit-reversed polynomial
108+
* @param {number} dim - CRC width in bits (32 or 64)
109+
* @returns {bigint} CRC of the concatenated chunk, masked to `dim` bits
110+
*/
111+
function combineCrcPair(crc1, crc2, len2, polyReversed, dim) {
112+
const mask = (1n << BigInt(dim)) - 1n;
113+
if (len2 === 0n) {
114+
return crc1 & mask;
115+
}
116+
117+
const state = getOrInitChain(polyReversed, dim);
118+
119+
let cLo = Number(crc1 & 0xffffffffn);
120+
let cHi = Number((crc1 >> 32n) & 0xffffffffn);
121+
122+
// Walk the bits of len2 (each bit represents a power-of-two number of
123+
// zero bytes to prepend); apply the cached operator for every set bit.
124+
let n = len2;
125+
let j = 0;
126+
while (n !== 0n) {
127+
if ((n & 1n) === 1n) {
128+
ensureChainLen(state, j);
129+
const r = gf2MatrixTimes(state.byteOps[j], cLo, cHi);
130+
cLo = r[0];
131+
cHi = r[1];
132+
}
133+
n >>= 1n;
134+
j += 1;
135+
}
136+
137+
const c2Lo = Number(crc2 & 0xffffffffn);
138+
const c2Hi = Number((crc2 >> 32n) & 0xffffffffn);
139+
cLo = (cLo ^ c2Lo) >>> 0;
140+
cHi = (cHi ^ c2Hi) >>> 0;
141+
142+
return ((BigInt(cHi) << 32n) | BigInt(cLo)) & mask;
143+
}
144+
145+
function base64ToBigInt(b64) {
146+
const buf = Buffer.from(b64, 'base64');
147+
return BigInt(`0x${buf.toString('hex')}`);
148+
}
149+
150+
function bigIntToBase64(value, dim) {
151+
const paddedHex = value.toString(16).padStart(dim / 4, '0');
152+
return Buffer.from(paddedHex, 'hex').toString('base64');
153+
}
154+
155+
/**
156+
* Combine N per-part CRCs into the full-object CRC, base64-encoded.
157+
*
158+
* @param {Array<{value: string, length: number}>} parts - per-part data in
159+
* part order; `value` is the base64-encoded per-part CRC, `length` is the
160+
* byte length of that part
161+
* @param {bigint} polyReversed - bit-reversed polynomial
162+
* @param {number} dim - CRC width in bits (32 or 64)
163+
* @returns {string} base64-encoded combined CRC
164+
*/
165+
function combinePartCrcs(parts, polyReversed, dim) {
166+
let combined = base64ToBigInt(parts[0].value);
167+
for (let i = 1; i < parts.length; i += 1) {
168+
combined = combineCrcPair(combined, base64ToBigInt(parts[i].value), BigInt(parts[i].length), polyReversed, dim);
169+
}
170+
return bigIntToBase64(combined, dim);
171+
}
172+
173+
module.exports = { combinePartCrcs, combineCrcPair };

lib/api/apiUtils/integrity/validateChecksums.js

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const { Crc32c } = require('@aws-crypto/crc32c');
44
const { CrtCrc64Nvme } = require('@aws-sdk/crc64-nvme-crt');
55
const { errors: ArsenalErrors, errorInstances } = require('arsenal');
66
const { config } = require('../../../Config');
7+
const { combinePartCrcs } = require('./crcCombine');
78

89
const defaultChecksumData = Object.freeze({ algorithm: 'crc64nvme', isTrailer: false, expected: undefined });
910

@@ -489,6 +490,85 @@ function getChecksumDataFromMPUHeaders(headers) {
489490
return { algorithm: algo, type: defaultChecksumType[algo], isDefault: false };
490491
}
491492

493+
// =============================================================================
494+
// MPU final-object checksum computation
495+
// =============================================================================
496+
//
497+
// CompleteMultipartUpload composes a final-object checksum from the per-part
498+
// checksums recorded at UploadPart time. AWS defines two modes:
499+
//
500+
// COMPOSITE : finalChecksum = base64(algo(decode(c1) || ... || decode(cN)))
501+
// + "-N" suffix, where N is the number of parts.
502+
// Supported on CRC32, CRC32C, SHA1, SHA256.
503+
//
504+
// FULL_OBJECT : finalChecksum is the CRC of the entire object's bytes,
505+
// reconstructed by combining the per-part CRCs via CRC
506+
// linearization. CRC-only: CRC32, CRC32C,
507+
// CRC64NVME.
508+
509+
// Bit-reversed polynomials used by the right-shift CRC implementations that
510+
// the @aws-crypto/* and @aws-sdk/crc64-nvme-crt packages produce.
511+
const FULL_OBJECT_POLYS = Object.freeze({
512+
crc32: { polyReversed: 0xedb88320n, dim: 32 },
513+
crc32c: { polyReversed: 0x82f63b78n, dim: 32 },
514+
crc64nvme: { polyReversed: 0x9a6c9329ac4bc9b5n, dim: 64 },
515+
});
516+
517+
// Algorithms whose digest is synchronous, which is the full set AWS allows
518+
// for COMPOSITE MPUs. crc64nvme is excluded because (a) AWS does not allow
519+
// COMPOSITE for CRC64NVME and (b) its CRT-backed digest is async.
520+
const COMPOSITE_ALGOS = new Set(['crc32', 'crc32c', 'sha1', 'sha256']);
521+
522+
/**
523+
* Compute the COMPOSITE final-object checksum for a CompleteMultipartUpload.
524+
*
525+
* final = base64(algo(decode(c1) || decode(c2) || ... || decode(cN))) + "-N"
526+
*
527+
* Supported algorithms: crc32, crc32c, sha1, sha256. (crc64nvme is excluded —
528+
* AWS does not allow COMPOSITE for CRC64NVME.)
529+
*
530+
* @param {string} algorithm - lowercase algorithm name
531+
* @param {string[]} partChecksumsBase64 - per-part checksums in part order,
532+
* each base64-encoded (the format stored on MPU part metadata)
533+
* @returns {{ checksum: string, error: null }
534+
* | { checksum: null, error: { code: string, details: object } }}
535+
*/
536+
function computeCompositeMPUChecksum(algorithm, partChecksumsBase64) {
537+
if (!COMPOSITE_ALGOS.has(algorithm)) {
538+
return { checksum: null, error: { code: ChecksumError.MPUAlgoNotSupported, details: { algorithm } } };
539+
}
540+
541+
const concat = Buffer.concat(partChecksumsBase64.map(c => Buffer.from(c, 'base64')));
542+
const digest = algorithms[algorithm].digest(concat);
543+
return {
544+
checksum: `${digest}-${partChecksumsBase64.length}`,
545+
error: null,
546+
};
547+
}
548+
549+
/**
550+
* Compute the FULL_OBJECT final-object checksum for a CompleteMultipartUpload.
551+
*
552+
* Returns the CRC of the assembled object's bytes, derived purely from the
553+
* per-part CRCs and part lengths via CRC linearization.
554+
*
555+
* Supported algorithms: crc32, crc32c, crc64nvme.
556+
*
557+
* @param {string} algorithm - lowercase algorithm name
558+
* @param {Array<{value: string, length: number}>} parts - per-part data in
559+
* part order; `value` is the base64-encoded per-part CRC, `length` is the
560+
* byte length of that part
561+
* @returns {{ checksum: string, error: null }
562+
* | { checksum: null, error: { code: string, details: object } }}
563+
*/
564+
function computeFullObjectMPUChecksum(algorithm, parts) {
565+
const params = FULL_OBJECT_POLYS[algorithm];
566+
if (!params) {
567+
return { checksum: null, error: { code: ChecksumError.MPUAlgoNotSupported, details: { algorithm } } };
568+
}
569+
return { checksum: combinePartCrcs(parts, params.polyReversed, params.dim), error: null };
570+
}
571+
492572
module.exports = {
493573
ChecksumError,
494574
defaultChecksumData,
@@ -499,4 +579,6 @@ module.exports = {
499579
algorithms,
500580
checksumedMethods,
501581
getChecksumDataFromMPUHeaders,
582+
computeCompositeMPUChecksum,
583+
computeFullObjectMPUChecksum,
502584
};

0 commit comments

Comments
 (0)