Skip to content

Commit 3d74bc9

Browse files
committed
CLDSRV-902: calculate final FULL_OBJECT (crc combine) and COMPOSITE checksum
1 parent 3f7ce59 commit 3d74bc9

4 files changed

Lines changed: 620 additions & 0 deletions

File tree

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
'use strict';
2+
3+
// Combine two right-shift CRCs (zlib's gf2_matrix_* trick) without using BigInt
4+
// inside the hot loops. Each GF(2) operator matrix is stored as a Uint32Array
5+
// of `2 * dim` words, where row n is packed as [lo32, hi32]. For 32-bit CRCs
6+
// the high halves stay zero and the per-row loop exits early; for the 64-bit
7+
// CRC (crc64nvme) the pair-of-u32s representation lets every XOR/shift stay on
8+
// 32-bit ints.
9+
//
10+
// References:
11+
// zlib crc32_combine (canonical C implementation):
12+
// https://github.com/madler/zlib/blob/master/crc32.c
13+
// Mark Adler, "How does CRC32 work?" — derivation of the matrix trick:
14+
// https://stackoverflow.com/a/23126768
15+
// AWS S3 multipart upload full-object checksums:
16+
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html
17+
18+
function gf2MatrixTimes(mat, vecLo, vecHi) {
19+
let sumLo = 0;
20+
let sumHi = 0;
21+
let lo = vecLo;
22+
let hi = vecHi;
23+
let i = 0;
24+
while ((lo | hi) !== 0) {
25+
if (lo & 1) {
26+
sumLo ^= mat[2 * i];
27+
sumHi ^= mat[2 * i + 1];
28+
}
29+
lo = (lo >>> 1) | ((hi & 1) << 31);
30+
hi = hi >>> 1;
31+
i += 1;
32+
}
33+
return [sumLo >>> 0, sumHi >>> 0];
34+
}
35+
36+
function gf2MatrixSquare(square, mat, dim) {
37+
for (let n = 0; n < dim; n += 1) {
38+
const r = gf2MatrixTimes(mat, mat[2 * n], mat[2 * n + 1]);
39+
// In-place mutation of the caller's scratch buffer is intentional —
40+
// the callers (combineCrcPair, ensureChainLen) own `square` and re-use
41+
// it across iterations to avoid re-allocating per squaring step.
42+
/* eslint-disable no-param-reassign */
43+
square[2 * n] = r[0];
44+
square[2 * n + 1] = r[1];
45+
/* eslint-enable no-param-reassign */
46+
}
47+
}
48+
49+
// Per (polyReversed, dim), a lazily-grown chain of zero-byte operators.
50+
// state.byteOps[j] is the GF(2) operator for prepending 2^j zero bytes
51+
// (i.e. M^(8 * 2^j)). Building this chain is the dominant cost of combineCrcPair
52+
// and depends only on the polynomial, so we cache it across calls.
53+
const chainCache = new Map();
54+
55+
function getOrInitChain(polyReversed, dim) {
56+
let state = chainCache.get(polyReversed);
57+
if (state !== undefined) {
58+
return state;
59+
}
60+
61+
// M^1: one-zero-bit operator. Column 0 is the polynomial; column k>0 is
62+
// 1 << (k - 1) — what right-shifting a state with bit k set produces.
63+
const m1 = new Uint32Array(2 * dim);
64+
m1[0] = Number(polyReversed & 0xffffffffn);
65+
m1[1] = Number((polyReversed >> 32n) & 0xffffffffn);
66+
for (let k = 1; k < dim; k += 1) {
67+
const bit = k - 1;
68+
if (bit < 32) {
69+
m1[2 * k] = (1 << bit) >>> 0;
70+
} else {
71+
m1[2 * k + 1] = (1 << (bit - 32)) >>> 0;
72+
}
73+
}
74+
75+
const m2 = new Uint32Array(2 * dim);
76+
gf2MatrixSquare(m2, m1, dim);
77+
const m4 = new Uint32Array(2 * dim);
78+
gf2MatrixSquare(m4, m2, dim);
79+
const m8 = new Uint32Array(2 * dim); // operator for 1 zero byte
80+
gf2MatrixSquare(m8, m4, dim);
81+
82+
state = { dim, byteOps: [m8] };
83+
chainCache.set(polyReversed, state);
84+
return state;
85+
}
86+
87+
function ensureChainLen(state, j) {
88+
while (state.byteOps.length <= j) {
89+
const prev = state.byteOps[state.byteOps.length - 1];
90+
const next = new Uint32Array(prev.length);
91+
gf2MatrixSquare(next, prev, state.dim);
92+
state.byteOps.push(next);
93+
}
94+
}
95+
96+
/**
97+
* Combine two CRCs of adjacent byte chunks.
98+
*
99+
* combineCrcPair(crc(a), crc(b), len(b), polyReversed, dim) === crc(a ‖ b)
100+
*
101+
* Works for any right-shift CRC of width `dim` (32 or 64) given its
102+
* bit-reversed polynomial. The squaring chain for `polyReversed` is cached
103+
* across calls, so the per-call cost is just popcount(len2) cheap operator
104+
* applications plus the BigInt boundary conversions.
105+
*
106+
* @param {bigint} crc1 - CRC of the first chunk
107+
* @param {bigint} crc2 - CRC of the second chunk
108+
* @param {bigint} len2 - byte length of the second chunk
109+
* @param {bigint} polyReversed - bit-reversed polynomial
110+
* @param {number} dim - CRC width in bits (32 or 64)
111+
* @returns {bigint} CRC of the concatenated chunk, masked to `dim` bits
112+
*/
113+
function combineCrcPair(crc1, crc2, len2, polyReversed, dim) {
114+
const mask = (1n << BigInt(dim)) - 1n;
115+
if (len2 === 0n) {
116+
return crc1 & mask;
117+
}
118+
119+
const state = getOrInitChain(polyReversed, dim);
120+
121+
let cLo = Number(crc1 & 0xffffffffn);
122+
let cHi = Number((crc1 >> 32n) & 0xffffffffn);
123+
124+
// Walk the bits of len2 (each bit represents a power-of-two number of
125+
// zero bytes to prepend); apply the cached operator for every set bit.
126+
let n = len2;
127+
let j = 0;
128+
while (n !== 0n) {
129+
if ((n & 1n) === 1n) {
130+
ensureChainLen(state, j);
131+
const r = gf2MatrixTimes(state.byteOps[j], cLo, cHi);
132+
cLo = r[0];
133+
cHi = r[1];
134+
}
135+
n >>= 1n;
136+
j += 1;
137+
}
138+
139+
const c2Lo = Number(crc2 & 0xffffffffn);
140+
const c2Hi = Number((crc2 >> 32n) & 0xffffffffn);
141+
cLo = (cLo ^ c2Lo) >>> 0;
142+
cHi = (cHi ^ c2Hi) >>> 0;
143+
144+
return ((BigInt(cHi) << 32n) | BigInt(cLo)) & mask;
145+
}
146+
147+
function base64ToBigInt(b64) {
148+
const buf = Buffer.from(b64, 'base64');
149+
let r = 0n;
150+
for (let i = 0; i < buf.length; i += 1) {
151+
r = (r << 8n) | BigInt(buf[i]);
152+
}
153+
return r;
154+
}
155+
156+
function bigIntToBase64(value, dim) {
157+
const nBytes = dim / 8;
158+
const buf = Buffer.alloc(nBytes);
159+
let v = value;
160+
for (let i = nBytes - 1; i >= 0; i -= 1) {
161+
buf[i] = Number(v & 0xffn);
162+
v >>= 8n;
163+
}
164+
return buf.toString('base64');
165+
}
166+
167+
/**
168+
* Combine N per-part CRCs into the full-object CRC, base64-encoded.
169+
*
170+
* @param {Array<{value: string, length: number}>} parts - per-part data in
171+
* part order; `value` is the base64-encoded per-part CRC, `length` is the
172+
* byte length of that part
173+
* @param {bigint} polyReversed - bit-reversed polynomial
174+
* @param {number} dim - CRC width in bits (32 or 64)
175+
* @returns {string} base64-encoded combined CRC
176+
*/
177+
function combinePartCrcs(parts, polyReversed, dim) {
178+
let combined = base64ToBigInt(parts[0].value);
179+
for (let i = 1; i < parts.length; i += 1) {
180+
combined = combineCrcPair(combined, base64ToBigInt(parts[i].value), BigInt(parts[i].length), polyReversed, dim);
181+
}
182+
return bigIntToBase64(combined, dim);
183+
}
184+
185+
module.exports = { combinePartCrcs, combineCrcPair };

lib/api/apiUtils/integrity/validateChecksums.js

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const { Crc32c } = require('@aws-crypto/crc32c');
44
const { CrtCrc64Nvme } = require('@aws-sdk/crc64-nvme-crt');
55
const { errors: ArsenalErrors, errorInstances } = require('arsenal');
66
const { config } = require('../../../Config');
7+
const { combinePartCrcs } = require('./crcCombine');
78

89
const defaultChecksumData = Object.freeze({ algorithm: 'crc64nvme', isTrailer: false, expected: undefined });
910

@@ -489,6 +490,85 @@ function getChecksumDataFromMPUHeaders(headers) {
489490
return { algorithm: algo, type: defaultChecksumType[algo], isDefault: false };
490491
}
491492

493+
// =============================================================================
494+
// MPU final-object checksum computation
495+
// =============================================================================
496+
//
497+
// CompleteMultipartUpload composes a final-object checksum from the per-part
498+
// checksums recorded at UploadPart time. AWS defines two modes:
499+
//
500+
// COMPOSITE : finalChecksum = base64(algo(decode(c1) || ... || decode(cN)))
501+
// + "-N" suffix, where N is the number of parts.
502+
// Supported on CRC32, CRC32C, SHA1, SHA256.
503+
//
504+
// FULL_OBJECT : finalChecksum is the CRC of the entire object's bytes,
505+
// reconstructed by combining the per-part CRCs via CRC
506+
// linearization. CRC-only: CRC32, CRC32C,
507+
// CRC64NVME.
508+
509+
// Bit-reversed polynomials used by the right-shift CRC implementations that
510+
// the @aws-crypto/* and @aws-sdk/crc64-nvme-crt packages produce.
511+
const FULL_OBJECT_POLYS = Object.freeze({
512+
crc32: { polyReversed: 0xedb88320n, dim: 32 },
513+
crc32c: { polyReversed: 0x82f63b78n, dim: 32 },
514+
crc64nvme: { polyReversed: 0x9a6c9329ac4bc9b5n, dim: 64 },
515+
});
516+
517+
// Algorithms whose digest is synchronous, which is the full set AWS allows
518+
// for COMPOSITE MPUs. crc64nvme is excluded because (a) AWS does not allow
519+
// COMPOSITE for CRC64NVME and (b) its CRT-backed digest is async.
520+
const COMPOSITE_ALGOS = new Set(['crc32', 'crc32c', 'sha1', 'sha256']);
521+
522+
/**
523+
* Compute the COMPOSITE final-object checksum for a CompleteMultipartUpload.
524+
*
525+
* final = base64(algo(decode(c1) || decode(c2) || ... || decode(cN))) + "-N"
526+
*
527+
* Supported algorithms: crc32, crc32c, sha1, sha256. (crc64nvme is excluded —
528+
* AWS does not allow COMPOSITE for CRC64NVME.)
529+
*
530+
* @param {string} algorithm - lowercase algorithm name
531+
* @param {string[]} partChecksumsBase64 - per-part checksums in part order,
532+
* each base64-encoded (the format stored on MPU part metadata)
533+
* @returns {{ checksum: string, error: null }
534+
* | { checksum: null, error: { code: string, details: object } }}
535+
*/
536+
function computeCompositeMPUChecksum(algorithm, partChecksumsBase64) {
537+
if (!COMPOSITE_ALGOS.has(algorithm)) {
538+
return { checksum: null, error: { code: ChecksumError.MPUAlgoNotSupported, details: { algorithm } } };
539+
}
540+
541+
const concat = Buffer.concat(partChecksumsBase64.map(c => Buffer.from(c, 'base64')));
542+
const digest = algorithms[algorithm].digest(concat);
543+
return {
544+
checksum: `${digest}-${partChecksumsBase64.length}`,
545+
error: null,
546+
};
547+
}
548+
549+
/**
550+
* Compute the FULL_OBJECT final-object checksum for a CompleteMultipartUpload.
551+
*
552+
* Returns the CRC of the assembled object's bytes, derived purely from the
553+
* per-part CRCs and part lengths via CRC linearization.
554+
*
555+
* Supported algorithms: crc32, crc32c, crc64nvme.
556+
*
557+
* @param {string} algorithm - lowercase algorithm name
558+
* @param {Array<{value: string, length: number}>} parts - per-part data in
559+
* part order; `value` is the base64-encoded per-part CRC, `length` is the
560+
* byte length of that part
561+
* @returns {{ checksum: string, error: null }
562+
* | { checksum: null, error: { code: string, details: object } }}
563+
*/
564+
function computeFullObjectMPUChecksum(algorithm, parts) {
565+
const params = FULL_OBJECT_POLYS[algorithm];
566+
if (!params) {
567+
return { checksum: null, error: { code: ChecksumError.MPUAlgoNotSupported, details: { algorithm } } };
568+
}
569+
return { checksum: combinePartCrcs(parts, params.polyReversed, params.dim), error: null };
570+
}
571+
492572
module.exports = {
493573
ChecksumError,
494574
defaultChecksumData,
@@ -499,4 +579,6 @@ module.exports = {
499579
algorithms,
500580
checksumedMethods,
501581
getChecksumDataFromMPUHeaders,
582+
computeCompositeMPUChecksum,
583+
computeFullObjectMPUChecksum,
502584
};

0 commit comments

Comments
 (0)