Skip to content

Commit 9ad4f21

Browse files
authored
feat(storage): enable default CRC32C checksum validation for object downloads (#9210)
1 parent c8a5181 commit 9ad4f21

5 files changed

Lines changed: 686 additions & 9 deletions

File tree

Storage/src/Connection/Rest.php

Lines changed: 118 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
use Google\Cloud\Core\Upload\ResumableUploader;
2828
use Google\Cloud\Core\Upload\StreamableUploader;
2929
use Google\Cloud\Core\UriTrait;
30+
use Google\Cloud\Storage\HashValidatingStream;
3031
use Google\Cloud\Storage\StorageClient;
3132
use GuzzleHttp\Exception\RequestException;
3233
use GuzzleHttp\Psr7\MimeType;
@@ -331,6 +332,7 @@ public function downloadObject(array $args = [])
331332
$requestedBytes = $this->getRequestedBytes($args);
332333
$resultStream = Utils::streamFor(null);
333334
$transcodedObj = false;
335+
$hashHeader = null;
334336

335337
$args['retryStrategy'] ??= $this->retryStrategy;
336338

@@ -339,12 +341,17 @@ public function downloadObject(array $args = [])
339341
$invocationId = Uuid::uuid4()->toString();
340342
$requestOptions['retryHeaders'] = self::getRetryHeaders($invocationId, 1);
341343
$requestOptions['restRetryFunction'] = $this->getRestRetryFunction('objects', 'get', $args);
342-
// We try to deduce if the object is a transcoded object when we receive the headers.
343-
$requestOptions['restOptions']['on_headers'] = function ($response) use (&$transcodedObj) {
344+
// We try to deduce if the object is a transcoded object
345+
// and capture the X-Goog-Hash when we receive the headers.
346+
$requestOptions['restOptions']['on_headers'] = function ($response) use (&$transcodedObj, &$hashHeader) {
344347
$header = $response->getHeader(self::TRANSCODED_OBJ_HEADER_KEY);
345348
if (is_array($header) && in_array(self::TRANSCODED_OBJ_HEADER_VAL, $header)) {
346349
$transcodedObj = true;
347350
}
351+
$hash = $response->getHeaderLine('X-Goog-Hash');
352+
if ($hash) {
353+
$hashHeader = $hash;
354+
}
348355
};
349356
$attempt = null;
350357
$requestOptions['restRetryListener'] = function (
@@ -383,30 +390,111 @@ public function downloadObject(array $args = [])
383390
}
384391
};
385392

386-
$fetchedStream = $this->requestWrapper->send(
393+
$response = $this->requestWrapper->send(
387394
$request,
388395
$requestOptions
389-
)->getBody();
396+
);
397+
$fetchedStream = $response->getBody();
390398

391399
// If no retry attempt was made, then we can return the stream as is.
392400
// This is important in the case where downloadObject is called to open
393401
// the file but not to read from it yet.
394402
if ($attempt === null) {
395-
return $fetchedStream;
403+
return $this->maybeWrapWithHashValidatingStream(
404+
$fetchedStream,
405+
$args,
406+
$response,
407+
$hashHeader,
408+
$transcodedObj
409+
);
396410
}
397411

398412
// If our object is a transcoded object, then Range headers are not honoured.
399413
// That means even if we had a partial download available, the final obj
400414
// that was fetched will contain the complete object. So, we don't need to copy
401415
// the partial stream, we can just return the stream we fetched.
402416
if ($transcodedObj) {
403-
return $fetchedStream;
417+
return $this->maybeWrapWithHashValidatingStream(
418+
$fetchedStream,
419+
$args,
420+
$response,
421+
$hashHeader,
422+
$transcodedObj
423+
);
404424
}
405425

406426
Utils::copyToStream($fetchedStream, $resultStream);
407427

408428
$resultStream->seek(0);
409-
return $resultStream;
429+
return $this->maybeWrapWithHashValidatingStream(
430+
$resultStream,
431+
$args,
432+
$response,
433+
$hashHeader,
434+
$transcodedObj
435+
);
436+
}
437+
438+
/**
439+
* Wrap the download stream in a HashValidatingStream if validation is enabled.
440+
*/
441+
private function maybeWrapWithHashValidatingStream(
442+
StreamInterface $stream,
443+
array $args,
444+
ResponseInterface $response,
445+
$hashHeader = null,
446+
$transcodedObj = false
447+
) {
448+
$validate = $args['validate'] ?? 'crc32';
449+
if ($validate === false || $validate === 'none') {
450+
return $stream;
451+
}
452+
453+
// Skip validation if the user requested a subrange of the object
454+
$requestedBytes = $this->getRequestedBytes($args);
455+
if ($requestedBytes['startByte'] > 0 || $requestedBytes['endByte'] !== '') {
456+
return $stream;
457+
}
458+
459+
// Skip validation if the object is a transcoded object (served decompressed, stored compressed)
460+
if ($transcodedObj || $response->hasHeader(self::TRANSCODED_OBJ_HEADER_KEY)) {
461+
return $stream;
462+
}
463+
464+
$hashHeader = $hashHeader ?: $response->getHeaderLine('X-Goog-Hash');
465+
if (!$hashHeader) {
466+
return $stream;
467+
}
468+
469+
$hashes = [];
470+
$parts = explode(',', $hashHeader);
471+
foreach ($parts as $part) {
472+
$kv = explode('=', trim($part), 2);
473+
if (count($kv) === 2) {
474+
$hashes[$kv[0]] = $kv[1];
475+
}
476+
}
477+
478+
$options = [];
479+
$crc32cSupported = in_array('crc32c', hash_algos());
480+
481+
if ($validate === 'md5') {
482+
if (isset($hashes['md5'])) {
483+
$options['expectedMd5'] = $hashes['md5'];
484+
}
485+
} elseif ($validate === 'crc32' || $validate === 'crc32c' || $validate === true) {
486+
if ($crc32cSupported && isset($hashes['crc32c'])) {
487+
$options['expectedCrc32c'] = $hashes['crc32c'];
488+
} elseif (isset($hashes['md5'])) {
489+
$options['expectedMd5'] = $hashes['md5'];
490+
}
491+
}
492+
493+
if (empty($options)) {
494+
return $stream;
495+
}
496+
497+
return new HashValidatingStream($stream, $options);
410498
}
411499

412500
/**
@@ -418,13 +506,34 @@ public function downloadObject(array $args = [])
418506
*/
419507
public function downloadObjectAsync(array $args = [])
420508
{
509+
$transcodedObj = false;
510+
$hashHeader = null;
421511
list($request, $requestOptions) = $this->buildDownloadObjectParams($args);
422512

513+
// We try to deduce if the object is a transcoded object
514+
// and capture the X-Goog-Hash when we receive the headers.
515+
$requestOptions['restOptions']['on_headers'] = function ($response) use (&$transcodedObj, &$hashHeader) {
516+
$header = $response->getHeader(self::TRANSCODED_OBJ_HEADER_KEY);
517+
if (is_array($header) && in_array(self::TRANSCODED_OBJ_HEADER_VAL, $header)) {
518+
$transcodedObj = true;
519+
}
520+
$hash = $response->getHeaderLine('X-Goog-Hash');
521+
if ($hash) {
522+
$hashHeader = $hash;
523+
}
524+
};
525+
423526
return $this->requestWrapper->sendAsync(
424527
$request,
425528
$requestOptions
426-
)->then(function (ResponseInterface $response) {
427-
return $response->getBody();
529+
)->then(function (ResponseInterface $response) use ($args, &$hashHeader, &$transcodedObj) {
530+
return $this->maybeWrapWithHashValidatingStream(
531+
$response->getBody(),
532+
$args,
533+
$response,
534+
$hashHeader,
535+
$transcodedObj
536+
);
428537
});
429538
}
430539

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
<?php
2+
/**
3+
* Copyright 2026 Google LLC
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
namespace Google\Cloud\Storage;
19+
20+
use GuzzleHttp\Psr7\StreamDecoratorTrait;
21+
use Psr\Http\Message\StreamInterface;
22+
use RuntimeException;
23+
use UnexpectedValueException;
24+
25+
/**
26+
* A Guzzle stream decorator that computes CRC32C and MD5 hashes on the fly
27+
* and validates them when the end of the stream is reached.
28+
*/
29+
class HashValidatingStream implements StreamInterface
30+
{
31+
use StreamDecoratorTrait;
32+
33+
private $stream;
34+
private $expectedCrc32c;
35+
private $expectedMd5;
36+
private $crc32cContext;
37+
private $md5Context;
38+
private $crc32cEnabled = false;
39+
private $md5Enabled = false;
40+
41+
/**
42+
* @param StreamInterface $stream The underlying stream to wrap.
43+
* @param array $options {
44+
* Configuration options.
45+
*
46+
* @type string $expectedCrc32c Base64-encoded expected CRC32C checksum.
47+
* @type string $expectedMd5 Base64-encoded expected MD5 checksum.
48+
* }
49+
* @throws RuntimeException If a requested hashing algorithm is not supported on the platform.
50+
*/
51+
public function __construct(StreamInterface $stream, array $options = [])
52+
{
53+
$this->stream = $stream;
54+
$this->expectedCrc32c = $options['expectedCrc32c'] ?? null;
55+
$this->expectedMd5 = $options['expectedMd5'] ?? null;
56+
57+
if ($this->expectedCrc32c !== null) {
58+
if (!in_array('crc32c', hash_algos())) {
59+
throw new RuntimeException('CRC32C hashing algorithm is not supported on this platform.');
60+
}
61+
$this->crc32cContext = hash_init('crc32c');
62+
$this->crc32cEnabled = true;
63+
}
64+
65+
if ($this->expectedMd5 !== null) {
66+
$this->md5Context = hash_init('md5');
67+
$this->md5Enabled = true;
68+
}
69+
}
70+
71+
/**
72+
* Validating streams are not seekable since hash calculations are done on-the-fly.
73+
*
74+
* @return bool
75+
*/
76+
public function isSeekable(): bool
77+
{
78+
return false;
79+
}
80+
81+
/**
82+
* Seek operations are not supported on validating streams.
83+
*
84+
* @param int $offset
85+
* @param int $whence
86+
* @throws RuntimeException
87+
*/
88+
public function seek($offset, $whence = SEEK_SET): void
89+
{
90+
throw new RuntimeException('Seeking is not supported on a validating stream.');
91+
}
92+
93+
/**
94+
* Read from the stream and update hash calculations.
95+
*
96+
* @param int $length
97+
* @return string
98+
*/
99+
public function read($length): string
100+
{
101+
$data = $this->stream->read($length);
102+
$this->updateHashes($data);
103+
104+
if ($this->stream->eof()) {
105+
$this->validate();
106+
}
107+
108+
return $data;
109+
}
110+
111+
/**
112+
* Get the entire remaining contents of the stream and validate.
113+
*
114+
* @return string
115+
*/
116+
public function getContents(): string
117+
{
118+
$data = $this->stream->getContents();
119+
$this->updateHashes($data);
120+
$this->validate();
121+
return $data;
122+
}
123+
124+
/**
125+
* Update hash contexts with the new chunk of data.
126+
*/
127+
private function updateHashes(string $data)
128+
{
129+
if ($data === '') {
130+
return;
131+
}
132+
133+
if ($this->crc32cEnabled) {
134+
hash_update($this->crc32cContext, $data);
135+
}
136+
137+
if ($this->md5Enabled) {
138+
hash_update($this->md5Context, $data);
139+
}
140+
}
141+
142+
/**
143+
* Validate the accumulated checksums against expected values.
144+
*
145+
* @throws UnexpectedValueException If checksum validation fails.
146+
*/
147+
private function validate()
148+
{
149+
if ($this->crc32cEnabled) {
150+
$crc32cHash = hash_final($this->crc32cContext, true);
151+
$calculatedCrc32c = base64_encode($crc32cHash);
152+
$this->crc32cEnabled = false; // Prevent double validation
153+
if ($calculatedCrc32c !== $this->expectedCrc32c) {
154+
throw new UnexpectedValueException(sprintf(
155+
'CRC32C checksum mismatch. Expected: %s, Calculated: %s',
156+
$this->expectedCrc32c,
157+
$calculatedCrc32c
158+
));
159+
}
160+
}
161+
162+
if ($this->md5Enabled) {
163+
$md5Hash = hash_final($this->md5Context, true);
164+
$calculatedMd5 = base64_encode($md5Hash);
165+
$this->md5Enabled = false; // Prevent double validation
166+
167+
if ($calculatedMd5 !== $this->expectedMd5) {
168+
throw new UnexpectedValueException(sprintf(
169+
'MD5 checksum mismatch. Expected: %s, Calculated: %s',
170+
$this->expectedMd5,
171+
$calculatedMd5
172+
));
173+
}
174+
}
175+
}
176+
}

0 commit comments

Comments
 (0)