Skip to content

Commit e622087

Browse files
fix(decoder): auto-detect gzip magic bytes for responses without Content-Encoding header
Co-Authored-By: syed.khadeer@airbyte.io <cloud-support@airbyte.io>
1 parent cd7e369 commit e622087

2 files changed

Lines changed: 22 additions & 12 deletions

File tree

airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from io import BufferedIOBase, TextIOWrapper
1212
from typing import Any, List, Optional
1313

14+
GZIP_MAGIC_BYTES = b"\x1f\x8b"
15+
1416
import orjson
1517
import requests
1618

@@ -35,15 +37,22 @@ def parse(self, data: BufferedIOBase) -> PARSER_OUTPUT_TYPE:
3537
"""
3638
Decompress gzipped bytes and pass decompressed data to the inner parser.
3739
38-
IMPORTANT:
39-
- If the data is not gzipped, reset the pointer and pass the data to the inner parser as is.
40-
41-
Note:
42-
- The data is not decoded by default.
40+
Auto-detects gzip content by checking for magic bytes (1f 8b) at the start of the data.
41+
If the data is not gzip-compressed, it is passed directly to the inner parser as-is.
42+
This handles APIs that return gzip-compressed bodies without setting the Content-Encoding header.
4343
"""
44+
header = data.read(2)
45+
if not header:
46+
return
4447

45-
with gzip.GzipFile(fileobj=data, mode="rb") as gzipobj:
46-
yield from self.inner_parser.parse(gzipobj)
48+
remaining = data.read()
49+
full_data = io.BytesIO(header + remaining)
50+
51+
if header == GZIP_MAGIC_BYTES:
52+
with gzip.GzipFile(fileobj=full_data, mode="rb") as gzipobj:
53+
yield from self.inner_parser.parse(gzipobj)
54+
else:
55+
yield from self.inner_parser.parse(full_data)
4756

4857

4958
@dataclass

airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2634,15 +2634,16 @@ def create_gzip_decoder(
26342634
gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser
26352635

26362636
if self._emit_connector_builder_messages:
2637-
# This is very surprising but if the response is not streamed,
2638-
# CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw,
2639-
# which uses urllib3 directly and does not uncompress the data.
2640-
return CompositeRawDecoder(gzip_parser.inner_parser, False)
2637+
# When not streaming, CompositeRawDecoder uses response.content which the requests
2638+
# library auto-decompresses when Content-Encoding is set. However, some APIs return
2639+
# gzip data without Content-Encoding headers. Using gzip_parser (which auto-detects
2640+
# gzip magic bytes) ensures decompression works in both cases.
2641+
return CompositeRawDecoder(gzip_parser, False)
26412642

26422643
return CompositeRawDecoder.by_headers(
26432644
[({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)],
26442645
stream_response=True,
2645-
fallback_parser=gzip_parser.inner_parser,
2646+
fallback_parser=gzip_parser,
26462647
)
26472648

26482649
@staticmethod

0 commit comments

Comments
 (0)