From 5245884207357da2827146936a64ab45040b425e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:19:37 +0000 Subject: [PATCH 1/8] fix(cdk): support start_date format without microseconds in file-based connectors Co-Authored-By: Daryna Ishchenko --- .../file_based/file_based_stream_reader.py | 10 ++++- .../test_file_based_stream_reader.py | 43 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 7443dccd6..1d3224dbb 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -98,6 +98,14 @@ def get_matching_files( """ ... + @staticmethod + def _parse_start_date(start_date_str: str) -> datetime: + """Parse a start_date string, supporting both with and without microseconds.""" + try: + return datetime.strptime(start_date_str, AbstractFileBasedStreamReader.DATE_TIME_FORMAT) + except ValueError: + return datetime.strptime(start_date_str, "%Y-%m-%dT%H:%M:%SZ") + def filter_files_by_globs_and_start_date( self, files: List[RemoteFile], globs: List[str] ) -> Iterable[RemoteFile]: @@ -105,7 +113,7 @@ def filter_files_by_globs_and_start_date( Utility method for filtering files based on globs. """ start_date = ( - datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT) + self._parse_start_date(self.config.start_date) if self.config and self.config.start_date else None ) diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index 13fa1025c..d1d7dae2b 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -401,6 +401,13 @@ def documentation_url(cls) -> AnyUrl: set(), id="all_csvs_modified_before_start_date", ), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-01T03:54:07Z", "streams": []}, + {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, + set(), + id="all_csvs_start_date_without_microseconds", + ), pytest.param( ["**/*.csv"], {"start_date": "2023-06-05T03:54:07.000Z", "streams": []}, @@ -494,6 +501,42 @@ def test_preserve_sub_directories_scenarios( assert file_paths[AbstractFileBasedStreamReader.FILE_FOLDER] == path.dirname(source_file_path) +@pytest.mark.parametrize( + "start_date_str, expected", + [ + pytest.param( + "2025-01-01T00:00:00.000000Z", + datetime(2025, 1, 1, 0, 0, 0), + id="with_microseconds_zero", + ), + pytest.param( + "2025-06-15T12:30:45.123456Z", + datetime(2025, 6, 15, 12, 30, 45, 123456), + id="with_microseconds_nonzero", + ), + pytest.param( + "2025-01-01T00:00:00Z", + datetime(2025, 1, 1, 0, 0, 0), + id="without_microseconds", + ), + pytest.param( + "2025-12-31T23:59:59Z", + datetime(2025, 12, 31, 23, 59, 59), + id="without_microseconds_end_of_day", + ), + ], +) +def test_parse_start_date(start_date_str: str, expected: datetime) -> None: + reader = TestStreamReader() + assert reader._parse_start_date(start_date_str) == expected + + +def test_parse_start_date_invalid_raises() -> None: + reader = TestStreamReader() + with pytest.raises(ValueError): + reader._parse_start_date("not-a-date") + + def test_upload_with_file_transfer_reader(): stream_reader = TestStreamReaderWithDefaultUpload() From 3506e0531698c330a7e1f6be510336ab96516edb Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:25:21 +0000 Subject: [PATCH 2/8] refactor: remove @staticmethod from _parse_start_date, use self.DATE_TIME_FORMAT Co-Authored-By: Daryna Ishchenko --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 1d3224dbb..1aed4177a 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -98,11 +98,10 @@ def get_matching_files( """ ... - @staticmethod - def _parse_start_date(start_date_str: str) -> datetime: + def _parse_start_date(self, start_date_str: str) -> datetime: """Parse a start_date string, supporting both with and without microseconds.""" try: - return datetime.strptime(start_date_str, AbstractFileBasedStreamReader.DATE_TIME_FORMAT) + return datetime.strptime(start_date_str, self.DATE_TIME_FORMAT) except ValueError: return datetime.strptime(start_date_str, "%Y-%m-%dT%H:%M:%SZ") From 35919a3a6038b741a97df9ee952e5dd9a507b439 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:29:35 +0000 Subject: [PATCH 3/8] docs: extend _parse_start_date docstring to explain AbstractFileBasedSpec pattern_descriptor Co-Authored-By: Daryna Ishchenko --- .../sources/file_based/file_based_stream_reader.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 1aed4177a..334af723f 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -99,7 +99,14 @@ def get_matching_files( ... def _parse_start_date(self, start_date_str: str) -> datetime: - """Parse a start_date string, supporting both with and without microseconds.""" + """Parse a start_date string, supporting both with and without microseconds. + + AbstractFileBasedSpec accepts start_date in multiple formats as described by its + pattern_descriptor: "YYYY-MM-DD, YYYY-MM-DDTHH:mm:ssZ, or YYYY-MM-DDTHH:mm:ss.SSSSSSZ". + The primary format (self.DATE_TIME_FORMAT) includes microseconds, but the spec also + allows the shorter "YYYY-MM-DDTHH:mm:ssZ" variant. This method tries the primary + format first and falls back to the shorter format without microseconds. + """ try: return datetime.strptime(start_date_str, self.DATE_TIME_FORMAT) except ValueError: From b02e4ad3a6559a6d3ac659b6b4fe2a4a64d833fb Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:35:36 +0000 Subject: [PATCH 4/8] docs: add note that fallback format is only for user-provided start_date, not cursor values Co-Authored-By: Daryna Ishchenko --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 334af723f..023833e1d 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -106,6 +106,10 @@ def _parse_start_date(self, start_date_str: str) -> datetime: The primary format (self.DATE_TIME_FORMAT) includes microseconds, but the spec also allows the shorter "YYYY-MM-DDTHH:mm:ssZ" variant. This method tries the primary format first and falls back to the shorter format without microseconds. + + Note: this fallback is only relevant for start_date values provided by the user in the + connector configuration. Cursor values persisted in connector state are always formatted + using the default DATE_TIME_FORMAT (with microseconds). """ try: return datetime.strptime(start_date_str, self.DATE_TIME_FORMAT) From f476281c136e2662b8e6bd6f883ebbac7bb39d66 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 11 Mar 2026 10:53:52 +0000 Subject: [PATCH 5/8] fix: add ab_datetime_parse fallback in _parse_start_date for additional format support Co-Authored-By: Daryna Ishchenko --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 6 +++++- .../sources/file_based/test_file_based_stream_reader.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 023833e1d..4e6fbb0d4 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -24,6 +24,7 @@ from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError from airbyte_cdk.sources.file_based.file_record_data import FileRecordData from airbyte_cdk.sources.file_based.remote_file import RemoteFile, UploadableRemoteFile +from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse class FileReadMode(Enum): @@ -114,7 +115,10 @@ def _parse_start_date(self, start_date_str: str) -> datetime: try: return datetime.strptime(start_date_str, self.DATE_TIME_FORMAT) except ValueError: - return datetime.strptime(start_date_str, "%Y-%m-%dT%H:%M:%SZ") + try: + return datetime.strptime(start_date_str, "%Y-%m-%dT%H:%M:%SZ") + except ValueError: + return ab_datetime_parse(start_date_str).replace(tzinfo=None) def filter_files_by_globs_and_start_date( self, files: List[RemoteFile], globs: List[str] diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index d1d7dae2b..64d0238be 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -524,6 +524,11 @@ def test_preserve_sub_directories_scenarios( datetime(2025, 12, 31, 23, 59, 59), id="without_microseconds_end_of_day", ), + pytest.param( + "2025-01-01", + datetime(2025, 1, 1, 0, 0, 0), + id="date_only_ab_datetime_parse_fallback", + ), ], ) def test_parse_start_date(start_date_str: str, expected: datetime) -> None: From 15e17a580aac43c889a4f7a502e26791f19e6cb7 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:11:10 +0000 Subject: [PATCH 6/8] fix: convert timezone to UTC before stripping in ab_datetime_parse fallback Co-Authored-By: Daryna Ishchenko --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 6 ++++-- .../sources/file_based/test_file_based_stream_reader.py | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 4e6fbb0d4..2e0df36bc 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -5,7 +5,7 @@ import logging import time from abc import ABC, abstractmethod -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from io import IOBase from os import makedirs, path @@ -118,7 +118,9 @@ def _parse_start_date(self, start_date_str: str) -> datetime: try: return datetime.strptime(start_date_str, "%Y-%m-%dT%H:%M:%SZ") except ValueError: - return ab_datetime_parse(start_date_str).replace(tzinfo=None) + return ( + ab_datetime_parse(start_date_str).astimezone(timezone.utc).replace(tzinfo=None) + ) def filter_files_by_globs_and_start_date( self, files: List[RemoteFile], globs: List[str] diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index 64d0238be..5ffae52cb 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -529,6 +529,11 @@ def test_preserve_sub_directories_scenarios( datetime(2025, 1, 1, 0, 0, 0), id="date_only_ab_datetime_parse_fallback", ), + pytest.param( + "2025-01-01T00:00:00+05:30", + datetime(2024, 12, 31, 18, 30, 0), + id="with_timezone_offset_converted_to_utc", + ), ], ) def test_parse_start_date(start_date_str: str, expected: datetime) -> None: From 09bdff2ef60c9ada5865c2b4587d65b231af1c20 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:14:20 +0000 Subject: [PATCH 7/8] docs: add comment explaining why astimezone(utc) is needed before stripping tzinfo Co-Authored-By: Daryna Ishchenko --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 2e0df36bc..f39d2860c 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -118,6 +118,10 @@ def _parse_start_date(self, start_date_str: str) -> datetime: try: return datetime.strptime(start_date_str, "%Y-%m-%dT%H:%M:%SZ") except ValueError: + # ab_datetime_parse may return a timezone-aware datetime (e.g. for inputs + # like "2025-01-01T00:00:00+05:30"). We convert to UTC first so the offset + # is applied correctly, then strip tzinfo to produce a naive UTC datetime + # compatible with RemoteFile.last_modified comparisons. return ( ab_datetime_parse(start_date_str).astimezone(timezone.utc).replace(tzinfo=None) ) From 9bcab015b0ede94f36e020a80fb20935e1aeb9bc Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:28:10 +0000 Subject: [PATCH 8/8] test: add regression test for overridden DATE_TIME_FORMAT in _parse_start_date Co-Authored-By: Daryna Ishchenko --- .../file_based/test_file_based_stream_reader.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index 5ffae52cb..a8b8e7752 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -541,6 +541,17 @@ def test_parse_start_date(start_date_str: str, expected: datetime) -> None: assert reader._parse_start_date(start_date_str) == expected +def test_parse_start_date_respects_overridden_date_time_format() -> None: + """Verify that subclasses overriding DATE_TIME_FORMAT are honored by _parse_start_date.""" + + class CustomFormatReader(TestStreamReader): + DATE_TIME_FORMAT = "custom:%Y/%m/%d %H:%M:%S" + + reader = CustomFormatReader() + + assert reader._parse_start_date("custom:2025/01/01 00:00:00") == datetime(2025, 1, 1, 0, 0, 0) + + def test_parse_start_date_invalid_raises() -> None: reader = TestStreamReader() with pytest.raises(ValueError):