Skip to content

Commit 6876663

Browse files
fix(cdk): support start_date format without microseconds in file-based connectors (#945)
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 3e65ad5 commit 6876663

File tree

2 files changed

+94
-2
lines changed

2 files changed

+94
-2
lines changed

airbyte_cdk/sources/file_based/file_based_stream_reader.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import logging
66
import time
77
from abc import ABC, abstractmethod
8-
from datetime import datetime
8+
from datetime import datetime, timezone
99
from enum import Enum
1010
from io import IOBase
1111
from os import makedirs, path
@@ -24,6 +24,7 @@
2424
from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
2525
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
2626
from airbyte_cdk.sources.file_based.remote_file import RemoteFile, UploadableRemoteFile
27+
from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse
2728

2829

2930
class FileReadMode(Enum):
@@ -98,14 +99,41 @@ def get_matching_files(
9899
"""
99100
...
100101

102+
def _parse_start_date(self, start_date_str: str) -> datetime:
103+
"""Parse a start_date string, supporting both with and without microseconds.
104+
105+
AbstractFileBasedSpec accepts start_date in multiple formats as described by its
106+
pattern_descriptor: "YYYY-MM-DD, YYYY-MM-DDTHH:mm:ssZ, or YYYY-MM-DDTHH:mm:ss.SSSSSSZ".
107+
The primary format (self.DATE_TIME_FORMAT) includes microseconds, but the spec also
108+
allows the shorter "YYYY-MM-DDTHH:mm:ssZ" variant. This method tries the primary
109+
format first and falls back to the shorter format without microseconds.
110+
111+
Note: this fallback is only relevant for start_date values provided by the user in the
112+
connector configuration. Cursor values persisted in connector state are always formatted
113+
using the default DATE_TIME_FORMAT (with microseconds).
114+
"""
115+
try:
116+
return datetime.strptime(start_date_str, self.DATE_TIME_FORMAT)
117+
except ValueError:
118+
try:
119+
return datetime.strptime(start_date_str, "%Y-%m-%dT%H:%M:%SZ")
120+
except ValueError:
121+
# ab_datetime_parse may return a timezone-aware datetime (e.g. for inputs
122+
# like "2025-01-01T00:00:00+05:30"). We convert to UTC first so the offset
123+
# is applied correctly, then strip tzinfo to produce a naive UTC datetime
124+
# compatible with RemoteFile.last_modified comparisons.
125+
return (
126+
ab_datetime_parse(start_date_str).astimezone(timezone.utc).replace(tzinfo=None)
127+
)
128+
101129
def filter_files_by_globs_and_start_date(
102130
self, files: List[RemoteFile], globs: List[str]
103131
) -> Iterable[RemoteFile]:
104132
"""
105133
Utility method for filtering files based on globs.
106134
"""
107135
start_date = (
108-
datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
136+
self._parse_start_date(self.config.start_date)
109137
if self.config and self.config.start_date
110138
else None
111139
)

unit_tests/sources/file_based/test_file_based_stream_reader.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,13 @@ def documentation_url(cls) -> AnyUrl:
401401
set(),
402402
id="all_csvs_modified_before_start_date",
403403
),
404+
pytest.param(
405+
["**/*.csv"],
406+
{"start_date": "2023-06-01T03:54:07Z", "streams": []},
407+
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
408+
set(),
409+
id="all_csvs_start_date_without_microseconds",
410+
),
404411
pytest.param(
405412
["**/*.csv"],
406413
{"start_date": "2023-06-05T03:54:07.000Z", "streams": []},
@@ -494,6 +501,63 @@ def test_preserve_sub_directories_scenarios(
494501
assert file_paths[AbstractFileBasedStreamReader.FILE_FOLDER] == path.dirname(source_file_path)
495502

496503

504+
@pytest.mark.parametrize(
505+
"start_date_str, expected",
506+
[
507+
pytest.param(
508+
"2025-01-01T00:00:00.000000Z",
509+
datetime(2025, 1, 1, 0, 0, 0),
510+
id="with_microseconds_zero",
511+
),
512+
pytest.param(
513+
"2025-06-15T12:30:45.123456Z",
514+
datetime(2025, 6, 15, 12, 30, 45, 123456),
515+
id="with_microseconds_nonzero",
516+
),
517+
pytest.param(
518+
"2025-01-01T00:00:00Z",
519+
datetime(2025, 1, 1, 0, 0, 0),
520+
id="without_microseconds",
521+
),
522+
pytest.param(
523+
"2025-12-31T23:59:59Z",
524+
datetime(2025, 12, 31, 23, 59, 59),
525+
id="without_microseconds_end_of_day",
526+
),
527+
pytest.param(
528+
"2025-01-01",
529+
datetime(2025, 1, 1, 0, 0, 0),
530+
id="date_only_ab_datetime_parse_fallback",
531+
),
532+
pytest.param(
533+
"2025-01-01T00:00:00+05:30",
534+
datetime(2024, 12, 31, 18, 30, 0),
535+
id="with_timezone_offset_converted_to_utc",
536+
),
537+
],
538+
)
539+
def test_parse_start_date(start_date_str: str, expected: datetime) -> None:
540+
reader = TestStreamReader()
541+
assert reader._parse_start_date(start_date_str) == expected
542+
543+
544+
def test_parse_start_date_respects_overridden_date_time_format() -> None:
545+
"""Verify that subclasses overriding DATE_TIME_FORMAT are honored by _parse_start_date."""
546+
547+
class CustomFormatReader(TestStreamReader):
548+
DATE_TIME_FORMAT = "custom:%Y/%m/%d %H:%M:%S"
549+
550+
reader = CustomFormatReader()
551+
552+
assert reader._parse_start_date("custom:2025/01/01 00:00:00") == datetime(2025, 1, 1, 0, 0, 0)
553+
554+
555+
def test_parse_start_date_invalid_raises() -> None:
556+
reader = TestStreamReader()
557+
with pytest.raises(ValueError):
558+
reader._parse_start_date("not-a-date")
559+
560+
497561
def test_upload_with_file_transfer_reader():
498562
stream_reader = TestStreamReaderWithDefaultUpload()
499563

0 commit comments

Comments
 (0)