Skip to content

Commit efad73e

Browse files
fix: use flexible datetime parsing for start_date in file-based connectors (#887)
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 313db66 commit efad73e

File tree

3 files changed

+64
-7
lines changed

3 files changed

+64
-7
lines changed

airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77
from typing import Any, Dict, List, Literal, Optional, Union
88

99
import dpath
10-
from pydantic.v1 import AnyUrl, BaseModel, Field
10+
from pydantic.v1 import AnyUrl, BaseModel, Field, validator
1111

1212
from airbyte_cdk import OneOfOptionConfig
1313
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
1414
from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
1515
from airbyte_cdk.sources.utils import schema_helpers
16+
from airbyte_cdk.utils.datetime_helpers import ab_datetime_try_parse
1617

1718

1819
class DeliverRecords(BaseModel):
@@ -53,13 +54,39 @@ class AbstractFileBasedSpec(BaseModel):
5354
start_date: Optional[str] = Field(
5455
title="Start Date",
5556
description="UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.",
56-
examples=["2021-01-01T00:00:00.000000Z"],
57+
examples=[
58+
"2021-01-01",
59+
"2021-01-01T00:00:00Z",
60+
"2021-01-01T00:00:00.000Z",
61+
"2021-01-01T00:00:00.000000Z",
62+
],
5763
format="date-time",
58-
pattern="^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$",
59-
pattern_descriptor="YYYY-MM-DDTHH:mm:ss.SSSSSSZ",
64+
pattern=r"^[0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?(Z|[+-][0-9]{2}:[0-9]{2})?)?$",
65+
pattern_descriptor="YYYY-MM-DD, YYYY-MM-DDTHH:mm:ssZ, or YYYY-MM-DDTHH:mm:ss.SSSSSSZ",
6066
order=1,
6167
)
6268

69+
@validator("start_date", pre=True)
70+
def validate_start_date(
71+
cls, # noqa: N805 # Pydantic validators use cls, not self
72+
v: Optional[str],
73+
) -> Optional[str]:
74+
"""Validate that start_date is a parseable datetime string.
75+
76+
Uses ab_datetime_try_parse which accepts any common ISO8601/RFC3339 format,
77+
including formats with or without microseconds (e.g., both
78+
'2021-01-01T00:00:00Z' and '2021-01-01T00:00:00.000000Z' are valid).
79+
"""
80+
if v is None:
81+
return v
82+
parsed = ab_datetime_try_parse(v)
83+
if parsed is None:
84+
raise ValueError(
85+
f"'{v}' is not a valid datetime string. "
86+
"Please use a format like '2021-01-01T00:00:00Z' or '2021-01-01T00:00:00.000000Z'."
87+
)
88+
return v
89+
6390
streams: List[FileBasedStreamConfig] = Field(
6491
title="The list of streams to sync",
6592
description='Each instance of this configuration defines a <a href="https://docs.airbyte.com/cloud/core-concepts#stream">stream</a>. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.',

unit_tests/sources/file_based/config/test_abstract_file_based_spec.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from jsonschema import ValidationError, validate
99
from pydantic.v1 import BaseModel
1010

11+
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
1112
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
1213
AvroFormat,
1314
CsvFormat,
@@ -40,3 +41,27 @@ def test_parquet_file_type_is_not_a_valid_csv_file_type(
4041
validate(instance=format_config[file_type], schema=file_format.schema())
4142
else:
4243
validate(instance=format_config[file_type], schema=file_format.schema())
44+
45+
46+
@pytest.mark.parametrize(
47+
"start_date, should_pass",
48+
[
49+
pytest.param("2021-01-01T00:00:00.000000Z", True, id="with_microseconds"),
50+
pytest.param("2021-01-01T00:00:00Z", True, id="without_microseconds"),
51+
pytest.param("2021-01-01T00:00:00.000Z", True, id="with_milliseconds"),
52+
pytest.param("2025-01-01T00:00:00Z", True, id="terraform_provider_format"),
53+
pytest.param("2021-01-01T00:00:00+00:00", True, id="with_timezone_offset"),
54+
pytest.param("2021-01-01", True, id="date_only"),
55+
pytest.param(None, True, id="none_value"),
56+
pytest.param("not-a-date", False, id="invalid_string"),
57+
pytest.param("", False, id="empty_string"),
58+
],
59+
)
60+
def test_start_date_validation(start_date: str, should_pass: bool) -> None:
61+
"""Test that start_date accepts various valid ISO8601/RFC3339 formats."""
62+
if should_pass:
63+
result = AbstractFileBasedSpec.validate_start_date(start_date)
64+
assert result == start_date
65+
else:
66+
with pytest.raises(ValueError, match="is not a valid datetime string"):
67+
AbstractFileBasedSpec.validate_start_date(start_date)

unit_tests/sources/file_based/scenarios/csv_scenarios.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,15 @@
6161
"start_date": {
6262
"title": "Start Date",
6363
"description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.",
64-
"examples": ["2021-01-01T00:00:00.000000Z"],
64+
"examples": [
65+
"2021-01-01",
66+
"2021-01-01T00:00:00Z",
67+
"2021-01-01T00:00:00.000Z",
68+
"2021-01-01T00:00:00.000000Z",
69+
],
6570
"format": "date-time",
66-
"pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$",
67-
"pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ",
71+
"pattern": r"^[0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?(Z|[+-][0-9]{2}:[0-9]{2})?)?$",
72+
"pattern_descriptor": "YYYY-MM-DD, YYYY-MM-DDTHH:mm:ssZ, or YYYY-MM-DDTHH:mm:ss.SSSSSSZ",
6873
"order": 1,
6974
"type": "string",
7075
},

0 commit comments

Comments
 (0)