Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "xml2db"
version = "0.13.2"
version = "0.13.3"
authors = [
{ name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
]
Expand Down
2 changes: 1 addition & 1 deletion src/xml2db/dialect/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def bulk_insert(self, conn: Any, table: Any, records: list) -> None:
sql = text(
f"INSERT INTO {full_name} ({insert_cols}) "
f"SELECT {select_exprs} "
f"FROM read_csv('{safe_path}', header=true, nullstr='', all_varchar=true)"
f"FROM read_csv('{safe_path}', header=true, nullstr='', all_varchar=true, quote='\"', escape='\"')"
)
conn.execute(sql)
finally:
Expand Down
21 changes: 21 additions & 0 deletions tests/test_bulk_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,27 @@ def test_duckdb_bulk_insert_scalar_column_default(duckdb_engine):
assert rows[1]["flag"] is False


def test_duckdb_bulk_insert_quoted_csv_field_after_large_unquoted_sample(duckdb_engine):
"""Regression: DuckDB's CSV sniffer uses only the first ~20k rows as a sample.

If all sampled rows are unquoted, the sniffer sets quote=(empty), causing a
column-count error when it later hits a row whose cell value contains a comma
(making csv.writer emit a quoted field). Explicitly passing quote='"' to
read_csv bypasses auto-detection and must always be present.
"""
table = _make_table(duckdb_engine, "quoted_field_test")
# 'vals' value that contains a comma — document.py's 'join' transform can produce
# strings like '"val,ue",other' which csv.writer then wraps in outer quotes,
# yielding a quoted CSV cell.
problematic_value = '"val,ue",other_value'
records = [
{"id": i, "label": "simple"} for i in range(25_000) # exceeds sniffer sample
] + [{"id": 25_000, "label": problematic_value}]
rows = _roundtrip(duckdb_engine, table, records)
assert len(rows) == 25_001
assert rows[-1]["label"] == problematic_value


def test_duckdb_bulk_insert_empty(duckdb_engine):
table = _make_table(duckdb_engine, "empty_test")
dialect = DuckDBDialect()
Expand Down
Loading