Skip to content

Commit f546a81

Browse files
Merge pull request #697 from pyathena-dev/fix/csv-engine-always-use-c
Always use C engine for CSV parsing (fix #696)
2 parents 981e786 + 16599b5 commit f546a81

2 files changed

Lines changed: 19 additions & 88 deletions

File tree

pyathena/pandas/result_set.py

Lines changed: 19 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -320,38 +320,33 @@ def _get_csv_engine(
320320
"""Determine the appropriate CSV engine based on configuration and compatibility.
321321
322322
Args:
323-
file_size_bytes: Size of the CSV file in bytes.
323+
file_size_bytes: Size of the CSV file in bytes. Only used for PyArrow
324+
compatibility checks (minimum file size threshold).
324325
chunksize: Chunksize parameter (overrides self._chunksize if provided).
325326
326327
Returns:
327328
CSV engine name ('pyarrow', 'c', or 'python').
328329
"""
329-
effective_chunksize = chunksize if chunksize is not None else self._chunksize
330+
if self._engine == "python":
331+
return "python"
330332

333+
# Use PyArrow only when explicitly requested and all compatibility
334+
# checks pass; otherwise fall through to the C engine default.
331335
if self._engine == "pyarrow":
332-
return self._get_pyarrow_engine(file_size_bytes, effective_chunksize)
333-
334-
if self._engine in ("c", "python"):
335-
return self._engine
336-
337-
# Auto-selection for "auto" or unknown engine values
338-
return self._get_optimal_csv_engine(file_size_bytes)
339-
340-
def _get_pyarrow_engine(self, file_size_bytes: int | None, chunksize: int | None) -> str:
341-
"""Get PyArrow engine if compatible, otherwise return optimal engine."""
342-
# Check parameter compatibility
343-
if chunksize is not None or self._quoting != 1 or self.converters:
344-
return self._get_optimal_csv_engine(file_size_bytes)
345-
346-
# Check file size compatibility
347-
if file_size_bytes is not None and file_size_bytes < self.PYARROW_MIN_FILE_SIZE_BYTES:
348-
return self._get_optimal_csv_engine(file_size_bytes)
336+
effective_chunksize = chunksize if chunksize is not None else self._chunksize
337+
is_compatible = (
338+
effective_chunksize is None
339+
and self._quoting == 1
340+
and not self.converters
341+
and (file_size_bytes is None or file_size_bytes >= self.PYARROW_MIN_FILE_SIZE_BYTES)
342+
)
343+
if is_compatible:
344+
try:
345+
return self._get_available_engine(["pyarrow"])
346+
except ImportError:
347+
pass
349348

350-
# Check availability
351-
try:
352-
return self._get_available_engine(["pyarrow"])
353-
except ImportError:
354-
return self._get_optimal_csv_engine(file_size_bytes)
349+
return "c"
355350

356351
def _get_available_engine(self, engine_candidates: list[str]) -> str:
357352
"""Get the first available engine from a list of candidates.
@@ -382,19 +377,6 @@ def _get_available_engine(self, engine_candidates: list[str]) -> str:
382377
f"{error_msgs}"
383378
)
384379

385-
def _get_optimal_csv_engine(self, file_size_bytes: int | None = None) -> str:
386-
"""Get the optimal CSV engine based on file size.
387-
388-
Args:
389-
file_size_bytes: Size of the CSV file in bytes.
390-
391-
Returns:
392-
'python' for large files (>50MB) to avoid C parser limits, otherwise 'c'.
393-
"""
394-
if file_size_bytes and file_size_bytes > self.LARGE_FILE_THRESHOLD_BYTES:
395-
return "python"
396-
return "c"
397-
398380
def _auto_determine_chunksize(self, file_size_bytes: int) -> int | None:
399381
"""Determine appropriate chunksize for large files based on file size.
400382
@@ -594,26 +576,6 @@ def _read_csv(self) -> TextFileReader | DataFrame:
594576

595577
except Exception as e:
596578
_logger.exception("Failed to read %s.", self.output_location)
597-
error_msg = str(e).lower()
598-
if any(
599-
phrase in error_msg
600-
for phrase in ["signed integer", "maximum", "overflow", "int32", "c parser"]
601-
):
602-
# Enhanced error message with specific recommendations
603-
file_mb = (length or 0) // (1024 * 1024)
604-
detailed_msg = (
605-
f"Large dataset processing error ({file_mb}MB file): {e}. "
606-
"This is likely due to pandas C parser limitations. "
607-
"Recommended solutions:\n"
608-
"1. Set chunksize: cursor = connection.cursor(PandasCursor, chunksize=50000)\n"
609-
"2. Enable auto-optimization: "
610-
"cursor = connection.cursor(PandasCursor, auto_optimize_chunksize=True)\n"
611-
"3. Use PyArrow engine: "
612-
"cursor = connection.cursor(PandasCursor, engine='pyarrow')\n"
613-
"4. Use Python engine: "
614-
"cursor = connection.cursor(PandasCursor, engine='python')"
615-
)
616-
raise OperationalError(detailed_msg) from e
617579
raise OperationalError(*e.args) from e
618580

619581
def _read_parquet(self, engine) -> DataFrame:

tests/pyathena/pandas/test_cursor.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -658,23 +658,6 @@ def test_no_ops(self):
658658
cursor.close()
659659
conn.close()
660660

661-
def test_get_optimal_csv_engine(self):
662-
"""Test _get_optimal_csv_engine method behavior."""
663-
664-
# Mock the parent class initialization
665-
with patch("pyathena.pandas.result_set.AthenaResultSet.__init__"):
666-
result_set = AthenaPandasResultSet.__new__(AthenaPandasResultSet)
667-
result_set._engine = "auto"
668-
result_set._chunksize = None # No chunking by default
669-
670-
# Small file should prefer C engine (compatibility-first approach)
671-
engine = result_set._get_optimal_csv_engine(1024) # 1KB
672-
assert engine == "c"
673-
674-
# Large file should prefer Python engine (avoid C parser int32 limits)
675-
engine = result_set._get_optimal_csv_engine(100 * 1024 * 1024) # 100MB
676-
assert engine == "python"
677-
678661
def test_auto_determine_chunksize(self):
679662
"""Test _auto_determine_chunksize method behavior."""
680663

@@ -747,70 +730,56 @@ def test_get_csv_engine_explicit_specification(self):
747730

748731
# Test PyArrow with incompatible chunksize (via parameter)
749732
with (
750-
patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
751733
patch.object(
752734
type(result_set), "converters", new_callable=PropertyMock, return_value={}
753735
),
754-
patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
755736
):
756737
engine = result_set._get_csv_engine(chunksize=1000)
757738
assert engine == "c"
758-
mock_opt.assert_called_once()
759739

760740
# Test PyArrow with incompatible chunksize (via instance variable)
761741
result_set._chunksize = 1000
762742
with (
763-
patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
764743
patch.object(
765744
type(result_set), "converters", new_callable=PropertyMock, return_value={}
766745
),
767-
patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
768746
):
769747
engine = result_set._get_csv_engine()
770748
assert engine == "c"
771-
mock_opt.assert_called_once()
772749

773750
# Test PyArrow with incompatible quoting
774751
result_set._chunksize = None
775752
result_set._quoting = 0 # Non-default quoting
776753
with (
777-
patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
778754
patch.object(
779755
type(result_set), "converters", new_callable=PropertyMock, return_value={}
780756
),
781-
patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
782757
):
783758
engine = result_set._get_csv_engine()
784759
assert engine == "c"
785-
mock_opt.assert_called_once()
786760

787761
# Test PyArrow with incompatible converters
788762
result_set._quoting = 1 # Reset to default
789763
with (
790-
patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
791764
patch.object(
792765
type(result_set),
793766
"converters",
794767
new_callable=PropertyMock,
795768
return_value={"col1": str},
796769
),
797-
patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
798770
):
799771
engine = result_set._get_csv_engine()
800772
assert engine == "c"
801-
mock_opt.assert_called_once()
802773

803774
# Test PyArrow specification (when unavailable)
804775
with (
805776
patch.object(result_set, "_get_available_engine", side_effect=ImportError),
806777
patch.object(
807778
type(result_set), "converters", new_callable=PropertyMock, return_value={}
808779
),
809-
patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
810780
):
811781
engine = result_set._get_csv_engine()
812782
assert engine == "c"
813-
mock_opt.assert_called_once()
814783

815784
@pytest.mark.parametrize(
816785
("pandas_cursor", "parquet_engine", "chunksize"),

0 commit comments

Comments
 (0)