pyathena-dev
diff --git a/‎docs/pandas.rst‎
Lines changed: 116 additions & 0 deletions b/‎docs/pandas.rst‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎pyathena/pandas/cursor.py‎
Lines changed: 98 additions & 2 deletions b/‎pyathena/pandas/cursor.py‎
Lines changed: 98 additions & 2 deletions
@@ -397,6 +397,27 @@ This object has exactly the same interface as the ``TextFileReader`` object and
         print(df.describe())
         print(df.head())
 
+**Memory-efficient iteration with iter_chunks()**
+
+PandasCursor provides an ``iter_chunks()`` method for convenient chunked processing:
+
+.. code:: python
+
+    from pyathena import connect
+    from pyathena.pandas.cursor import PandasCursor
+
+    cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/",
+                     region_name="us-west-2",
+                     cursor_class=PandasCursor).cursor()
+    
+    # Process large dataset in chunks
+    cursor.execute("SELECT * FROM large_table", chunksize=50_000)
+    for chunk in cursor.iter_chunks():
+        # Process each chunk
+        processed = chunk.groupby('category').sum()
+        # Memory can be freed after each chunk
+        del chunk
+
 You can also concatenate them into a single `pandas.DataFrame object`_ using `pandas.concat`_.
 
 .. code:: python
@@ -427,6 +448,101 @@ When all rows have been read, calling the ``get_chunk`` method will raise ``Stop
     df_iter.get_chunk(10)
     df_iter.get_chunk(10)  # raise StopIteration
 
+**Auto-optimization of chunksize**
+
+PandasCursor can automatically determine optimal chunksize based on result file size when enabled:
+
+.. code:: python
+
+    from pyathena import connect
+    from pyathena.pandas.cursor import PandasCursor
+
+    # Enable auto-optimization (chunksize will be determined automatically for large files)
+    cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/",
+                     region_name="us-west-2",
+                     cursor_class=PandasCursor).cursor(auto_optimize_chunksize=True)
+    
+    # For large files, chunksize will be automatically set based on file size
+    cursor.execute("SELECT * FROM very_large_table")
+    for chunk in cursor.iter_chunks():
+        process_chunk(chunk)
+
+**Priority of chunksize settings:**
+
+1. **Explicit chunksize** (highest priority): Always respected
+2. **auto_optimize_chunksize=True**: Automatic determination for large files  
+3. **auto_optimize_chunksize=False** (default): No chunking, load entire DataFrame
+
+.. code:: python
+
+    # Explicit chunksize always takes precedence
+    cursor = connection.cursor(PandasCursor, chunksize=50_000, auto_optimize_chunksize=True)
+    # Will use chunksize=50_000, auto-optimization is ignored
+
+    # Auto-optimization only when chunksize is not specified
+    cursor = connection.cursor(PandasCursor, auto_optimize_chunksize=True)
+    # Will determine chunksize automatically for large files
+
+    # Default behavior - no chunking
+    cursor = connection.cursor(PandasCursor)
+    # Will load entire DataFrame regardless of file size
+
+You can customize the automatic chunksize determination by modifying class attributes:
+
+.. code:: python
+
+    from pyathena.pandas.result_set import AthenaPandasResultSet
+    
+    # Customize thresholds and chunk sizes for your use case
+    AthenaPandasResultSet.LARGE_FILE_THRESHOLD_BYTES = 100 * 1024 * 1024  # 100MB
+    AthenaPandasResultSet.AUTO_CHUNK_SIZE_LARGE = 200_000  # Larger chunks
+    AthenaPandasResultSet.AUTO_CHUNK_SIZE_MEDIUM = 100_000
+
+**Performance tuning options**
+
+PandasCursor accepts additional pandas.read_csv() options for performance optimization:
+
+.. code:: python
+
+    from pyathena import connect
+    from pyathena.pandas.cursor import PandasCursor
+
+    cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/",
+                     region_name="us-west-2",
+                     cursor_class=PandasCursor).cursor()
+    
+    # High-performance reading with PyArrow engine
+    cursor.execute("SELECT * FROM large_table", 
+                   engine="pyarrow",
+                   chunksize=100_000,
+                   use_threads=True)
+    
+    # Memory-conscious reading with Python engine
+    cursor.execute("SELECT * FROM huge_table",
+                   engine="python",
+                   chunksize=25_000, 
+                   low_memory=True)
+    
+    # Fine-tuned C engine with custom buffer
+    cursor.execute("SELECT * FROM data_table",
+                   engine="c",
+                   chunksize=50_000,
+                   buffer_lines=100_000)
+    
+    # Custom data types for better performance
+    cursor.execute("SELECT * FROM typed_table",
+                   dtype={'col1': 'int64', 'col2': 'float32'},
+                   parse_dates=['timestamp_col'])
+
+Common performance options:
+
+- ``engine``: CSV parsing engine ('c', 'python', 'pyarrow')
+- ``use_threads``: Enable threading for PyArrow engine
+- ``low_memory``: Use low memory mode for Python engine
+- ``buffer_lines``: Buffer size for C engine
+- ``dtype``: Explicit column data types
+- ``parse_dates``: Columns to parse as dates
+
 Unload options
 ~~~~~~~~~~~~~~
 
 
@@ -8,6 +8,7 @@
     Any,
     Callable,
     Dict,
+    Generator,
     Iterable,
     List,
     Optional,
@@ -24,7 +25,7 @@
     DefaultPandasTypeConverter,
     DefaultPandasUnloadTypeConverter,
 )
-from pyathena.pandas.result_set import AthenaPandasResultSet
+from pyathena.pandas.result_set import AthenaPandasResultSet, DataFrameIterator
 from pyathena.result_set import WithResultSet
 
 if TYPE_CHECKING:
@@ -34,6 +35,12 @@
 
 
 class PandasCursor(BaseCursor, CursorIterator, WithResultSet):
+    """Cursor for handling pandas DataFrame results from Athena queries.
+
+    This cursor provides memory-efficient DataFrame processing with chunking support
+    and automatic chunksize optimization for large result sets.
+    """
+
     def __init__(
         self,
         s3_staging_dir: Optional[str] = None,
@@ -52,9 +59,36 @@ def __init__(
         max_workers: int = (cpu_count() or 1) * 5,
         result_reuse_enable: bool = False,
         result_reuse_minutes: int = CursorIterator.DEFAULT_RESULT_REUSE_MINUTES,
+        auto_optimize_chunksize: bool = False,
         on_start_query_execution: Optional[Callable[[str], None]] = None,
         **kwargs,
     ) -> None:
+        """Initialize PandasCursor with configuration options.
+
+        Args:
+            s3_staging_dir: S3 directory for query result staging.
+            schema_name: Default schema name for queries.
+            catalog_name: Default catalog name for queries.
+            work_group: Athena workgroup name.
+            poll_interval: Query polling interval in seconds.
+            encryption_option: S3 encryption option.
+            kms_key: KMS key for encryption.
+            kill_on_interrupt: Cancel query on interrupt signal.
+            unload: Use UNLOAD statement for faster result retrieval.
+            engine: CSV parsing engine ('auto', 'c', 'python', 'pyarrow').
+            chunksize: Number of rows per chunk for memory-efficient processing.
+                      If specified, takes precedence over auto_optimize_chunksize.
+            block_size: S3 read block size.
+            cache_type: S3 caching strategy.
+            max_workers: Maximum worker threads for parallel processing.
+            result_reuse_enable: Enable query result reuse.
+            result_reuse_minutes: Result reuse duration in minutes.
+            auto_optimize_chunksize: Enable automatic chunksize determination for
+                                   large files. Only effective when chunksize is None.
+                                   Default: False (no automatic chunking).
+            on_start_query_execution: Callback for query start events.
+            **kwargs: Additional arguments passed to pandas.read_csv.
+        """
         super().__init__(
             s3_staging_dir=s3_staging_dir,
             schema_name=schema_name,
@@ -74,6 +108,7 @@ def __init__(
         self._block_size = block_size
         self._cache_type = cache_type
         self._max_workers = max_workers
+        self._auto_optimize_chunksize = auto_optimize_chunksize
         self._on_start_query_execution = on_start_query_execution
         self._query_id: Optional[str] = None
         self._result_set: Optional[AthenaPandasResultSet] = None
@@ -185,10 +220,12 @@ def execute(
                 block_size=kwargs.pop("block_size", self._block_size),
                 cache_type=kwargs.pop("cache_type", self._cache_type),
                 max_workers=kwargs.pop("max_workers", self._max_workers),
+                auto_optimize_chunksize=self._auto_optimize_chunksize,
                 **kwargs,
             )
         else:
             raise OperationalError(query_execution.state_change_reason)
+
         return self
 
     def executemany(
@@ -231,8 +268,67 @@ def fetchall(
         result_set = cast(AthenaPandasResultSet, self.result_set)
         return result_set.fetchall()
 
-    def as_pandas(self) -> "DataFrame":
+    def as_pandas(self) -> Union["DataFrame", DataFrameIterator]:
+        """Return DataFrame or DataFrameIterator based on chunksize setting.
+
+        Returns:
+            DataFrame when chunksize is None, DataFrameIterator when chunksize is set.
+        """
         if not self.has_result_set:
             raise ProgrammingError("No result set.")
         result_set = cast(AthenaPandasResultSet, self.result_set)
         return result_set.as_pandas()
+
+    def iter_chunks(self) -> Generator["DataFrame", None, None]:
+        """Iterate over DataFrame chunks for memory-efficient processing.
+
+        This method provides an iterator interface for processing large result sets
+        in chunks, preventing memory exhaustion when working with datasets that are
+        too large to fit in memory as a single DataFrame.
+
+        Chunking behavior:
+        - If chunksize is explicitly set, uses that value
+        - If auto_optimize_chunksize=True and chunksize=None, automatically determines
+          optimal chunksize based on file size
+        - If auto_optimize_chunksize=False and chunksize=None, yields entire DataFrame
+
+        Yields:
+            DataFrame: Individual chunks of the result set when chunking is enabled,
+                      or the entire DataFrame as a single chunk when chunking is disabled.
+
+        Examples:
+            # Explicit chunksize
+            cursor = connection.cursor(PandasCursor, chunksize=50000)
+            cursor.execute("SELECT * FROM large_table")
+            for chunk in cursor.iter_chunks():
+                process_chunk(chunk)
+
+            # Auto-optimization enabled
+            cursor = connection.cursor(PandasCursor, auto_optimize_chunksize=True)
+            cursor.execute("SELECT * FROM large_table")
+            for chunk in cursor.iter_chunks():
+                process_chunk(chunk)  # Chunks determined automatically for large files
+
+            # No chunking (default behavior)
+            cursor = connection.cursor(PandasCursor)
+            cursor.execute("SELECT * FROM large_table")
+            for chunk in cursor.iter_chunks():
+                process_chunk(chunk)  # Single DataFrame regardless of size
+        """
+        if not self.has_result_set:
+            raise ProgrammingError("No result set.")
+
+        result = self.as_pandas()
+        if isinstance(result, DataFrameIterator):
+            # It's an iterator (chunked mode)
+            import gc
+
+            for chunk_count, chunk in enumerate(result, 1):
+                yield chunk
+
+                # Suggest garbage collection every 10 chunks for large datasets
+                if chunk_count % 10 == 0:
+                    gc.collect()
+        else:
+            # Single DataFrame - yield as one chunk
+            yield result