88 Any ,
99 Callable ,
1010 Dict ,
11+ Generator ,
1112 Iterable ,
1213 List ,
1314 Optional ,
2425 DefaultPandasTypeConverter ,
2526 DefaultPandasUnloadTypeConverter ,
2627)
27- from pyathena .pandas .result_set import AthenaPandasResultSet
28+ from pyathena .pandas .result_set import AthenaPandasResultSet , DataFrameIterator
2829from pyathena .result_set import WithResultSet
2930
3031if TYPE_CHECKING :
3435
3536
3637class PandasCursor (BaseCursor , CursorIterator , WithResultSet ):
38+ """Cursor for handling pandas DataFrame results from Athena queries.
39+
40+ This cursor provides memory-efficient DataFrame processing with chunking support
41+ and automatic chunksize optimization for large result sets.
42+ """
43+
3744 def __init__ (
3845 self ,
3946 s3_staging_dir : Optional [str ] = None ,
@@ -52,9 +59,36 @@ def __init__(
5259 max_workers : int = (cpu_count () or 1 ) * 5 ,
5360 result_reuse_enable : bool = False ,
5461 result_reuse_minutes : int = CursorIterator .DEFAULT_RESULT_REUSE_MINUTES ,
62+ auto_optimize_chunksize : bool = False ,
5563 on_start_query_execution : Optional [Callable [[str ], None ]] = None ,
5664 ** kwargs ,
5765 ) -> None :
66+ """Initialize PandasCursor with configuration options.
67+
68+ Args:
69+ s3_staging_dir: S3 directory for query result staging.
70+ schema_name: Default schema name for queries.
71+ catalog_name: Default catalog name for queries.
72+ work_group: Athena workgroup name.
73+ poll_interval: Query polling interval in seconds.
74+ encryption_option: S3 encryption option.
75+ kms_key: KMS key for encryption.
76+ kill_on_interrupt: Cancel query on interrupt signal.
77+ unload: Use UNLOAD statement for faster result retrieval.
78+ engine: CSV parsing engine ('auto', 'c', 'python', 'pyarrow').
79+ chunksize: Number of rows per chunk for memory-efficient processing.
80+ If specified, takes precedence over auto_optimize_chunksize.
81+ block_size: S3 read block size.
82+ cache_type: S3 caching strategy.
83+ max_workers: Maximum worker threads for parallel processing.
84+ result_reuse_enable: Enable query result reuse.
85+ result_reuse_minutes: Result reuse duration in minutes.
86+ auto_optimize_chunksize: Enable automatic chunksize determination for
87+ large files. Only effective when chunksize is None.
88+ Default: False (no automatic chunking).
89+ on_start_query_execution: Callback for query start events.
90+ **kwargs: Additional arguments passed to pandas.read_csv.
91+ """
5892 super ().__init__ (
5993 s3_staging_dir = s3_staging_dir ,
6094 schema_name = schema_name ,
@@ -74,6 +108,7 @@ def __init__(
74108 self ._block_size = block_size
75109 self ._cache_type = cache_type
76110 self ._max_workers = max_workers
111+ self ._auto_optimize_chunksize = auto_optimize_chunksize
77112 self ._on_start_query_execution = on_start_query_execution
78113 self ._query_id : Optional [str ] = None
79114 self ._result_set : Optional [AthenaPandasResultSet ] = None
@@ -185,10 +220,12 @@ def execute(
185220 block_size = kwargs .pop ("block_size" , self ._block_size ),
186221 cache_type = kwargs .pop ("cache_type" , self ._cache_type ),
187222 max_workers = kwargs .pop ("max_workers" , self ._max_workers ),
223+ auto_optimize_chunksize = self ._auto_optimize_chunksize ,
188224 ** kwargs ,
189225 )
190226 else :
191227 raise OperationalError (query_execution .state_change_reason )
228+
192229 return self
193230
194231 def executemany (
@@ -231,8 +268,67 @@ def fetchall(
231268 result_set = cast (AthenaPandasResultSet , self .result_set )
232269 return result_set .fetchall ()
233270
234- def as_pandas (self ) -> "DataFrame" :
271+ def as_pandas (self ) -> Union ["DataFrame" , DataFrameIterator ]:
272+ """Return DataFrame or DataFrameIterator based on chunksize setting.
273+
274+ Returns:
275+ DataFrame when chunksize is None, DataFrameIterator when chunksize is set.
276+ """
235277 if not self .has_result_set :
236278 raise ProgrammingError ("No result set." )
237279 result_set = cast (AthenaPandasResultSet , self .result_set )
238280 return result_set .as_pandas ()
281+
282+ def iter_chunks (self ) -> Generator ["DataFrame" , None , None ]:
283+ """Iterate over DataFrame chunks for memory-efficient processing.
284+
285+ This method provides an iterator interface for processing large result sets
286+ in chunks, preventing memory exhaustion when working with datasets that are
287+ too large to fit in memory as a single DataFrame.
288+
289+ Chunking behavior:
290+ - If chunksize is explicitly set, uses that value
291+ - If auto_optimize_chunksize=True and chunksize=None, automatically determines
292+ optimal chunksize based on file size
293+ - If auto_optimize_chunksize=False and chunksize=None, yields entire DataFrame
294+
295+ Yields:
296+ DataFrame: Individual chunks of the result set when chunking is enabled,
297+ or the entire DataFrame as a single chunk when chunking is disabled.
298+
299+ Examples:
300+ # Explicit chunksize
301+ cursor = connection.cursor(PandasCursor, chunksize=50000)
302+ cursor.execute("SELECT * FROM large_table")
303+ for chunk in cursor.iter_chunks():
304+ process_chunk(chunk)
305+
306+ # Auto-optimization enabled
307+ cursor = connection.cursor(PandasCursor, auto_optimize_chunksize=True)
308+ cursor.execute("SELECT * FROM large_table")
309+ for chunk in cursor.iter_chunks():
310+ process_chunk(chunk) # Chunks determined automatically for large files
311+
312+ # No chunking (default behavior)
313+ cursor = connection.cursor(PandasCursor)
314+ cursor.execute("SELECT * FROM large_table")
315+ for chunk in cursor.iter_chunks():
316+ process_chunk(chunk) # Single DataFrame regardless of size
317+ """
318+ if not self .has_result_set :
319+ raise ProgrammingError ("No result set." )
320+
321+ result = self .as_pandas ()
322+ if isinstance (result , DataFrameIterator ):
323+ # It's an iterator (chunked mode)
324+ import gc
325+
326+ for chunk_count , chunk in enumerate (result , 1 ):
327+ yield chunk
328+
329+ # Suggest garbage collection every 10 chunks for large datasets
330+ if chunk_count % 10 == 0 :
331+ gc .collect ()
332+ else :
333+ # Single DataFrame - yield as one chunk
334+ yield result
0 commit comments