Skip to content

Commit 4a31285

Browse files
authored
fix: use PyArrowFileIO for S3 access in get_dbt_model_as_dataframe (#2218)
* fix: use PyArrowFileIO for S3 access in get_dbt_model_as_dataframe * revert * fix: update exception handling in Glue table loading documentation
1 parent 1019b0d commit 4a31285

1 file changed

Lines changed: 18 additions & 7 deletions

File tree

packages/ol-orchestrate-lib/src/ol_orchestrate/lib/glue_helper.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,23 +90,34 @@ def create_or_update_table(
9090

9191

9292
def get_dbt_model_as_dataframe(database_name: str, table_name: str) -> pl.LazyFrame:
93-
"""Retrieve a dbt model from AWS Glue as a Polars DataFrame.
93+
"""Retrieve a dbt model from AWS Glue as a Polars LazyFrame.
9494
9595
This function fetches table metadata from AWS Glue and loads the Iceberg
96-
table data into a Polars DataFrame.
96+
table data into a Polars LazyFrame.
97+
98+
``PyArrowFileIO`` is used so that PyIceberg reads S3 data via PyArrow's
99+
native C++ S3 client instead of the default ``FsspecFileIO`` (which relies
100+
on aiobotocore / aiohttp). After the aiobotocore 3.4.0 → 3.5.0 bump
101+
deployed around 2026-04-27, botocore's lazy loader cache was populated
102+
inside aiobotocore's async event loop thread, blocking all pending S3
103+
coroutines and causing Dagster runs to hang indefinitely.
104+
``PyArrowFileIO`` bypasses aiobotocore entirely and is not affected.
97105
98106
Args:
99107
database_name: The Glue database name containing the table
100108
table_name: The name of the table to retrieve
101109
102110
Returns:
103-
A Polars DataFrame containing the table data
111+
A Polars LazyFrame containing the table data
104112
105113
Raises:
106-
KeyError: If the table metadata doesn't contain the expected fields
107-
boto3 exceptions: If the AWS Glue API call fails
114+
Exception: If loading the Iceberg table from Glue or converting it to
115+
a Polars LazyFrame fails.
108116
"""
109-
glue = GlueCatalog("default", client=boto3.client("glue", region_name="us-east-1"))
117+
glue = GlueCatalog(
118+
"default",
119+
client=boto3.client("glue", region_name="us-east-1"),
120+
**{"py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO"},
121+
)
110122
table = glue.load_table(f"{database_name}.{table_name}")
111-
112123
return table.to_polars()

0 commit comments

Comments
 (0)