Skip to content
This repository was archived by the owner on Nov 12, 2025. It is now read-only.

Commit 56d1b1f

Browse files
authored
fix: handle consuming streams with no data (#29)
1 parent 77b373b commit 56d1b1f

2 files changed

Lines changed: 146 additions & 5 deletions

File tree

google/cloud/bigquery_storage_v1/reader.py

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,13 @@ def to_arrow(self):
287287
record_batches = []
288288
for page in self.pages:
289289
record_batches.append(page.to_arrow())
290-
return pyarrow.Table.from_batches(record_batches)
290+
291+
if record_batches:
292+
return pyarrow.Table.from_batches(record_batches)
293+
294+
# No data, return an empty Table.
295+
self._stream_parser._parse_arrow_schema()
296+
return pyarrow.Table.from_batches([], schema=self._stream_parser._schema)
291297

292298
def to_dataframe(self, dtypes=None):
293299
"""Create a :class:`pandas.DataFrame` of all rows in the stream.
@@ -323,17 +329,66 @@ def to_dataframe(self, dtypes=None):
323329
# rarely no-copy, whereas pyarrow.Table.from_batches + to_pandas is
324330
# usually no-copy.
325331
schema_type = self._read_session.WhichOneof("schema")
332+
326333
if schema_type == "arrow_schema":
327334
record_batch = self.to_arrow()
328335
df = record_batch.to_pandas()
329336
for column in dtypes:
330337
df[column] = pandas.Series(df[column], dtype=dtypes[column])
331338
return df
332339

333-
frames = []
334-
for page in self.pages:
335-
frames.append(page.to_dataframe(dtypes=dtypes))
336-
return pandas.concat(frames)
340+
frames = [page.to_dataframe(dtypes=dtypes) for page in self.pages]
341+
342+
if frames:
343+
return pandas.concat(frames)
344+
345+
# No data, construct an empty dataframe with columns matching the schema.
346+
# The result should be consistent with what an empty ARROW stream would produce.
347+
self._stream_parser._parse_avro_schema()
348+
schema = self._stream_parser._avro_schema_json
349+
350+
column_dtypes = self._dtypes_from_avro(schema["fields"])
351+
column_dtypes.update(dtypes)
352+
353+
df = pandas.DataFrame(columns=column_dtypes.keys())
354+
for column in df:
355+
df[column] = pandas.Series([], dtype=column_dtypes[column])
356+
357+
return df
358+
359+
def _dtypes_from_avro(self, avro_fields):
360+
"""Determine Pandas dtypes for columns in Avro schema.
361+
362+
Args:
363+
avro_fields (Iterable[Mapping[str, Any]]):
364+
Avro fields' metadata.
365+
366+
Returns:
367+
colelctions.OrderedDict[str, str]:
368+
Column names with their corresponding Pandas dtypes.
369+
"""
370+
result = collections.OrderedDict()
371+
372+
type_map = {"long": "int64", "double": "float64", "boolean": "bool"}
373+
374+
for field_info in avro_fields:
375+
# If a type is an union of multiple types, pick the first type
376+
# that is not "null".
377+
if isinstance(field_info["type"], list):
378+
type_info = next(item for item in field_info["type"] if item != "null")
379+
380+
if isinstance(type_info, six.string_types):
381+
field_dtype = type_map.get(type_info, "object")
382+
else:
383+
logical_type = type_info.get("logicalType")
384+
if logical_type == "timestamp-micros":
385+
field_dtype = "datetime64[ns, UTC]"
386+
else:
387+
field_dtype = "object"
388+
389+
result[field_info["name"]] = field_dtype
390+
391+
return result
337392

338393

339394
class ReadRowsPage(object):

tests/unit/test_reader_v1.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,92 @@ def test_to_dataframe_w_dtypes_arrow(class_under_test):
645645
)
646646

647647

648+
def test_to_dataframe_empty_w_scalars_avro(class_under_test):
649+
avro_schema = _bq_to_avro_schema(SCALAR_COLUMNS)
650+
read_session = _generate_avro_read_session(avro_schema)
651+
avro_blocks = _bq_to_avro_blocks([], avro_schema)
652+
reader = class_under_test(avro_blocks, mock_client, "", 0, {})
653+
654+
got = reader.to_dataframe(read_session)
655+
656+
expected = pandas.DataFrame(columns=SCALAR_COLUMN_NAMES)
657+
expected["int_col"] = expected["int_col"].astype("int64")
658+
expected["float_col"] = expected["float_col"].astype("float64")
659+
expected["bool_col"] = expected["bool_col"].astype("bool")
660+
expected["ts_col"] = expected["ts_col"].astype("datetime64[ns, UTC]")
661+
662+
pandas.testing.assert_frame_equal(
663+
got.reset_index(drop=True), # reset_index to ignore row labels
664+
expected.reset_index(drop=True),
665+
)
666+
667+
668+
def test_to_dataframe_empty_w_scalars_arrow(class_under_test):
669+
arrow_schema = _bq_to_arrow_schema(SCALAR_COLUMNS)
670+
read_session = _generate_arrow_read_session(arrow_schema)
671+
arrow_batches = _bq_to_arrow_batches([], arrow_schema)
672+
reader = class_under_test(arrow_batches, mock_client, "", 0, {})
673+
674+
got = reader.to_dataframe(read_session)
675+
676+
expected = pandas.DataFrame([], columns=SCALAR_COLUMN_NAMES)
677+
expected["int_col"] = expected["int_col"].astype("int64")
678+
expected["float_col"] = expected["float_col"].astype("float64")
679+
expected["bool_col"] = expected["bool_col"].astype("bool")
680+
expected["ts_col"] = expected["ts_col"].astype("datetime64[ns, UTC]")
681+
682+
pandas.testing.assert_frame_equal(
683+
got.reset_index(drop=True), # reset_index to ignore row labels
684+
expected.reset_index(drop=True),
685+
)
686+
687+
688+
def test_to_dataframe_empty_w_dtypes_avro(class_under_test, mock_client):
689+
avro_schema = _bq_to_avro_schema(
690+
[
691+
{"name": "bigfloat", "type": "float64"},
692+
{"name": "lilfloat", "type": "float64"},
693+
]
694+
)
695+
read_session = _generate_avro_read_session(avro_schema)
696+
avro_blocks = _bq_to_avro_blocks([], avro_schema)
697+
reader = class_under_test(avro_blocks, mock_client, "", 0, {})
698+
699+
got = reader.to_dataframe(read_session, dtypes={"lilfloat": "float16"})
700+
701+
expected = pandas.DataFrame([], columns=["bigfloat", "lilfloat"])
702+
expected["bigfloat"] = expected["bigfloat"].astype("float64")
703+
expected["lilfloat"] = expected["lilfloat"].astype("float16")
704+
705+
pandas.testing.assert_frame_equal(
706+
got.reset_index(drop=True), # reset_index to ignore row labels
707+
expected.reset_index(drop=True),
708+
)
709+
710+
711+
def test_to_dataframe_empty_w_dtypes_arrow(class_under_test, mock_client):
712+
arrow_schema = _bq_to_arrow_schema(
713+
[
714+
{"name": "bigfloat", "type": "float64"},
715+
{"name": "lilfloat", "type": "float64"},
716+
]
717+
)
718+
read_session = _generate_arrow_read_session(arrow_schema)
719+
arrow_batches = _bq_to_arrow_batches([], arrow_schema)
720+
reader = class_under_test(arrow_batches, mock_client, "", 0, {})
721+
722+
got = reader.to_dataframe(read_session, dtypes={"lilfloat": "float16"})
723+
724+
expected = pandas.DataFrame([], columns=["bigfloat", "lilfloat"])
725+
expected["bigfloat"] = expected["bigfloat"].astype("float64")
726+
expected["lilfloat"] = expected["lilfloat"].astype("float16")
727+
728+
pandas.testing.assert_frame_equal(
729+
got.reset_index(drop=True), # reset_index to ignore row labels
730+
expected.reset_index(drop=True),
731+
)
732+
733+
648734
def test_to_dataframe_by_page(class_under_test, mock_client):
649735
bq_columns = [
650736
{"name": "int_col", "type": "int64"},

0 commit comments

Comments
 (0)