Skip to content

Commit 129d7ef

Browse files
add support for bare structured type keyword, i.e. no inner details. and add tests
1 parent 302d109 commit 129d7ef

2 files changed

Lines changed: 185 additions & 5 deletions

File tree

src/snowflake/snowpark/dataframe_reader.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,8 +247,14 @@ def _parse_structured_type_str(type_str, max_string_size):
247247
result = _extract_paren_content(type_str)
248248
base_upper = result[0].upper() if result else type_str.upper()
249249

250-
if base_upper in _STRUCTURED_TYPE_KEYWORDS and result is not None:
251-
return _sf_type_to_type_object(type_str)
250+
if base_upper in _STRUCTURED_TYPE_KEYWORDS:
251+
if result is not None:
252+
return _sf_type_to_type_object(type_str)
253+
# Bare structured keyword (e.g. "OBJECT", "MAP", "ARRAY") without
254+
# inner type details — older backends may return these. Return
255+
# VariantType so column names are preserved and callers (e.g. SAS)
256+
# can apply their own structured-type discovery.
257+
return VariantType()
252258

253259
if result is None:
254260
return convert_sf_to_sp_type(base_upper, 0, 0, 0, max_string_size)
@@ -1549,7 +1555,13 @@ def _infer_schema_for_file_format(
15491555
format.lower() in ("parquet", "json")
15501556
and use_structured_type_infer_schema
15511557
):
1552-
if use_relaxed_types:
1558+
if isinstance(datatype, VariantType):
1559+
# Bare structured keyword was returned by the backend
1560+
# (no inner details). Skip the cast — $1:{name}
1561+
# extracts as variant and lets callers handle
1562+
# structured-type discovery.
1563+
identifier = f"$1:{name}"
1564+
elif use_relaxed_types:
15531565
identifier = f"$1:{name}::{convert_sp_to_sf_type(datatype)}"
15541566
else:
15551567
# INFER_SCHEMA may return NOT NULL annotations in

tests/unit/test_dataframe_reader_type_parsing.py

Lines changed: 170 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,22 @@ def test_nested_structured(self):
396396
assert inner.structured is True
397397
assert inner.fields[1].nullable is False
398398

399+
def test_bare_object_returns_variant(self):
400+
result = _parse_structured_type_str("OBJECT", MAX_STRING_SIZE)
401+
assert result == VariantType()
402+
403+
def test_bare_map_returns_variant(self):
404+
result = _parse_structured_type_str("MAP", MAX_STRING_SIZE)
405+
assert result == VariantType()
406+
407+
def test_bare_array_returns_variant(self):
408+
result = _parse_structured_type_str("ARRAY", MAX_STRING_SIZE)
409+
assert result == VariantType()
410+
411+
def test_bare_object_lowercase_returns_variant(self):
412+
result = _parse_structured_type_str("object", MAX_STRING_SIZE)
413+
assert result == VariantType()
414+
399415

400416
# ---------------------------------------------------------------------------
401417
# _infer_schema_for_file_format (mock-based)
@@ -427,7 +443,13 @@ def _build_infer_schema_rows(columns):
427443
class TestInferSchemaStructuredTypePath:
428444
"""Tests the structured-type branch inside _infer_schema_for_file_format."""
429445

430-
def _run_infer(self, columns, use_structured=True, use_relaxed_types=False):
446+
def _run_infer(
447+
self,
448+
columns,
449+
use_structured=True,
450+
use_relaxed_types=False,
451+
file_format="PARQUET",
452+
):
431453
session = _make_mock_session(use_structured=use_structured)
432454
rows = _build_infer_schema_rows(columns)
433455

@@ -453,7 +475,7 @@ def _run_infer(self, columns, use_structured=True, use_relaxed_types=False):
453475
schema_to_cast,
454476
transformations,
455477
exception,
456-
) = reader._infer_schema_for_file_format("@stage/path", "PARQUET")
478+
) = reader._infer_schema_for_file_format("@stage/path", file_format)
457479
assert exception is None, f"Unexpected exception: {exception}"
458480
return new_schema, schema_to_cast, transformations
459481

@@ -624,3 +646,149 @@ def test_mixed_columns(self):
624646

625647
assert len(schema_to_cast) == 5
626648
assert len(transformations) == 5
649+
650+
# --- bare structured keywords (older backends) ---
651+
652+
def test_bare_object_returns_variant_type(self):
653+
columns = [
654+
("address", "OBJECT", True, "$1:address::OBJECT"),
655+
]
656+
schema, schema_to_cast, _ = self._run_infer(columns)
657+
658+
assert schema[0].datatype == VariantType()
659+
assert schema_to_cast[0][0] == '$1:"address"'
660+
661+
def test_bare_map_returns_variant_type(self):
662+
columns = [
663+
("props", "MAP", True, "$1:props::MAP"),
664+
]
665+
schema, schema_to_cast, _ = self._run_infer(columns)
666+
667+
assert schema[0].datatype == VariantType()
668+
assert schema_to_cast[0][0] == '$1:"props"'
669+
670+
def test_bare_array_returns_variant_type(self):
671+
columns = [
672+
("tags", "ARRAY", True, "$1:tags::ARRAY"),
673+
]
674+
schema, schema_to_cast, _ = self._run_infer(columns)
675+
676+
assert schema[0].datatype == VariantType()
677+
assert schema_to_cast[0][0] == '$1:"tags"'
678+
679+
def test_mixed_bare_and_detailed_structured(self):
680+
columns = [
681+
("id", "NUMBER(38,0)", True, "$1:id::NUMBER(38,0)"),
682+
("addr", "OBJECT", True, "$1:addr::OBJECT"),
683+
(
684+
"tags",
685+
"ARRAY(VARCHAR NOT NULL)",
686+
True,
687+
"$1:tags::ARRAY(VARCHAR NOT NULL)",
688+
),
689+
("meta", "MAP", True, "$1:meta::MAP"),
690+
]
691+
schema, schema_to_cast, _ = self._run_infer(columns)
692+
693+
assert schema[0].datatype == LongType()
694+
assert schema[1].datatype == VariantType()
695+
assert isinstance(schema[2].datatype, ArrayType)
696+
assert schema[3].datatype == VariantType()
697+
# bare keywords get no cast; detailed types get the cast
698+
assert schema_to_cast[1][0] == '$1:"addr"'
699+
assert "::ARRAY(VARCHAR)" in schema_to_cast[2][0]
700+
assert schema_to_cast[3][0] == '$1:"meta"'
701+
702+
# --- JSON format path ---
703+
704+
def test_json_format_uses_structured_path(self):
705+
columns = [
706+
("id", "NUMBER(38,0)", True, "$1:id::NUMBER(38,0)"),
707+
("name", "TEXT", True, "$1:name::TEXT"),
708+
]
709+
schema, schema_to_cast, _ = self._run_infer(columns, file_format="JSON")
710+
711+
assert len(schema) == 2
712+
assert schema[0].datatype == LongType()
713+
assert schema[1].datatype == StringType()
714+
assert "::NUMBER(38,0)" in schema_to_cast[0][0]
715+
assert "::TEXT" in schema_to_cast[1][0]
716+
717+
def test_json_format_structured_array(self):
718+
columns = [
719+
(
720+
"tags",
721+
"ARRAY(VARCHAR NOT NULL)",
722+
True,
723+
"$1:tags::ARRAY(VARCHAR NOT NULL)",
724+
),
725+
]
726+
schema, schema_to_cast, _ = self._run_infer(columns, file_format="JSON")
727+
728+
dt = schema[0].datatype
729+
assert isinstance(dt, ArrayType)
730+
assert dt.structured is True
731+
assert dt.contains_null is False
732+
assert "NOT NULL" not in schema_to_cast[0][0]
733+
734+
def test_json_format_bare_map(self):
735+
columns = [
736+
("props", "MAP", True, "$1:props::MAP"),
737+
]
738+
schema, schema_to_cast, _ = self._run_infer(columns, file_format="JSON")
739+
740+
assert schema[0].datatype == VariantType()
741+
assert schema_to_cast[0][0] == '$1:"props"'
742+
743+
744+
# ---------------------------------------------------------------------------
745+
# Session parameter defaults
746+
# ---------------------------------------------------------------------------
747+
748+
749+
class TestSessionParameterDefaults:
750+
def test_structured_infer_schema_default_is_false(self):
751+
session = _make_mock_session(use_structured=False)
752+
assert session._use_structured_type_infer_schema is False
753+
754+
def test_structured_infer_schema_can_be_enabled(self):
755+
session = _make_mock_session(use_structured=True)
756+
assert session._use_structured_type_infer_schema is True
757+
758+
def test_flag_controls_parser_path(self):
759+
"""When the flag is True, structured types are parsed recursively;
760+
when False, the legacy identifier path is used."""
761+
struct_columns = [
762+
(
763+
"addr",
764+
"OBJECT(city VARCHAR, zip NUMBER(38,0))",
765+
True,
766+
"$1:addr::OBJECT(city VARCHAR, zip NUMBER(38,0))",
767+
),
768+
]
769+
770+
# With flag ON: recursive parser produces StructType
771+
session_on = _make_mock_session(use_structured=True)
772+
rows = _build_infer_schema_rows(struct_columns)
773+
session_on._conn.run_query.side_effect = [{}, {"data": rows}, {}]
774+
reader_on = DataFrameReader(session_on, _emit_ast=False)
775+
schema_on, _, _, exc_on = reader_on._infer_schema_for_file_format(
776+
"@stage/path", "PARQUET"
777+
)
778+
assert exc_on is None
779+
assert isinstance(schema_on[0].datatype, StructType)
780+
781+
# With flag OFF: uses the legacy identifier path (raw type string)
782+
simple_columns = [
783+
("id", "NUMBER(38,0)", True, "$1:id::NUMBER(38,0)"),
784+
]
785+
session_off = _make_mock_session(use_structured=False)
786+
rows = _build_infer_schema_rows(simple_columns)
787+
session_off._conn.run_query.side_effect = [{}, {"data": rows}, {}]
788+
reader_off = DataFrameReader(session_off, _emit_ast=False)
789+
schema_off, cast_off, _, exc_off = reader_off._infer_schema_for_file_format(
790+
"@stage/path", "PARQUET"
791+
)
792+
assert exc_off is None
793+
assert schema_off[0].datatype == LongType()
794+
assert "::NUMBER(38,0)" in cast_off[0][0]

0 commit comments

Comments
 (0)