Skip to content

Commit c63ad27

Browse files
SNOW-3237416: Support structured type schema string parsing (#4155)
1 parent 07689e0 commit c63ad27

5 files changed

Lines changed: 1290 additions & 14 deletions

File tree

src/snowflake/snowpark/_internal/type_utils.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,6 +1363,193 @@ def is_likely_struct(s: str) -> bool:
13631363
return top_level_space_found
13641364

13651365

1366+
# ---------------------------------------------------------------------------
1367+
# Structured-type INFER_SCHEMA parser
1368+
#
1369+
# These helpers parse Snowflake type strings as returned by the server's
1370+
# INFER_SCHEMA function, including the structured forms
1371+
# ``OBJECT(field type, ...)``, ``MAP(key, value)``, ``ARRAY(element)`` with
1372+
# arbitrary nesting and ``NOT NULL`` annotations.
1373+
#
1374+
# Used by ``DataFrameReader._infer_schema_for_file_format`` when the session
1375+
# flag ``Session._use_structured_type_infer_schema`` is enabled.
1376+
# ---------------------------------------------------------------------------
1377+
1378+
_STRUCTURED_TYPE_KEYWORDS = frozenset({"OBJECT", "MAP", "ARRAY"})
1379+
1380+
_SF_EXTRA_TYPE_MAPPINGS = {
1381+
"text": StringType,
1382+
"real": DoubleType,
1383+
"fixed": LongType,
1384+
}
1385+
1386+
1387+
def _lookup_simple_type(name: str, original: str) -> DataType:
1388+
"""Look up a simple Snowflake type name in the standard and extra mappings.
1389+
1390+
``name`` is the candidate keyword to look up (lower-cased and
1391+
whitespace-stripped before lookup). ``original`` is the caller's full
1392+
type string, used verbatim in the error message so that parenthesized
1393+
inputs like ``"FOO(1)"`` show up correctly when ``"FOO"`` isn't found.
1394+
1395+
Raises ``ValueError`` if the name isn't recognized.
1396+
"""
1397+
normalized = name.replace(" ", "").lower()
1398+
if normalized in DATA_TYPE_STRING_OBJECT_MAPPINGS:
1399+
return DATA_TYPE_STRING_OBJECT_MAPPINGS[normalized]()
1400+
if normalized in _SF_EXTRA_TYPE_MAPPINGS:
1401+
return _SF_EXTRA_TYPE_MAPPINGS[normalized]()
1402+
raise ValueError(f"'{original}' is not a supported type")
1403+
1404+
1405+
def _extract_paren_content(type_str: str) -> Optional[Tuple[str, str]]:
1406+
"""Extract the base keyword and content inside matching parentheses.
1407+
1408+
Returns (base, inner_content) if matching parens are found.
1409+
Returns None if ``type_str`` contains no ``(`` at all (normal for simple
1410+
types like ``VARCHAR`` or ``BOOLEAN``).
1411+
1412+
Raises ``ValueError`` if ``type_str`` contains a ``(`` that is never
1413+
closed. Reaching this branch implies a malformed type string from the
1414+
backend (``INFER_SCHEMA``), so we fail loudly rather than silently
1415+
degrade to ``VariantType``.
1416+
1417+
E.g. "OBJECT(city VARCHAR, zip NUMBER(38,0))" -> ("OBJECT", "city VARCHAR, zip NUMBER(38,0)")
1418+
"""
1419+
paren_idx = type_str.find("(")
1420+
if paren_idx == -1:
1421+
return None
1422+
base = type_str[:paren_idx].strip()
1423+
depth = 0
1424+
for i in range(paren_idx, len(type_str)):
1425+
if type_str[i] == "(":
1426+
depth += 1
1427+
elif type_str[i] == ")":
1428+
depth -= 1
1429+
if depth == 0:
1430+
return base, type_str[paren_idx + 1 : i]
1431+
raise ValueError(f"Unbalanced parentheses in type string: '{type_str}'")
1432+
1433+
1434+
def _sf_type_to_type_object(type_str: str) -> DataType:
1435+
"""Parse a Snowflake SQL type string directly into a Snowpark DataType.
1436+
1437+
Handles both simple types and structured types returned by INFER_SCHEMA:
1438+
- Simple: VARCHAR, NUMBER(38,0), BOOLEAN, TIMESTAMP_NTZ, etc.
1439+
- ARRAY(element_type [NOT NULL])
1440+
- MAP(key_type, value_type [NOT NULL])
1441+
- OBJECT(field1 type1, field2 type2 NOT NULL, ...)
1442+
- Nested combinations of the above
1443+
1444+
NOT NULL annotations are respected:
1445+
- On ARRAY elements: sets ArrayType.contains_null = False
1446+
- On MAP values: sets MapType.value_contains_null = False
1447+
- On OBJECT fields: sets StructField.nullable = False
1448+
"""
1449+
type_str = type_str.strip()
1450+
if not type_str:
1451+
raise ValueError("Empty type string")
1452+
1453+
# Strip a trailing top-level NOT NULL if present and discard the bool:
1454+
# top-level column nullability is carried by INFER_SCHEMA row metadata
1455+
# (handled in _infer_schema_for_file_format), not by the type string.
1456+
# Nested NOT NULL (inside ARRAY/MAP/OBJECT) is already consumed by those
1457+
# branches below before they recurse into this function, so the bool is
1458+
# redundant here.
1459+
type_str, _ = extract_nullable_keyword(type_str)
1460+
1461+
result = _extract_paren_content(type_str)
1462+
if result is None:
1463+
return _lookup_simple_type(type_str, type_str)
1464+
1465+
base, inner = result
1466+
base_upper = base.upper()
1467+
1468+
if base_upper == "ARRAY":
1469+
element_str, element_nullable = extract_nullable_keyword(inner)
1470+
element_type = _sf_type_to_type_object(element_str)
1471+
return ArrayType(element_type, structured=True, contains_null=element_nullable)
1472+
1473+
if base_upper == "MAP":
1474+
parts = split_top_level_comma_fields(inner)
1475+
if len(parts) != 2:
1476+
raise ValueError(f"Invalid MAP type definition: '{type_str}'")
1477+
key_type = _sf_type_to_type_object(parts[0])
1478+
value_str, value_nullable = extract_nullable_keyword(parts[1])
1479+
value_type = _sf_type_to_type_object(value_str)
1480+
return MapType(
1481+
key_type, value_type, structured=True, value_contains_null=value_nullable
1482+
)
1483+
1484+
if base_upper == "OBJECT":
1485+
# OBJECT() with no inner content is valid per the Snowflake grammar:
1486+
# "a structured OBJECT that contains no keys."
1487+
if not inner.strip():
1488+
return StructType([], structured=True)
1489+
fields = split_top_level_comma_fields(inner)
1490+
struct_fields = []
1491+
for field_def in fields:
1492+
field_def = field_def.strip()
1493+
if not field_def:
1494+
# A trailing comma or empty fragment is not valid Snowflake
1495+
# SQL grammar for OBJECT types; raise so backend bugs or
1496+
# malformed input surface loudly.
1497+
raise ValueError(f"Empty field in OBJECT type: '{type_str}'")
1498+
parts = field_def.split(None, 1)
1499+
if len(parts) != 2:
1500+
raise ValueError(f"Cannot parse OBJECT field definition: '{field_def}'")
1501+
field_name = parts[0]
1502+
type_part, nullable = extract_nullable_keyword(parts[1])
1503+
field_type = _sf_type_to_type_object(type_part)
1504+
struct_fields.append(StructField(field_name, field_type, nullable=nullable))
1505+
return StructType(struct_fields, structured=True)
1506+
1507+
precision_scale = get_number_precision_scale(type_str)
1508+
if precision_scale:
1509+
return DecimalType(*precision_scale)
1510+
length = get_string_length(type_str)
1511+
if length:
1512+
return StringType(length)
1513+
1514+
return _lookup_simple_type(base_upper, type_str)
1515+
1516+
1517+
def _parse_structured_type_str(type_str: str, max_string_size: int) -> DataType:
1518+
"""Parse a Snowflake type string from INFER_SCHEMA into a Snowpark DataType.
1519+
1520+
For structured types (OBJECT, MAP, ARRAY), uses the recursive parser.
1521+
For simple types, delegates to convert_sf_to_sp_type for precision/scale.
1522+
"""
1523+
type_str = type_str.strip()
1524+
if not type_str:
1525+
return VariantType()
1526+
1527+
result = _extract_paren_content(type_str)
1528+
base_upper = result[0].upper() if result else type_str.upper()
1529+
1530+
if base_upper in _STRUCTURED_TYPE_KEYWORDS:
1531+
if result is not None:
1532+
return _sf_type_to_type_object(type_str)
1533+
# Bare structured keyword (e.g. "OBJECT", "MAP", "ARRAY") without
1534+
# inner type details — older backends may return these. Return
1535+
# VariantType so column names are preserved and callers (e.g. SAS)
1536+
# can apply their own structured-type discovery.
1537+
return VariantType()
1538+
1539+
if result is None:
1540+
return convert_sf_to_sp_type(base_upper, 0, 0, 0, max_string_size)
1541+
1542+
inner = result[1]
1543+
parts = inner.split(",")
1544+
try:
1545+
precision = int(parts[0].strip())
1546+
scale = int(parts[1].strip()) if len(parts) > 1 else 0
1547+
except (ValueError, IndexError):
1548+
precision = 0
1549+
scale = 0
1550+
return convert_sf_to_sp_type(base_upper, precision, scale, 0, max_string_size)
1551+
1552+
13661553
def type_string_to_type_object(type_str: str) -> DataType:
13671554
type_str = type_str.strip()
13681555
if not type_str:

src/snowflake/snowpark/dataframe_reader.py

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
convert_sp_to_sf_type,
5555
most_permissive_type,
5656
type_string_to_type_object,
57+
_parse_structured_type_str,
5758
)
5859
from snowflake.snowpark._internal.udf_utils import get_types_from_type_hints
5960
from snowflake.snowpark._internal.xml_reader import DEFAULT_CHUNK_SIZE
@@ -112,6 +113,9 @@
112113

113114
logger = getLogger(__name__)
114115

116+
_NOT_NULL_RE = re.compile(r"\s+NOT\s+NULL", re.IGNORECASE)
117+
118+
115119
LOCAL_TESTING_SUPPORTED_FILE_FORMAT = ("JSON",)
116120

117121
_TIME_TRAVEL_OPTIONS_PARAMS_MAP = {
@@ -1341,25 +1345,39 @@ def _infer_schema_for_file_format(
13411345
schema_to_cast = []
13421346
transformations: List["snowflake.snowpark.column.Column"] = []
13431347
read_file_transformations = None
1348+
use_structured_type_infer_schema = (
1349+
self._session._use_structured_type_infer_schema
1350+
)
13441351
for r in results:
13451352
# Columns for r [column_name, type, nullable, expression, filenames, order_id]
13461353
column_name, type, nullable, expression = r[0], r[1], r[2], r[3]
13471354
name = quote_name_without_upper_casing(column_name)
1348-
# Parse the type returned by infer_schema command to
1349-
# pass to determine datatype for schema
1350-
data_type_parts = type.split("(")
1351-
parts_length = len(data_type_parts)
1352-
if parts_length == 1:
1353-
data_type = type
1354-
precision = 0
1355-
scale = 0
1355+
# Parse the type returned by infer_schema command.
1356+
if use_structured_type_infer_schema:
1357+
# handles both simple types and structured
1358+
# types (OBJECT, MAP, ARRAY with inner details).
1359+
datatype = _parse_structured_type_str(
1360+
type, self._session._conn.max_string_size
1361+
)
13561362
else:
1357-
data_type = data_type_parts[0]
1358-
precision = int(data_type_parts[1].split(",")[0])
1359-
scale = int(data_type_parts[1].split(",")[1][:-1])
1360-
datatype = convert_sf_to_sp_type(
1361-
data_type, precision, scale, 0, self._session._conn.max_string_size
1362-
)
1363+
data_type_parts = type.split("(")
1364+
parts_length = len(data_type_parts)
1365+
if parts_length == 1:
1366+
data_type = type
1367+
precision = 0
1368+
scale = 0
1369+
else:
1370+
data_type = data_type_parts[0]
1371+
precision = int(data_type_parts[1].split(",")[0])
1372+
scale = int(data_type_parts[1].split(",")[1][:-1])
1373+
datatype = convert_sf_to_sp_type(
1374+
data_type,
1375+
precision,
1376+
scale,
1377+
0,
1378+
self._session._conn.max_string_size,
1379+
)
1380+
13631381
if use_relaxed_types:
13641382
datatype = most_permissive_type(datatype)
13651383
new_schema.append(
@@ -1379,6 +1397,26 @@ def _infer_schema_for_file_format(
13791397
"GET_IGNORE_CASE"
13801398
):
13811399
identifier = expression
1400+
elif (
1401+
format.lower() in ("parquet", "json")
1402+
and use_structured_type_infer_schema
1403+
):
1404+
if isinstance(datatype, VariantType):
1405+
# Bare structured keyword was returned by the backend
1406+
# (no inner details). Skip the cast — $1:{name}
1407+
# extracts as variant and lets callers handle
1408+
# structured-type discovery.
1409+
identifier = f"$1:{name}"
1410+
elif use_relaxed_types:
1411+
identifier = f"$1:{name}::{convert_sp_to_sf_type(datatype)}"
1412+
else:
1413+
# INFER_SCHEMA may return NOT NULL annotations in
1414+
# structured type strings (e.g.,
1415+
# "ARRAY(NUMBER(10,0) NOT NULL)"). Strip them before
1416+
# embedding in SQL — nullable info is already captured
1417+
# in the parsed DataType objects.
1418+
cast_type = _NOT_NULL_RE.sub("", type)
1419+
identifier = f"$1:{name}::{cast_type}"
13821420
else:
13831421
identifier = f"$1:{name}::{convert_sp_to_sf_type(datatype) if use_relaxed_types else type}"
13841422

src/snowflake/snowpark/session.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,15 @@ def __init__(
710710
)
711711
)
712712

713+
# When True, INFER_SCHEMA results for parquet/json files with
714+
# structured types (OBJECT/MAP/ARRAY with inner type details) are
715+
# parsed into full Snowpark DataType objects by DataFrameReader and
716+
# TRY_CAST is used in the SELECT expression. Defaults to False;
717+
# opt in by direct assignment, e.g.
718+
# session._use_structured_type_infer_schema = True
719+
# (SAS / snowpark-connect enables this during session configuration.)
720+
self._use_structured_type_infer_schema: bool = False
721+
713722
self._large_query_breakdown_enabled: bool = self.is_feature_enabled_for_version(
714723
_PYTHON_SNOWPARK_USE_LARGE_QUERY_BREAKDOWN_OPTIMIZATION_VERSION
715724
)

0 commit comments

Comments
 (0)