@@ -1363,6 +1363,193 @@ def is_likely_struct(s: str) -> bool:
13631363 return top_level_space_found
13641364
13651365
1366+ # ---------------------------------------------------------------------------
1367+ # Structured-type INFER_SCHEMA parser
1368+ #
1369+ # These helpers parse Snowflake type strings as returned by the server's
1370+ # INFER_SCHEMA function, including the structured forms
1371+ # ``OBJECT(field type, ...)``, ``MAP(key, value)``, ``ARRAY(element)`` with
1372+ # arbitrary nesting and ``NOT NULL`` annotations.
1373+ #
1374+ # Used by ``DataFrameReader._infer_schema_for_file_format`` when the session
1375+ # flag ``Session._use_structured_type_infer_schema`` is enabled.
1376+ # ---------------------------------------------------------------------------
1377+
1378+ _STRUCTURED_TYPE_KEYWORDS = frozenset ({"OBJECT" , "MAP" , "ARRAY" })
1379+
1380+ _SF_EXTRA_TYPE_MAPPINGS = {
1381+ "text" : StringType ,
1382+ "real" : DoubleType ,
1383+ "fixed" : LongType ,
1384+ }
1385+
1386+
1387+ def _lookup_simple_type (name : str , original : str ) -> DataType :
1388+ """Look up a simple Snowflake type name in the standard and extra mappings.
1389+
1390+ ``name`` is the candidate keyword to look up (lower-cased and
1391+ whitespace-stripped before lookup). ``original`` is the caller's full
1392+ type string, used verbatim in the error message so that parenthesized
1393+ inputs like ``"FOO(1)"`` show up correctly when ``"FOO"`` isn't found.
1394+
1395+ Raises ``ValueError`` if the name isn't recognized.
1396+ """
1397+ normalized = name .replace (" " , "" ).lower ()
1398+ if normalized in DATA_TYPE_STRING_OBJECT_MAPPINGS :
1399+ return DATA_TYPE_STRING_OBJECT_MAPPINGS [normalized ]()
1400+ if normalized in _SF_EXTRA_TYPE_MAPPINGS :
1401+ return _SF_EXTRA_TYPE_MAPPINGS [normalized ]()
1402+ raise ValueError (f"'{ original } ' is not a supported type" )
1403+
1404+
1405+ def _extract_paren_content (type_str : str ) -> Optional [Tuple [str , str ]]:
1406+ """Extract the base keyword and content inside matching parentheses.
1407+
1408+ Returns (base, inner_content) if matching parens are found.
1409+ Returns None if ``type_str`` contains no ``(`` at all (normal for simple
1410+ types like ``VARCHAR`` or ``BOOLEAN``).
1411+
1412+ Raises ``ValueError`` if ``type_str`` contains a ``(`` that is never
1413+ closed. Reaching this branch implies a malformed type string from the
1414+ backend (``INFER_SCHEMA``), so we fail loudly rather than silently
1415+ degrade to ``VariantType``.
1416+
1417+ E.g. "OBJECT(city VARCHAR, zip NUMBER(38,0))" -> ("OBJECT", "city VARCHAR, zip NUMBER(38,0)")
1418+ """
1419+ paren_idx = type_str .find ("(" )
1420+ if paren_idx == - 1 :
1421+ return None
1422+ base = type_str [:paren_idx ].strip ()
1423+ depth = 0
1424+ for i in range (paren_idx , len (type_str )):
1425+ if type_str [i ] == "(" :
1426+ depth += 1
1427+ elif type_str [i ] == ")" :
1428+ depth -= 1
1429+ if depth == 0 :
1430+ return base , type_str [paren_idx + 1 : i ]
1431+ raise ValueError (f"Unbalanced parentheses in type string: '{ type_str } '" )
1432+
1433+
1434+ def _sf_type_to_type_object (type_str : str ) -> DataType :
1435+ """Parse a Snowflake SQL type string directly into a Snowpark DataType.
1436+
1437+ Handles both simple types and structured types returned by INFER_SCHEMA:
1438+ - Simple: VARCHAR, NUMBER(38,0), BOOLEAN, TIMESTAMP_NTZ, etc.
1439+ - ARRAY(element_type [NOT NULL])
1440+ - MAP(key_type, value_type [NOT NULL])
1441+ - OBJECT(field1 type1, field2 type2 NOT NULL, ...)
1442+ - Nested combinations of the above
1443+
1444+ NOT NULL annotations are respected:
1445+ - On ARRAY elements: sets ArrayType.contains_null = False
1446+ - On MAP values: sets MapType.value_contains_null = False
1447+ - On OBJECT fields: sets StructField.nullable = False
1448+ """
1449+ type_str = type_str .strip ()
1450+ if not type_str :
1451+ raise ValueError ("Empty type string" )
1452+
1453+ # Strip a trailing top-level NOT NULL if present and discard the bool:
1454+ # top-level column nullability is carried by INFER_SCHEMA row metadata
1455+ # (handled in _infer_schema_for_file_format), not by the type string.
1456+ # Nested NOT NULL (inside ARRAY/MAP/OBJECT) is already consumed by those
1457+ # branches below before they recurse into this function, so the bool is
1458+ # redundant here.
1459+ type_str , _ = extract_nullable_keyword (type_str )
1460+
1461+ result = _extract_paren_content (type_str )
1462+ if result is None :
1463+ return _lookup_simple_type (type_str , type_str )
1464+
1465+ base , inner = result
1466+ base_upper = base .upper ()
1467+
1468+ if base_upper == "ARRAY" :
1469+ element_str , element_nullable = extract_nullable_keyword (inner )
1470+ element_type = _sf_type_to_type_object (element_str )
1471+ return ArrayType (element_type , structured = True , contains_null = element_nullable )
1472+
1473+ if base_upper == "MAP" :
1474+ parts = split_top_level_comma_fields (inner )
1475+ if len (parts ) != 2 :
1476+ raise ValueError (f"Invalid MAP type definition: '{ type_str } '" )
1477+ key_type = _sf_type_to_type_object (parts [0 ])
1478+ value_str , value_nullable = extract_nullable_keyword (parts [1 ])
1479+ value_type = _sf_type_to_type_object (value_str )
1480+ return MapType (
1481+ key_type , value_type , structured = True , value_contains_null = value_nullable
1482+ )
1483+
1484+ if base_upper == "OBJECT" :
1485+ # OBJECT() with no inner content is valid per the Snowflake grammar:
1486+ # "a structured OBJECT that contains no keys."
1487+ if not inner .strip ():
1488+ return StructType ([], structured = True )
1489+ fields = split_top_level_comma_fields (inner )
1490+ struct_fields = []
1491+ for field_def in fields :
1492+ field_def = field_def .strip ()
1493+ if not field_def :
1494+ # A trailing comma or empty fragment is not valid Snowflake
1495+ # SQL grammar for OBJECT types; raise so backend bugs or
1496+ # malformed input surface loudly.
1497+ raise ValueError (f"Empty field in OBJECT type: '{ type_str } '" )
1498+ parts = field_def .split (None , 1 )
1499+ if len (parts ) != 2 :
1500+ raise ValueError (f"Cannot parse OBJECT field definition: '{ field_def } '" )
1501+ field_name = parts [0 ]
1502+ type_part , nullable = extract_nullable_keyword (parts [1 ])
1503+ field_type = _sf_type_to_type_object (type_part )
1504+ struct_fields .append (StructField (field_name , field_type , nullable = nullable ))
1505+ return StructType (struct_fields , structured = True )
1506+
1507+ precision_scale = get_number_precision_scale (type_str )
1508+ if precision_scale :
1509+ return DecimalType (* precision_scale )
1510+ length = get_string_length (type_str )
1511+ if length :
1512+ return StringType (length )
1513+
1514+ return _lookup_simple_type (base_upper , type_str )
1515+
1516+
1517+ def _parse_structured_type_str (type_str : str , max_string_size : int ) -> DataType :
1518+ """Parse a Snowflake type string from INFER_SCHEMA into a Snowpark DataType.
1519+
1520+ For structured types (OBJECT, MAP, ARRAY), uses the recursive parser.
1521+ For simple types, delegates to convert_sf_to_sp_type for precision/scale.
1522+ """
1523+ type_str = type_str .strip ()
1524+ if not type_str :
1525+ return VariantType ()
1526+
1527+ result = _extract_paren_content (type_str )
1528+ base_upper = result [0 ].upper () if result else type_str .upper ()
1529+
1530+ if base_upper in _STRUCTURED_TYPE_KEYWORDS :
1531+ if result is not None :
1532+ return _sf_type_to_type_object (type_str )
1533+ # Bare structured keyword (e.g. "OBJECT", "MAP", "ARRAY") without
1534+ # inner type details — older backends may return these. Return
1535+ # VariantType so column names are preserved and callers (e.g. SAS)
1536+ # can apply their own structured-type discovery.
1537+ return VariantType ()
1538+
1539+ if result is None :
1540+ return convert_sf_to_sp_type (base_upper , 0 , 0 , 0 , max_string_size )
1541+
1542+ inner = result [1 ]
1543+ parts = inner .split ("," )
1544+ try :
1545+ precision = int (parts [0 ].strip ())
1546+ scale = int (parts [1 ].strip ()) if len (parts ) > 1 else 0
1547+ except (ValueError , IndexError ):
1548+ precision = 0
1549+ scale = 0
1550+ return convert_sf_to_sp_type (base_upper , precision , scale , 0 , max_string_size )
1551+
1552+
13661553def type_string_to_type_object (type_str : str ) -> DataType :
13671554 type_str = type_str .strip ()
13681555 if not type_str :
0 commit comments