@@ -1324,22 +1324,36 @@ def parse_struct_field_list(fields_str: str) -> Optional[StructType]:
13241324
13251325def split_top_level_comma_fields (s : str ) -> List [str ]:
13261326 """
1327- Splits 's' by commas not enclosed in matching brackets.
1327+ Splits 's' by commas not enclosed in matching brackets or inside quoted
1328+ identifiers.
1329+
13281330 Example: "int, array<long>, decimal(10,2)" => ["int", "array<long>", "decimal(10,2)"].
1331+
1332+ Quoted-identifier-aware: ``,`` ``(`` ``)`` ``<`` ``>`` characters that
1333+ appear inside a ``"..."`` quoted span (e.g. an OBJECT field name like
1334+ ``"a, b"`` or ``"x<y>z"``) are skipped over and do not affect the
1335+ bracket counter or split positions. Snowflake uses ``""`` as the only
1336+ in-band escape inside a quoted identifier.
13291337 """
13301338 parts = []
13311339 bracket_depth = 0
13321340 start_idx = 0
1333- for i , c in enumerate (s ):
1334- if c in ["<" , "(" ]:
1341+ i = 0
1342+ while i < len (s ):
1343+ c = s [i ]
1344+ if c == '"' :
1345+ i = _scan_quoted_identifier (s , i )
1346+ continue
1347+ if c in ("<" , "(" ):
13351348 bracket_depth += 1
1336- elif c in [ ">" , ")" ] :
1349+ elif c in ( ">" , ")" ) :
13371350 bracket_depth -= 1
13381351 if bracket_depth < 0 :
13391352 raise ValueError (f"Mismatched bracket in '{ s } '." )
13401353 elif c == "," and bracket_depth == 0 :
13411354 parts .append (s [start_idx :i ].strip ())
13421355 start_idx = i + 1
1356+ i += 1
13431357 parts .append (s [start_idx :].strip ())
13441358 return parts
13451359
@@ -1406,6 +1420,60 @@ def _lookup_simple_type(name: str, original: str) -> DataType:
14061420 raise ValueError (f"'{ original } ' is not a supported type" )
14071421
14081422
1423+ def _scan_quoted_identifier (s : str , start : int ) -> int :
1424+ """Return the index just past a quoted identifier that begins at ``s[start] == '"'``.
1425+
1426+ Snowflake's identifier grammar (``SFSqlLexer.g`` ``QuotedString`` rule) allows
1427+ any character inside ``"..."`` and uses ``""`` as the only in-band escape for a
1428+ literal ``"``. The canonical inverse is
1429+ ``SqlIdentifierUtils.java::quote()`` which doubles every embedded ``"``
1430+ and nothing else.
1431+
1432+ Raises ``ValueError`` if the closing quote is missing.
1433+
1434+ Precondition: ``s[start] == '"'``. All current callers guard on this; we
1435+ do not re-check here because asserts are stripped under ``python -O`` and
1436+ promoting to ``raise`` would be overkill for a private helper.
1437+ """
1438+ i = start + 1
1439+ while i < len (s ):
1440+ if s [i ] == '"' :
1441+ if i + 1 < len (s ) and s [i + 1 ] == '"' :
1442+ i += 2 # escaped "" inside the name; keep scanning
1443+ continue
1444+ return i + 1 # index just past the closing quote
1445+ i += 1
1446+ raise ValueError (f"Unterminated quoted identifier in: { s !r} " )
1447+
1448+
1449+ def _split_object_field (field_def : str ) -> Tuple [str , str ]:
1450+ """Split a single OBJECT field definition into ``(name_token, remainder)``.
1451+
1452+ Quoted-identifier-aware:
1453+ ``foo NUMBER`` -> (``foo``, ``NUMBER``)
1454+ ``"col with space" NUMBER`` -> (``"col with space"``, ``NUMBER``)
1455+ ``"a, b" NUMBER`` -> (``"a, b"``, ``NUMBER``)
1456+
1457+ The returned ``name_token`` still carries any surrounding quotes so the
1458+ caller can decide whether to unquote (via ``_strip_quoted_identifier``)
1459+ while preserving the raw form for diagnostics.
1460+ """
1461+ field_def = field_def .lstrip ()
1462+ if not field_def :
1463+ raise ValueError ("Empty OBJECT field definition" )
1464+ if field_def [0 ] == '"' :
1465+ end = _scan_quoted_identifier (field_def , 0 )
1466+ name_token = field_def [:end ]
1467+ remainder = field_def [end :].lstrip ()
1468+ if not remainder :
1469+ raise ValueError (f"Cannot parse OBJECT field definition: { field_def !r} " )
1470+ return name_token , remainder
1471+ parts = field_def .split (None , 1 )
1472+ if len (parts ) != 2 :
1473+ raise ValueError (f"Cannot parse OBJECT field definition: { field_def !r} " )
1474+ return parts [0 ], parts [1 ]
1475+
1476+
14091477def _extract_paren_content (type_str : str ) -> Optional [Tuple [str , str ]]:
14101478 """Extract the base keyword and content inside matching parentheses.
14111479
@@ -1418,20 +1486,30 @@ def _extract_paren_content(type_str: str) -> Optional[Tuple[str, str]]:
14181486 backend (``INFER_SCHEMA``), so we fail loudly rather than silently
14191487 degrade to ``VariantType``.
14201488
1489+ Quoted-identifier-aware: ``(`` and ``)`` characters appearing inside a
1490+ ``"..."`` quoted name (``OBJECT("a(b)c" TEXT)``) are skipped over and do
1491+ not affect the depth counter.
1492+
14211493 E.g. "OBJECT(city VARCHAR, zip NUMBER(38,0))" -> ("OBJECT", "city VARCHAR, zip NUMBER(38,0)")
14221494 """
14231495 paren_idx = type_str .find ("(" )
14241496 if paren_idx == - 1 :
14251497 return None
14261498 base = type_str [:paren_idx ].strip ()
14271499 depth = 0
1428- for i in range (paren_idx , len (type_str )):
1429- if type_str [i ] == "(" :
1500+ i = paren_idx
1501+ while i < len (type_str ):
1502+ c = type_str [i ]
1503+ if c == '"' :
1504+ i = _scan_quoted_identifier (type_str , i )
1505+ continue
1506+ if c == "(" :
14301507 depth += 1
1431- elif type_str [ i ] == ")" :
1508+ elif c == ")" :
14321509 depth -= 1
14331510 if depth == 0 :
14341511 return base , type_str [paren_idx + 1 : i ]
1512+ i += 1
14351513 raise ValueError (f"Unbalanced parentheses in type string: '{ type_str } '" )
14361514
14371515
@@ -1499,13 +1577,17 @@ def _sf_type_to_type_object(type_str: str) -> DataType:
14991577 # SQL grammar for OBJECT types; raise so backend bugs or
15001578 # malformed input surface loudly.
15011579 raise ValueError (f"Empty field in OBJECT type: '{ type_str } '" )
1502- parts = field_def .split (None , 1 )
1503- if len (parts ) != 2 :
1504- raise ValueError (f"Cannot parse OBJECT field definition: '{ field_def } '" )
1505- field_name = parts [0 ]
1506- type_part , nullable = extract_nullable_keyword (parts [1 ])
1580+ # Quoted-identifier-aware split so OBJECT field names containing
1581+ # spaces, commas, parens, or other non-bare characters survive
1582+ # round-trip through INFER_SCHEMA. The name token is passed to
1583+ # ``StructField`` with its surrounding quotes intact so
1584+ # ``ColumnIdentifier``'s ``ALREADY_QUOTED`` branch preserves
1585+ # mixed-case names verbatim (without quotes the bare-identifier
1586+ # rule would case-fold ``"Foo"`` to ``"FOO"``).
1587+ name_token , type_remainder = _split_object_field (field_def )
1588+ type_part , nullable = extract_nullable_keyword (type_remainder )
15071589 field_type = _sf_type_to_type_object (type_part )
1508- struct_fields .append (StructField (field_name , field_type , nullable = nullable ))
1590+ struct_fields .append (StructField (name_token , field_type , nullable = nullable ))
15091591 return StructType (struct_fields , structured = True )
15101592
15111593 precision_scale = get_number_precision_scale (type_str )
0 commit comments