Skip to content

Commit e56041f

Browse files
SNOW-3440288: Enhance schema string parser for quotes (#4206)
1 parent 75260b9 commit e56041f

2 files changed

Lines changed: 376 additions & 13 deletions

File tree

src/snowflake/snowpark/_internal/type_utils.py

Lines changed: 95 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,22 +1324,36 @@ def parse_struct_field_list(fields_str: str) -> Optional[StructType]:
13241324

13251325
def split_top_level_comma_fields(s: str) -> List[str]:
13261326
"""
1327-
Splits 's' by commas not enclosed in matching brackets.
1327+
Splits 's' by commas not enclosed in matching brackets or inside quoted
1328+
identifiers.
1329+
13281330
Example: "int, array<long>, decimal(10,2)" => ["int", "array<long>", "decimal(10,2)"].
1331+
1332+
Quoted-identifier-aware: ``,`` ``(`` ``)`` ``<`` ``>`` characters that
1333+
appear inside a ``"..."`` quoted span (e.g. an OBJECT field name like
1334+
``"a, b"`` or ``"x<y>z"``) are skipped over and do not affect the
1335+
bracket counter or split positions. Snowflake uses ``""`` as the only
1336+
in-band escape inside a quoted identifier.
13291337
"""
13301338
parts = []
13311339
bracket_depth = 0
13321340
start_idx = 0
1333-
for i, c in enumerate(s):
1334-
if c in ["<", "("]:
1341+
i = 0
1342+
while i < len(s):
1343+
c = s[i]
1344+
if c == '"':
1345+
i = _scan_quoted_identifier(s, i)
1346+
continue
1347+
if c in ("<", "("):
13351348
bracket_depth += 1
1336-
elif c in [">", ")"]:
1349+
elif c in (">", ")"):
13371350
bracket_depth -= 1
13381351
if bracket_depth < 0:
13391352
raise ValueError(f"Mismatched bracket in '{s}'.")
13401353
elif c == "," and bracket_depth == 0:
13411354
parts.append(s[start_idx:i].strip())
13421355
start_idx = i + 1
1356+
i += 1
13431357
parts.append(s[start_idx:].strip())
13441358
return parts
13451359

@@ -1406,6 +1420,60 @@ def _lookup_simple_type(name: str, original: str) -> DataType:
14061420
raise ValueError(f"'{original}' is not a supported type")
14071421

14081422

1423+
def _scan_quoted_identifier(s: str, start: int) -> int:
1424+
"""Return the index just past a quoted identifier that begins at ``s[start] == '"'``.
1425+
1426+
Snowflake's identifier grammar (``SFSqlLexer.g`` ``QuotedString`` rule) allows
1427+
any character inside ``"..."`` and uses ``""`` as the only in-band escape for a
1428+
literal ``"``. The canonical inverse is
1429+
``SqlIdentifierUtils.java::quote()`` which doubles every embedded ``"``
1430+
and nothing else.
1431+
1432+
Raises ``ValueError`` if the closing quote is missing.
1433+
1434+
Precondition: ``s[start] == '"'``. All current callers guard on this; we
1435+
do not re-check here because asserts are stripped under ``python -O`` and
1436+
promoting to ``raise`` would be overkill for a private helper.
1437+
"""
1438+
i = start + 1
1439+
while i < len(s):
1440+
if s[i] == '"':
1441+
if i + 1 < len(s) and s[i + 1] == '"':
1442+
i += 2 # escaped "" inside the name; keep scanning
1443+
continue
1444+
return i + 1 # index just past the closing quote
1445+
i += 1
1446+
raise ValueError(f"Unterminated quoted identifier in: {s!r}")
1447+
1448+
1449+
def _split_object_field(field_def: str) -> Tuple[str, str]:
1450+
"""Split a single OBJECT field definition into ``(name_token, remainder)``.
1451+
1452+
Quoted-identifier-aware:
1453+
``foo NUMBER`` -> (``foo``, ``NUMBER``)
1454+
``"col with space" NUMBER`` -> (``"col with space"``, ``NUMBER``)
1455+
``"a, b" NUMBER`` -> (``"a, b"``, ``NUMBER``)
1456+
1457+
The returned ``name_token`` still carries any surrounding quotes so the
1458+
caller can decide whether to unquote (via ``_strip_quoted_identifier``)
1459+
while preserving the raw form for diagnostics.
1460+
"""
1461+
field_def = field_def.lstrip()
1462+
if not field_def:
1463+
raise ValueError("Empty OBJECT field definition")
1464+
if field_def[0] == '"':
1465+
end = _scan_quoted_identifier(field_def, 0)
1466+
name_token = field_def[:end]
1467+
remainder = field_def[end:].lstrip()
1468+
if not remainder:
1469+
raise ValueError(f"Cannot parse OBJECT field definition: {field_def!r}")
1470+
return name_token, remainder
1471+
parts = field_def.split(None, 1)
1472+
if len(parts) != 2:
1473+
raise ValueError(f"Cannot parse OBJECT field definition: {field_def!r}")
1474+
return parts[0], parts[1]
1475+
1476+
14091477
def _extract_paren_content(type_str: str) -> Optional[Tuple[str, str]]:
14101478
"""Extract the base keyword and content inside matching parentheses.
14111479
@@ -1418,20 +1486,30 @@ def _extract_paren_content(type_str: str) -> Optional[Tuple[str, str]]:
14181486
backend (``INFER_SCHEMA``), so we fail loudly rather than silently
14191487
degrade to ``VariantType``.
14201488
1489+
Quoted-identifier-aware: ``(`` and ``)`` characters appearing inside a
1490+
``"..."`` quoted name (``OBJECT("a(b)c" TEXT)``) are skipped over and do
1491+
not affect the depth counter.
1492+
14211493
E.g. "OBJECT(city VARCHAR, zip NUMBER(38,0))" -> ("OBJECT", "city VARCHAR, zip NUMBER(38,0)")
14221494
"""
14231495
paren_idx = type_str.find("(")
14241496
if paren_idx == -1:
14251497
return None
14261498
base = type_str[:paren_idx].strip()
14271499
depth = 0
1428-
for i in range(paren_idx, len(type_str)):
1429-
if type_str[i] == "(":
1500+
i = paren_idx
1501+
while i < len(type_str):
1502+
c = type_str[i]
1503+
if c == '"':
1504+
i = _scan_quoted_identifier(type_str, i)
1505+
continue
1506+
if c == "(":
14301507
depth += 1
1431-
elif type_str[i] == ")":
1508+
elif c == ")":
14321509
depth -= 1
14331510
if depth == 0:
14341511
return base, type_str[paren_idx + 1 : i]
1512+
i += 1
14351513
raise ValueError(f"Unbalanced parentheses in type string: '{type_str}'")
14361514

14371515

@@ -1499,13 +1577,17 @@ def _sf_type_to_type_object(type_str: str) -> DataType:
14991577
# SQL grammar for OBJECT types; raise so backend bugs or
15001578
# malformed input surface loudly.
15011579
raise ValueError(f"Empty field in OBJECT type: '{type_str}'")
1502-
parts = field_def.split(None, 1)
1503-
if len(parts) != 2:
1504-
raise ValueError(f"Cannot parse OBJECT field definition: '{field_def}'")
1505-
field_name = parts[0]
1506-
type_part, nullable = extract_nullable_keyword(parts[1])
1580+
# Quoted-identifier-aware split so OBJECT field names containing
1581+
# spaces, commas, parens, or other non-bare characters survive
1582+
# round-trip through INFER_SCHEMA. The name token is passed to
1583+
# ``StructField`` with its surrounding quotes intact so
1584+
# ``ColumnIdentifier``'s ``ALREADY_QUOTED`` branch preserves
1585+
# mixed-case names verbatim (without quotes the bare-identifier
1586+
# rule would case-fold ``"Foo"`` to ``"FOO"``).
1587+
name_token, type_remainder = _split_object_field(field_def)
1588+
type_part, nullable = extract_nullable_keyword(type_remainder)
15071589
field_type = _sf_type_to_type_object(type_part)
1508-
struct_fields.append(StructField(field_name, field_type, nullable=nullable))
1590+
struct_fields.append(StructField(name_token, field_type, nullable=nullable))
15091591
return StructType(struct_fields, structured=True)
15101592

15111593
precision_scale = get_number_precision_scale(type_str)

0 commit comments

Comments
 (0)