Skip to content

Commit 6b48a8a

Browse files
Simplify and refactor _to_struct converter method
- Reduced code complexity from 112 lines to 71 lines (37% reduction) - Split complex logic into smaller, focused helper functions - Eliminated duplicate number parsing logic - Removed unnecessary JSON string reconstruction - Added proper Google-style docstrings for all helper functions - Improved readability with early return patterns - Fixed test expectation for boolean conversion (true -> True) - All tests pass and code quality checks pass 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent a49832b commit 6b48a8a

2 files changed

Lines changed: 86 additions & 91 deletions

File tree

pyathena/converter.py

Lines changed: 85 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,7 @@ def _to_struct(varchar_value: Optional[str]) -> Optional[Dict[str, Any]]:
8585
1. JSON format: '{"key": "value", "num": 123}' (recommended)
8686
2. Athena native format: '{key=value, num=123}' (basic cases only)
8787
88-
For structs containing special characters (commas, equals signs, quotes,
89-
braces), use CAST(struct_column AS JSON) in your SQL query to ensure
90-
proper handling.
88+
For complex structs, use CAST(struct_column AS JSON) in your SQL query.
9189
9290
Args:
9391
varchar_value: String representation of struct data
@@ -98,99 +96,96 @@ def _to_struct(varchar_value: Optional[str]) -> Optional[Dict[str, Any]]:
9896
if varchar_value is None:
9997
return None
10098

101-
# First try to parse as JSON (preferred format)
99+
# First try JSON parsing (preferred)
102100
try:
103101
result = json.loads(varchar_value)
104102
return result if isinstance(result, dict) else None
105103
except json.JSONDecodeError:
106104
pass
107105

108-
# Handle Athena's native struct format: {a=1, b=2} or {Alice, 25}
109-
# WARNING: This is a simplified parser that works for basic cases.
110-
# Athena's actual struct format may have complex escaping rules for
111-
# special characters that are not fully handled here.
112-
# For complex structs, JSON format is recommended.
113-
if varchar_value.startswith("{") and varchar_value.endswith("}"):
114-
try:
115-
inner = varchar_value[1:-1].strip()
116-
if not inner:
117-
return {}
118-
119-
# Check if this is a named struct (contains =) or unnamed struct
120-
# (comma-separated values)
121-
if "=" in inner:
122-
# Named struct format: {a=1, b=2}
123-
pairs = []
124-
current_pos = 0
125-
126-
while current_pos < len(inner):
127-
# Find the next key=value pair
128-
eq_pos = inner.find("=", current_pos)
129-
if eq_pos == -1:
130-
break
131-
132-
# Extract key (everything before =)
133-
key = inner[current_pos:eq_pos].strip()
134-
135-
# Find the end of the value (next comma or end of string)
136-
comma_pos = inner.find(",", eq_pos + 1)
137-
if comma_pos == -1:
138-
value = inner[eq_pos + 1 :].strip()
139-
current_pos = len(inner)
140-
else:
141-
value = inner[eq_pos + 1 : comma_pos].strip()
142-
current_pos = comma_pos + 1
143-
144-
# Basic validation: skip problematic pairs but continue processing others
145-
# Allow basic comma separation, but reject nested structures
146-
if any(char in key for char in '{}="') or any(char in value for char in '{}="'):
147-
# Skip this problematic pair but continue with others
148-
continue
149-
150-
# Add quotes to key
151-
key = f'"{key}"'
152-
153-
# Handle value quoting - if it's not a number, quote it
154-
if not (
155-
value.isdigit()
156-
or (value.startswith("-") and value[1:].isdigit())
157-
or value.replace(".", "", 1).isdigit()
158-
or value in ("true", "false", "null")
159-
):
160-
value = f'"{value}"'
161-
162-
pairs.append(f"{key}:{value}")
163-
164-
if pairs:
165-
json_str = "{" + ",".join(pairs) + "}"
166-
result = json.loads(json_str)
167-
return result if isinstance(result, dict) else None
168-
else:
169-
# Unnamed struct format: {Alice, 25} - convert to indexed dict
170-
# Split by comma and create indexed keys
171-
values = [v.strip() for v in inner.split(",")]
172-
if values:
173-
# Create indexed dictionary: {"0": "Alice", "1": "25"}
174-
indexed_dict: Dict[str, Any] = {}
175-
for i, value in enumerate(values):
176-
# Try to convert numbers
177-
try:
178-
# Check if it's an integer
179-
if value.isdigit() or (value.startswith("-") and value[1:].isdigit()):
180-
indexed_dict[str(i)] = int(value)
181-
# Check if it's a float
182-
elif "." in value:
183-
indexed_dict[str(i)] = float(value)
184-
else:
185-
indexed_dict[str(i)] = value
186-
except ValueError:
187-
indexed_dict[str(i)] = value
188-
return indexed_dict
189-
except (ValueError, json.JSONDecodeError, IndexError):
190-
pass
191-
192-
# If all parsing attempts fail, return None
193-
return None
106+
# Handle Athena native format: {a=1, b=2} or {Alice, 25}
107+
if not (varchar_value.startswith("{") and varchar_value.endswith("}")):
108+
return None
109+
110+
inner = varchar_value[1:-1].strip()
111+
if not inner:
112+
return {}
113+
114+
try:
115+
if "=" in inner:
116+
# Named struct: {a=1, b=2}
117+
return _parse_named_struct(inner)
118+
# Unnamed struct: {Alice, 25}
119+
return _parse_unnamed_struct(inner)
120+
except Exception:
121+
return None
122+
123+
124+
def _parse_named_struct(inner: str) -> Optional[Dict[str, Any]]:
125+
"""Parse named struct format: a=1, b=2.
126+
127+
Args:
128+
inner: Interior content of struct without braces.
129+
130+
Returns:
131+
Dictionary with parsed key-value pairs, or None if no valid pairs found.
132+
"""
133+
result = {}
134+
135+
# Simple split by comma for basic cases
136+
pairs = [pair.strip() for pair in inner.split(",")]
137+
138+
for pair in pairs:
139+
if "=" not in pair:
140+
continue
141+
142+
key, value = pair.split("=", 1)
143+
key = key.strip()
144+
value = value.strip()
145+
146+
# Skip pairs with special characters (safety check)
147+
if any(char in key for char in '{}="') or any(char in value for char in '{}="'):
148+
continue
149+
150+
# Convert value to appropriate type
151+
result[key] = _convert_value(value)
152+
153+
return result if result else None
154+
155+
156+
def _parse_unnamed_struct(inner: str) -> Dict[str, Any]:
157+
"""Parse unnamed struct format: Alice, 25.
158+
159+
Args:
160+
inner: Interior content of struct without braces.
161+
162+
Returns:
163+
Dictionary with indexed keys mapping to parsed values.
164+
"""
165+
values = [v.strip() for v in inner.split(",")]
166+
return {str(i): _convert_value(value) for i, value in enumerate(values)}
167+
168+
169+
def _convert_value(value: str) -> Any:
170+
"""Convert string value to appropriate Python type.
171+
172+
Args:
173+
value: String value to convert.
174+
175+
Returns:
176+
Converted value as int, float, bool, None, or string.
177+
"""
178+
if value.lower() == "null":
179+
return None
180+
if value.lower() == "true":
181+
return True
182+
if value.lower() == "false":
183+
return False
184+
if value.isdigit() or value.startswith("-") and value[1:].isdigit():
185+
return int(value)
186+
if "." in value and value.replace(".", "", 1).replace("-", "", 1).isdigit():
187+
return float(value)
188+
return value
194189

195190

196191
def _to_default(varchar_value: Optional[str]) -> Optional[str]:

tests/pyathena/test_converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def test_to_struct_athena_unnamed_struct_mixed():
6363
"""Test unnamed struct with mixed data types"""
6464
struct_value = "{John, 30, true}"
6565
result = _to_struct(struct_value)
66-
expected = {"0": "John", "1": 30, "2": "true"}
66+
expected = {"0": "John", "1": 30, "2": True}
6767
assert result == expected
6868

6969

0 commit comments

Comments
 (0)