Skip to content

Commit 0c71d69

Browse files
committed
Quote bind parameter names containing non-identifier characters
Column names sourced from DataFrames frequently contain hyphens (e.g. `col-with-hyphen`). SQLAlchemy uses the column name as the default bind parameter name, and Databricks named-parameter markers (`:name`) only accept bare identifiers ([A-Za-z_][A-Za-z0-9_]*). The hyphen was being emitted verbatim, producing invalid SQL like `:col-with-hyphen` which the server rejects with UNBOUND_SQL_PARAMETER because it parses `-with-hyphen` as stray tokens. Override `DatabricksStatementCompiler.bindparam_string` to wrap non-bare-identifier names in backticks (`:`col-with-hyphen``), which the Spark/Databricks SQL grammar accepts as a quoted parameter identifier (`simpleIdentifier -> quotedIdentifier` in `SqlBaseParser.g4`). This mirrors Oracle's `:"name"` approach to the same problem. The backticks are quoting syntax only — the parameter's logical name is still the text between them, so the params dict sent to the driver keeps the original unquoted key. `escaped_bind_names` is intentionally left empty so `construct_params` passes keys through unchanged. This covers hyphens, spaces, dots, brackets, leading digits, and any other character outside [A-Za-z0-9_], with no risk of collisions between sibling columns like `col-name` and `col_name` (a concern with single-character escape-map approaches). Co-authored-by: Isaac
1 parent ad9eb35 commit 0c71d69

2 files changed

Lines changed: 160 additions & 1 deletion

File tree

src/databricks/sqlalchemy/_ddl.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,51 @@ def get_column_specification(self, column, **kwargs):
8484

8585

8686
class DatabricksStatementCompiler(compiler.SQLCompiler):
87+
# Names that a bare Databricks named-parameter marker (`:name`) accepts:
88+
# a letter or underscore followed by letters, digits, or underscores.
89+
# Anything outside that set — hyphens, spaces, dots, brackets, a leading
90+
# digit, etc. — must be wrapped in backticks (`:`name``), which the
91+
# Spark/Databricks SQL grammar accepts as a quoted parameter identifier.
92+
_bindname_is_bare_identifier = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
93+
94+
def bindparam_string(self, name, **kw):
95+
"""Render a bind parameter marker.
96+
97+
Databricks named parameter markers only accept bare identifiers
98+
([A-Za-z_][A-Za-z0-9_]*) out of the box. DataFrame-origin column
99+
names frequently contain hyphens (e.g. ``col-with-hyphen``), which
100+
SQLAlchemy would otherwise pass through verbatim and produce an
101+
invalid marker ``:col-with-hyphen`` — the parser splits on ``-``
102+
and reports UNBOUND_SQL_PARAMETER.
103+
104+
The Spark SQL grammar accepts a quoted form ``:`col-with-hyphen```,
105+
mirroring Oracle's ``:"name"`` pattern. The backticks are *quoting*
106+
only: the parameter's logical name is still the text between them,
107+
so the params dict sent to the driver must keep the original
108+
unquoted key. We therefore emit the backticked marker directly
109+
without populating ``escaped_bind_names`` — leaving the key
110+
translation in ``construct_params`` a no-op.
111+
112+
For bare identifiers (the common case), we fall through to the
113+
default implementation so INSERT/SELECT output stays unchanged.
114+
"""
115+
if (
116+
not kw.get("escaped_from")
117+
and not kw.get("post_compile", False)
118+
and not self._bindname_is_bare_identifier.match(name)
119+
):
120+
accumulate = kw.get("accumulate_bind_names")
121+
if accumulate is not None:
122+
accumulate.add(name)
123+
visited = kw.get("visited_bindparam")
124+
if visited is not None:
125+
visited.append(name)
126+
quoted = f"`{name}`"
127+
if self.state is compiler.CompilerState.COMPILING:
128+
return self.compilation_bindtemplate % {"name": quoted}
129+
return self.bindtemplate % {"name": quoted}
130+
return super().bindparam_string(name, **kw)
131+
87132
def limit_clause(self, select, **kw):
88133
"""Identical to the default implementation of SQLCompiler.limit_clause except it writes LIMIT ALL instead of LIMIT -1,
89134
since Databricks SQL doesn't support the latter.

tests/test_local/test_ddl.py

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import pytest
2-
from sqlalchemy import Column, MetaData, String, Table, Numeric, Integer, create_engine
2+
from sqlalchemy import Column, MetaData, String, Table, Numeric, Integer, create_engine, insert
33
from sqlalchemy.schema import (
44
CreateTable,
55
DropColumnComment,
@@ -114,3 +114,117 @@ def test_create_table_with_complex_type(self, metadata):
114114
assert "array_array_string ARRAY<ARRAY<STRING>>" in output
115115
assert "map_string_string MAP<STRING,STRING>" in output
116116
assert "variant_col VARIANT" in output
117+
118+
119+
class TestBindParamQuoting(DDLTestBase):
120+
"""Regression tests for column names that contain characters which are not
121+
legal inside a bare Databricks named-parameter marker (`:name`). Without
122+
the custom ``bindparam_string`` override, a column like
123+
``col-with-hyphen`` produces SQL like ``VALUES (:col-with-hyphen)`` which
124+
fails with UNBOUND_SQL_PARAMETER on the server. The fix wraps such names
125+
in backticks (``VALUES (:`col-with-hyphen`)``), which the Databricks SQL
126+
grammar accepts as a quoted parameter identifier.
127+
"""
128+
129+
def _compile_insert(self, table, values):
130+
stmt = insert(table).values(values)
131+
return stmt.compile(bind=self.engine)
132+
133+
def test_hyphenated_column_renders_backticked_bind_marker(self):
134+
metadata = MetaData()
135+
table = Table(
136+
"t",
137+
metadata,
138+
Column("col-with-hyphen", String()),
139+
Column("normal_col", String()),
140+
)
141+
compiled = self._compile_insert(
142+
table, {"col-with-hyphen": "x", "normal_col": "y"}
143+
)
144+
145+
sql = str(compiled)
146+
# Hyphenated name is wrapped in backticks at the marker site
147+
assert ":`col-with-hyphen`" in sql
148+
# Plain name is untouched
149+
assert ":normal_col" in sql
150+
# The params dict sent to the driver keeps the ORIGINAL unquoted key
151+
# — this matches what the Databricks server expects (verified
152+
# empirically: a backticked marker `:`name`` binds against a plain
153+
# `name` key in the params dict).
154+
params = compiled.construct_params()
155+
assert params["col-with-hyphen"] == "x"
156+
assert params["normal_col"] == "y"
157+
assert "`col-with-hyphen`" not in params
158+
159+
def test_hyphen_and_underscore_columns_do_not_collide(self):
160+
"""A table containing both ``col-name`` and ``col_name`` must produce
161+
two distinct bind parameters with two distinct dict keys; otherwise
162+
one value would silently clobber the other.
163+
"""
164+
metadata = MetaData()
165+
table = Table(
166+
"t",
167+
metadata,
168+
Column("col-name", String()),
169+
Column("col_name", String()),
170+
)
171+
compiled = self._compile_insert(
172+
table, {"col-name": "hyphen_value", "col_name": "underscore_value"}
173+
)
174+
175+
sql = str(compiled)
176+
assert ":`col-name`" in sql
177+
assert ":col_name" in sql
178+
179+
params = compiled.construct_params()
180+
assert params["col-name"] == "hyphen_value"
181+
assert params["col_name"] == "underscore_value"
182+
183+
def test_plain_identifier_bind_names_are_unchanged(self):
184+
"""No regression: ordinary column names must not be backticked."""
185+
metadata = MetaData()
186+
table = Table(
187+
"t",
188+
metadata,
189+
Column("id", String()),
190+
Column("name", String()),
191+
)
192+
compiled = self._compile_insert(table, {"id": "1", "name": "n"})
193+
sql = str(compiled)
194+
assert ":id" in sql
195+
assert ":name" in sql
196+
assert ":`id`" not in sql
197+
assert ":`name`" not in sql
198+
199+
def test_space_and_dot_in_column_name_also_backticked(self):
200+
"""The bare-identifier check covers all non-[A-Za-z0-9_] characters,
201+
not just hyphens — spaces, dots, etc. should also be wrapped.
202+
"""
203+
metadata = MetaData()
204+
table = Table(
205+
"t",
206+
metadata,
207+
Column("col with space", String()),
208+
Column("col.with.dot", String()),
209+
)
210+
compiled = self._compile_insert(
211+
table, {"col with space": "s", "col.with.dot": "d"}
212+
)
213+
sql = str(compiled)
214+
assert ":`col with space`" in sql
215+
assert ":`col.with.dot`" in sql
216+
217+
params = compiled.construct_params()
218+
assert params["col with space"] == "s"
219+
assert params["col.with.dot"] == "d"
220+
221+
def test_leading_digit_column_is_backticked(self):
222+
"""Databricks bind names cannot start with a digit either."""
223+
metadata = MetaData()
224+
table = Table("t", metadata, Column("1col", String()))
225+
compiled = self._compile_insert(table, {"1col": "x"})
226+
sql = str(compiled)
227+
assert ":`1col`" in sql
228+
229+
params = compiled.construct_params()
230+
assert params["1col"] == "x"

0 commit comments

Comments
 (0)