Skip to content

Commit 221e05d

Browse files
committed
Simplify to unconditional backtick quoting, gate behind flag
Following review feedback: the conditional check based on whether the name matches a bare-identifier pattern is unnecessary. The Spark/ Databricks SQL grammar accepts :`name` for every valid identifier (verified empirically against a live SQL warehouse), so wrapping unconditionally keeps the compiler simpler and removes a class of edge-case bugs that the condition could miss. Add a ``quote_bind_params`` flag on DatabricksDialect (default True) that can be turned off via the URL query parameter ``?quote_bind_params=false`` as an escape hatch if the quoting ever introduces an unexpected regression. When disabled, we fall through entirely to stock SQLAlchemy bind-name rendering. Co-authored-by: Isaac
1 parent 0c71d69 commit 221e05d

3 files changed

Lines changed: 120 additions & 72 deletions

File tree

src/databricks/sqlalchemy/_ddl.py

Lines changed: 42 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -84,50 +84,52 @@ def get_column_specification(self, column, **kwargs):
8484

8585

8686
class DatabricksStatementCompiler(compiler.SQLCompiler):
87-
# Names that a bare Databricks named-parameter marker (`:name`) accepts:
88-
# a letter or underscore followed by letters, digits, or underscores.
89-
# Anything outside that set — hyphens, spaces, dots, brackets, a leading
90-
# digit, etc. — must be wrapped in backticks (`:`name``), which the
91-
# Spark/Databricks SQL grammar accepts as a quoted parameter identifier.
92-
_bindname_is_bare_identifier = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
93-
9487
def bindparam_string(self, name, **kw):
95-
"""Render a bind parameter marker.
96-
97-
Databricks named parameter markers only accept bare identifiers
98-
([A-Za-z_][A-Za-z0-9_]*) out of the box. DataFrame-origin column
99-
names frequently contain hyphens (e.g. ``col-with-hyphen``), which
100-
SQLAlchemy would otherwise pass through verbatim and produce an
101-
invalid marker ``:col-with-hyphen`` — the parser splits on ``-``
102-
and reports UNBOUND_SQL_PARAMETER.
103-
104-
The Spark SQL grammar accepts a quoted form ``:`col-with-hyphen```,
105-
mirroring Oracle's ``:"name"`` pattern. The backticks are *quoting*
106-
only: the parameter's logical name is still the text between them,
107-
so the params dict sent to the driver must keep the original
108-
unquoted key. We therefore emit the backticked marker directly
109-
without populating ``escaped_bind_names`` — leaving the key
110-
translation in ``construct_params`` a no-op.
111-
112-
For bare identifiers (the common case), we fall through to the
113-
default implementation so INSERT/SELECT output stays unchanged.
88+
"""Render a bind parameter marker wrapped in backticks.
89+
90+
Databricks named parameter markers accept two identifier forms
91+
(per ``SqlBaseParser.g4``): a bare ``IDENTIFIER``
92+
(``[A-Za-z_][A-Za-z0-9_]*``) or a ``quotedIdentifier`` wrapped in
93+
backticks. DataFrame-origin column names frequently contain
94+
hyphens (e.g. ``col-with-hyphen``), which SQLAlchemy would
95+
otherwise pass through verbatim and produce an invalid marker
96+
``:col-with-hyphen`` — the parser splits on ``-`` and reports
97+
UNBOUND_SQL_PARAMETER.
98+
99+
Backticks are valid for *every* identifier (plain names included),
100+
verified empirically against a Databricks SQL warehouse, so we
101+
wrap unconditionally. This mirrors Oracle's ``:"name"`` approach
102+
to the same grammar constraint and eliminates the collision risk
103+
that any single-character escape map would carry (e.g. ``col-name``
104+
vs ``col_name`` both mapping to ``:col_name``).
105+
106+
The backticks are SQL-side *quoting* only: the parameter's
107+
logical name is still the text between them, so the params dict
108+
sent to the driver keeps the original unquoted key. We therefore
109+
leave ``escaped_bind_names`` untouched — ``construct_params``
110+
passes keys through unchanged.
111+
112+
Gated by ``DatabricksDialect.quote_bind_params``. Set
113+
``?quote_bind_params=false`` on the SQLAlchemy URL to fall back
114+
to stock bind-name rendering.
114115
"""
115116
if (
116-
not kw.get("escaped_from")
117-
and not kw.get("post_compile", False)
118-
and not self._bindname_is_bare_identifier.match(name)
117+
kw.get("post_compile", False)
118+
or kw.get("escaped_from")
119+
or not getattr(self.dialect, "quote_bind_params", True)
119120
):
120-
accumulate = kw.get("accumulate_bind_names")
121-
if accumulate is not None:
122-
accumulate.add(name)
123-
visited = kw.get("visited_bindparam")
124-
if visited is not None:
125-
visited.append(name)
126-
quoted = f"`{name}`"
127-
if self.state is compiler.CompilerState.COMPILING:
128-
return self.compilation_bindtemplate % {"name": quoted}
129-
return self.bindtemplate % {"name": quoted}
130-
return super().bindparam_string(name, **kw)
121+
return super().bindparam_string(name, **kw)
122+
123+
accumulate = kw.get("accumulate_bind_names")
124+
if accumulate is not None:
125+
accumulate.add(name)
126+
visited = kw.get("visited_bindparam")
127+
if visited is not None:
128+
visited.append(name)
129+
quoted = f"`{name}`"
130+
if self.state is compiler.CompilerState.COMPILING:
131+
return self.compilation_bindtemplate % {"name": quoted}
132+
return self.bindtemplate % {"name": quoted}
131133

132134
def limit_clause(self, select, **kw):
133135
"""Identical to the default implementation of SQLCompiler.limit_clause except it writes LIMIT ALL instead of LIMIT -1,

src/databricks/sqlalchemy/base.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@ class DatabricksDialect(default.DefaultDialect):
6666
supports_sequences: bool = False
6767
supports_native_boolean: bool = True
6868

69+
# When True (default), every named bind parameter is rendered wrapped in
70+
# backticks (`` :`name` ``) so that column names containing characters
71+
# which are illegal in bare Databricks parameter identifiers (hyphens,
72+
# spaces, dots, leading digits, etc.) work transparently. Set to False
73+
# via the URL query string — ``?quote_bind_params=false`` — to fall back
74+
# to stock SQLAlchemy bind-name rendering if this quoting causes an
75+
# unexpected regression.
76+
quote_bind_params: bool = True
77+
6978
colspecs = {
7079
sqlalchemy.types.DateTime: dialect_type_impl.TIMESTAMP_NTZ,
7180
sqlalchemy.types.Time: dialect_type_impl.DatabricksTimeType,
@@ -117,6 +126,10 @@ def create_connect_args(self, url):
117126
self.schema = kwargs["schema"]
118127
self.catalog = kwargs["catalog"]
119128

129+
raw_quote_flag = url.query.get("quote_bind_params")
130+
if raw_quote_flag is not None:
131+
self.quote_bind_params = raw_quote_flag.lower() not in ("false", "0", "no")
132+
120133
self._force_paramstyle_to_native_mode()
121134

122135
return [], kwargs

tests/test_local/test_ddl.py

Lines changed: 65 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,24 @@ def test_create_table_with_complex_type(self, metadata):
117117

118118

119119
class TestBindParamQuoting(DDLTestBase):
120-
"""Regression tests for column names that contain characters which are not
121-
legal inside a bare Databricks named-parameter marker (`:name`). Without
122-
the custom ``bindparam_string`` override, a column like
123-
``col-with-hyphen`` produces SQL like ``VALUES (:col-with-hyphen)`` which
124-
fails with UNBOUND_SQL_PARAMETER on the server. The fix wraps such names
125-
in backticks (``VALUES (:`col-with-hyphen`)``), which the Databricks SQL
126-
grammar accepts as a quoted parameter identifier.
120+
"""Regression tests for bind-parameter quoting.
121+
122+
Databricks named parameter markers (``:name``) must be bare identifiers
123+
(``[A-Za-z_][A-Za-z0-9_]*``) unless wrapped in backticks. Because
124+
DataFrame-origin column names frequently contain hyphens (a character
125+
that's legal inside a backtick-quoted column identifier but not in a
126+
bare bind marker), the dialect wraps every bind name in backticks
127+
unconditionally. The backticks are SQL-side quoting only — the params
128+
dict sent to the driver keeps the original unquoted key.
129+
130+
The behavior is gated by ``DatabricksDialect.quote_bind_params`` which
131+
defaults to True; set ``?quote_bind_params=false`` in the URL to
132+
disable.
127133
"""
128134

129-
def _compile_insert(self, table, values):
135+
def _compile_insert(self, table, values, engine=None):
130136
stmt = insert(table).values(values)
131-
return stmt.compile(bind=self.engine)
137+
return stmt.compile(bind=engine or self.engine)
132138

133139
def test_hyphenated_column_renders_backticked_bind_marker(self):
134140
metadata = MetaData()
@@ -143,18 +149,18 @@ def test_hyphenated_column_renders_backticked_bind_marker(self):
143149
)
144150

145151
sql = str(compiled)
146-
# Hyphenated name is wrapped in backticks at the marker site
152+
# Both names are backticked at the marker site
147153
assert ":`col-with-hyphen`" in sql
148-
# Plain name is untouched
149-
assert ":normal_col" in sql
154+
assert ":`normal_col`" in sql
150155
# The params dict sent to the driver keeps the ORIGINAL unquoted key
151156
# — this matches what the Databricks server expects (verified
152-
# empirically: a backticked marker `:`name`` binds against a plain
153-
# `name` key in the params dict).
157+
# empirically: a backticked marker ``:`name``` binds against a plain
158+
# ``name`` key in the params dict).
154159
params = compiled.construct_params()
155160
assert params["col-with-hyphen"] == "x"
156161
assert params["normal_col"] == "y"
157162
assert "`col-with-hyphen`" not in params
163+
assert "`normal_col`" not in params
158164

159165
def test_hyphen_and_underscore_columns_do_not_collide(self):
160166
"""A table containing both ``col-name`` and ``col_name`` must produce
@@ -174,14 +180,17 @@ def test_hyphen_and_underscore_columns_do_not_collide(self):
174180

175181
sql = str(compiled)
176182
assert ":`col-name`" in sql
177-
assert ":col_name" in sql
183+
assert ":`col_name`" in sql
178184

179185
params = compiled.construct_params()
180186
assert params["col-name"] == "hyphen_value"
181187
assert params["col_name"] == "underscore_value"
182188

183-
def test_plain_identifier_bind_names_are_unchanged(self):
184-
"""No regression: ordinary column names must not be backticked."""
189+
def test_plain_identifier_bind_names_are_also_backticked(self):
190+
"""Every bind name is wrapped unconditionally — the Databricks SQL
191+
grammar accepts ``:`id``` identically to ``:id`` for plain names
192+
(verified against a live warehouse).
193+
"""
185194
metadata = MetaData()
186195
table = Table(
187196
"t",
@@ -191,15 +200,10 @@ def test_plain_identifier_bind_names_are_unchanged(self):
191200
)
192201
compiled = self._compile_insert(table, {"id": "1", "name": "n"})
193202
sql = str(compiled)
194-
assert ":id" in sql
195-
assert ":name" in sql
196-
assert ":`id`" not in sql
197-
assert ":`name`" not in sql
203+
assert ":`id`" in sql
204+
assert ":`name`" in sql
198205

199-
def test_space_and_dot_in_column_name_also_backticked(self):
200-
"""The bare-identifier check covers all non-[A-Za-z0-9_] characters,
201-
not just hyphens — spaces, dots, etc. should also be wrapped.
202-
"""
206+
def test_space_and_dot_in_column_name_are_backticked(self):
203207
metadata = MetaData()
204208
table = Table(
205209
"t",
@@ -218,13 +222,42 @@ def test_space_and_dot_in_column_name_also_backticked(self):
218222
assert params["col with space"] == "s"
219223
assert params["col.with.dot"] == "d"
220224

221-
def test_leading_digit_column_is_backticked(self):
222-
"""Databricks bind names cannot start with a digit either."""
225+
def test_quote_bind_params_can_be_disabled(self):
226+
"""Setting ``quote_bind_params=False`` on the dialect reverts to
227+
stock SQLAlchemy bind-name rendering (the pre-fix behavior).
228+
"""
229+
from databricks.sqlalchemy.base import DatabricksDialect
230+
231+
dialect = DatabricksDialect()
232+
dialect.paramstyle = "named"
233+
dialect.quote_bind_params = False
234+
223235
metadata = MetaData()
224-
table = Table("t", metadata, Column("1col", String()))
225-
compiled = self._compile_insert(table, {"1col": "x"})
236+
table = Table("t", metadata, Column("id", String()))
237+
compiled = insert(table).values({"id": "1"}).compile(dialect=dialect)
226238
sql = str(compiled)
227-
assert ":`1col`" in sql
239+
assert ":id" in sql
240+
assert ":`id`" not in sql
228241

229-
params = compiled.construct_params()
230-
assert params["1col"] == "x"
242+
def test_url_query_string_disables_quoting(self):
243+
"""The URL query parameter ``?quote_bind_params=false`` turns the
244+
flag off on the dialect.
245+
"""
246+
from sqlalchemy import create_engine
247+
248+
engine = create_engine(
249+
"databricks://token:****@****?http_path=****&catalog=****"
250+
"&schema=****&quote_bind_params=false"
251+
)
252+
# create_engine lazy-initializes; force the dialect to process the URL
253+
engine.dialect.create_connect_args(engine.url)
254+
assert engine.dialect.quote_bind_params is False
255+
256+
def test_url_query_string_defaults_to_quoting(self):
257+
from sqlalchemy import create_engine
258+
259+
engine = create_engine(
260+
"databricks://token:****@****?http_path=****&catalog=****&schema=****"
261+
)
262+
engine.dialect.create_connect_args(engine.url)
263+
assert engine.dialect.quote_bind_params is True

0 commit comments

Comments
 (0)