Skip to content

Commit 809b627

Browse files
authored
SNOW-3718333: escape backslashes and single quotes in stage/file path SQL generation (#4274)
1 parent c9b9303 commit 809b627

6 files changed

Lines changed: 181 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#### Bug Fixes
88

99
- Fixed a bug where stage paths and file format names that contain single quotes were not consistently escaped when generating SQL, which could produce malformed statements. This affects `INFER_SCHEMA` (used by `DataFrameReader.csv`/`json`/`parquet`/`orc`/`avro`) and `COPY FILES` (used by `FileOperation.copy_files`).
10+
- Fixed a bug where single quotes and backslashes in stage/file paths were not correctly escaped when generating `COPY INTO` / `PUT` / `GET` SQL, which could produce malformed statements. This affects `DataFrame.write.csv`/`copy_into_location` and the Snowpark-pandas `DataFrame.to_csv` stage path.
1011
- Fixed a bug where column names containing quote characters returned by an external database were not correctly escaped when generating the `SELECT` query for `DataFrameReader.dbapi`, which could produce malformed SQL. Embedded quote characters in identifiers are now doubled (backticks for Databricks/MySQL, double quotes for Oracle/PostgreSQL/SQL Server).
1112
- Fixed a bug where the destination passed to `DataFrameWriter.copy_into_location` (and `csv`/`json`/`parquet`/`save`) was embedded into the generated `COPY INTO` statement without quoting, which could produce malformed SQL for locations containing single quotes. The location is now consistently quoted and escaped, and a string that merely starts and ends with a single quote but contains unescaped interior quotes is no longer treated as an already-quoted literal; it is fully escaped so it stays a single SQL string literal.
1213
- Fixed a bug where UDF default argument values reconstructed from a source file in `register_from_file` were evaluated with `eval()`; they are now evaluated only against the documented set of supported default-value types, and unsupported expressions are ignored.

src/snowflake/snowpark/_internal/utils.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,17 @@ def normalize_path(path: str, is_local: bool) -> str:
481481
return path
482482
if is_local and OPERATING_SYSTEM == "Windows":
483483
path = path.replace("\\", "/")
484-
path = path.strip().replace("'", "\\'")
484+
# Escape the backslash before the single quote so the path stays a single
485+
# Snowflake string literal; the reverse order would let an escaped quote
486+
# close the literal early and produce invalid SQL. Constants keep the
487+
# replacements readable (no Python escape double-counting).
488+
BACKSLASH = "\\"
489+
SINGLE_QUOTE = "'"
490+
path = (
491+
path.strip()
492+
.replace(BACKSLASH, BACKSLASH * 2) # \ -> \\
493+
.replace(SINGLE_QUOTE, BACKSLASH + SINGLE_QUOTE) # ' -> \'
494+
)
485495
if not any(path.startswith(prefix) for prefix in prefixes):
486496
path = f"{prefixes[0]}{path}"
487497
return f"'{path}'"

tests/integ/modin/io/test_to_csv.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from numpy.testing import assert_equal
1414

1515
import snowflake.snowpark.modin.plugin # noqa: F401
16-
from tests.integ.utils.sql_counter import sql_count_checker
16+
from tests.integ.utils.sql_counter import sql_count_checker, SqlCounter
1717
from tests.utils import Utils
1818

1919
temp_dir = tempfile.TemporaryDirectory()
@@ -293,3 +293,57 @@ def test_timedeltaindex_to_csv_dataframe_local():
293293
pd.DataFrame(native_df).to_csv(snow_path)
294294

295295
assert_file_equal(snow_path, native_path, is_compressed=False)
296+
297+
298+
def test_to_csv_stage_path_escapes_special_characters(sf_stage, session):
299+
"""Snowpark-pandas ``to_csv`` to a stage path must escape special characters
300+
in the path.
301+
302+
``DataFrame.to_csv(path_or_buf="@stage/...")`` routes server-side into
303+
``snowpark_df.write.csv(location=...)`` -> ``normalize_path`` ->
304+
``COPY INTO <location>``. A path containing a backslash immediately followed
305+
by a single quote must stay inside the stage-location string literal so the
306+
generated ``COPY INTO`` is valid and the path is treated as literal data.
307+
"""
308+
snow_df = pd.DataFrame({"A": ["one", "two", "three"], "B": [1, 2, 3]})
309+
# None index name is not supported when writing to a Snowflake stage.
310+
snow_df.index.set_names(["X"], inplace=True)
311+
312+
# (a) Stage path whose directory name contains a single quote. The quote is
313+
# escaped as literal data, so the write succeeds and the file lands under
314+
# that exact name. ``to_csv`` to a stage emits one query (the COPY INTO);
315+
# downloading it back confirms the path was treated as a literal file
316+
# name and not parsed as SQL.
317+
quote_name = "o'clock/mods.csv"
318+
quote_path = f"@{sf_stage}/{quote_name}"
319+
with SqlCounter(query_count=1):
320+
snow_df.to_csv(quote_path, index=False)
321+
listed = [row[0] for row in session.sql(f"LIST '@{sf_stage}'").collect()]
322+
assert any(name.endswith(quote_name) for name in listed), listed
323+
324+
download_dir = tempfile.mkdtemp()
325+
session.file.get(quote_path, download_dir)
326+
downloaded = [
327+
f
328+
for f in os.listdir(download_dir)
329+
if os.path.isfile(os.path.join(download_dir, f))
330+
]
331+
assert len(downloaded) == 1, downloaded
332+
with open(os.path.join(download_dir, downloaded[0])) as fh:
333+
content = fh.read()
334+
data_rows = [line for line in content.splitlines() if line.strip()]
335+
# Header ("A,B") + the DataFrame's own 3 data rows == 4 lines.
336+
assert len(data_rows) == 4, content
337+
assert content == "A,B\none,1\ntwo,2\nthree,3\n", content
338+
339+
# (b) A file name mixing a backslash, a single quote, parentheses, a comma
340+
# and a trailing ``--`` must produce valid SQL: before the fix the
341+
# unescaped backslash/quote closed the location literal early. The write
342+
# must succeed with a single COPY INTO query -- if the path were parsed as
343+
# SQL the statement would error instead. Stage storage does not preserve a
344+
# literal backslash as a path character, so we assert the write succeeds
345+
# rather than reading back the exact name.
346+
special_name = "report\\' , (note) -- draft"
347+
special_path = f"@{sf_stage}/{special_name}"
348+
with SqlCounter(query_count=1):
349+
snow_df.to_csv(special_path, index=False)

tests/integ/scala/test_dataframe_writer_suite.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,45 @@ def test_writer_csv(session, temp_stage, caplog):
989989
Utils.check_answer(data7, df)
990990

991991

992+
@pytest.mark.skipif(
993+
"config.getoption('local_testing_mode', default=False)",
994+
reason="COPY INTO <location> is not supported in Local Testing",
995+
)
996+
def test_writer_csv_stage_path_escapes_special_characters(session, temp_stage):
997+
"""``DataFrame.write.csv`` routes the destination through ``normalize_path``,
998+
which must escape both backslashes and single quotes so that a path
999+
containing a backslash immediately followed by a single quote stays inside
1000+
the stage-location string literal in the generated ``COPY INTO`` and the
1001+
SQL is always valid.
1002+
1003+
Each write below uses a path with characters that, before the fix, would
1004+
close the location string literal early and produce invalid SQL (a
1005+
backslash, a single quote, a ``\\'`` combination, parentheses, a comma and a
1006+
trailing ``--``). The writes must now succeed with the DataFrame's own rows
1007+
unloaded, which proves the path is escaped as literal data and not parsed as
1008+
SQL. Note: a literal backslash is not preserved as a directory separator by
1009+
stage storage, so we assert the write succeeds rather than a read-back
1010+
round-trip.
1011+
"""
1012+
df = session.create_dataframe([[1, 2], [3, 4]], schema=["a", "b"])
1013+
1014+
special_paths = [
1015+
# Directory name containing a backslash.
1016+
f"{temp_stage}/back\\slash_dir/data.csv",
1017+
# Directory name containing a single quote.
1018+
f"{temp_stage}/o'clock/data.csv",
1019+
# Directory name containing a backslash immediately followed by a quote.
1020+
f"{temp_stage}/mix\\'both/data.csv",
1021+
# File name mixing a backslash-quote, parentheses, a comma and a
1022+
# trailing ``--`` -- all must be treated as literal path characters.
1023+
f"@{temp_stage}/out\\' , (note) -- draft",
1024+
]
1025+
for path in special_paths:
1026+
result = df.write.csv(path, single=True)
1027+
# The DataFrame's own rows are unloaded; the path is not parsed as SQL.
1028+
assert result[0].rows_unloaded == 2, path
1029+
1030+
9921031
@pytest.mark.skipif(
9931032
"config.getoption('local_testing_mode', default=False)",
9941033
reason="BUG: SNOW-1235716 should raise not implemented error not AttributeError: 'MockExecutionPlan' object has no attribute 'replace_repeated_subquery_with_cte', FEAT: parquet support",

tests/unit/scala/test_utils_suite.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,9 @@ def test_normalize_file(is_local):
167167
assert normalize_path(name2, is_local) == f"'{symbol}sta\\'ge'"
168168
name3 = "s ta\\'ge "
169169
assert normalize_path(name3, is_local) == (
170-
f"'{symbol}s ta/\\'ge'" if is_local and IS_WINDOWS else f"'{symbol}s ta\\\\'ge'"
170+
f"'{symbol}s ta/\\'ge'"
171+
if is_local and IS_WINDOWS
172+
else f"'{symbol}s ta\\\\\\'ge'"
171173
)
172174

173175

tests/unit/test_internal_utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,78 @@ def test_normalize_path(path: str, is_local: bool, expected: str) -> None:
7676
assert expected == actual
7777

7878

79+
def _decode_snowflake_literal(literal: str) -> str:
80+
"""Simulate Snowflake's decoding of a single-quoted string literal.
81+
82+
Snowflake treats ``\\`` as an escape character inside a single-quoted literal,
83+
so ``\\\\`` decodes to one backslash and ``\\'`` decodes to one single quote.
84+
An unescaped single quote closes the literal. This helper returns the decoded
85+
literal value and raises if the literal is closed early -- which would mean the
86+
path was not escaped correctly and the generated SQL is invalid.
87+
"""
88+
assert literal.startswith("'") and literal.endswith(
89+
"'"
90+
), f"not a quoted literal: {literal!r}"
91+
body = literal[1:-1]
92+
out = []
93+
i = 0
94+
while i < len(body):
95+
ch = body[i]
96+
if ch == "\\" and i + 1 < len(body):
97+
out.append(body[i + 1])
98+
i += 2
99+
elif ch == "'":
100+
raise AssertionError(
101+
f"unescaped quote closes literal early at index {i}: {literal!r}"
102+
)
103+
else:
104+
out.append(ch)
105+
i += 1
106+
return "".join(out)
107+
108+
109+
@pytest.mark.parametrize("is_local", [True, False])
110+
@pytest.mark.parametrize(
111+
"raw_path",
112+
[
113+
# Paths containing a backslash immediately followed by a single quote,
114+
# plus parentheses, commas and a trailing ``--``. Before the fix the
115+
# backslash was not escaped, so ``\'`` was written as ``\\'`` and closed
116+
# the literal early, producing invalid SQL.
117+
"@~/out\\' , (note) FILE_FORMAT=(TYPE=CSV) -- draft",
118+
"report\\' , (v2) draft --",
119+
# Plain special characters that must round-trip as literal data.
120+
"@stage/o'clock/file.csv",
121+
"@stage/back\\slash/file.csv",
122+
"@stage/double\\\\back/file.csv",
123+
'@stage/dquote"/file.csv',
124+
"@stage/uniécode/file.csv",
125+
"@stage/all\\'\"mix/file.csv",
126+
],
127+
)
128+
def test_normalize_path_escapes_backslash_and_quote(raw_path, is_local):
129+
"""``normalize_path`` must produce a Snowflake string literal that decodes back
130+
to the original path. A backslash followed by a single quote must stay inside
131+
the literal and not close it early, so the generated SQL is always valid and
132+
the path is treated as literal data."""
133+
literal = utils.normalize_path(raw_path, is_local)
134+
# The output must be a well-formed single-quoted literal: decoding it must not
135+
# raise (i.e. the literal is not closed early).
136+
decoded = _decode_snowflake_literal(literal)
137+
# The decoded literal must end with the (stripped) raw path -- the prefix may
138+
# differ only by an added ``@`` / ``file://`` scheme prefix.
139+
expected_tail = raw_path.strip()
140+
# Local paths on Windows are normalized (backslashes -> forward slashes)
141+
# before escaping, so mirror that transform here. This only affects the
142+
# round-trip comparison; the escaping guarantee checked above (the literal
143+
# never closes early) still holds for every input on every platform.
144+
if is_local and utils.OPERATING_SYSTEM == "Windows":
145+
expected_tail = expected_tail.replace("\\", "/")
146+
assert decoded.endswith(
147+
expected_tail
148+
), f"decoded={decoded!r} does not end with {expected_tail!r}"
149+
150+
79151
def test__pandas_importer():
80152
imported_pandas = _pandas_importer()
81153
try:

0 commit comments

Comments
 (0)