From 318648e2000b089e008796f55f314bebde785920 Mon Sep 17 00:00:00 2001 From: Jamison Date: Wed, 24 Sep 2025 13:13:46 -0700 Subject: [PATCH 1/7] SNOW-2360274: Fix schema query sql generation for structured types --- CHANGELOG.md | 1 + .../_internal/analyzer/datatype_mapper.py | 23 +++++-- src/snowflake/snowpark/context.py | 3 + tests/integ/scala/test_datatype_suite.py | 68 +++++++++++++++++++ 4 files changed, 89 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0740db7fd..a7eade2c1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ #### Bug Fixes - Fixed a bug that `DataFrame.limit()` fail if there is parameter binding in the executed SQL. +- Fixed a bug in schema query generation that could cause invalid sql to be genrated when using nested structured types. #### New Features diff --git a/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py b/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py index cb7f2a7944..7ccbf52f4f 100644 --- a/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py +++ b/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py @@ -12,6 +12,7 @@ from decimal import Decimal from typing import Any +import snowflake.snowpark.context as context import snowflake.snowpark._internal.analyzer.analyzer_utils as analyzer_utils from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type from snowflake.snowpark._internal.utils import ( @@ -518,7 +519,12 @@ def schema_expression(data_type: DataType, is_nullable: bool) -> str: if isinstance(data_type, ArrayType): if data_type.structured: assert data_type.element_type is not None - element = schema_expression(data_type.element_type, data_type.contains_null) + if context._enable_fix_2360274: + element = "NULL" + else: + element = schema_expression( + data_type.element_type, data_type.contains_null + ) return f"to_array({element}) :: {convert_sp_to_sf_type(data_type)}" return "to_array(0)" if isinstance(data_type, MapType): @@ -526,10 +532,13 @@ def schema_expression(data_type: DataType, is_nullable: bool) -> str: assert data_type.key_type is not None and data_type.value_type is not None # Key values can never be null key = schema_expression(data_type.key_type, False) - # Value nullability is variable. Defaults to True - value = schema_expression( - data_type.value_type, data_type.value_contains_null - ) + if context._enable_fix_2360274: + # Value nullability is variable. Defaults to True + value = schema_expression( + data_type.value_type, data_type.value_contains_null + ) + else: + value = "NULL" return f"object_construct_keep_null({key}, {value}) :: {convert_sp_to_sf_type(data_type)}" return "to_object(parse_json('0'))" if isinstance(data_type, StructType): @@ -539,7 +548,9 @@ def schema_expression(data_type: DataType, is_nullable: bool) -> str: # Even if nulls are allowed the cast will fail due to schema mismatch when passed a null field. schema_strings += [ f"'{field.name}'", - schema_expression(field.datatype, is_nullable=False), + "NULL" + if context._enable_fix_2360274 + else schema_expression(field.datatype, is_nullable=False), ] return f"object_construct_keep_null({', '.join(schema_strings)}) :: {convert_sp_to_sf_type(data_type)}" return "to_object(parse_json('{}'))" diff --git a/src/snowflake/snowpark/context.py b/src/snowflake/snowpark/context.py index d3ee14c97e..471bbc86db 100644 --- a/src/snowflake/snowpark/context.py +++ b/src/snowflake/snowpark/context.py @@ -39,6 +39,9 @@ # This is an internal-only global flag, used to determine whether to enable query line tracking for tracing sql compilation errors. _enable_trace_sql_errors_to_dataframe = False +# Global flag for fix 2360274. When enabled schema queries will use NULL as a place holder for any values inside structured objects +_enable_fix_2360274 = False + def configure_development_features( *, diff --git a/tests/integ/scala/test_datatype_suite.py b/tests/integ/scala/test_datatype_suite.py index ccb036f85f..29dc058fa7 100644 --- a/tests/integ/scala/test_datatype_suite.py +++ b/tests/integ/scala/test_datatype_suite.py @@ -9,6 +9,7 @@ import logging import pytest +from unittest import mock import snowflake.snowpark.context as context from snowflake.connector.options import installed_pandas @@ -1763,3 +1764,70 @@ def test_lob_collect_max_size(session, server_side_max_string, type_string, data ) assert df.schema == StructType([StructField("DATA", datatype, nullable=False)]) assert len(df.collect()[0][0]) >= server_side_max_string - 16 + + +@pytest.mark.skipif( + "config.getoption('local_testing_mode', default=False)", + reason="Structured types are not supported in Local Testing", +) +@pytest.mark.parametrize("fix_enabled", [True, False]) +def test_snow_2360274_repro(session, fix_enabled): + if not structured_type_support: + pytest.skip("Test requires structured type support.") + expected_schema = StructType( + [ + StructField("ID", LongType(), nullable=False), + StructField( + "VALS", + ArrayType( + StructType([StructField('"value"', StringType(), nullable=True)]) + ), + nullable=True, + ), + StructField("TAG", StringType(2), nullable=False), + ] + ) + + def inner(): + agged = session.sql( + """ + WITH SRC(ID, VALUE) AS ( + SELECT + $1, + $2 + FROM + VALUES + (1, 'A'), + (1, 'B'), + (2, 'A') + ) + SELECT + ID, + CAST( + ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(OBJECT("value" STRING)) + ) AS VALS + FROM + SRC + GROUP BY + ID""" + ) + + reference = session.sql( + """ + SELECT $1 AS ID, $2 AS TAG FROM VALUES (1, 'AB'), (2, 'B') + """ + ) + + joined = agged.join(reference, on=agged.id == reference.id, how="inner").select( + agged.id.alias("ID"), "VALS", "TAG" + ) + Utils.is_schema_same(joined.schema, expected_schema, case_sensitive=False) + + with mock.patch.object(context, "_enable_fix_2360274", fix_enabled): + if fix_enabled: + inner() + else: + with pytest.raises( + SnowparkSQLException, match="Unsupported data type 'STRUCTURED_OBJECT'" + ): + inner() From 89643d4a6a233022c21305e94fef190c46c681a9 Mon Sep 17 00:00:00 2001 From: Jamison Date: Wed, 24 Sep 2025 14:19:11 -0700 Subject: [PATCH 2/7] Update comments/docs/and tests --- CHANGELOG.md | 2 +- src/snowflake/snowpark/context.py | 1 + tests/integ/scala/test_datatype_suite.py | 8 +++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7eade2c1c..1d0e156f7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ #### Bug Fixes - Fixed a bug that `DataFrame.limit()` fail if there is parameter binding in the executed SQL. -- Fixed a bug in schema query generation that could cause invalid sql to be genrated when using nested structured types. +- Added an experimental fix for a bug in schema query generation that could cause invalid sql to be genrated when using nested structured types. #### New Features diff --git a/src/snowflake/snowpark/context.py b/src/snowflake/snowpark/context.py index 471bbc86db..86e92b6aa4 100644 --- a/src/snowflake/snowpark/context.py +++ b/src/snowflake/snowpark/context.py @@ -39,6 +39,7 @@ # This is an internal-only global flag, used to determine whether to enable query line tracking for tracing sql compilation errors. _enable_trace_sql_errors_to_dataframe = False +# SNOW-2362050: Enable this fix by default. # Global flag for fix 2360274. When enabled schema queries will use NULL as a place holder for any values inside structured objects _enable_fix_2360274 = False diff --git a/tests/integ/scala/test_datatype_suite.py b/tests/integ/scala/test_datatype_suite.py index 29dc058fa7..14574abb8d 100644 --- a/tests/integ/scala/test_datatype_suite.py +++ b/tests/integ/scala/test_datatype_suite.py @@ -1771,7 +1771,9 @@ def test_lob_collect_max_size(session, server_side_max_string, type_string, data reason="Structured types are not supported in Local Testing", ) @pytest.mark.parametrize("fix_enabled", [True, False]) -def test_snow_2360274_repro(session, fix_enabled): +def test_snow_2360274_repro( + structured_type_session, structured_type_support, fix_enabled +): if not structured_type_support: pytest.skip("Test requires structured type support.") expected_schema = StructType( @@ -1789,7 +1791,7 @@ def test_snow_2360274_repro(session, fix_enabled): ) def inner(): - agged = session.sql( + agged = structured_type_session.sql( """ WITH SRC(ID, VALUE) AS ( SELECT @@ -1812,7 +1814,7 @@ def inner(): ID""" ) - reference = session.sql( + reference = structured_type_session.sql( """ SELECT $1 AS ID, $2 AS TAG FROM VALUES (1, 'AB'), (2, 'B') """ From 219dc54ec0cbb01e1526a75d78113cbf496ad401 Mon Sep 17 00:00:00 2001 From: Jamison Date: Wed, 24 Sep 2025 14:41:44 -0700 Subject: [PATCH 3/7] test fix --- tests/integ/scala/test_datatype_suite.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/integ/scala/test_datatype_suite.py b/tests/integ/scala/test_datatype_suite.py index 14574abb8d..2cc84bff8f 100644 --- a/tests/integ/scala/test_datatype_suite.py +++ b/tests/integ/scala/test_datatype_suite.py @@ -1776,13 +1776,18 @@ def test_snow_2360274_repro( ): if not structured_type_support: pytest.skip("Test requires structured type support.") + nested_field_name = ( + "value" if context._should_use_structured_type_semantics() else '"value"' + ) expected_schema = StructType( [ StructField("ID", LongType(), nullable=False), StructField( "VALS", ArrayType( - StructType([StructField('"value"', StringType(), nullable=True)]) + StructType( + [StructField(nested_field_name, StringType(), nullable=True)] + ) ), nullable=True, ), @@ -1792,7 +1797,7 @@ def test_snow_2360274_repro( def inner(): agged = structured_type_session.sql( - """ + f""" WITH SRC(ID, VALUE) AS ( SELECT $1, @@ -1806,7 +1811,7 @@ def inner(): SELECT ID, CAST( - ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(OBJECT("value" STRING)) + ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(OBJECT({nested_field_name} STRING)) ) AS VALS FROM SRC From 12079975c5a0095328a24892a98ab7964a604584 Mon Sep 17 00:00:00 2001 From: Jamison Date: Wed, 24 Sep 2025 14:44:29 -0700 Subject: [PATCH 4/7] graphite fix --- src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py b/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py index 7ccbf52f4f..b4604df0bb 100644 --- a/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py +++ b/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py @@ -533,12 +533,12 @@ def schema_expression(data_type: DataType, is_nullable: bool) -> str: # Key values can never be null key = schema_expression(data_type.key_type, False) if context._enable_fix_2360274: + value = "NULL" + else: # Value nullability is variable. Defaults to True value = schema_expression( data_type.value_type, data_type.value_contains_null ) - else: - value = "NULL" return f"object_construct_keep_null({key}, {value}) :: {convert_sp_to_sf_type(data_type)}" return "to_object(parse_json('0'))" if isinstance(data_type, StructType): From e0f52398c638b7e236f6a937bd327f9206328c16 Mon Sep 17 00:00:00 2001 From: Jamison Date: Wed, 24 Sep 2025 15:59:14 -0700 Subject: [PATCH 5/7] coverage --- tests/integ/scala/test_datatype_suite.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/integ/scala/test_datatype_suite.py b/tests/integ/scala/test_datatype_suite.py index 2cc84bff8f..4b820eceeb 100644 --- a/tests/integ/scala/test_datatype_suite.py +++ b/tests/integ/scala/test_datatype_suite.py @@ -1783,7 +1783,7 @@ def test_snow_2360274_repro( [ StructField("ID", LongType(), nullable=False), StructField( - "VALS", + "VALS_OBJ", ArrayType( StructType( [StructField(nested_field_name, StringType(), nullable=True)] @@ -1791,6 +1791,16 @@ def test_snow_2360274_repro( ), nullable=True, ), + StructField( + "VALS_MAP", + ArrayType(MapType(StringType(), StringType())), + nullable=True, + ), + StructField( + "VALS_ARR", + ArrayType(ArrayType(StringType())), + nullable=True, + ), StructField("TAG", StringType(2), nullable=False), ] ) @@ -1812,7 +1822,13 @@ def inner(): ID, CAST( ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(OBJECT({nested_field_name} STRING)) - ) AS VALS + ) AS VALS_OBJ, + CAST( + ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(MAP(STRING, STRING)) + ) AS VALS_MAP, + CAST( + ARRAY_AGG(ARRAY_CONSTRUCT('value', VALUE)) AS ARRAY(ARRAY(STRING)) + ) AS VALS_ARR, FROM SRC GROUP BY @@ -1826,7 +1842,7 @@ def inner(): ) joined = agged.join(reference, on=agged.id == reference.id, how="inner").select( - agged.id.alias("ID"), "VALS", "TAG" + agged.id.alias("ID"), "VALS_OBJ", "VALS_MAP", "VALS_ARR", "TAG" ) Utils.is_schema_same(joined.schema, expected_schema, case_sensitive=False) From 64e1c2e51160af18b2a0296a2c137c5f4f10cd7d Mon Sep 17 00:00:00 2001 From: Jamison Date: Wed, 24 Sep 2025 17:36:32 -0700 Subject: [PATCH 6/7] additional test coverage --- tests/integ/scala/test_datatype_suite.py | 93 ++++++++++++++---------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/tests/integ/scala/test_datatype_suite.py b/tests/integ/scala/test_datatype_suite.py index 4b820eceeb..a16fc888a0 100644 --- a/tests/integ/scala/test_datatype_suite.py +++ b/tests/integ/scala/test_datatype_suite.py @@ -1776,6 +1776,9 @@ def test_snow_2360274_repro( ): if not structured_type_support: pytest.skip("Test requires structured type support.") + + agg_table_name = f"snowpark_2360274_repro_agg_{uuid.uuid4().hex[:5]}".upper() + nested_field_name = ( "value" if context._should_use_structured_type_semantics() else '"value"' ) @@ -1783,22 +1786,24 @@ def test_snow_2360274_repro( [ StructField("ID", LongType(), nullable=False), StructField( - "VALS_OBJ", + "VALS_ARR", ArrayType( StructType( - [StructField(nested_field_name, StringType(), nullable=True)] + [StructField(nested_field_name, StringType(10), nullable=True)] ) ), nullable=True, ), StructField( "VALS_MAP", - ArrayType(MapType(StringType(), StringType())), + MapType(StringType(10), StringType(10)), nullable=True, ), StructField( - "VALS_ARR", - ArrayType(ArrayType(StringType())), + "VALS_OBJ", + StructType( + [StructField(nested_field_name, StringType(10), nullable=True)] + ), nullable=True, ), StructField("TAG", StringType(2), nullable=False), @@ -1806,34 +1811,42 @@ def test_snow_2360274_repro( ) def inner(): - agged = structured_type_session.sql( + structured_type_session.sql( f""" - WITH SRC(ID, VALUE) AS ( + CREATE + OR REPLACE TABLE {agg_table_name} ( + ID INT NOT NULL, + VALS_ARR ARRAY(OBJECT({nested_field_name} STRING(10))) NOT NULL, + VALS_MAP MAP(STRING(10), STRING(10)) NOT NULL, + VALS_OBJ OBJECT({nested_field_name} STRING(10)) NOT NULL + ) AS WITH SRC(ID, VALUE) AS ( + SELECT + $1, + $2 + FROM + VALUES + (1, 'A'), + (1, 'B'), + (2, 'A') + ) SELECT - $1, - $2 + ID, + CAST( + ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(OBJECT({nested_field_name} STRING)) + ) AS VALS_ARR, + CAST( + OBJECT_CONSTRUCT('value', VALUE) AS MAP(STRING, STRING) + ) AS VALS_MAP, + CAST( + OBJECT_CONSTRUCT('value', VALUE) AS OBJECT({nested_field_name} STRING) + ) AS VALS_OBJ, FROM - VALUES - (1, 'A'), - (1, 'B'), - (2, 'A') - ) - SELECT - ID, - CAST( - ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(OBJECT({nested_field_name} STRING)) - ) AS VALS_OBJ, - CAST( - ARRAY_AGG(OBJECT_CONSTRUCT('value', VALUE)) AS ARRAY(MAP(STRING, STRING)) - ) AS VALS_MAP, - CAST( - ARRAY_AGG(ARRAY_CONSTRUCT('value', VALUE)) AS ARRAY(ARRAY(STRING)) - ) AS VALS_ARR, - FROM - SRC - GROUP BY - ID""" - ) + SRC + GROUP BY + ID, VALS_MAP, VALS_OBJ""" + ).collect() + + agged = structured_type_session.table(agg_table_name) reference = structured_type_session.sql( """ @@ -1842,15 +1855,19 @@ def inner(): ) joined = agged.join(reference, on=agged.id == reference.id, how="inner").select( - agged.id.alias("ID"), "VALS_OBJ", "VALS_MAP", "VALS_ARR", "TAG" + agged.id.alias("ID"), "VALS_ARR", "VALS_MAP", "VALS_OBJ", "TAG" ) Utils.is_schema_same(joined.schema, expected_schema, case_sensitive=False) - with mock.patch.object(context, "_enable_fix_2360274", fix_enabled): - if fix_enabled: - inner() - else: - with pytest.raises( - SnowparkSQLException, match="Unsupported data type 'STRUCTURED_OBJECT'" - ): + try: + with mock.patch.object(context, "_enable_fix_2360274", fix_enabled): + if fix_enabled: inner() + else: + with pytest.raises( + SnowparkSQLException, + match="Unsupported data type 'STRUCTURED_OBJECT'", + ): + inner() + finally: + Utils.drop_table(structured_type_session, agg_table_name) From 3ad0371ecaf1692c839c984c8cc2eead60d41ce8 Mon Sep 17 00:00:00 2001 From: Jamison Date: Thu, 25 Sep 2025 00:09:56 -0700 Subject: [PATCH 7/7] skip test --- tests/integ/test_stored_procedure.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integ/test_stored_procedure.py b/tests/integ/test_stored_procedure.py index 082e6cf45d..3ea7b0b458 100644 --- a/tests/integ/test_stored_procedure.py +++ b/tests/integ/test_stored_procedure.py @@ -2291,6 +2291,7 @@ def artifact_repo_test(_): @pytest.mark.skipif( sys.version_info < (3, 9), reason="artifact repository requires Python 3.9+" ) +@pytest.mark.skip("SNOW-2362946: Skip until root cause is found.") def test_sproc_artifact_repository_from_file(session, tmpdir): source = dedent( """