From 1a69ff1e7a6016da09a205257f7cf96ba1ed9902 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Fri, 26 Sep 2025 16:34:35 -0600 Subject: [PATCH 1/5] Add support for semi-structured and structured data functions ( part 1) --- CHANGELOG.md | 187 ++++++++++-------- docs/source/snowpark/functions.rst | 4 + .../snowpark/_functions/scalar_functions.py | 126 ++++++++++++ 3 files changed, 234 insertions(+), 83 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbefc6fd34..7921b5b9c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,63 +19,80 @@ - `get_cloud_provider_token` - Added support for the following scalar functions in `functions.py`: - - `array_remove_at` - - `as_boolean` - - `booland` - - `boolnot` - - `boolor` - - `boolor_agg` - - `boolxor` - - `chr` - - `decode` - - `div0null` - - `dp_interval_high` - - `dp_interval_low` - - `greatest_ignore_nulls` - - `h3_cell_to_boundary` - - `h3_cell_to_children` - - `h3_cell_to_children_string` - - `h3_cell_to_parent` - - `h3_cell_to_point` - - `h3_compact_cells` - - `h3_compact_cells_strings` - - `h3_coverage` - - `h3_coverage_strings` - - `h3_get_resolution` - - `h3_grid_disk` - - `h3_grid_distance` - - `h3_int_to_string` - - `h3_polygon_to_cells` - - `h3_polygon_to_cells_strings` - - `h3_string_to_int` - - `h3_try_grid_path` - - `h3_try_polygon_to_cells` - - `h3_try_polygon_to_cells_strings` - - `h3_uncompact_cells` - - `h3_uncompact_cells_strings` - - `haversine` - - `h3_grid_path` - - `h3_is_pentagon` - - `h3_is_valid_cell` - - `h3_latlng_to_cell` - - `h3_latlng_to_cell_string` - - `h3_point_to_cell` - - `h3_point_to_cell_string` - - `h3_try_coverage` - - `h3_try_coverage_strings` - - `h3_try_grid_distance` - - `hex_decode_binary` - - `last_query_id` - - `last_transaction` - - `least_ignore_nulls` - - `nullif` - - `nvl2` - - `regr_valx` - - `st_area` - - `st_asewkb` - - `st_asewkt` - - `st_asgeojson` - - `st_aswkb` + - Conditional expression functions: + - `booland` + - `boolnot` + - `boolor` + - `boolxor` + - `boolor_agg` + - `decode` + - `greatest_ignore_nulls` + - `least_ignore_nulls` + - `nullif` + - `nvl2` + - `regr_valx` + + - Semi-structured and structured date functions: + - `array_remove_at` + - `as_boolean` + - `map_delete` + - `_map_insert` + - `map_pick` + - `map_size` + + - String & binary functions + - `chr` + - `hex_decode_binary` + + - Numeric functions: + - `div0null` + + - Differential privacy functions: + - `dp_interval_high` + - `dp_interval_low` + + - Context functions: + - `last_query_id` + - `last_transaction` + + - Geospatial functoins: + - `h3_cell_to_boundary` + - `h3_cell_to_children` + - `h3_cell_to_children_string` + - `h3_cell_to_parent` + - `h3_cell_to_point` + - `h3_compact_cells` + - `h3_compact_cells_strings` + - `h3_coverage` + - `h3_coverage_strings` + - `h3_get_resolution` + - `h3_grid_disk` + - `h3_grid_distance` + - `h3_int_to_string` + - `h3_polygon_to_cells` + - `h3_polygon_to_cells_strings` + - `h3_string_to_int` + - `h3_try_grid_path` + - `h3_try_polygon_to_cells` + - `h3_try_polygon_to_cells_strings` + - `h3_uncompact_cells` + - `h3_uncompact_cells_strings` + - `haversine` + - `h3_grid_path` + - `h3_is_pentagon` + - `h3_is_valid_cell` + - `h3_latlng_to_cell` + - `h3_latlng_to_cell_string` + - `h3_point_to_cell` + - `h3_point_to_cell_string` + - `h3_try_coverage` + - `h3_try_coverage_strings` + - `h3_try_grid_distance` + - `st_area` + - `st_asewkb` + - `st_asewkt` + - `st_asgeojson` + - `st_aswkb` #### Bug Fixes @@ -137,16 +154,17 @@ - Added support for parameter `use_vectorized_scanner` in function `Session.write_pandas()`. - Added support for parameter `session_init_statement` in udtf ingestion of `DataFrameReader.jdbc`(PrPr). - Added support for the following scalar functions in `functions.py`: - - `getdate` - - `getvariable` - - `invoker_role` - - `invoker_share` - - `is_application_role_in_session` - - `is_database_role_in_session` - - `is_granted_to_invoker_role` - - `is_role_in_session` - - `localtime` - - `systimestamp` + - Context functions: + - `getdate` + - `getvariable` + - `invoker_role` + - `invoker_share` + - `is_application_role_in_session` + - `is_database_role_in_session` + - `is_granted_to_invoker_role` + - `is_role_in_session` + - `localtime` + - `systimestamp` #### Bug Fixes @@ -221,22 +239,25 @@ - Added support for `DataFrameReader.jdbc`(PrPr) that allows ingesting external data source with jdbc driver. - Added support for `FileOperation.copy_files` to copy files from a source location to an output stage. - Added support for the following scalar functions in `functions.py`: - - `all_user_names` - - `bitand` - - `bitand_agg` - - `bitor` - - `bitor_agg` - - `bitxor` - - `bitxor_agg` - - `current_account_name` - - `current_client` - - `current_ip_address` - - `current_role_type` - - `current_organization_name` - - `current_organization_user` - - `current_secondary_roles` - - `current_transaction` - - `getbit` + - Context functions: + - `all_user_names` + - `current_account_name` + - `current_client` + - `current_ip_address` + - `current_role_type` + - `current_organization_name` + - `current_organization_user` + - `current_secondary_roles` + - `current_transaction` + + - Bitwise expression functions: + - `bitand` + - `bitand_agg` + - `bitor` + - `bitor_agg` + - `bitxor` + - `bitxor_agg` + - `getbit` #### Bug Fixes diff --git a/docs/source/snowpark/functions.rst b/docs/source/snowpark/functions.rst index f57e439cfc..8bfd35d68d 100644 --- a/docs/source/snowpark/functions.rst +++ b/docs/source/snowpark/functions.rst @@ -336,6 +336,10 @@ Functions map_concat map_contains_key map_keys + map_delete + map_insert + map_insert + map_size max max_by md5 diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index bc17f4a1c2..d3d5f14c8a 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -2064,3 +2064,129 @@ def h3_try_grid_distance( cell_id_1 = _to_col_if_str(cell_id_1, "h3_try_grid_distance") cell_id_2 = _to_col_if_str(cell_id_2, "h3_try_grid_distance") return builtin("h3_try_grid_distance", _emit_ast=_emit_ast)(cell_id_1, cell_id_2) + + +@publicapi +def map_delete( + map_col: ColumnOrName, + key1: ColumnOrName, + *keys: ColumnOrName, + _emit_ast: bool = True +) -> Column: + """Returns a map consisting of the input map with one or more keys removed. + + Args: + map_col (ColumnOrName): The map used to remove keys. + key1 (ColumnOrName): The first key to remove. + *key (ColumnOrName): Additional keys to remove. + + Returns: + Column: A map with the specified keys removed. + + Example:: + + >>> from snowflake.snowpark.functions import col, lit, to_variant + >>> df = session.sql(\""" + ... SELECT {'a':1,'b':2,'c':3}::MAP(VARCHAR,NUMBER) as map_col + ... union all + ... SELECT {'c':3,'d':4,'e':5}::MAP(VARCHAR,NUMBER) as map_col + ... \""") + >>> df.select(to_variant(map_delete(col("map_col"), lit("c"), lit("d"))).alias("result")).collect() + [Row(RESULT='{\\n "a": 1,\\n "b": 2\\n}'), Row(RESULT='{\\n "e": 5\\n}')] + + """ + m = _to_col_if_str(map_col, "map_delete") + k1 = _to_col_if_str(key1, "map_delete") + ks = [_to_col_if_str(k, "map_delete") for k in keys] + return builtin("map_delete", _emit_ast=_emit_ast)(m, k1, *ks) + + +@publicapi +def map_insert( + map_col: ColumnOrName, + key: ColumnOrName, + value: ColumnOrName, + update_flag: Optional[ColumnOrName] = None, + _emit_ast: bool = True, +) -> Column: + """ + Returns a map containing all key-value pairs from the source map as well as the new key-value pair. + If the key already exists in the map, the value is updated with the new value unless update_flag is False. + + Args: + map_col (ColumnOrName): Column containing the source map + key (ColumnOrName): Column containing the key to insert or update + value (ColumnOrName): Column containing the value to associate with the key + update_flag (Optional[ColumnOrName]): Column containing a boolean flag indicating whether to update existing keys. If None or True, existing keys are updated. If False, existing keys are not updated. + + Returns: + Column: A new map with the key-value pair inserted or updated + + Examples: + >>> from snowflake.snowpark.functions import lit, to_variant, col + >>> df = session.sql("SELECT {'a': 1, 'b': 2}::MAP(VARCHAR, NUMBER) as MAP_COL") + >>> df.select(to_variant(map_insert(col("MAP_COL"), lit("c"), lit(3))).alias("RESULT")).collect() + [Row(RESULT='{\\n "a": 1,\\n "b": 2,\\n "c": 3\\n}')] + """ + m = _to_col_if_str(map_col, "map_insert") + k = _to_col_if_str(key, "map_insert") + v = _to_col_if_str(value, "map_insert") + uf = _to_col_if_str(update_flag, "map_insert") if update_flag is not None else None + if uf is not None: + return builtin("map_insert", _emit_ast=_emit_ast)(m, k, v, uf) + else: + return builtin("map_insert", _emit_ast=_emit_ast)(m, k, v) + + +@publicapi +def map_pick( + map_col: ColumnOrName, + key1: ColumnOrName, + *keys: ColumnOrName, + _emit_ast: bool = True +) -> Column: + """ + Returns a new map containing some of the key-value pairs from an existing map. + + To identify the key-value pairs to include in the new map, pass in the keys as arguments. + If a specified key is not present in the input map, the key is ignored. + + Args: + map_col (ColumnOrName): The map column to pick from + key1 (ColumnOrName): The first key to pick + *keys (ColumnOrName): Additional keys to pick + + Returns: + Column: A new map containing the selected key-value pairs + + Examples: + >>> from snowflake.snowpark.functions import lit, to_variant, col + >>> df = session.sql("SELECT {'a':1,'b':2,'c':3}::MAP(VARCHAR,NUMBER) as map_col") + >>> df.select(to_variant(map_pick(df["map_col"], lit("a"), lit("b"))).alias("result")).collect() + [Row(RESULT='{\\n "a": 1,\\n "b": 2\\n}')] + """ + m = _to_col_if_str(map_col, "map_pick") + k1 = _to_col_if_str(key1, "map_pick") + ks = [_to_col_if_str(k, "map_pick") for k in keys] + return builtin("map_pick", _emit_ast=_emit_ast)(m, k1, *ks) + + +@publicapi +def map_size(map_col: ColumnOrName, _emit_ast: bool = True) -> Column: + """ + Returns the size of the input MAP. Returns None if the input column is not a MAP type. + + Args: + map_col (ColumnOrName): The map values. + + Returns: + Column: The size of the map. + + Examples: + >>> from snowflake.snowpark.functions import col + >>> df = session.sql("SELECT {'a': 1, 'b': 2}::MAP(VARCHAR, NUMBER) as MAP_COL") + >>> df.select(map_size(col("MAP_COL")).alias("MAP_SIZE")).collect() + [Row(MAP_SIZE=2)] + """ + c = _to_col_if_str(map_col, "map_size") + return builtin("map_size", _emit_ast=_emit_ast)(c) From 6180373f39ece1cdc16708f78680610cfb8ff27a Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Fri, 26 Sep 2025 16:38:46 -0600 Subject: [PATCH 2/5] Fix changelog issue --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7921b5b9c0..ef60a8e52a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,7 +36,7 @@ - `array_remove_at` - `as_boolean` - `map_delete` - - `_map_insert` + - `map_insert` - `map_pick` - `map_size` From c5bb9206f93eb44f715b7dea3df23643f5760a3c Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Fri, 26 Sep 2025 16:39:29 -0600 Subject: [PATCH 3/5] Fix changelog issue --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef60a8e52a..64ba633da3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,7 +40,7 @@ - `map_pick` - `map_size` - - String & binary functions + - String & binary functions: - `chr` - `hex_decode_binary` @@ -55,7 +55,7 @@ - `last_query_id` - `last_transaction` - - Geospatial functoins: + - Geospatial functions: - `h3_cell_to_boundary` - `h3_cell_to_children` - `h3_cell_to_children_string` From 6ca2861790beefa3a67cfe584480ac24fca43217 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Fri, 26 Sep 2025 16:41:15 -0600 Subject: [PATCH 4/5] Update docs/source/snowpark/functions.rst Co-authored-by: Jamison Rose --- docs/source/snowpark/functions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/snowpark/functions.rst b/docs/source/snowpark/functions.rst index 8bfd35d68d..c09c1304eb 100644 --- a/docs/source/snowpark/functions.rst +++ b/docs/source/snowpark/functions.rst @@ -338,7 +338,7 @@ Functions map_keys map_delete map_insert - map_insert + map_pick map_size max max_by From d66a72cc281d0db8113cbe6cc1fc28fcf53f60a6 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Mon, 29 Sep 2025 11:20:37 -0600 Subject: [PATCH 5/5] Fix comments --- CHANGELOG.md | 56 +++++++++---------- .../snowpark/_functions/scalar_functions.py | 40 ++++++------- 2 files changed, 47 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d7f2198b8..097bb41af1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -155,17 +155,16 @@ - Added support for parameter `use_vectorized_scanner` in function `Session.write_pandas()`. - Added support for parameter `session_init_statement` in udtf ingestion of `DataFrameReader.jdbc`(PrPr). - Added support for the following scalar functions in `functions.py`: - - Context functions: - - `getdate` - - `getvariable` - - `invoker_role` - - `invoker_share` - - `is_application_role_in_session` - - `is_database_role_in_session` - - `is_granted_to_invoker_role` - - `is_role_in_session` - - `localtime` - - `systimestamp` + - `getdate` + - `getvariable` + - `invoker_role` + - `invoker_share` + - `is_application_role_in_session` + - `is_database_role_in_session` + - `is_granted_to_invoker_role` + - `is_role_in_session` + - `localtime` + - `systimestamp` #### Bug Fixes @@ -240,25 +239,22 @@ - Added support for `DataFrameReader.jdbc`(PrPr) that allows ingesting external data source with jdbc driver. - Added support for `FileOperation.copy_files` to copy files from a source location to an output stage. - Added support for the following scalar functions in `functions.py`: - - Context functions: - - `all_user_names` - - `current_account_name` - - `current_client` - - `current_ip_address` - - `current_role_type` - - `current_organization_name` - - `current_organization_user` - - `current_secondary_roles` - - `current_transaction` - - - Bitwise expression functions: - - `bitand` - - `bitand_agg` - - `bitor` - - `bitor_agg` - - `bitxor` - - `bitxor_agg` - - `getbit` + - `all_user_names` + - `bitand` + - `bitand_agg` + - `bitor` + - `bitor_agg` + - `bitxor` + - `bitxor_agg` + - `current_account_name` + - `current_client` + - `current_ip_address` + - `current_role_type` + - `current_organization_name` + - `current_organization_user` + - `current_secondary_roles` + - `current_transaction` + - `getbit` #### Bug Fixes diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index d3d5f14c8a..89aa2874d0 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -2068,17 +2068,13 @@ def h3_try_grid_distance( @publicapi def map_delete( - map_col: ColumnOrName, - key1: ColumnOrName, - *keys: ColumnOrName, - _emit_ast: bool = True + map_col: ColumnOrName, *keys: ColumnOrName, _emit_ast: bool = True ) -> Column: """Returns a map consisting of the input map with one or more keys removed. Args: map_col (ColumnOrName): The map used to remove keys. - key1 (ColumnOrName): The first key to remove. - *key (ColumnOrName): Additional keys to remove. + *keys (ColumnOrName): Keys to remove. Returns: Column: A map with the specified keys removed. @@ -2096,9 +2092,8 @@ def map_delete( """ m = _to_col_if_str(map_col, "map_delete") - k1 = _to_col_if_str(key1, "map_delete") ks = [_to_col_if_str(k, "map_delete") for k in keys] - return builtin("map_delete", _emit_ast=_emit_ast)(m, k1, *ks) + return builtin("map_delete", _emit_ast=_emit_ast)(m, *ks) @publicapi @@ -2114,10 +2109,10 @@ def map_insert( If the key already exists in the map, the value is updated with the new value unless update_flag is False. Args: - map_col (ColumnOrName): Column containing the source map - key (ColumnOrName): Column containing the key to insert or update - value (ColumnOrName): Column containing the value to associate with the key - update_flag (Optional[ColumnOrName]): Column containing a boolean flag indicating whether to update existing keys. If None or True, existing keys are updated. If False, existing keys are not updated. + map_col (ColumnOrName): The source map + key (ColumnOrName): The key to insert or update + value (ColumnOrName): The value to associate with the key + update_flag (Optional[ColumnOrName]): A boolean flag indicating whether to update existing keys. If None or True, existing keys are updated. If False, existing keys are not updated. Returns: Column: A new map with the key-value pair inserted or updated @@ -2127,6 +2122,12 @@ def map_insert( >>> df = session.sql("SELECT {'a': 1, 'b': 2}::MAP(VARCHAR, NUMBER) as MAP_COL") >>> df.select(to_variant(map_insert(col("MAP_COL"), lit("c"), lit(3))).alias("RESULT")).collect() [Row(RESULT='{\\n "a": 1,\\n "b": 2,\\n "c": 3\\n}')] + + # Example using update flag + >>> from snowflake.snowpark.functions import lit, to_variant, col + >>> df = session.sql("SELECT {'a': 1, 'b': 2}::MAP(VARCHAR, NUMBER) as MAP_COL") + >>> df.select(to_variant(map_insert(col("MAP_COL"), lit("a"), lit(20), lit(True))).alias("RESULT")).collect() + [Row(RESULT='{\\n "a": 20,\\n "b": 2\\n}')] """ m = _to_col_if_str(map_col, "map_insert") k = _to_col_if_str(key, "map_insert") @@ -2140,10 +2141,7 @@ def map_insert( @publicapi def map_pick( - map_col: ColumnOrName, - key1: ColumnOrName, - *keys: ColumnOrName, - _emit_ast: bool = True + map_col: ColumnOrName, *keys: ColumnOrName, _emit_ast: bool = True ) -> Column: """ Returns a new map containing some of the key-value pairs from an existing map. @@ -2153,7 +2151,6 @@ def map_pick( Args: map_col (ColumnOrName): The map column to pick from - key1 (ColumnOrName): The first key to pick *keys (ColumnOrName): Additional keys to pick Returns: @@ -2164,11 +2161,16 @@ def map_pick( >>> df = session.sql("SELECT {'a':1,'b':2,'c':3}::MAP(VARCHAR,NUMBER) as map_col") >>> df.select(to_variant(map_pick(df["map_col"], lit("a"), lit("b"))).alias("result")).collect() [Row(RESULT='{\\n "a": 1,\\n "b": 2\\n}')] + + # Examlpe sending an array of keys + >>> from snowflake.snowpark.functions import map_pick, to_variant, col + >>> df = session.sql("SELECT {'a':1,'b':2,'c':3}::MAP(VARCHAR,NUMBER) as map_col, ARRAY_CONSTRUCT('a','b') as keys_arr") + >>> df.select(to_variant(map_pick(col("map_col"), col("keys_arr"))).alias("RESULT")).collect() + [Row(RESULT='{\\n "a": 1,\\n "b": 2\\n}')] """ m = _to_col_if_str(map_col, "map_pick") - k1 = _to_col_if_str(key1, "map_pick") ks = [_to_col_if_str(k, "map_pick") for k in keys] - return builtin("map_pick", _emit_ast=_emit_ast)(m, k1, *ks) + return builtin("map_pick", _emit_ast=_emit_ast)(m, *ks) @publicapi