fix: ensure numpy version matches in remote_function deployment (#798)

tswast · web-flow · commit 324d93cb3119 · 2024-06-18T18:13:05.000-05:00
* fix: ensure numpy version matches in `remote_function` deploymnet

* add comment linking to the bug

* restore noxfile

* restore flaky decoration

* skip tests that reuse functions with stale packages
diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py
@@ -39,6 +39,7 @@
 import warnings
 
 import ibis
+import numpy
 import pandas
 import pyarrow
 import requests
@@ -280,6 +281,9 @@ def generate_cloud_function_code(
         if is_row_processor:
             # bigframes remote function will send an entire row of data as json,
             # which would be converted to a pandas series and processed
+            # Ensure numpy versions match to avoid unpickling problems. See
+            # internal issue b/347934471.
+            requirements.append(f"numpy=={numpy.__version__}")
             requirements.append(f"pandas=={pandas.__version__}")
             requirements.append(f"pyarrow=={pyarrow.__version__}")
         if package_requirements:
diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py
@@ -742,109 +742,6 @@ def test_read_gbq_function_enforces_explicit_types(
         )
 
 
-@pytest.mark.flaky(retries=2, delay=120)
-def test_df_apply_axis_1(session, scalars_dfs):
-    columns = [
-        "bool_col",
-        "int64_col",
-        "int64_too",
-        "float64_col",
-        "string_col",
-        "bytes_col",
-    ]
-    scalars_df, scalars_pandas_df = scalars_dfs
-
-    def add_ints(row):
-        return row["int64_col"] + row["int64_too"]
-
-    with pytest.warns(
-        bigframes.exceptions.PreviewWarning,
-        match="input_types=Series is in preview.",
-    ):
-        add_ints_remote = session.remote_function(
-            bigframes.series.Series,
-            int,
-        )(add_ints)
-
-    with pytest.warns(
-        bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview."
-    ):
-        bf_result = scalars_df[columns].apply(add_ints_remote, axis=1).to_pandas()
-
-    pd_result = scalars_pandas_df[columns].apply(add_ints, axis=1)
-
-    # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this
-    # mismatch by using check_dtype=False.
-    #
-    # bf_result.to_numpy() produces an array of numpy.float64's
-    # (in system_prerelease tests), while pd_result.to_numpy() produces an
-    # array of ints, ignore this mismatch by using check_exact=False.
-    pd.testing.assert_series_equal(
-        pd_result, bf_result, check_dtype=False, check_exact=False
-    )
-
-
-@pytest.mark.flaky(retries=2, delay=120)
-def test_df_apply_axis_1_ordering(session, scalars_dfs):
-    columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"]
-    ordering_columns = ["bool_col", "int64_col"]
-    scalars_df, scalars_pandas_df = scalars_dfs
-
-    def add_ints(row):
-        return row["int64_col"] + row["int64_too"]
-
-    add_ints_remote = session.remote_function(bigframes.series.Series, int)(add_ints)
-
-    bf_result = (
-        scalars_df[columns]
-        .sort_values(ordering_columns)
-        .apply(add_ints_remote, axis=1)
-        .to_pandas()
-    )
-    pd_result = (
-        scalars_pandas_df[columns].sort_values(ordering_columns).apply(add_ints, axis=1)
-    )
-
-    # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this
-    # mismatch by using check_dtype=False.
-    #
-    # bf_result.to_numpy() produces an array of numpy.float64's
-    # (in system_prerelease tests), while pd_result.to_numpy() produces an
-    # array of ints, ignore this mismatch by using check_exact=False.
-    pd.testing.assert_series_equal(
-        pd_result, bf_result, check_dtype=False, check_exact=False
-    )
-
-
-@pytest.mark.flaky(retries=2, delay=120)
-def test_df_apply_axis_1_multiindex(session):
-    pd_df = pd.DataFrame(
-        {"x": [1, 2, 3], "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"]},
-        index=pd.MultiIndex.from_tuples([("a", 100), ("a", 200), ("b", 300)]),
-    )
-    bf_df = session.read_pandas(pd_df)
-
-    def add_numbers(row):
-        return row["x"] + row["y"]
-
-    add_numbers_remote = session.remote_function(bigframes.series.Series, float)(
-        add_numbers
-    )
-
-    bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas()
-    pd_result = pd_df.apply(add_numbers, axis=1)
-
-    # bf_result.dtype is 'Float64' while pd_result.dtype is 'float64', ignore this
-    # mismatch by using check_dtype=False.
-    #
-    # bf_result.index[0].dtype is 'string[pyarrow]' while
-    # pd_result.index[0].dtype is 'object', ignore this mismatch by using
-    # check_index_type=False.
-    pd.testing.assert_series_equal(
-        pd_result, bf_result, check_dtype=False, check_index_type=False
-    )
-
-
 def test_df_apply_axis_1_unsupported_callable(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"]