Merge branch 'main' into test-enable-assorted-tests

chalmerlowe · web-flow · commit 314a2b309a2f · 2026-04-10T12:14:14.000-04:00
diff --git a/packages/bigframes/noxfile.py b/packages/bigframes/noxfile.py
@@ -119,6 +119,7 @@
     # from GitHub actions.
     "unit_noextras",
     "system-3.10",  # No extras.
+    "system-3.12",  # No extras.
     f"system-{DEFAULT_PYTHON_VERSION}",  # All extras.
     "cover",
     # TODO(b/401609005): remove
@@ -364,6 +365,7 @@ def run_system(
     )
 
 
+
 @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
 @nox.parametrize("test_extra", [True, False])
 def system(session: nox.sessions.Session, test_extra):
diff --git a/packages/bigframes/tests/system/small/ml/test_cluster.py b/packages/bigframes/tests/system/small/ml/test_cluster.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import pandas as pd
 
 import bigframes.pandas as bpd
@@ -141,6 +142,26 @@ def test_kmeans_cluster_centers(penguins_kmeans_model: cluster.KMeans):
         .sort_values(["centroid_id", "feature"])
         .reset_index(drop=True)
     )
+
+    # FIX: Helper to ignore row order inside categorical_value lists
+    # and sign flipping of values inside numerical_value list.
+    # This prevents the test from failing if BQML returns [MALE, FEMALE] instead of [FEMALE, MALE]
+    # or 0.197 versus -0.197.
+    def sort_and_abs_categorical(val):
+        # Accept BOTH python lists AND numpy arrays
+        if isinstance(val, (list, np.ndarray)) and len(val) > 0:
+            # Take abs of value first, then sort
+            processed = [
+                {"category": x["category"], "value": abs(x["value"])} for x in val
+            ]
+            return sorted(processed, key=lambda x: x["category"])
+        return val
+
+    result["numerical_value"] = result["numerical_value"].abs()
+    result["categorical_value"] = result["categorical_value"].apply(
+        sort_and_abs_categorical
+    )
+
     expected = (
         pd.DataFrame(
             {
@@ -198,11 +219,18 @@ def test_kmeans_cluster_centers(penguins_kmeans_model: cluster.KMeans):
         .sort_values(["centroid_id", "feature"])
         .reset_index(drop=True)
     )
+
+    # Sort and sign flip expected values to match the output of the model.
+    expected["numerical_value"] = expected["numerical_value"].abs()
+    expected["categorical_value"] = expected["categorical_value"].apply(
+        sort_and_abs_categorical
+    )
+
     pd.testing.assert_frame_equal(
         result,
         expected,
         check_exact=False,
-        rtol=0.1,
+        rtol=0.1,  # Keep or slightly increase if numerical drift persists
         # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame
         check_index_type=False,
         check_dtype=False,
diff --git a/packages/bigframes/tests/system/small/ml/test_core.py b/packages/bigframes/tests/system/small/ml/test_core.py
@@ -15,6 +15,7 @@
 import typing
 from datetime import datetime
 
+import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
@@ -78,6 +79,16 @@ def test_model_eval_with_data(penguins_bqml_linear_model, penguins_df_default_in
 
 def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel):
     result = penguins_bqml_kmeans_model.centroids().to_pandas()
+
+    # FIX: Helper to ignore row order inside categorical_value lists
+    # This prevents the test from failing if BQML returns [MALE, FEMALE] instead of [FEMALE, MALE]
+    def sort_categorical(val):
+        if isinstance(val, (list, np.ndarray)) and len(val) > 0:
+            return sorted(val, key=lambda x: x["category"])
+        return val
+
+    result["categorical_value"] = result["categorical_value"].apply(sort_categorical)
+
     expected = (
         pd.DataFrame(
             {
@@ -135,6 +146,12 @@ def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel):
         .sort_values(["centroid_id", "feature"])
         .reset_index(drop=True)
     )
+
+    # Sort expected values to match the output of the model.
+    expected["categorical_value"] = expected["categorical_value"].apply(
+        sort_categorical
+    )
+
     pd.testing.assert_frame_equal(
         result,
         expected,
@@ -152,6 +169,26 @@ def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel)
 
     # result is too long, only check the first principal component here.
     result = result.head(7)
+
+    # FIX: Helper to ignore row order inside categorical_value lists
+    # and sign flipping of values inside numerical_value list.
+    # This prevents the test from failing if BQML returns [MALE, FEMALE] instead of [FEMALE, MALE]
+    # or 0.197 versus -0.197.
+    def sort_and_abs_categorical(val):
+        # Accept BOTH python lists AND numpy arrays
+        if isinstance(val, (list, np.ndarray)) and len(val) > 0:
+            # Take abs of value first, then sort
+            processed = [
+                {"category": x["category"], "value": abs(x["value"])} for x in val
+            ]
+            return sorted(processed, key=lambda x: x["category"])
+        return val
+
+    result["numerical_value"] = result["numerical_value"].abs()
+    result["categorical_value"] = result["categorical_value"].apply(
+        sort_and_abs_categorical
+    )
+
     expected = (
         pd.DataFrame(
             {
@@ -211,6 +248,12 @@ def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel)
         .reset_index(drop=True)
     )
 
+    # Sort and sign flip expected values to match the output of the model.
+    expected["numerical_value"] = expected["numerical_value"].abs()
+    expected["categorical_value"] = expected["categorical_value"].apply(
+        sort_and_abs_categorical
+    )
+
     utils.assert_pandas_df_equal_pca_components(
         result,
         expected,
diff --git a/packages/bigframes/tests/system/small/ml/test_decomposition.py b/packages/bigframes/tests/system/small/ml/test_decomposition.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import pandas as pd
 
 import bigframes.pandas as bpd
@@ -34,7 +35,7 @@ def test_pca_predict(
     )
 
     bigframes.testing.utils.assert_pandas_df_equal_pca(
-        predictions, expected, check_exact=False, rtol=0.1
+        predictions, expected, check_exact=False, rtol=0.2
     )
 
 
@@ -55,7 +56,7 @@ def test_pca_detect_anomalies(
         expected,
         check_exact=False,
         check_dtype=False,
-        rtol=0.1,
+        rtol=0.2,
     )
 
 
@@ -78,7 +79,7 @@ def test_pca_detect_anomalies_params(
         expected,
         check_exact=False,
         check_dtype=False,
-        rtol=0.1,
+        rtol=0.2,
     )
 
 
@@ -92,7 +93,7 @@ def test_pca_score(penguins_pca_model: decomposition.PCA):
         result,
         expected,
         check_exact=False,
-        rtol=0.1,
+        rtol=0.2,
         check_index_type=False,
     )
 
@@ -102,6 +103,26 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA):
 
     # result is too long, only check the first principal component here.
     result = result.head(7)
+
+    # FIX: Helper to ignore row order inside categorical_value lists
+    # and sign flipping of values inside numerical_value list.
+    # This prevents the test from failing if BQML returns [MALE, FEMALE] instead of [FEMALE, MALE]
+    # or 0.197 versus -0.197.
+    def sort_and_abs_categorical(val):
+        # Accept BOTH python lists AND numpy arrays
+        if isinstance(val, (list, np.ndarray)) and len(val) > 0:
+            # Take abs of value first, then sort
+            processed = [
+                {"category": x["category"], "value": abs(x["value"])} for x in val
+            ]
+            return sorted(processed, key=lambda x: x["category"])
+        return val
+
+    result["numerical_value"] = result["numerical_value"].abs()
+    result["categorical_value"] = result["categorical_value"].apply(
+        sort_and_abs_categorical
+    )
+
     expected = (
         pd.DataFrame(
             {
@@ -161,11 +182,17 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA):
         .reset_index(drop=True)
     )
 
+    # Sort and sign flip expected values to match the output of the model.
+    expected["numerical_value"] = expected["numerical_value"].abs()
+    expected["categorical_value"] = expected["categorical_value"].apply(
+        sort_and_abs_categorical
+    )
+
     bigframes.testing.utils.assert_pandas_df_equal_pca_components(
         result,
         expected,
         check_exact=False,
-        rtol=0.1,
+        rtol=0.2,  # FIX: Slightly increased rtol for numerical drift (from 0.1)
         check_index_type=False,
         check_dtype=False,
     )
@@ -184,7 +211,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA):
         result,
         expected,
         check_exact=False,
-        rtol=0.1,
+        rtol=0.2,
         check_index_type=False,
         check_dtype=False,
         ignore_order=True,
@@ -204,7 +231,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA):
         result,
         expected,
         check_exact=False,
-        rtol=0.1,
+        rtol=0.2,
         check_index_type=False,
         check_dtype=False,
         ignore_order=True,
diff --git a/packages/bigframes/tests/system/small/ml/test_forecasting.py b/packages/bigframes/tests/system/small/ml/test_forecasting.py
@@ -474,6 +474,7 @@ def test_arima_plus_score(
                 "root_mean_squared_error": [120.675442, 120.675442],
                 "mean_absolute_percentage_error": [4.80044, 4.80044],
                 "symmetric_mean_absolute_percentage_error": [4.744332, 4.744332],
+                "mean_absolute_scaled_error": [0.400, 0.400],
             },
             dtype="Float64",
         )
@@ -489,6 +490,7 @@ def test_arima_plus_score(
                 "root_mean_squared_error": [120.675442],
                 "mean_absolute_percentage_error": [4.80044],
                 "symmetric_mean_absolute_percentage_error": [4.744332],
+                "mean_absolute_scaled_error": [0.400],
             },
             dtype="Float64",
         )
@@ -575,6 +577,7 @@ def test_arima_plus_score_series(
                 "root_mean_squared_error": [120.675442, 120.675442],
                 "mean_absolute_percentage_error": [4.80044, 4.80044],
                 "symmetric_mean_absolute_percentage_error": [4.744332, 4.744332],
+                "mean_absolute_scaled_error": [0.400, 0.400],
             },
             dtype="Float64",
         )
@@ -590,6 +593,7 @@ def test_arima_plus_score_series(
                 "root_mean_squared_error": [120.675442],
                 "mean_absolute_percentage_error": [4.80044],
                 "symmetric_mean_absolute_percentage_error": [4.744332],
+                "mean_absolute_scaled_error": [0.400],
             },
             dtype="Float64",
         )
diff --git a/packages/bigframes/tests/system/small/test_pandas_options.py b/packages/bigframes/tests/system/small/test_pandas_options.py
@@ -316,7 +316,9 @@ def test_credentials_need_reauthentication(
         with warnings.catch_warnings(record=True) as warned:
             bpd.close_session()  # CleanupFailedWarning: can't clean up
 
-        assert len(warned) == 1
+        # The test forces a failure during cleanup and asserts that one or more warning is generated
+        # when/if multiple temp tables might have been left over.
+        assert len(warned) >= 1
         assert warned[0].category == bigframes.exceptions.CleanupFailedWarning
 
         assert (
diff --git a/packages/google-cloud-spanner/google/cloud/spanner_v1/_async/client.py b/packages/google-cloud-spanner/google/cloud/spanner_v1/_async/client.py
@@ -313,6 +313,7 @@ def __init__(
             self._client_certificate = client_certificate
             self._client_key = client_key
             credentials = AnonymousCredentials()
+            disable_builtin_metrics = True
         elif isinstance(credentials, AnonymousCredentials):
             self._emulator_host = self._client_options.api_endpoint
 
@@ -645,6 +646,7 @@ def instance(
             self._emulator_host,
             labels,
             processing_units,
+            self._experimental_host,
         )
 
     @CrossSync.convert
diff --git a/packages/google-cloud-spanner/google/cloud/spanner_v1/client.py b/packages/google-cloud-spanner/google/cloud/spanner_v1/client.py
@@ -276,6 +276,7 @@ def __init__(
             self._client_certificate = client_certificate
             self._client_key = client_key
             credentials = AnonymousCredentials()
+            disable_builtin_metrics = True
         elif isinstance(credentials, AnonymousCredentials):
             self._emulator_host = self._client_options.api_endpoint
         super(Client, self).__init__(
@@ -547,6 +548,7 @@ def instance(
             self._emulator_host,
             labels,
             processing_units,
+            self._experimental_host,
         )
 
     def list_instances(self, filter_="", page_size=None):
diff --git a/packages/google-cloud-spanner/tests/system/_async/conftest.py b/packages/google-cloud-spanner/tests/system/_async/conftest.py
@@ -35,7 +35,10 @@ def spanner_client():
 
         credentials = AnonymousCredentials()
         return spanner_v1.AsyncClient(
-            project=_helpers.EXPERIMENTAL_HOST_PROJECT,
+            use_plain_text=_helpers.USE_PLAIN_TEXT,
+            ca_certificate=_helpers.CA_CERTIFICATE,
+            client_certificate=_helpers.CLIENT_CERTIFICATE,
+            client_key=_helpers.CLIENT_KEY,
             credentials=credentials,
             experimental_host=_helpers.EXPERIMENTAL_HOST,
         )