revert: Roll back to state at 4134750

shuoweil · shuoweil · commit 474070844736 · 2026-02-06T20:07:33.000Z
diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py
@@ -16,7 +16,6 @@
 import dataclasses
 import functools
 import itertools
-import json
 from typing import cast, Literal, Optional, Sequence, Tuple, Type, TYPE_CHECKING
 
 import pandas as pd
@@ -430,68 +429,7 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
         @compile_op.register(json_ops.JSONDecode)
         def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
             assert isinstance(op, json_ops.JSONDecode)
-            target_dtype = _bigframes_dtype_to_polars_dtype(op.to_type)
-            if op.safe:
-                # Polars does not support safe JSON decoding (returning null on failure).
-                # We use map_elements to provide safe JSON decoding.
-                def safe_decode(val):
-                    if val is None:
-                        return None
-                    try:
-                        decoded = json.loads(val)
-                    except Exception:
-                        return None
-
-                    if decoded is None:
-                        return None
-
-                    if op.to_type == bigframes.dtypes.INT_DTYPE:
-                        if type(decoded) is bool:
-                            return None
-                        if isinstance(decoded, int):
-                            return decoded
-                        if isinstance(decoded, float):
-                            if decoded.is_integer():
-                                return int(decoded)
-                        if isinstance(decoded, str):
-                            try:
-                                return int(decoded)
-                            except Exception:
-                                pass
-                        return None
-
-                    if op.to_type == bigframes.dtypes.FLOAT_DTYPE:
-                        if type(decoded) is bool:
-                            return None
-                        if isinstance(decoded, (int, float)):
-                            return float(decoded)
-                        if isinstance(decoded, str):
-                            try:
-                                return float(decoded)
-                            except Exception:
-                                pass
-                        return None
-
-                    if op.to_type == bigframes.dtypes.BOOL_DTYPE:
-                        if isinstance(decoded, bool):
-                            return decoded
-                        if isinstance(decoded, str):
-                            if decoded.lower() == "true":
-                                return True
-                            if decoded.lower() == "false":
-                                return False
-                        return None
-
-                    if op.to_type == bigframes.dtypes.STRING_DTYPE:
-                        if isinstance(decoded, str):
-                            return decoded
-                        return None
-
-                    return decoded
-
-                return input.map_elements(safe_decode, return_dtype=target_dtype)
-
-            return input.str.json_decode(target_dtype)
+            return input.str.json_decode(_DTYPE_MAPPING[op.to_type])
 
         @compile_op.register(arr_ops.ToArrayOp)
         def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr:
diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py
@@ -391,7 +391,7 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression):
         return arg
 
     if arg.output_type == dtypes.JSON_DTYPE:
-        return json_ops.JSONDecode(cast_op.to_type, safe=cast_op.safe).as_expr(arg)
+        return json_ops.JSONDecode(cast_op.to_type).as_expr(arg)
     if (
         arg.output_type == dtypes.STRING_DTYPE
         and cast_op.to_type == dtypes.DATETIME_DTYPE
diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py
@@ -24,8 +24,7 @@
 """
 
 import abc
-import typing
-from typing import Optional, TypeVar, Union
+from typing import cast, Optional, TypeVar, Union
 import warnings
 
 import bigframes_vendored.sklearn.base
@@ -134,7 +133,7 @@ def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T:
                 self._bqml_model = self._create_bqml_model()  # type: ignore
             except AttributeError:
                 raise RuntimeError("A model must be trained before register.")
-        self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model)
+        self._bqml_model = cast(core.BqmlModel, self._bqml_model)
 
         self._bqml_model.register(vertex_ai_model_id)
         return self
@@ -287,7 +286,7 @@ def _predict_and_retry(
                 bpd.concat([df_result, df_succ]) if df_result is not None else df_succ
             )
 
-        df_result = typing.cast(
+        df_result = cast(
             bpd.DataFrame,
             bpd.concat([df_result, df_fail]) if df_result is not None else df_fail,
         )
@@ -307,7 +306,7 @@ def _extract_output_names(self):
 
         output_names = []
         for transform_col in self._bqml_model._model._properties["transformColumns"]:
-            transform_col_dict = typing.cast(dict, transform_col)
+            transform_col_dict = cast(dict, transform_col)
             # pass the columns that are not transformed
             if "transformSql" not in transform_col_dict:
                 continue
diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py
@@ -21,7 +21,7 @@
 import re
 import types
 import typing
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import cast, Iterable, List, Optional, Set, Tuple, Union
 
 from bigframes_vendored import constants
 import bigframes_vendored.sklearn.compose._column_transformer
@@ -218,7 +218,7 @@ def camel_to_snake(name):
 
         output_names = []
         for transform_col in bq_model._properties["transformColumns"]:
-            transform_col_dict = typing.cast(dict, transform_col)
+            transform_col_dict = cast(dict, transform_col)
             # pass the columns that are not transformed
             if "transformSql" not in transform_col_dict:
                 continue
@@ -282,7 +282,7 @@ def _merge(
             return self  # SQLScalarColumnTransformer only work inside ColumnTransformer
         feature_columns_sorted = sorted(
             [
-                typing.cast(str, feature_column.name)
+                cast(str, feature_column.name)
                 for feature_column in bq_model.feature_columns
             ]
         )
diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
@@ -18,8 +18,7 @@
 
 import dataclasses
 import datetime
-import typing
-from typing import Callable, Iterable, Mapping, Optional, Union
+from typing import Callable, cast, Iterable, Mapping, Optional, Union
 import uuid
 
 from google.cloud import bigquery
@@ -377,7 +376,7 @@ def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel:
     def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel:
         if vertex_ai_model_id is None:
             # vertex id needs to start with letters. https://cloud.google.com/vertex-ai/docs/general/resource-naming
-            vertex_ai_model_id = "bigframes_" + typing.cast(str, self._model.model_id)
+            vertex_ai_model_id = "bigframes_" + cast(str, self._model.model_id)
 
         # truncate as Vertex ID only accepts 63 characters, easily exceeding the limit for temp models.
         # The possibility of conflicts should be low.
diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py
@@ -16,8 +16,7 @@
 
 from __future__ import annotations
 
-import typing
-from typing import Mapping, Optional
+from typing import cast, Mapping, Optional
 
 from google.cloud import bigquery
 
@@ -79,7 +78,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
             if self.model_path is None:
                 raise ValueError("Model GCS path must be provided.")
             self._bqml_model = self._create_bqml_model()
-        self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model)
+        self._bqml_model = cast(core.BqmlModel, self._bqml_model)
 
         (X,) = utils.batch_convert_to_dataframe(X)
 
@@ -100,7 +99,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel:
             if self.model_path is None:
                 raise ValueError("Model GCS path must be provided.")
             self._bqml_model = self._create_bqml_model()
-        self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model)
+        self._bqml_model = cast(core.BqmlModel, self._bqml_model)
 
         new_model = self._bqml_model.copy(model_name, replace)
         return new_model.session.read_gbq_model(model_name)
@@ -158,7 +157,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
             if self.model_path is None:
                 raise ValueError("Model GCS path must be provided.")
             self._bqml_model = self._create_bqml_model()
-        self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model)
+        self._bqml_model = cast(core.BqmlModel, self._bqml_model)
 
         (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
@@ -179,7 +178,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel:
             if self.model_path is None:
                 raise ValueError("Model GCS path must be provided.")
             self._bqml_model = self._create_bqml_model()
-        self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model)
+        self._bqml_model = cast(core.BqmlModel, self._bqml_model)
 
         new_model = self._bqml_model.copy(model_name, replace)
         return new_model.session.read_gbq_model(model_name)
@@ -277,7 +276,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
             if self.model_path is None:
                 raise ValueError("Model GCS path must be provided.")
             self._bqml_model = self._create_bqml_model()
-        self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model)
+        self._bqml_model = cast(core.BqmlModel, self._bqml_model)
 
         (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
@@ -298,7 +297,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel:
             if self.model_path is None:
                 raise ValueError("Model GCS path must be provided.")
             self._bqml_model = self._create_bqml_model()
-        self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model)
+        self._bqml_model = cast(core.BqmlModel, self._bqml_model)
 
         new_model = self._bqml_model.copy(model_name, replace)
         return new_model.session.read_gbq_model(model_name)
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
@@ -16,8 +16,7 @@
 
 from __future__ import annotations
 
-import typing
-from typing import Iterable, Literal, Mapping, Optional, Union
+from typing import cast, Iterable, Literal, Mapping, Optional, Union
 import warnings
 
 import bigframes_vendored.constants as constants
@@ -253,7 +252,7 @@ def predict(
 
         if len(X.columns) == 1:
             # BQML identified the column by name
-            col_label = typing.cast(blocks.Label, X.columns[0])
+            col_label = cast(blocks.Label, X.columns[0])
             X = X.rename(columns={col_label: "content"})
 
         options: dict = {}
@@ -392,7 +391,7 @@ def predict(
 
         if len(X.columns) == 1:
             # BQML identified the column by name
-            col_label = typing.cast(blocks.Label, X.columns[0])
+            col_label = cast(blocks.Label, X.columns[0])
             X = X.rename(columns={col_label: "content"})
 
         # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input
@@ -605,10 +604,7 @@ def fit(
         options["prompt_col"] = X.columns.tolist()[0]
 
         self._bqml_model = self._bqml_model_factory.create_llm_remote_model(
-            X,
-            y,
-            options=options,
-            connection_name=typing.cast(str, self.connection_name),
+            X, y, options=options, connection_name=cast(str, self.connection_name)
         )
         return self
 
@@ -739,7 +735,7 @@ def predict(
 
         if len(X.columns) == 1:
             # BQML identified the column by name
-            col_label = typing.cast(blocks.Label, X.columns[0])
+            col_label = cast(blocks.Label, X.columns[0])
             X = X.rename(columns={col_label: "prompt"})
 
         options: dict = {
@@ -824,8 +820,8 @@ def score(
             )
 
         # BQML identified the column by name
-        X_col_label = typing.cast(blocks.Label, X.columns[0])
-        y_col_label = typing.cast(blocks.Label, y.columns[0])
+        X_col_label = cast(blocks.Label, X.columns[0])
+        y_col_label = cast(blocks.Label, y.columns[0])
         X = X.rename(columns={X_col_label: "input_text"})
         y = y.rename(columns={y_col_label: "output_text"})
 
@@ -1037,7 +1033,7 @@ def predict(
 
         if len(X.columns) == 1:
             # BQML identified the column by name
-            col_label = typing.cast(blocks.Label, X.columns[0])
+            col_label = cast(blocks.Label, X.columns[0])
             X = X.rename(columns={col_label: "prompt"})
 
         options = {
diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
@@ -20,8 +20,7 @@
 import inspect
 from itertools import chain
 import time
-import typing
-from typing import Generator, List, Optional, Union
+from typing import cast, Generator, List, Optional, Union
 
 import bigframes_vendored.sklearn.model_selection._split as vendored_model_selection_split
 import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation
@@ -100,10 +99,10 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
             train_dfs.append(train)
             test_dfs.append(test)
 
-        train_df = typing.cast(
+        train_df = cast(
             bpd.DataFrame, bpd.concat(train_dfs).drop(columns="bigframes_stratify_col")
         )
-        test_df = typing.cast(
+        test_df = cast(
             bpd.DataFrame, bpd.concat(test_dfs).drop(columns="bigframes_stratify_col")
         )
         return [train_df, test_df]
diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 import typing
-from typing import Iterable, List, Literal, Optional, Union
+from typing import cast, Iterable, List, Literal, Optional, Union
 
 import bigframes_vendored.sklearn.preprocessing._data
 import bigframes_vendored.sklearn.preprocessing._discretization
@@ -470,7 +470,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[OneHotEncoder, str]:
         s = sql[sql.find("(") + 1 : sql.find(")")]
         col_label, drop_str, top_k, frequency_threshold = s.split(", ")
         drop = (
-            typing.cast(Literal["most_frequent"], "most_frequent")
+            cast(Literal["most_frequent"], "most_frequent")
             if drop_str.lower() == "'most_frequent'"
             else None
         )
diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py
@@ -220,7 +220,6 @@ def output_type(self, *input_types):
 class JSONDecode(base_ops.UnaryOp):
     name: typing.ClassVar[str] = "json_decode"
     to_type: dtypes.Dtype
-    safe: bool = False
 
     def output_type(self, *input_types):
         input_type = input_types[0]
diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py
@@ -34,7 +34,6 @@
     numeric_ops,
     string_ops,
 )
-import bigframes.operations.json_ops as json_ops
 from bigframes.session import executor, semi_executor
 
 if TYPE_CHECKING:
@@ -95,7 +94,6 @@
     string_ops.EndsWithOp,
     string_ops.StrContainsOp,
     string_ops.StrContainsRegexOp,
-    json_ops.JSONDecode,
 )
 _COMPATIBLE_AGG_OPS = (
     agg_ops.SizeOp,
diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py
@@ -4142,6 +4142,7 @@ def test_json_astype_others_raise_error(data, to_type):
         bf_series.astype(to_type, errors="raise").to_pandas()
 
 
+@pytest.mark.skip(reason="AssertionError: Series NA mask are different")
 @pytest.mark.parametrize(
     ("data", "to_type"),
     [