Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit a6c47a6

Browse files
authored
Merge branch 'main' into callable-df-mask
2 parents 3bbb4ce + 8f2cad2 commit a6c47a6

File tree

21 files changed

+552
-40
lines changed

21 files changed

+552
-40
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@
44

55
[1]: https://pypi.org/project/bigframes/#history
66

7+
## [2.17.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.16.0...v2.17.0) (2025-08-22)
8+
9+
10+
### Features
11+
12+
* Add isin local execution impl ([#1993](https://github.com/googleapis/python-bigquery-dataframes/issues/1993)) ([26df6e6](https://github.com/googleapis/python-bigquery-dataframes/commit/26df6e691bb27ed09322a81214faedbf3639b32e))
13+
* Add reset_index names, col_level, col_fill, allow_duplicates args ([#2017](https://github.com/googleapis/python-bigquery-dataframes/issues/2017)) ([c02a1b6](https://github.com/googleapis/python-bigquery-dataframes/commit/c02a1b67d27758815430bb8006ac3a72cea55a89))
14+
* Support callable for series mask method ([#2014](https://github.com/googleapis/python-bigquery-dataframes/issues/2014)) ([5ac32eb](https://github.com/googleapis/python-bigquery-dataframes/commit/5ac32ebe17cfda447870859f5dd344b082b4d3d0))
15+
716
## [2.16.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.15.0...v2.16.0) (2025-08-20)
817

918

bigframes/core/compile/sqlglot/expressions/unary_compiler.py

Lines changed: 97 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -177,14 +177,96 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
177177
)
178178

179179

180+
@UNARY_OP_REGISTRATION.register(ops.StrContainsOp)
181+
def _(op: ops.StrContainsOp, expr: TypedExpr) -> sge.Expression:
182+
return sge.Like(this=expr.expr, expression=sge.convert(f"%{op.pat}%"))
183+
184+
180185
@UNARY_OP_REGISTRATION.register(ops.StrContainsRegexOp)
181186
def _(op: ops.StrContainsRegexOp, expr: TypedExpr) -> sge.Expression:
182187
return sge.RegexpLike(this=expr.expr, expression=sge.convert(op.pat))
183188

184189

185-
@UNARY_OP_REGISTRATION.register(ops.StrContainsOp)
186-
def _(op: ops.StrContainsOp, expr: TypedExpr) -> sge.Expression:
187-
return sge.Like(this=expr.expr, expression=sge.convert(f"%{op.pat}%"))
190+
@UNARY_OP_REGISTRATION.register(ops.StrExtractOp)
191+
def _(op: ops.StrExtractOp, expr: TypedExpr) -> sge.Expression:
192+
return sge.RegexpExtract(
193+
this=expr.expr, expression=sge.convert(op.pat), group=sge.convert(op.n)
194+
)
195+
196+
197+
@UNARY_OP_REGISTRATION.register(ops.StrFindOp)
198+
def _(op: ops.StrFindOp, expr: TypedExpr) -> sge.Expression:
199+
# INSTR is 1-based, so we need to adjust the start position.
200+
start = sge.convert(op.start + 1) if op.start is not None else sge.convert(1)
201+
if op.end is not None:
202+
# BigQuery's INSTR doesn't support `end`, so we need to use SUBSTR.
203+
return sge.func(
204+
"INSTR",
205+
sge.Substring(
206+
this=expr.expr,
207+
start=start,
208+
length=sge.convert(op.end - (op.start or 0)),
209+
),
210+
sge.convert(op.substr),
211+
) - sge.convert(1)
212+
else:
213+
return sge.func(
214+
"INSTR",
215+
expr.expr,
216+
sge.convert(op.substr),
217+
start,
218+
) - sge.convert(1)
219+
220+
221+
@UNARY_OP_REGISTRATION.register(ops.StrLstripOp)
222+
def _(op: ops.StrLstripOp, expr: TypedExpr) -> sge.Expression:
223+
return sge.Trim(this=expr.expr, expression=sge.convert(op.to_strip), side="LEFT")
224+
225+
226+
@UNARY_OP_REGISTRATION.register(ops.StrPadOp)
227+
def _(op: ops.StrPadOp, expr: TypedExpr) -> sge.Expression:
228+
pad_length = sge.func(
229+
"GREATEST", sge.Length(this=expr.expr), sge.convert(op.length)
230+
)
231+
if op.side == "left":
232+
return sge.func(
233+
"LPAD",
234+
expr.expr,
235+
pad_length,
236+
sge.convert(op.fillchar),
237+
)
238+
elif op.side == "right":
239+
return sge.func(
240+
"RPAD",
241+
expr.expr,
242+
pad_length,
243+
sge.convert(op.fillchar),
244+
)
245+
else: # side == both
246+
lpad_amount = sge.Cast(
247+
this=sge.func(
248+
"SAFE_DIVIDE",
249+
sge.Sub(this=pad_length, expression=sge.Length(this=expr.expr)),
250+
sge.convert(2),
251+
),
252+
to="INT64",
253+
) + sge.Length(this=expr.expr)
254+
return sge.func(
255+
"RPAD",
256+
sge.func(
257+
"LPAD",
258+
expr.expr,
259+
lpad_amount,
260+
sge.convert(op.fillchar),
261+
),
262+
pad_length,
263+
sge.convert(op.fillchar),
264+
)
265+
266+
267+
@UNARY_OP_REGISTRATION.register(ops.StrRepeatOp)
268+
def _(op: ops.StrRepeatOp, expr: TypedExpr) -> sge.Expression:
269+
return sge.Repeat(this=expr.expr, times=sge.convert(op.repeats))
188270

189271

190272
@UNARY_OP_REGISTRATION.register(ops.date_op)
@@ -444,11 +526,6 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
444526
return sge.Extract(this=sge.Identifier(this="MONTH"), expression=expr.expr)
445527

446528

447-
@UNARY_OP_REGISTRATION.register(ops.StrLstripOp)
448-
def _(op: ops.StrLstripOp, expr: TypedExpr) -> sge.Expression:
449-
return sge.Trim(this=expr.expr, expression=sge.convert(op.to_strip), side="LEFT")
450-
451-
452529
@UNARY_OP_REGISTRATION.register(ops.neg_op)
453530
def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
454531
return sge.Neg(this=expr.expr)
@@ -484,6 +561,18 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
484561
return sge.Extract(this=sge.Identifier(this="QUARTER"), expression=expr.expr)
485562

486563

564+
@UNARY_OP_REGISTRATION.register(ops.ReplaceStrOp)
565+
def _(op: ops.ReplaceStrOp, expr: TypedExpr) -> sge.Expression:
566+
return sge.func("REPLACE", expr.expr, sge.convert(op.pat), sge.convert(op.repl))
567+
568+
569+
@UNARY_OP_REGISTRATION.register(ops.RegexReplaceStrOp)
570+
def _(op: ops.RegexReplaceStrOp, expr: TypedExpr) -> sge.Expression:
571+
return sge.func(
572+
"REGEXP_REPLACE", expr.expr, sge.convert(op.pat), sge.convert(op.repl)
573+
)
574+
575+
487576
@UNARY_OP_REGISTRATION.register(ops.reverse_op)
488577
def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
489578
return sge.func("REVERSE", expr.expr)

bigframes/series.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1904,9 +1904,22 @@ def _groupby_values(
19041904
)
19051905

19061906
def apply(
1907-
self, func, by_row: typing.Union[typing.Literal["compat"], bool] = "compat"
1907+
self,
1908+
func,
1909+
by_row: typing.Union[typing.Literal["compat"], bool] = "compat",
1910+
*,
1911+
args: typing.Tuple = (),
19081912
) -> Series:
1909-
# TODO(shobs, b/274645634): Support convert_dtype, args, **kwargs
1913+
# Note: This signature differs from pandas.Series.apply. Specifically,
1914+
# `args` is keyword-only and `by_row` is a custom parameter here. Full
1915+
# alignment would involve breaking changes. However, given that by_row
1916+
# is not frequently used, we defer any such changes until there is a
1917+
# clear need based on user feedback.
1918+
#
1919+
# See pandas docs for reference:
1920+
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.apply.html
1921+
1922+
# TODO(shobs, b/274645634): Support convert_dtype, **kwargs
19101923
# is actually a ternary op
19111924

19121925
if by_row not in ["compat", False]:
@@ -1950,10 +1963,19 @@ def apply(
19501963
raise
19511964

19521965
# We are working with bigquery function at this point
1953-
result_series = self._apply_unary_op(
1954-
ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True)
1955-
)
1966+
if args:
1967+
result_series = self._apply_nary_op(
1968+
ops.NaryRemoteFunctionOp(function_def=func.udf_def), args
1969+
)
1970+
# TODO(jialuo): Investigate why `_apply_nary_op` drops the series
1971+
# `name`. Manually reassigning it here as a temporary fix.
1972+
result_series.name = self.name
1973+
else:
1974+
result_series = self._apply_unary_op(
1975+
ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True)
1976+
)
19561977
result_series = func._post_process_series(result_series)
1978+
19571979
return result_series
19581980

19591981
def combine(
@@ -2113,13 +2135,8 @@ def duplicated(self, keep: str = "first") -> Series:
21132135
)
21142136

21152137
def mask(self, cond, other=None) -> Series:
2116-
if callable(cond):
2117-
if hasattr(cond, "bigframes_bigquery_function"):
2118-
cond = self.apply(cond)
2119-
else:
2120-
# For non-BigQuery function assume that it is applicable on Series
2121-
cond = self.apply(cond, by_row=False)
2122-
2138+
cond = self._apply_callable(cond)
2139+
other = self._apply_callable(other)
21232140
if not isinstance(cond, Series):
21242141
raise TypeError(
21252142
f"Only bigframes series condition is supported, received {type(cond).__name__}. "

bigframes/version.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "2.16.0"
15+
__version__ = "2.17.0"
1616

1717
# {x-release-please-start-date}
18-
__release_date__ = "2025-08-20"
18+
__release_date__ = "2025-08-22"
1919
# {x-release-please-end}

tests/system/large/functions/test_managed_function.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,7 +1103,7 @@ def func_for_other(x):
11031103
)
11041104

11051105

1106-
def test_managed_function_series_where(session, dataset_id, scalars_dfs):
1106+
def test_managed_function_series_where_mask(session, dataset_id, scalars_dfs):
11071107
try:
11081108

11091109
# The return type has to be bool type for callable where condition.
@@ -1124,8 +1124,8 @@ def _is_positive(s):
11241124
pd_int64 = scalars_pandas["int64_col"]
11251125
pd_int64_filtered = pd_int64.dropna()
11261126

1127-
# The cond is a callable (managed function) and the other is not a
1128-
# callable in series.where method.
1127+
# Test series.where method: the cond is a callable (managed function)
1128+
# and the other is not a callable.
11291129
bf_result = bf_int64_filtered.where(
11301130
cond=is_positive_mf, other=-bf_int64_filtered
11311131
).to_pandas()
@@ -1134,6 +1134,44 @@ def _is_positive(s):
11341134
# Ignore any dtype difference.
11351135
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
11361136

1137+
# Test series.mask method: the cond is a callable (managed function)
1138+
# and the other is not a callable.
1139+
bf_result = bf_int64_filtered.mask(
1140+
cond=is_positive_mf, other=-bf_int64_filtered
1141+
).to_pandas()
1142+
pd_result = pd_int64_filtered.mask(cond=_is_positive, other=-pd_int64_filtered)
1143+
1144+
# Ignore any dtype difference.
1145+
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
1146+
11371147
finally:
11381148
# Clean up the gcp assets created for the managed function.
11391149
cleanup_function_assets(is_positive_mf, session.bqclient, ignore_failures=False)
1150+
1151+
1152+
def test_managed_function_series_apply_args(session, dataset_id, scalars_dfs):
1153+
try:
1154+
1155+
with pytest.warns(bfe.PreviewWarning, match="udf is in preview."):
1156+
1157+
@session.udf(dataset=dataset_id, name=prefixer.create_prefix())
1158+
def foo_list(x: int, y0: float, y1: bytes, y2: bool) -> list[str]:
1159+
return [str(x), str(y0), str(y1), str(y2)]
1160+
1161+
scalars_df, scalars_pandas_df = scalars_dfs
1162+
1163+
bf_result = (
1164+
scalars_df["int64_too"]
1165+
.apply(foo_list, args=(12.34, b"hello world", False))
1166+
.to_pandas()
1167+
)
1168+
pd_result = scalars_pandas_df["int64_too"].apply(
1169+
foo_list, args=(12.34, b"hello world", False)
1170+
)
1171+
1172+
# Ignore any dtype difference.
1173+
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
1174+
1175+
finally:
1176+
# Clean up the gcp assets created for the managed function.
1177+
cleanup_function_assets(foo_list, session.bqclient, ignore_failures=False)

tests/system/large/functions/test_remote_function.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2950,7 +2950,7 @@ def func_for_other(x):
29502950

29512951

29522952
@pytest.mark.flaky(retries=2, delay=120)
2953-
def test_remote_function_series_where(session, dataset_id, scalars_dfs):
2953+
def test_remote_function_series_where_mask(session, dataset_id, scalars_dfs):
29542954
try:
29552955

29562956
def _ten_times(x):
@@ -2971,8 +2971,8 @@ def _ten_times(x):
29712971
pd_int64 = scalars_pandas["float64_col"]
29722972
pd_int64_filtered = pd_int64.dropna()
29732973

2974-
# The cond is not a callable and the other is a callable (remote
2975-
# function) in series.where method.
2974+
# Test series.where method: the cond is not a callable and the other is
2975+
# a callable (remote function).
29762976
bf_result = bf_int64_filtered.where(
29772977
cond=bf_int64_filtered < 0, other=ten_times_mf
29782978
).to_pandas()
@@ -2983,6 +2983,55 @@ def _ten_times(x):
29832983
# Ignore any dtype difference.
29842984
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
29852985

2986+
# Test series.mask method: the cond is not a callable and the other is
2987+
# a callable (remote function).
2988+
bf_result = bf_int64_filtered.mask(
2989+
cond=bf_int64_filtered < 0, other=ten_times_mf
2990+
).to_pandas()
2991+
pd_result = pd_int64_filtered.mask(cond=pd_int64_filtered < 0, other=_ten_times)
2992+
2993+
# Ignore any dtype difference.
2994+
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
2995+
29862996
finally:
29872997
# Clean up the gcp assets created for the remote function.
29882998
cleanup_function_assets(ten_times_mf, session.bqclient, ignore_failures=False)
2999+
3000+
3001+
@pytest.mark.flaky(retries=2, delay=120)
3002+
def test_remote_function_series_apply_args(session, dataset_id, scalars_dfs):
3003+
try:
3004+
3005+
@session.remote_function(
3006+
dataset=dataset_id,
3007+
reuse=False,
3008+
cloud_function_service_account="default",
3009+
)
3010+
def foo(x: int, y: bool, z: float) -> str:
3011+
if y:
3012+
return f"{x}: y is True."
3013+
if z > 0.0:
3014+
return f"{x}: y is False and z is positive."
3015+
return f"{x}: y is False and z is non-positive."
3016+
3017+
scalars_df, scalars_pandas_df = scalars_dfs
3018+
3019+
args1 = (True, 10.0)
3020+
bf_result = scalars_df["int64_too"].apply(foo, args=args1).to_pandas()
3021+
pd_result = scalars_pandas_df["int64_too"].apply(foo, args=args1)
3022+
3023+
# Ignore any dtype difference.
3024+
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
3025+
3026+
args2 = (False, -10.0)
3027+
foo_ref = session.read_gbq_function(foo.bigframes_bigquery_function)
3028+
3029+
bf_result = scalars_df["int64_too"].apply(foo_ref, args=args2).to_pandas()
3030+
pd_result = scalars_pandas_df["int64_too"].apply(foo, args=args2)
3031+
3032+
# Ignore any dtype difference.
3033+
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
3034+
3035+
finally:
3036+
# Clean up the gcp assets created for the remote function.
3037+
cleanup_function_assets(foo, session.bqclient, ignore_failures=False)

tests/system/small/test_series.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3603,6 +3603,26 @@ def test_mask_custom_value(scalars_dfs):
36033603
assert_pandas_df_equal(bf_result, pd_result)
36043604

36053605

3606+
def test_mask_with_callable(scalars_df_index, scalars_pandas_df_index):
3607+
def _ten_times(x):
3608+
return x * 10
3609+
3610+
# Both cond and other are callable.
3611+
bf_result = (
3612+
scalars_df_index["int64_col"]
3613+
.mask(cond=lambda x: x > 0, other=_ten_times)
3614+
.to_pandas()
3615+
)
3616+
pd_result = scalars_pandas_df_index["int64_col"].mask(
3617+
cond=lambda x: x > 0, other=_ten_times
3618+
)
3619+
3620+
pd.testing.assert_series_equal(
3621+
bf_result,
3622+
pd_result,
3623+
)
3624+
3625+
36063626
@pytest.mark.parametrize(
36073627
("lambda_",),
36083628
[

0 commit comments

Comments
 (0)