Skip to content

Commit 8a95dfd

Browse files
authored
chore(bigframes): update the datetime_ops and isin_join in the sqlglot compiler (#16568)
This update addresses issues identified during performance testing against Ibis golden SQL. - `datetime_ops`: Aligned with Ibis behavior by implementing safe parsing instead of throwing errors. - `isin_join`: Improved performance; benchmark details are available here: [screenshot/vmb8ACvJhDNpccp].
1 parent a8fccef commit 8a95dfd

File tree

4 files changed

+21
-32
lines changed
  • packages/bigframes
    • bigframes/core/compile/sqlglot
    • tests/unit/core/compile/sqlglot
      • expressions/snapshots/test_datetime_ops/test_to_datetime
      • snapshots/test_compile_isin/test_compile_isin

4 files changed

+21
-32
lines changed

packages/bigframes/bigframes/core/compile/sqlglot/expressions/datetime_ops.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,7 @@ def _(expr: TypedExpr, op: ops.ToDatetimeOp) -> sge.Expression:
366366
result = expr.expr
367367
if expr.dtype != dtypes.STRING_DTYPE:
368368
result = sge.Cast(this=result, to="STRING")
369-
result = sge.func(
370-
"PARSE_TIMESTAMP", sge.convert(op.format), result, sge.convert("UTC")
371-
)
372-
return sge.Cast(this=result, to="DATETIME")
369+
return sge.TryCast(this=result, to="DATETIME")
373370

374371
if expr.dtype in (
375372
dtypes.STRING_DTYPE,

packages/bigframes/bigframes/core/compile/sqlglot/sqlglot_ir.py

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -364,34 +364,25 @@ def isin_join(
364364
or conditions[1].dtype == dtypes.FLOAT_DTYPE
365365
):
366366
force_float_domain = True
367-
part1_id = sql.identifier("bfpart1")
368-
part2_id = sql.identifier("bfpart2")
369367
left_expr1, left_expr2 = _value_to_non_null_identity(
370368
conditions[0], force_float_domain
371369
)
372-
left_as_struct = sge.Struct(
373-
expressions=[
374-
sge.PropertyEQ(this=part1_id, expression=left_expr1),
375-
sge.PropertyEQ(this=part2_id, expression=left_expr2),
376-
]
377-
)
378370
right_expr1, right_expr2 = _value_to_non_null_identity(
379371
conditions[1], force_float_domain
380372
)
381-
right_select = right.expr.select(
382-
*[
383-
sge.Struct(
384-
expressions=[
385-
sge.PropertyEQ(this=part1_id, expression=right_expr1),
386-
sge.PropertyEQ(this=part2_id, expression=right_expr2),
387-
]
388-
)
389-
],
390-
)
391373

392-
new_column = sge.In(
393-
this=left_as_struct,
394-
expressions=[right_select.subquery()],
374+
# Use EXISTS for better performance.
375+
# We use COALESCE on both sides in the WHERE clause as requested.
376+
new_column = sge.Exists(
377+
this=sge.Select()
378+
.select(sge.convert(1))
379+
.from_(right.expr.as_from_item())
380+
.where(
381+
sge.and_(
382+
sge.EQ(this=left_expr1, expression=right_expr1),
383+
sge.EQ(this=left_expr2, expression=right_expr2),
384+
)
385+
)
395386
)
396387
else:
397388
new_column = sge.In(

packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ SELECT
33
SAFE_CAST(`string_col` AS DATETIME),
44
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`float64_col` * 0.001) AS INT64)) AS DATETIME) AS `float64_col`,
55
SAFE_CAST(`timestamp_col` AS DATETIME),
6-
CAST(PARSE_TIMESTAMP('%Y-%m-%d', `string_col`, 'UTC') AS DATETIME) AS `string_col_fmt`
6+
SAFE_CAST(`string_col` AS DATETIME) AS `string_col_fmt`
77
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0`

packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_isin/test_compile_isin/out.sql

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ WITH `bfcte_0` AS (
2020
), `bfcte_4` AS (
2121
SELECT
2222
*,
23-
STRUCT(COALESCE(`bfcol_4`, 0) AS `bfpart1`, COALESCE(`bfcol_4`, 1) AS `bfpart2`) IN (
24-
(
25-
SELECT
26-
STRUCT(COALESCE(`bfcol_0`, 0) AS `bfpart1`, COALESCE(`bfcol_0`, 1) AS `bfpart2`)
27-
FROM `bfcte_3`
28-
)
23+
EXISTS(
24+
SELECT
25+
1
26+
FROM `bfcte_3`
27+
WHERE
28+
COALESCE(`bfcol_4`, 0) = COALESCE(`bfcol_0`, 0)
29+
AND COALESCE(`bfcol_4`, 1) = COALESCE(`bfcol_0`, 1)
2930
) AS `bfcol_5`
3031
FROM `bfcte_1`
3132
)

0 commit comments

Comments
 (0)