Skip to content

Commit 7f9c17a

Browse files
committed
filter null value during aggregation instead now that apache/datafusion#21011 is closed
1 parent 3f1d9a3 commit 7f9c17a

File tree

1 file changed

+3
-16
lines changed

1 file changed

+3
-16
lines changed

examples/tpch/q21_suppliers_kept_orders_waiting.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,6 @@
4848
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
4949
"n_nationkey", "n_name"
5050
)
51-
print("df_orders")
52-
df_orders.show()
53-
print("df_lineitem")
54-
df_lineitem.show()
55-
print("df_supplier")
56-
df_supplier.show()
57-
print("df_nation")
58-
df_nation.show()
59-
6051

6152
# Limit to suppliers in the nation of interest
6253
df_suppliers_of_interest = df_nation.filter(col("n_name") == lit(NATION_OF_INTEREST))
@@ -90,16 +81,12 @@
9081
[col("o_orderkey")],
9182
[
9283
F.array_agg(col("l_suppkey"), distinct=True).alias("all_suppliers"),
93-
F.array_agg(col("failed_supp"), distinct=True).alias("failed_suppliers"),
84+
F.array_agg(
85+
col("failed_supp"), filter=col("failed_supp").is_not_null(), distinct=True
86+
).alias("failed_suppliers"),
9487
],
9588
)
9689

97-
# Remove the null entries that will get returned by array_agg so we can test to see where we only
98-
# have a single failed supplier in a multiple supplier order
99-
df = df.with_column(
100-
"failed_suppliers", F.array_remove(col("failed_suppliers"), lit(None))
101-
)
102-
10390
# This is the check described above which will identify single failed supplier in a multiple
10491
# supplier order.
10592
df = df.filter(F.array_length(col("failed_suppliers")) == lit(1)).filter(

0 commit comments

Comments
 (0)