Skip to content

Commit 1b6480b

Browse files
Fix unionByName to properly handle missing columns from both DataFrames (#243)
When allowMissingColumns=True, the method now correctly handles missing columns from both the left and right DataFrames by: - Adding missing columns from the right DataFrame to the left as NULL - Ensuring all columns from the left DataFrame are present in the right - Properly aligning column order to match Spark's behavior This ensures the union result contains all columns from both DataFrames, with NULL values where columns are missing, matching PySpark behavior.
2 parents df345a2 + b94529d commit 1b6480b

3 files changed

Lines changed: 37 additions & 10 deletions

File tree

.github/workflows/release.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ defaults:
3939
jobs:
4040
build_sdist:
4141
name: Build an sdist and determine versions
42+
if: ${{ github.ref != 'refs/heads/main' }}
4243
uses: ./.github/workflows/packaging_sdist.yml
4344
with:
4445
testsuite: all

duckdb/experimental/spark/sql/dataframe.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1067,8 +1067,7 @@ def union(self, other: "DataFrame") -> "DataFrame":
10671067
unionAll = union
10681068

10691069
def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
1070-
"""Returns a new :class:`DataFrame` containing union of rows in this and another
1071-
:class:`DataFrame`.
1070+
"""Returns a new :class:`DataFrame` containing union of rows in this and another :class:`DataFrame`.
10721071
10731072
This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
10741073
union (that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1121,15 +1120,27 @@ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) ->
11211120
| 1| 2| 3|NULL|
11221121
|NULL| 4| 5| 6|
11231122
+----+----+----+----+
1124-
""" # noqa: D205
1123+
"""
11251124
if allowMissingColumns:
1126-
cols = []
1127-
for col in self.relation.columns:
1128-
if col in other.relation.columns:
1129-
cols.append(col)
1130-
else:
1131-
cols.append(spark_sql_functions.lit(None))
1132-
other = other.select(*cols)
1125+
df1 = self.select(
1126+
*self.relation.columns,
1127+
*[
1128+
spark_sql_functions.lit(None).alias(c)
1129+
for c in other.relation.columns
1130+
if c not in self.relation.columns
1131+
],
1132+
)
1133+
1134+
df2 = other.select(
1135+
*[
1136+
spark_sql_functions.col(c)
1137+
if c in other.relation.columns
1138+
else spark_sql_functions.lit(None).alias(c)
1139+
for c in df1.relation.columns
1140+
]
1141+
)
1142+
1143+
return df1.unionByName(df2, allowMissingColumns=False)
11331144
else:
11341145
other = other.select(*self.relation.columns)
11351146

tests/fast/spark/test_spark_union_by_name.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,18 @@ def test_union_by_name_allow_missing_cols(self, df1, df2):
5252
Row(name="Jeff", id=None),
5353
]
5454
assert res == expected
55+
56+
def test_union_by_name_allow_missing_cols_rev(self, df1, df2):
57+
rel = df2.drop("id").unionByName(df1, allowMissingColumns=True)
58+
res = rel.collect()
59+
expected = [
60+
Row(name="James", id=None),
61+
Row(name="Maria", id=None),
62+
Row(name="Jen", id=None),
63+
Row(name="Jeff", id=None),
64+
Row(name="James", id=34),
65+
Row(name="Michael", id=56),
66+
Row(name="Robert", id=30),
67+
Row(name="Maria", id=24),
68+
]
69+
assert res == expected

0 commit comments

Comments
 (0)