Skip to content

Commit 385da6c

Browse files
timsaucerclaude
andcommitted
Add DataFrame.column(), col(), and find_qualified_columns() methods
Expose upstream find_qualified_columns to resolve unqualified column names into fully qualified column expressions. This is especially useful for disambiguating columns after joins. - find_qualified_columns(*names) on Rust side calls upstream directly - DataFrame.column(name) and col(name) alias on Python side - Update join and join_on docstrings to reference DataFrame.col() - Add "Disambiguating Columns with DataFrame.col()" section to joins docs - Add tests for qualified column resolution, ambiguity, and join usage Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b46987e commit 385da6c

File tree

4 files changed

+213
-1
lines changed

4 files changed

+213
-1
lines changed

crates/core/src/dataframe.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,16 @@ impl PyDataFrame {
10151015
Ok(Self::new(df))
10161016
}
10171017

1018+
/// Return fully qualified column expressions for the given column names
1019+
fn find_qualified_columns(&self, names: Vec<String>) -> PyDataFusionResult<Vec<PyExpr>> {
1020+
let name_refs: Vec<&str> = names.iter().map(|s| s.as_str()).collect();
1021+
let qualified = self.df.find_qualified_columns(&name_refs)?;
1022+
Ok(qualified
1023+
.into_iter()
1024+
.map(|q| Expr::Column(Column::from(q)).into())
1025+
.collect())
1026+
}
1027+
10181028
/// Write a `DataFrame` to a CSV file.
10191029
fn write_csv(
10201030
&self,

docs/source/user-guide/common-operations/joins.rst

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,36 @@ In contrast to the above example, if we wish to get both columns:
134134
.. ipython:: python
135135
136136
left.join(right, "id", how="inner", coalesce_duplicate_keys=False)
137+
138+
Disambiguating Columns with ``DataFrame.col()``
139+
------------------------------------------------
140+
141+
When both DataFrames contain non-key columns with the same name, you can use
142+
:py:meth:`~datafusion.dataframe.DataFrame.col` on each DataFrame **before** the
143+
join to create fully qualified column references. These references can then be
144+
used in the join predicate and when selecting from the result.
145+
146+
This is especially useful with :py:meth:`~datafusion.dataframe.DataFrame.join_on`,
147+
which accepts expression-based predicates.
148+
149+
.. ipython:: python
150+
151+
left = ctx.from_pydict(
152+
{
153+
"id": [1, 2, 3],
154+
"val": [10, 20, 30],
155+
}
156+
)
157+
158+
right = ctx.from_pydict(
159+
{
160+
"id": [1, 2, 3],
161+
"val": [40, 50, 60],
162+
}
163+
)
164+
165+
joined = left.join_on(
166+
right, left.col("id") == right.col("id"), how="inner"
167+
)
168+
169+
joined.select(left.col("id"), left.col("val"), right.col("val"))

python/datafusion/dataframe.py

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,80 @@ def schema(self) -> pa.Schema:
415415
"""
416416
return self.df.schema()
417417

418+
def column(self, name: str) -> Expr:
419+
"""Return a fully qualified column expression for ``name``.
420+
421+
Resolves an unqualified column name against this DataFrame's schema
422+
and returns an :py:class:`Expr` whose underlying column reference
423+
includes the table qualifier. This is especially useful after joins,
424+
where the same column name may appear in multiple relations.
425+
426+
Args:
427+
name: Unqualified column name to look up.
428+
429+
Returns:
430+
A fully qualified column expression.
431+
432+
Raises:
433+
Exception: If the column is not found or is ambiguous (exists in
434+
multiple relations).
435+
436+
Examples:
437+
Resolve a column from a simple DataFrame:
438+
439+
>>> ctx = dfn.SessionContext()
440+
>>> df = ctx.from_pydict({"a": [1, 2], "b": [3, 4]})
441+
>>> expr = df.column("a")
442+
>>> df.select(expr).to_pydict()
443+
{'a': [1, 2]}
444+
445+
Resolve qualified columns after a join:
446+
447+
>>> left = ctx.from_pydict({"id": [1, 2], "x": [10, 20]})
448+
>>> right = ctx.from_pydict({"id": [1, 2], "y": [30, 40]})
449+
>>> joined = left.join(right, on="id", how="inner")
450+
>>> expr = joined.column("y")
451+
>>> joined.select("id", expr).sort("id").to_pydict()
452+
{'id': [1, 2], 'y': [30, 40]}
453+
"""
454+
return self.find_qualified_columns(name)[0]
455+
456+
def col(self, name: str) -> Expr:
457+
"""Alias for :py:meth:`column`.
458+
459+
See Also:
460+
:py:meth:`column`
461+
"""
462+
return self.column(name)
463+
464+
def find_qualified_columns(self, *names: str) -> list[Expr]:
465+
"""Return fully qualified column expressions for the given names.
466+
467+
This is a batch version of :py:meth:`column` — it resolves each
468+
unqualified name against the DataFrame's schema and returns a list
469+
of qualified column expressions.
470+
471+
Args:
472+
names: Unqualified column names to look up.
473+
474+
Returns:
475+
List of fully qualified column expressions, one per name.
476+
477+
Raises:
478+
Exception: If any column is not found or is ambiguous.
479+
480+
Examples:
481+
Resolve multiple columns at once:
482+
483+
>>> ctx = dfn.SessionContext()
484+
>>> df = ctx.from_pydict({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
485+
>>> exprs = df.find_qualified_columns("a", "c")
486+
>>> df.select(*exprs).to_pydict()
487+
{'a': [1, 2], 'c': [5, 6]}
488+
"""
489+
raw_exprs = self.df.find_qualified_columns(list(names))
490+
return [Expr(e) for e in raw_exprs]
491+
418492
@deprecated(
419493
"select_columns() is deprecated. Use :py:meth:`~DataFrame.select` instead"
420494
)
@@ -887,7 +961,13 @@ def join(
887961
) -> DataFrame:
888962
"""Join this :py:class:`DataFrame` with another :py:class:`DataFrame`.
889963
890-
`on` has to be provided or both `left_on` and `right_on` in conjunction.
964+
``on`` has to be provided or both ``left_on`` and ``right_on`` in
965+
conjunction.
966+
967+
When non-key columns share the same name in both DataFrames, use
968+
:py:meth:`DataFrame.col` on each DataFrame **before** the join to
969+
obtain fully qualified column references that can disambiguate them.
970+
See :py:meth:`join_on` for an example.
891971
892972
Args:
893973
right: Other DataFrame to join with.
@@ -961,7 +1041,14 @@ def join_on(
9611041
built with :func:`datafusion.col`. On expressions are used to support
9621042
in-equality predicates. Equality predicates are correctly optimized.
9631043
1044+
Use :py:meth:`DataFrame.col` on each DataFrame **before** the join to
1045+
obtain fully qualified column references. These qualified references
1046+
can then be used in the join predicate and to disambiguate columns
1047+
with the same name when selecting from the result.
1048+
9641049
Examples:
1050+
Join with unique column names:
1051+
9651052
>>> ctx = dfn.SessionContext()
9661053
>>> left = ctx.from_pydict({"a": [1, 2], "x": ["a", "b"]})
9671054
>>> right = ctx.from_pydict({"b": [1, 2], "y": ["c", "d"]})
@@ -970,6 +1057,18 @@ def join_on(
9701057
... ).sort(col("x")).to_pydict()
9711058
{'a': [1, 2], 'x': ['a', 'b'], 'b': [1, 2], 'y': ['c', 'd']}
9721059
1060+
Use :py:meth:`col` to disambiguate shared column names:
1061+
1062+
>>> left = ctx.from_pydict({"id": [1, 2], "val": [10, 20]})
1063+
>>> right = ctx.from_pydict({"id": [1, 2], "val": [30, 40]})
1064+
>>> joined = left.join_on(
1065+
... right, left.col("id") == right.col("id"), how="inner"
1066+
... )
1067+
>>> joined.select(
1068+
... left.col("id"), left.col("val"), right.col("val")
1069+
... ).sort(left.col("id")).to_pydict()
1070+
{'id': [1, 2], 'val': [10, 20], 'val': [30, 40]}
1071+
9731072
Args:
9741073
right: Other DataFrame to join with.
9751074
on_exprs: single or multiple (in)-equality predicates.

python/tests/test_dataframe.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3621,6 +3621,76 @@ def test_union_by_name_distinct():
36213621
assert result.column(1).to_pylist() == [10]
36223622

36233623

3624+
def test_column_qualified():
3625+
"""DataFrame.column() returns a qualified column expression."""
3626+
ctx = SessionContext()
3627+
df = ctx.from_pydict({"a": [1, 2], "b": [3, 4]})
3628+
expr = df.column("a")
3629+
result = df.select(expr).collect()[0]
3630+
assert result.column(0).to_pylist() == [1, 2]
3631+
3632+
3633+
def test_column_not_found():
3634+
ctx = SessionContext()
3635+
df = ctx.from_pydict({"a": [1]})
3636+
with pytest.raises(Exception, match="not found"):
3637+
df.column("z")
3638+
3639+
3640+
def test_column_ambiguous():
3641+
"""After a join, duplicate column names that cannot be resolved raise an error."""
3642+
ctx = SessionContext()
3643+
left = ctx.from_pydict({"id": [1, 2], "val": [10, 20]})
3644+
right = ctx.from_pydict({"id": [1, 2], "val": [30, 40]})
3645+
joined = left.join(right, on="id", how="inner")
3646+
with pytest.raises(Exception, match="not found"):
3647+
joined.column("val")
3648+
3649+
3650+
def test_column_after_join():
3651+
"""Qualified column works for non-ambiguous columns after a join."""
3652+
ctx = SessionContext()
3653+
left = ctx.from_pydict({"id": [1, 2], "x": [10, 20]})
3654+
right = ctx.from_pydict({"id": [1, 2], "y": [30, 40]})
3655+
joined = left.join(right, on="id", how="inner")
3656+
expr = joined.column("y")
3657+
result = joined.select("id", expr).sort("id").collect()[0]
3658+
assert result.column(0).to_pylist() == [1, 2]
3659+
assert result.column(1).to_pylist() == [30, 40]
3660+
3661+
3662+
def test_col_join_disambiguate():
3663+
"""Use col() to disambiguate and select columns after a join."""
3664+
ctx = SessionContext()
3665+
df1 = ctx.from_pydict({"foo": [1, 2, 3], "bar": [5, 6, 7]})
3666+
df2 = ctx.from_pydict({"foo": [1, 2, 3], "baz": [8, 9, 10]})
3667+
joined = df1.join_on(df2, df1.col("foo") == df2.col("foo"), how="inner")
3668+
result = (
3669+
joined.select(df1.col("foo"), df1.col("bar"), df2.col("baz"))
3670+
.sort(df1.col("foo"))
3671+
.to_pydict()
3672+
)
3673+
assert result["bar"] == [5, 6, 7]
3674+
assert result["baz"] == [8, 9, 10]
3675+
3676+
3677+
def test_find_qualified_columns():
3678+
ctx = SessionContext()
3679+
df = ctx.from_pydict({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
3680+
exprs = df.find_qualified_columns("a", "c")
3681+
assert len(exprs) == 2
3682+
result = df.select(*exprs).collect()[0]
3683+
assert result.column(0).to_pylist() == [1, 2]
3684+
assert result.column(1).to_pylist() == [5, 6]
3685+
3686+
3687+
def test_find_qualified_columns_not_found():
3688+
ctx = SessionContext()
3689+
df = ctx.from_pydict({"a": [1]})
3690+
with pytest.raises(Exception, match="not found"):
3691+
df.find_qualified_columns("a", "z")
3692+
3693+
36243694
def test_distinct_on():
36253695
ctx = SessionContext()
36263696
df = ctx.from_pydict({"a": [1, 1, 2, 2], "b": [10, 20, 30, 40]})

0 commit comments

Comments
 (0)