Add DataFrame.column(), col(), and find_qualified_columns() methods

timsaucer · claude · timsaucer · commit 385da6c9c14c · 2026-04-07T10:38:31.000-04:00
Expose upstream find_qualified_columns to resolve unqualified column
names into fully qualified column expressions. This is especially
useful for disambiguating columns after joins.

- find_qualified_columns(*names) on Rust side calls upstream directly
- DataFrame.column(name) and col(name) alias on Python side
- Update join and join_on docstrings to reference DataFrame.col()
- Add "Disambiguating Columns with DataFrame.col()" section to joins docs
- Add tests for qualified column resolution, ambiguity, and join usage

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/crates/core/src/dataframe.rs b/crates/core/src/dataframe.rs
@@ -1015,6 +1015,16 @@ impl PyDataFrame {
         Ok(Self::new(df))
     }
 
+    /// Return fully qualified column expressions for the given column names
+    fn find_qualified_columns(&self, names: Vec<String>) -> PyDataFusionResult<Vec<PyExpr>> {
+        let name_refs: Vec<&str> = names.iter().map(|s| s.as_str()).collect();
+        let qualified = self.df.find_qualified_columns(&name_refs)?;
+        Ok(qualified
+            .into_iter()
+            .map(|q| Expr::Column(Column::from(q)).into())
+            .collect())
+    }
+
     /// Write a `DataFrame` to a CSV file.
     fn write_csv(
         &self,
diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst
@@ -134,3 +134,36 @@ In contrast to the above example, if we wish to get both columns:
 .. ipython:: python
 
     left.join(right, "id", how="inner", coalesce_duplicate_keys=False)
+
+Disambiguating Columns with ``DataFrame.col()``
+------------------------------------------------
+
+When both DataFrames contain non-key columns with the same name, you can use
+:py:meth:`~datafusion.dataframe.DataFrame.col` on each DataFrame **before** the
+join to create fully qualified column references. These references can then be
+used in the join predicate and when selecting from the result.
+
+This is especially useful with :py:meth:`~datafusion.dataframe.DataFrame.join_on`,
+which accepts expression-based predicates.
+
+.. ipython:: python
+
+    left = ctx.from_pydict(
+        {
+            "id": [1, 2, 3],
+            "val": [10, 20, 30],
+        }
+    )
+
+    right = ctx.from_pydict(
+        {
+            "id": [1, 2, 3],
+            "val": [40, 50, 60],
+        }
+    )
+
+    joined = left.join_on(
+        right, left.col("id") == right.col("id"), how="inner"
+    )
+
+    joined.select(left.col("id"), left.col("val"), right.col("val"))
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -415,6 +415,80 @@ def schema(self) -> pa.Schema:
         """
         return self.df.schema()
 
+    def column(self, name: str) -> Expr:
+        """Return a fully qualified column expression for ``name``.
+
+        Resolves an unqualified column name against this DataFrame's schema
+        and returns an :py:class:`Expr` whose underlying column reference
+        includes the table qualifier. This is especially useful after joins,
+        where the same column name may appear in multiple relations.
+
+        Args:
+            name: Unqualified column name to look up.
+
+        Returns:
+            A fully qualified column expression.
+
+        Raises:
+            Exception: If the column is not found or is ambiguous (exists in
+                multiple relations).
+
+        Examples:
+            Resolve a column from a simple DataFrame:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2], "b": [3, 4]})
+            >>> expr = df.column("a")
+            >>> df.select(expr).to_pydict()
+            {'a': [1, 2]}
+
+            Resolve qualified columns after a join:
+
+            >>> left = ctx.from_pydict({"id": [1, 2], "x": [10, 20]})
+            >>> right = ctx.from_pydict({"id": [1, 2], "y": [30, 40]})
+            >>> joined = left.join(right, on="id", how="inner")
+            >>> expr = joined.column("y")
+            >>> joined.select("id", expr).sort("id").to_pydict()
+            {'id': [1, 2], 'y': [30, 40]}
+        """
+        return self.find_qualified_columns(name)[0]
+
+    def col(self, name: str) -> Expr:
+        """Alias for :py:meth:`column`.
+
+        See Also:
+            :py:meth:`column`
+        """
+        return self.column(name)
+
+    def find_qualified_columns(self, *names: str) -> list[Expr]:
+        """Return fully qualified column expressions for the given names.
+
+        This is a batch version of :py:meth:`column` — it resolves each
+        unqualified name against the DataFrame's schema and returns a list
+        of qualified column expressions.
+
+        Args:
+            names: Unqualified column names to look up.
+
+        Returns:
+            List of fully qualified column expressions, one per name.
+
+        Raises:
+            Exception: If any column is not found or is ambiguous.
+
+        Examples:
+            Resolve multiple columns at once:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
+            >>> exprs = df.find_qualified_columns("a", "c")
+            >>> df.select(*exprs).to_pydict()
+            {'a': [1, 2], 'c': [5, 6]}
+        """
+        raw_exprs = self.df.find_qualified_columns(list(names))
+        return [Expr(e) for e in raw_exprs]
+
     @deprecated(
         "select_columns() is deprecated. Use :py:meth:`~DataFrame.select` instead"
     )
@@ -887,7 +961,13 @@ def join(
     ) -> DataFrame:
         """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`.
 
-        `on` has to be provided or both `left_on` and `right_on` in conjunction.
+        ``on`` has to be provided or both ``left_on`` and ``right_on`` in
+        conjunction.
+
+        When non-key columns share the same name in both DataFrames, use
+        :py:meth:`DataFrame.col` on each DataFrame **before** the join to
+        obtain fully qualified column references that can disambiguate them.
+        See :py:meth:`join_on` for an example.
 
         Args:
             right: Other DataFrame to join with.
@@ -961,7 +1041,14 @@ def join_on(
         built with :func:`datafusion.col`. On expressions are used to support
         in-equality predicates. Equality predicates are correctly optimized.
 
+        Use :py:meth:`DataFrame.col` on each DataFrame **before** the join to
+        obtain fully qualified column references. These qualified references
+        can then be used in the join predicate and to disambiguate columns
+        with the same name when selecting from the result.
+
         Examples:
+            Join with unique column names:
+
             >>> ctx = dfn.SessionContext()
             >>> left = ctx.from_pydict({"a": [1, 2], "x": ["a", "b"]})
             >>> right = ctx.from_pydict({"b": [1, 2], "y": ["c", "d"]})
@@ -970,6 +1057,18 @@ def join_on(
             ... ).sort(col("x")).to_pydict()
             {'a': [1, 2], 'x': ['a', 'b'], 'b': [1, 2], 'y': ['c', 'd']}
 
+            Use :py:meth:`col` to disambiguate shared column names:
+
+            >>> left = ctx.from_pydict({"id": [1, 2], "val": [10, 20]})
+            >>> right = ctx.from_pydict({"id": [1, 2], "val": [30, 40]})
+            >>> joined = left.join_on(
+            ...     right, left.col("id") == right.col("id"), how="inner"
+            ... )
+            >>> joined.select(
+            ...     left.col("id"), left.col("val"), right.col("val")
+            ... ).sort(left.col("id")).to_pydict()
+            {'id': [1, 2], 'val': [10, 20], 'val': [30, 40]}
+
         Args:
             right: Other DataFrame to join with.
             on_exprs: single or multiple (in)-equality predicates.
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -3621,6 +3621,76 @@ def test_union_by_name_distinct():
     assert result.column(1).to_pylist() == [10]
 
 
+def test_column_qualified():
+    """DataFrame.column() returns a qualified column expression."""
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [1, 2], "b": [3, 4]})
+    expr = df.column("a")
+    result = df.select(expr).collect()[0]
+    assert result.column(0).to_pylist() == [1, 2]
+
+
+def test_column_not_found():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [1]})
+    with pytest.raises(Exception, match="not found"):
+        df.column("z")
+
+
+def test_column_ambiguous():
+    """After a join, duplicate column names that cannot be resolved raise an error."""
+    ctx = SessionContext()
+    left = ctx.from_pydict({"id": [1, 2], "val": [10, 20]})
+    right = ctx.from_pydict({"id": [1, 2], "val": [30, 40]})
+    joined = left.join(right, on="id", how="inner")
+    with pytest.raises(Exception, match="not found"):
+        joined.column("val")
+
+
+def test_column_after_join():
+    """Qualified column works for non-ambiguous columns after a join."""
+    ctx = SessionContext()
+    left = ctx.from_pydict({"id": [1, 2], "x": [10, 20]})
+    right = ctx.from_pydict({"id": [1, 2], "y": [30, 40]})
+    joined = left.join(right, on="id", how="inner")
+    expr = joined.column("y")
+    result = joined.select("id", expr).sort("id").collect()[0]
+    assert result.column(0).to_pylist() == [1, 2]
+    assert result.column(1).to_pylist() == [30, 40]
+
+
+def test_col_join_disambiguate():
+    """Use col() to disambiguate and select columns after a join."""
+    ctx = SessionContext()
+    df1 = ctx.from_pydict({"foo": [1, 2, 3], "bar": [5, 6, 7]})
+    df2 = ctx.from_pydict({"foo": [1, 2, 3], "baz": [8, 9, 10]})
+    joined = df1.join_on(df2, df1.col("foo") == df2.col("foo"), how="inner")
+    result = (
+        joined.select(df1.col("foo"), df1.col("bar"), df2.col("baz"))
+        .sort(df1.col("foo"))
+        .to_pydict()
+    )
+    assert result["bar"] == [5, 6, 7]
+    assert result["baz"] == [8, 9, 10]
+
+
+def test_find_qualified_columns():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
+    exprs = df.find_qualified_columns("a", "c")
+    assert len(exprs) == 2
+    result = df.select(*exprs).collect()[0]
+    assert result.column(0).to_pylist() == [1, 2]
+    assert result.column(1).to_pylist() == [5, 6]
+
+
+def test_find_qualified_columns_not_found():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [1]})
+    with pytest.raises(Exception, match="not found"):
+        df.find_qualified_columns("a", "z")
+
+
 def test_distinct_on():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [1, 1, 2, 2], "b": [10, 20, 30, 40]})