Change resolution precedence + test improvements

andreaschat-db · andreaschat-db · commit 88cc8d6fb9c2 · 2026-05-22T14:03:32.000Z
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/LookupCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/LookupCatalog.scala
@@ -121,17 +121,9 @@ private[sql] trait LookupCatalog extends Logging {
         // this custom catalog can't be accessed.
         Some((catalogManager.v2SessionCatalog, nameParts.asIdentifier))
       } else {
-        // Path-based data sources (e.g. `pathformat2.'/path/to/t'`) whose format declares a
-        // catalog via SupportsCatalogOptions are routed to that catalog. Both the catalog and
-        // the canonical identifier come from the connector.
-        val (catalogName, ident) =
-          Option(catalogManager.catalogAndIdentForDataSource(nameParts)).flatten match {
-            case Some((catName, providerIdent)) => (catName, providerIdent)
-            case None => (nameParts.head, nameParts.tail.asIdentifier)
-          }
-
         try {
-          val catalog = catalogManager.catalog(catalogName)
+          val catalog = catalogManager.catalog(nameParts.head)
+          val ident = nameParts.tail.asIdentifier
           if (CatalogV2Util.isSessionCatalog(catalog)) {
             // Reject only when namespace is empty (e.g. spark_catalog.t with no database).
             // Allow multi-part namespace for metadata tables (e.g. default.table.snapshots).
@@ -143,7 +135,18 @@ private[sql] trait LookupCatalog extends Logging {
           Some((catalog, ident))
         } catch {
           case _: CatalogNotFoundException =>
-            Some((currentCatalog, nameParts.asIdentifier))
+            // No catalog matched. As a fallback, try path-based data sources:
+            // formats implementing SupportsCatalogOptions (e.g. `pathformat.`/path/to/t``)
+            // route to the catalog the connector designates. If no SCO format claims the
+            // identifier head, fall through to currentCatalog and let later analysis raise
+            // table-not-found. This matches the v1 file-format precedence (catalog first,
+            // path-based as fallback).
+            Option(catalogManager.catalogAndIdentForDataSource(nameParts)).flatten match {
+              case Some((catName, providerIdent)) =>
+                Some((catalogManager.catalog(catName), providerIdent))
+              case None =>
+                Some((currentCatalog, nameParts.asIdentifier))
+            }
         }
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/FakeV2Provider.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/FakeV2Provider.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.connector
 
 import java.util
+import java.util.Optional
 
 import org.apache.spark.sql.connector.catalog.{Identifier, SessionConfigSupport, SupportsCatalogOptions, SupportsV1OverwriteWithSaveAsTable, Table, TableProvider}
 import org.apache.spark.sql.connector.expressions.Transform
@@ -96,10 +97,11 @@ class FakeV2ProviderWithV1SaveAsTableOverwriteWriteOptionDisabled
 }
 
 /**
- * Simulates a path-based connector (e.g. Delta) that implements [[SupportsCatalogOptions]]
- * to route `pathformat.\`/path/to/t\`` SQL identifiers to the session catalog. We rely on
- * the default [[SupportsCatalogOptions#extractCatalog]] which returns
- * [[org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME]].
+ * Simulates a path-based connector that implements [[SupportsCatalogOptions]] and routes
+ * `pathformat.\`/path/to/t\`` SQL identifiers to a dedicated catalog (`pathformat_cat`).
+ * Tests register that catalog and assert against it so the SCO seam is exercised
+ * unambiguously: without SCO, `CatalogAndIdentifier` falls back to the current catalog
+ * (session catalog) and the target catalog stays empty.
  */
 class FakePathBasedSource
     extends FakeV2ProviderWithCustomSchema
@@ -108,8 +110,18 @@ class FakePathBasedSource
 
   override def shortName(): String = "pathformat"
 
+  override def extractCatalog(options: CaseInsensitiveStringMap): String =
+    FakePathBasedSource.CATALOG_NAME
+
   override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier =
     Identifier.of(Array(shortName()), options.get("path"))
+
+  override def extractTimeTravelVersion(options: CaseInsensitiveStringMap): Optional[String] =
+    Optional.ofNullable(options.get("versionAsOf"))
+}
+
+object FakePathBasedSource {
+  val CATALOG_NAME: String = "pathformat_cat"
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/PathBasedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/PathBasedTableSuite.scala
@@ -19,44 +19,50 @@ package org.apache.spark.sql.connector
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.connector.catalog.InMemoryTableCatalog
-import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.test.SharedSparkSession
 
 /**
  * Non-transactional tests for SQL resolution of path-based tables surfaced by a
  * [[org.apache.spark.sql.connector.catalog.SupportsCatalogOptions]] data source
- * (e.g. `pathformat.`/path/to/t``). Covers reads, DDL, CREATE/REPLACE, regression for v1
- * file-format direct queries, and the `runSQLOnFile` gate. Transactional behavior is
- * covered separately in [[PathBasedTableTransactionSuite]].
+ * (e.g. `pathformat.`/path/to/t``). [[FakePathBasedSource]] routes resolution to a
+ * dedicated `pathformat_cat` catalog rather than the session catalog, so assertions
+ * against that catalog unambiguously confirm the SCO seam fired — without SCO,
+ * `CatalogAndIdentifier`'s fallback lands in the (default) session catalog and the
+ * named catalog stays empty.
  */
 class PathBasedTableSuite extends QueryTest with SharedSparkSession {
 
   import testImplicits._
 
-  // FakePathBasedSource rewrites `pathformat.\`/path/to/t\`` to the session catalog with
-  // Identifier(ns = ["pathformat"], name = "/path/to/t"). InMemoryTableCatalog accepts
-  // arbitrary namespace/name shapes, so we plug it in as the v2 session catalog.
   private val tablePath = "pathformat.`/path/to/t`"
 
+  private def pathformatCat: InMemoryTableCatalog =
+    spark.sessionState.catalogManager.catalog(FakePathBasedSource.CATALOG_NAME)
+      .asInstanceOf[InMemoryTableCatalog]
+
   override def beforeEach(): Unit = {
     super.beforeEach()
     spark.conf.set(
-      V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableCatalog].getName)
+      s"spark.sql.catalog.${FakePathBasedSource.CATALOG_NAME}",
+      classOf[InMemoryTableCatalog].getName)
   }
 
   override def afterEach(): Unit = {
-    // SharedSparkSession reuses one SparkSession across tests, so the in-memory catalog's
-    // table map would persist between tests. Reset clears registered catalogs so each test
-    // sees a fresh session catalog instance.
+    // SharedSparkSession reuses one SparkSession across tests. `reset()` drops registered
+    // non-session catalogs (including pathformat_cat), so the next test starts with a
+    // fresh InMemoryTableCatalog instance.
     spark.sessionState.catalogManager.reset()
-    spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key)
+    spark.conf.unset(s"spark.sql.catalog.${FakePathBasedSource.CATALOG_NAME}")
     super.afterEach()
   }
 
   test("CREATE then SELECT on path-based table") {
     sql(s"CREATE TABLE $tablePath (id INT, data STRING)")
     sql(s"INSERT INTO $tablePath VALUES (1, 'a'), (2, 'b')")
     checkAnswer(spark.table(tablePath), Row(1, "a") :: Row(2, "b") :: Nil)
+    // The SCO resolver routed creation/reads to pathformat_cat. Without the seam,
+    // CREATE would fall through to the (default) session catalog and fail.
+    assert(pathformatCat.listTables(Array("pathformat")).map(_.name()).contains("/path/to/t"))
   }
 
   test("DESCRIBE TABLE resolves path-based table") {
@@ -136,39 +142,43 @@ class PathBasedTableSuite extends QueryTest with SharedSparkSession {
     checkAnswer(sql(s"SELECT * FROM $base VERSION AS OF 'v1'"), Row(2))
   }
 
-  test("SCO precedence: data source name wins over same-named catalog") {
-    // Register a catalog under the same name as the SCO data source short name.
-    // Resolution should still route through the SCO resolver, i.e. the table is
-    // created under the session catalog (`spark_catalog`), not under "pathformat".
+  test("catalog precedence: same-named catalog wins over SCO data source") {
+    // Register a catalog under the same name as the SCO data source short name. SQL
+    // resolution should route to the catalog; the SCO resolver is consulted only when
+    // no catalog claims the head, matching v1 file-format precedence (ResolveSQLOnFile)
+    // and Delta's ResolveDeltaPathTable extension.
     withSQLConf("spark.sql.catalog.pathformat" -> classOf[InMemoryTableCatalog].getName) {
       sql(s"CREATE TABLE $tablePath (id INT, data STRING)")
       sql(s"INSERT INTO $tablePath VALUES (1, 'a')")
       checkAnswer(spark.table(tablePath), Row(1, "a") :: Nil)
 
-      // Table lives in the session catalog under namespace=["pathformat"], not in the
-      // catalog registered as "pathformat".
-      val sessionCat = spark.sessionState.catalogManager.v2SessionCatalog
-        .asInstanceOf[InMemoryTableCatalog]
-      assert(sessionCat.listTables(Array("pathformat")).map(_.name()).contains("/path/to/t"))
+      // Table lives in the homonym catalog at identifier (ns=[], name="/path/to/t"),
+      // and the SCO-targeted catalog (pathformat_cat) is untouched because the SCO
+      // resolver was never consulted.
       val homonymCat = spark.sessionState.catalogManager.catalog("pathformat")
         .asInstanceOf[InMemoryTableCatalog]
-      assert(homonymCat.listTables(Array.empty).isEmpty)
+      assert(homonymCat.listTables(Array.empty).map(_.name()).contains("/path/to/t"))
+      assert(!pathformatCat.namespaceExists(Array("pathformat")))
     }
   }
 
   test("CREATE TABLE AS SELECT on path-based table") {
-    sql("CREATE TABLE source (id INT, data STRING)")
-    sql("INSERT INTO source VALUES (1, 'a'), (2, 'b')")
-    sql(s"CREATE TABLE $tablePath AS SELECT * FROM source")
-    checkAnswer(spark.table(tablePath), Row(1, "a") :: Row(2, "b") :: Nil)
+    withTable("source") {
+      sql("CREATE TABLE source (id INT, data STRING)")
+      sql("INSERT INTO source VALUES (1, 'a'), (2, 'b')")
+      sql(s"CREATE TABLE $tablePath AS SELECT * FROM source")
+      checkAnswer(spark.table(tablePath), Row(1, "a") :: Row(2, "b") :: Nil)
+    }
   }
 
   test("REPLACE TABLE AS SELECT on path-based table") {
-    sql("CREATE TABLE source (id INT, data STRING)")
-    sql("INSERT INTO source VALUES (1, 'a'), (2, 'b'), (3, 'c')")
-    sql(s"CREATE TABLE $tablePath AS SELECT * FROM source")
-    sql(s"REPLACE TABLE $tablePath AS SELECT id FROM source WHERE id > 1")
-    checkAnswer(spark.table(tablePath), Row(2) :: Row(3) :: Nil)
+    withTable("source") {
+      sql("CREATE TABLE source (id INT, data STRING)")
+      sql("INSERT INTO source VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+      sql(s"CREATE TABLE $tablePath AS SELECT * FROM source")
+      sql(s"REPLACE TABLE $tablePath AS SELECT id FROM source WHERE id > 1")
+      checkAnswer(spark.table(tablePath), Row(2) :: Row(3) :: Nil)
+    }
   }
 
   test("INSERT OVERWRITE on path-based table") {
@@ -178,13 +188,53 @@ class PathBasedTableSuite extends QueryTest with SharedSparkSession {
     checkAnswer(spark.table(tablePath), Row(9, "z") :: Nil)
   }
 
-  test("DataFrame API regression: read still resolves via SCO") {
+  test("DataFrame API: read resolves via SCO") {
     // Create via SQL (exercises the new LookupCatalog SCO seam), read via DataFrame
     // (exercises the pre-existing DataFrameReader SCO path in DataSourceV2Utils).
-    // Both paths should land on the same Identifier in the session catalog.
+    // Both paths should land on the same Identifier in pathformat_cat.
     sql(s"CREATE TABLE $tablePath (id INT, data STRING)")
     sql(s"INSERT INTO $tablePath VALUES (1, 'a'), (2, 'b')")
     val df = spark.read.format("pathformat").load("/path/to/t")
     checkAnswer(df, Row(1, "a") :: Row(2, "b") :: Nil)
   }
+
+  test("DataFrame API: write via SCO, read via SQL") {
+    // Write through DataFrameWriter (exercises the refactored buildDsOptions in the v2
+    // write path), read back via SQL to confirm both entry points land on the same
+    // Identifier under pathformat_cat.
+    Seq((1, "a"), (2, "b")).toDF("id", "data")
+      .write.format("pathformat").save("/path/to/t")
+    checkAnswer(spark.table(tablePath), Row(1, "a") :: Row(2, "b") :: Nil)
+  }
+
+  test("DataFrame API: time travel via SCO") {
+    // InMemoryTableCatalog implements time travel by appending the version string to
+    // the identifier name. SCO time-travel options flow through the refactored
+    // extractCatalogAndIdentifier helper, so this also regresses that path.
+    sql("CREATE TABLE pathformat.`/p` (id INT)")
+    sql("CREATE TABLE pathformat.`/pv1` (id INT)")
+    sql("INSERT INTO pathformat.`/p` VALUES (1)")
+    sql("INSERT INTO pathformat.`/pv1` VALUES (2)")
+    val df = spark.read.format("pathformat").option("versionAsOf", "v1").load("/p")
+    checkAnswer(df, Row(2))
+  }
+
+  test("DataFrame API: pure write and read via SCO (no SQL)") {
+    // Uses only DataFrameWriter/DataFrameReader, so it exercises the v2 SCO entry point
+    // in DataSourceV2Utils.loadV2Source / the writer's getTableProviderCatalog branch
+    // independently of LookupCatalog. Survives even when the SQL SCO seam is removed.
+    Seq((1, "a"), (2, "b")).toDF("id", "data")
+      .write.format("pathformat").save("/path/to/t")
+    val df = spark.read.format("pathformat").load("/path/to/t")
+    checkAnswer(df, Row(1, "a") :: Row(2, "b") :: Nil)
+    assert(pathformatCat.listTables(Array("pathformat")).map(_.name()).contains("/path/to/t"))
+  }
+
+  test("DataFrame API: pure time travel via SCO (no SQL)") {
+    // Pure DataFrame setup so the test does not depend on the SQL SCO seam.
+    Seq(1).toDF("id").write.format("pathformat").save("/p")
+    Seq(2).toDF("id").write.format("pathformat").save("/pv1")
+    val df = spark.read.format("pathformat").option("versionAsOf", "v1").load("/p")
+    checkAnswer(df, Row(2))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/PathBasedTableTransactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/PathBasedTableTransactionSuite.scala