fix: restore deleted tests and fix inaccurate documentation

andygrove · claude · andygrove · commit 52ab85cd303f · 2026-01-16T11:56:39.000-07:00
- Restore original ignored tests for ROWS BETWEEN with PARTITION BY + ORDER BY
  (COUNT, SUM, AVG with various ROWS BETWEEN frames)
- Fix documentation to accurately reflect what's supported:
  - Remove AVG from supported list (has native implementation issues)
  - Clarify PARTITION BY/ORDER BY restriction (partition must be subset of order)
  - Clarify ROWS BETWEEN limitations
- Fix misleading test names ("COUNT with ROWS frame" -&gt; "COUNT with PARTITION BY only")

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md
@@ -66,18 +66,23 @@ this can be overridden by setting `spark.comet.regexp.allowIncompatible=true`.
 
 ## Window Functions
 
-Comet supports window aggregate functions with ROWS BETWEEN frames. These are enabled by default when
-`spark.comet.exec.window.enabled=true` (the default). Ranking functions (ROW_NUMBER, RANK, etc.)
-and offset functions (LAG, LEAD) are not yet supported and will automatically fall back to Spark.
+Comet supports a subset of window aggregate functions. These are enabled by default when
+`spark.comet.exec.window.enabled=true` (the default). Unsupported window functions will automatically
+fall back to Spark.
 
 **Supported:**
 
-- Window aggregates: `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`
-- Frame types: `ROWS BETWEEN` with `UNBOUNDED PRECEDING`, `CURRENT ROW`, `UNBOUNDED FOLLOWING`, and numeric offsets
-- `PARTITION BY` and `ORDER BY` clauses (can be different columns)
+- Window aggregates: `COUNT`, `SUM`, `MIN`, `MAX`
+- `PARTITION BY` only (no `ORDER BY`)
+- `ORDER BY` only (no `PARTITION BY`)
+- `PARTITION BY` with `ORDER BY` when partition columns are a subset of order columns
+  (e.g., `PARTITION BY a ORDER BY a, b` works, but `PARTITION BY a ORDER BY b` does not)
 
-**Not Supported:**
+**Not Yet Supported:**
 
+- `AVG` window aggregate (native implementation has known issues)
+- `PARTITION BY` with `ORDER BY` using different columns (falls back to Spark)
+- `ROWS BETWEEN` frames with `PARTITION BY` and `ORDER BY` on different columns
 - Ranking functions: `ROW_NUMBER`, `RANK`, `DENSE_RANK`, `PERCENT_RANK`, `NTILE`, `CUME_DIST`
 - Offset functions: `LAG`, `LEAD`
 - Value functions: `FIRST_VALUE`, `LAST_VALUE`, `NTH_VALUE`
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometWindowExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometWindowExecSuite.scala
@@ -365,8 +365,8 @@ class CometWindowExecSuite extends CometTestBase {
     }
   }
 
-  // COUNT with ROWS frame (ORDER BY with PARTITION BY not yet fully supported in native)
-  test("window: COUNT with ROWS frame") {
+  // COUNT with PARTITION BY only (no ORDER BY)
+  test("window: COUNT with PARTITION BY only") {
     withTempDir { dir =>
       (0 until 30)
         .map(i => (i % 3, i % 5, i))
@@ -386,7 +386,7 @@ class CometWindowExecSuite extends CometTestBase {
     }
   }
 
-  // SUM with PARTITION BY only (ORDER BY with PARTITION BY not yet fully supported in native)
+  // SUM with PARTITION BY only (no ORDER BY)
   test("window: SUM with PARTITION BY only") {
     withTempDir { dir =>
       (0 until 30)
@@ -407,6 +407,94 @@ class CometWindowExecSuite extends CometTestBase {
     }
   }
 
+  // TODO: COUNT with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW produces incorrect results
+  // Falls back to Spark - partition expressions must be subset of order expressions
+  ignore("window: COUNT with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW") {
+    withTempDir { dir =>
+      (0 until 30)
+        .map(i => (i % 3, i % 5, i))
+        .toDF("a", "b", "c")
+        .repartition(3)
+        .write
+        .mode("overwrite")
+        .parquet(dir.toString)
+
+      spark.read.parquet(dir.toString).createOrReplaceTempView("window_test")
+      val df = sql("""
+        SELECT a, b, c,
+          COUNT(*) OVER (PARTITION BY a ORDER BY b ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as cnt
+        FROM window_test
+      """)
+      checkSparkAnswerAndOperator(df)
+    }
+  }
+
+  // TODO: SUM with ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING produces incorrect results
+  // Falls back to Spark - partition expressions must be subset of order expressions
+  ignore("window: SUM with ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING") {
+    withTempDir { dir =>
+      (0 until 30)
+        .map(i => (i % 3, i % 5, i))
+        .toDF("a", "b", "c")
+        .repartition(3)
+        .write
+        .mode("overwrite")
+        .parquet(dir.toString)
+
+      spark.read.parquet(dir.toString).createOrReplaceTempView("window_test")
+      val df = sql("""
+        SELECT a, b, c,
+          SUM(c) OVER (PARTITION BY a ORDER BY b ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) as sum_c
+        FROM window_test
+      """)
+      checkSparkAnswerAndOperator(df)
+    }
+  }
+
+  // TODO: AVG with ROWS BETWEEN produces incorrect results
+  // Falls back to Spark - partition expressions must be subset of order expressions
+  ignore("window: AVG with ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING") {
+    withTempDir { dir =>
+      (0 until 30)
+        .map(i => (i % 3, i % 5, i))
+        .toDF("a", "b", "c")
+        .repartition(3)
+        .write
+        .mode("overwrite")
+        .parquet(dir.toString)
+
+      spark.read.parquet(dir.toString).createOrReplaceTempView("window_test")
+      val df = sql("""
+        SELECT a, b, c,
+          AVG(c) OVER (PARTITION BY a ORDER BY b ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as avg_c
+        FROM window_test
+      """)
+      checkSparkAnswerAndOperator(df)
+    }
+  }
+
+  // TODO: SUM with ROWS BETWEEN produces incorrect results
+  // Falls back to Spark - partition expressions must be subset of order expressions
+  ignore("window: SUM with ROWS BETWEEN 2 PRECEDING AND CURRENT ROW") {
+    withTempDir { dir =>
+      (0 until 30)
+        .map(i => (i % 3, i % 5, i))
+        .toDF("a", "b", "c")
+        .repartition(3)
+        .write
+        .mode("overwrite")
+        .parquet(dir.toString)
+
+      spark.read.parquet(dir.toString).createOrReplaceTempView("window_test")
+      val df = sql("""
+        SELECT a, b, c,
+          SUM(c) OVER (PARTITION BY a ORDER BY b ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as sum_c
+        FROM window_test
+      """)
+      checkSparkAnswerAndOperator(df)
+    }
+  }
+
   // TODO: COUNT with ROWS BETWEEN not supported
   // Falls back to Spark Window operator - "Partitioning and sorting specifications must be the same"
   ignore("window: COUNT with ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING") {
@@ -1003,7 +1091,7 @@ class CometWindowExecSuite extends CometTestBase {
     }
   }
 
-  // Multiple aggregate functions in single query (TODO: fix AVG support in native window)
+  // Multiple aggregate functions in single query (COUNT, SUM, MIN, MAX with PARTITION BY only)
   test("window: multiple aggregate functions in single query") {
     withTempDir { dir =>
       (0 until 30)