Make RangeScanBoundaryUtils safe against CharType / VariantType key schemas

HeartSaVioR · HeartSaVioR · commit a116f630d898 · 2026-04-19T16:58:30.000+09:00
Address #55265 (comment): Literal.default's recursive "smallest" property does not hold for CharType or VariantType, so unguarded use can silently produce incorrect range-scan boundaries when such types appear in the key schema. - CharType(n): override with n zero-bytes (U+0000 repeated, UTF-8 encoded), which is the byte-wise smallest legitimate CharType(n) value. This keeps preserveCharVarcharTypeInfo=true users supported. - VariantType: assert-reject, because the Variant binary spec is @unstable and a hand-encoded minimum would be brittle. Existing Spark analysis already blocks VariantType in grouping / hashing positions, so this is defensive and should never fire in practice. Add RangeScanBoundaryUtilsSuite covering the happy path, byte-wise sanity check for CharType, nested CharType in struct, VarcharType empty default, and VariantType rejection at top level / nested in struct / nested in array.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RangeScanBoundaryUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RangeScanBoundaryUtils.scala
@@ -18,8 +18,9 @@
 package org.apache.spark.sql.execution.streaming.state
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Literal, UnsafeProjection, UnsafeRow}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Literal, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.types.{CharType, DataType, StructType, UserDefinedType, VariantType}
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Utilities for building boundary rows used as start / end keys in state store
@@ -31,18 +32,27 @@ import org.apache.spark.sql.types.StructType
  * The non-ordering columns must encode byte-wise no larger than any real entry at
  * the same ordering prefix, or `seek()` will silently skip matching entries.
  *
- * The helpers here fill non-ordering columns with `Literal.default` (recursive
- * zero / empty / false). In UnsafeRow, that encodes as the byte-wise smallest
- * representation for all standard Dataset/SQL-encoded types -- numerics (including
- * negatives, whose non-zero two's-complement bytes still sort greater than
- * all-zero), and variable-length types (size=0 beats any non-zero size).
+ * The helpers here fill non-ordering columns with recursive defaults. For most
+ * types, `Literal.default` (recursive zero / empty / false) is already byte-wise
+ * smallest in UnsafeRow -- numerics (including negatives, whose non-zero two's-
+ * complement bytes still sort greater than all-zero), and variable-length types
+ * (size=0 beats any non-zero size).
  *
- * Caveats where the "smallest" property does NOT hold and which therefore should
- * not appear in caller key schemas:
- *   - `CharType(n)`: default is space-padded (0x20), but real values may legally
- *     contain control bytes (0x00..0x1F).
+ * Exceptions handled explicitly:
+ *   - `CharType(n)`: `Literal.default` is space-padded (0x20) because every
+ *     stored value has exactly `n` characters, and real values may legally
+ *     contain control bytes (0x00..0x1F). We override with `n` bytes of `0x00`
+ *     (U+0000 is a legal 1-byte UTF-8 code point), giving the smallest possible
+ *     UnsafeRow encoding (minimum byte length `n`, minimum per-byte content).
+ *
+ * Types rejected at runtime because we cannot safely hand-encode a byte-wise
+ * minimum:
  *   - `VariantType`: the Variant binary layout is not guaranteed minimized by
- *     `castToVariant(0, IntegerType)`.
+ *     `castToVariant(0, IntegerType)`, and the Variant spec is `@Unstable`, so
+ *     the minimum would be brittle against future spec changes. In practice,
+ *     Spark analysis (grouping / hashing checks in `ExprUtils`, `HashExpression`)
+ *     already rejects VariantType in state-store key positions before reaching
+ *     the range-scan helpers, so this assertion is defensive only.
  *
  * A cleaner long-term fix would extend the state store API to accept an
  * ordering-column-only range bound and avoid synthesizing boundary rows at all.
@@ -54,11 +64,13 @@ import org.apache.spark.sql.types.StructType
 private[sql] object RangeScanBoundaryUtils {
 
   /**
-   * Build an `InternalRow` whose fields are the recursive defaults of `schema`. See
-   * the object-level docstring for the byte-wise-ordering guarantees and caveats.
+   * Build an `InternalRow` whose fields are the recursive byte-wise-smallest
+   * defaults of `schema`. See the object-level docstring for the ordering
+   * guarantees and the CharType / VariantType handling.
    */
   def defaultInternalRow(schema: StructType): InternalRow = {
-    InternalRow.fromSeq(schema.map(f => Literal.default(f.dataType).value))
+    assertBoundarySchemaSupported(schema)
+    InternalRow.fromSeq(schema.map(f => recursiveDefaultValue(f.dataType)))
   }
 
   /**
@@ -70,4 +82,37 @@ private[sql] object RangeScanBoundaryUtils {
   def defaultUnsafeRow(schema: StructType): UnsafeRow = {
     UnsafeProjection.create(schema).apply(defaultInternalRow(schema)).copy()
   }
+
+  /**
+   * Produce the byte-wise smallest legitimate value for `dt`. Falls back to
+   * `Literal.default` for types where it is already byte-wise smallest; overrides
+   * `CharType` with `n` zero-bytes; recurses through `StructType` and unwraps
+   * `UserDefinedType`. `ArrayType` / `MapType` defaults are empty and therefore
+   * already smallest regardless of element type, so no recursion is needed.
+   */
+  private def recursiveDefaultValue(dt: DataType): Any = dt match {
+    case c: CharType =>
+      UTF8String.fromBytes(new Array[Byte](c.length))
+    case struct: StructType =>
+      new GenericInternalRow(
+        struct.fields.map(f => recursiveDefaultValue(f.dataType)))
+    case udt: UserDefinedType[_] =>
+      recursiveDefaultValue(udt.sqlType)
+    case _ =>
+      Literal.default(dt).value
+  }
+
+  /**
+   * Reject schemas containing types for which we cannot hand-encode a byte-wise
+   * minimum (see object-level docstring). Existing Spark analysis already blocks
+   * VariantType in grouping / hashing positions, so this assertion is defensive
+   * and should never fire in practice.
+   */
+  private def assertBoundarySchemaSupported(schema: StructType): Unit = {
+    val hasVariantType = schema.existsRecursively(_.isInstanceOf[VariantType])
+    assert(!hasVariantType,
+      "RangeScanBoundaryUtils cannot build a scan boundary for a schema containing " +
+        "VariantType; see RangeScanBoundaryUtils docstring for details. " +
+        s"schema=${schema.catalogString}")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RangeScanBoundaryUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RangeScanBoundaryUtilsSuite.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.state
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+class RangeScanBoundaryUtilsSuite extends SparkFunSuite {
+
+  test("defaultInternalRow / defaultUnsafeRow accept schemas with common types") {
+    val schema = new StructType()
+      .add("ts", LongType)
+      .add("s", StringType)
+      .add("i", IntegerType)
+      .add("nested", new StructType().add("b", BinaryType).add("d", DoubleType))
+      .add("arr", ArrayType(IntegerType))
+      .add("m", MapType(StringType, IntegerType))
+
+    val row = RangeScanBoundaryUtils.defaultInternalRow(schema)
+    assert(row.numFields === schema.length)
+
+    val unsafe = RangeScanBoundaryUtils.defaultUnsafeRow(schema)
+    assert(unsafe.numFields === schema.length)
+  }
+
+  test("CharType field uses n zero-bytes, which is byte-wise <= any legitimate value") {
+    val n = 5
+    val schema = new StructType().add("ts", LongType).add("c", CharType(n))
+    val boundary = RangeScanBoundaryUtils.defaultUnsafeRow(schema)
+    val boundaryStr = boundary.getUTF8String(1)
+
+    // UTF-8 encoding of n U+0000 code points is exactly n bytes of 0x00.
+    assert(boundaryStr.numBytes() === n)
+    (0 until n).foreach { i =>
+      assert(boundaryStr.getByte(i) === 0.toByte, s"byte $i is not 0x00")
+    }
+
+    // Sanity: a legitimate CharType(5) value written through UnsafeProjection
+    // (e.g. "abcde") should produce a row whose raw bytes are >= the boundary row's.
+    def encode(value: UTF8String): Array[Byte] = {
+      val input = new GenericInternalRow(2)
+      input.setLong(0, 0L)
+      input.update(1, value)
+      UnsafeProjection.create(schema).apply(input).getBytes
+    }
+
+    val boundaryBytes = boundary.getBytes
+    val realBytes = encode(UTF8String.fromString("abcde"))
+    assert(compareBytes(boundaryBytes, realBytes) <= 0,
+      "boundary bytes should be <= 'abcde' bytes")
+
+    // Also <= a value with legitimate low code points (e.g. all U+0001 padded).
+    val lowValue = UTF8String.fromBytes(Array.fill[Byte](n)(1.toByte))
+    assert(compareBytes(boundaryBytes, encode(lowValue)) <= 0,
+      "boundary bytes should be <= all-0x01 bytes")
+  }
+
+  test("CharType nested inside a struct is still handled") {
+    val inner = new StructType().add("c", CharType(3))
+    val schema = new StructType().add("ts", LongType).add("nested", inner)
+    // Should not throw, and the nested struct's char field should be 3 zero-bytes.
+    val row = RangeScanBoundaryUtils.defaultInternalRow(schema)
+    val nested = row.getStruct(1, inner.length)
+    val str = nested.getUTF8String(0)
+    assert(str.numBytes() === 3)
+    (0 until 3).foreach(i => assert(str.getByte(i) === 0.toByte))
+  }
+
+  test("VarcharType default (empty string) is still used and is byte-wise smallest") {
+    val schema = new StructType().add("ts", LongType).add("v", VarcharType(10))
+    // Should not throw; VarcharType's Literal.default is empty string (no padding).
+    val row = RangeScanBoundaryUtils.defaultInternalRow(schema)
+    val s = row.getUTF8String(1)
+    assert(s.numBytes() === 0)
+  }
+
+  test("defaultInternalRow rejects schemas containing VariantType at top level") {
+    val schema = new StructType().add("ts", LongType).add("v", VariantType)
+    val e = intercept[AssertionError] {
+      RangeScanBoundaryUtils.defaultInternalRow(schema)
+    }
+    assert(e.getMessage.contains("VariantType"))
+  }
+
+  test("defaultInternalRow rejects schemas containing VariantType nested in array") {
+    val schema = new StructType()
+      .add("ts", LongType)
+      .add("arr", ArrayType(VariantType))
+    val e = intercept[AssertionError] {
+      RangeScanBoundaryUtils.defaultInternalRow(schema)
+    }
+    assert(e.getMessage.contains("VariantType"))
+  }
+
+  test("defaultInternalRow rejects schemas containing VariantType nested in struct") {
+    val schema = new StructType()
+      .add("ts", LongType)
+      .add("nested", new StructType().add("v", VariantType))
+    val e = intercept[AssertionError] {
+      RangeScanBoundaryUtils.defaultInternalRow(schema)
+    }
+    assert(e.getMessage.contains("VariantType"))
+  }
+
+  test("defaultUnsafeRow also triggers the schema check") {
+    val schema = new StructType().add("ts", LongType).add("v", VariantType)
+    val e = intercept[AssertionError] {
+      RangeScanBoundaryUtils.defaultUnsafeRow(schema)
+    }
+    assert(e.getMessage.contains("VariantType"))
+  }
+
+  // Byte-wise unsigned lexicographic comparison, matching the RocksDB key order.
+  private def compareBytes(a: Array[Byte], b: Array[Byte]): Int = {
+    val len = math.min(a.length, b.length)
+    var i = 0
+    while (i < len) {
+      val ai = a(i) & 0xFF
+      val bi = b(i) & 0xFF
+      if (ai != bi) return ai - bi
+      i += 1
+    }
+    a.length - b.length
+  }
+}