@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
3232import org .apache .spark .sql .execution .metric .SQLMetric
3333import org .apache .spark .sql .execution .streaming .operators .stateful .{StatefulOperatorStateInfo , StatefulOpStateStoreCheckpointInfo , WatermarkSupport }
3434import org .apache .spark .sql .execution .streaming .operators .stateful .join .StreamingSymmetricHashJoinHelper ._
35- import org .apache .spark .sql .execution .streaming .state .{DropLastNFieldsStatePartitionKeyExtractor , KeyStateEncoderSpec , NoopStatePartitionKeyExtractor , NoPrefixKeyStateEncoderSpec , StatePartitionKeyExtractor , StateSchemaBroadcast , StateStore , StateStoreCheckpointInfo , StateStoreColFamilySchema , StateStoreConf , StateStoreErrors , StateStoreId , StateStoreMetrics , StateStoreProvider , StateStoreProviderId , SupportsFineGrainedReplay , TimestampAsPostfixKeyStateEncoderSpec , TimestampAsPrefixKeyStateEncoderSpec , TimestampKeyStateEncoder }
35+ import org .apache .spark .sql .execution .streaming .state .{DropLastNFieldsStatePartitionKeyExtractor , KeyStateEncoderSpec , NoopStatePartitionKeyExtractor , NoPrefixKeyStateEncoderSpec , RangeScanBoundaryUtils , StatePartitionKeyExtractor , StateSchemaBroadcast , StateStore , StateStoreCheckpointInfo , StateStoreColFamilySchema , StateStoreConf , StateStoreErrors , StateStoreId , StateStoreMetrics , StateStoreProvider , StateStoreProviderId , SupportsFineGrainedReplay , TimestampAsPostfixKeyStateEncoderSpec , TimestampAsPrefixKeyStateEncoderSpec , TimestampKeyStateEncoder }
3636import org .apache .spark .sql .internal .SQLConf
3737import org .apache .spark .sql .types .{BooleanType , DataType , LongType , NullType , StructField , StructType }
3838import org .apache .spark .util .NextIterator
@@ -184,15 +184,28 @@ trait SupportsEvictByCondition { self: SymmetricHashJoinStateManager =>
184184trait SupportsEvictByTimestamp { self : SymmetricHashJoinStateManager =>
185185 import SymmetricHashJoinStateManager ._
186186
187- /** Evict the state by timestamp. Returns the number of values evicted. */
188- def evictByTimestamp (endTimestamp : Long ): Long
187+ /**
188+ * Evict the state by timestamp. Returns the number of values evicted.
189+ *
190+ * @param endTimestamp Inclusive upper bound: evicts entries with timestamp <= endTimestamp.
191+ * @param startTimestamp Exclusive lower bound: entries with timestamp <= startTimestamp are
192+ * assumed to have been evicted already (e.g. from the previous batch). When provided,
193+ * the scan starts from startTimestamp + 1.
194+ */
195+ def evictByTimestamp (endTimestamp : Long , startTimestamp : Option [Long ] = None ): Long
189196
190197 /**
191198 * Evict the state by timestamp and return the evicted key-value pairs.
192199 *
193200 * It is caller's responsibility to consume the whole iterator.
201+ *
202+ * @param endTimestamp Inclusive upper bound: evicts entries with timestamp <= endTimestamp.
203+ * @param startTimestamp Exclusive lower bound: entries with timestamp <= startTimestamp are
204+ * assumed to have been evicted already (e.g. from the previous batch). When provided,
205+ * the scan starts from startTimestamp + 1.
194206 */
195- def evictAndReturnByTimestamp (endTimestamp : Long ): Iterator [KeyToValuePair ]
207+ def evictAndReturnByTimestamp (
208+ endTimestamp : Long , startTimestamp : Option [Long ] = None ): Iterator [KeyToValuePair ]
196209}
197210
198211/**
@@ -519,11 +532,11 @@ class SymmetricHashJoinStateManagerV4(
519532 }
520533 }
521534
522- override def evictByTimestamp (endTimestamp : Long ): Long = {
535+ override def evictByTimestamp (endTimestamp : Long , startTimestamp : Option [ Long ] = None ): Long = {
523536 require(hasEventTime,
524537 " evictByTimestamp requires event time; secondary index was not populated" )
525538 var removed = 0L
526- tsWithKey.scanEvictedKeys(endTimestamp).foreach { evicted =>
539+ tsWithKey.scanEvictedKeys(endTimestamp, startTimestamp ).foreach { evicted =>
527540 val key = evicted.key
528541 val timestamp = evicted.timestamp
529542 val numValues = evicted.numValues
@@ -537,12 +550,13 @@ class SymmetricHashJoinStateManagerV4(
537550 removed
538551 }
539552
540- override def evictAndReturnByTimestamp (endTimestamp : Long ): Iterator [KeyToValuePair ] = {
553+ override def evictAndReturnByTimestamp (
554+ endTimestamp : Long , startTimestamp : Option [Long ] = None ): Iterator [KeyToValuePair ] = {
541555 require(hasEventTime,
542556 " evictAndReturnByTimestamp requires event time; secondary index was not populated" )
543557 val reusableKeyToValuePair = KeyToValuePair ()
544558
545- tsWithKey.scanEvictedKeys(endTimestamp).flatMap { evicted =>
559+ tsWithKey.scanEvictedKeys(endTimestamp, startTimestamp ).flatMap { evicted =>
546560 val key = evicted.key
547561 val timestamp = evicted.timestamp
548562 val values = keyWithTsToValues.get(key, timestamp)
@@ -663,14 +677,33 @@ class SymmetricHashJoinStateManagerV4(
663677
664678 /**
665679 * Returns entries where minTs <= timestamp <= maxTs (both inclusive), grouped by timestamp.
666- * Skips entries before minTs and stops iterating past maxTs (timestamps are sorted).
680+ * When maxTs is bounded (< Long.MaxValue), uses rangeScanWithMultiValues for efficient
681+ * range access; falls back to prefixScan otherwise to stay within the key's scope.
682+ *
683+ * When prefixScan is used (maxTs == Long.MaxValue), entries outside [minTs, maxTs] are
684+ * filtered out so both code paths produce identical results.
667685 */
668686 def getValuesInRange (
669687 key : UnsafeRow , minTs : Long , maxTs : Long ): Iterator [GetValuesResult ] = {
670688 val reusableGetValuesResult = new GetValuesResult ()
689+ // Only use rangeScan when maxTs < Long.MaxValue, since rangeScan requires
690+ // an exclusive end key (maxTs + 1) which would overflow at Long.MaxValue.
691+ val useRangeScan = maxTs < Long .MaxValue
671692
672693 new NextIterator [GetValuesResult ] {
673- private val iter = stateStore.prefixScanWithMultiValues(key, colFamilyName)
694+ private val iter = if (useRangeScan) {
695+ // startKey must be copied because the second createKeyRow call below reuses
696+ // the same projection buffer and would otherwise overwrite its contents.
697+ // endKey does not need a copy: rangeScanWithMultiValues encodes both bounds
698+ // to independent byte arrays eagerly at call time, and the scope of endKey
699+ // ends with the call of rangeScanWithMultiValues.
700+ val startKey = createKeyRow(key, minTs).copy()
701+ // rangeScanWithMultiValues endKey is exclusive, so use maxTs + 1
702+ val endKey = Some (createKeyRow(key, maxTs + 1 ))
703+ stateStore.rangeScanWithMultiValues(Some (startKey), endKey, colFamilyName)
704+ } else {
705+ stateStore.prefixScanWithMultiValues(key, colFamilyName)
706+ }
674707
675708 private var currentTs = - 1L
676709 private var pastUpperBound = false
@@ -697,6 +730,11 @@ class SymmetricHashJoinStateManagerV4(
697730 val unsafeRowPair = iter.next()
698731 val ts = TimestampKeyStateEncoder .extractTimestamp(unsafeRowPair.key)
699732
733+ if (useRangeScan && (ts < minTs || ts > maxTs)) {
734+ throw StateStoreErrors .streamStreamJoinRangeScanTimestampOutOfRange(
735+ ts, minTs, maxTs)
736+ }
737+
700738 if (ts > maxTs) {
701739 pastUpperBound = true
702740 getNext()
@@ -773,6 +811,8 @@ class SymmetricHashJoinStateManagerV4(
773811 isInternal = true
774812 )
775813
814+ // Returns an UnsafeRow backed by a reused projection buffer. Callers that need to
815+ // hold the row beyond the immediate state store call must invoke copy() on the result.
776816 private def createKeyRow (key : UnsafeRow , timestamp : Long ): UnsafeRow = {
777817 TimestampKeyStateEncoder .attachTimestamp(
778818 attachTimestampProjection, keySchemaWithTimestamp, key, timestamp)
@@ -788,9 +828,60 @@ class SymmetricHashJoinStateManagerV4(
788828
789829 case class EvictedKeysResult (key : UnsafeRow , timestamp : Long , numValues : Int )
790830
791- // NOTE: This assumes we consume the whole iterator to trigger completion.
792- def scanEvictedKeys (endTimestamp : Long ): Iterator [EvictedKeysResult ] = {
793- val evictIterator = stateStore.iteratorWithMultiValues(colFamilyName)
831+ // Reusable default key row for scan boundary construction; see
832+ // [[RangeScanBoundaryUtils]] for rationale. Safe to reuse because createKeyRow
833+ // only reads this row (via BoundReference evaluations) and writes to the
834+ // projection's own internal buffer. Correctness relies on real stored entries
835+ // never having internally-null key fields, which is preserved by join-key
836+ // expressions being evaluated via the user's expression encoder. Preserve this
837+ // invariant if you change how entries are written.
838+ private lazy val defaultKey : UnsafeRow = RangeScanBoundaryUtils .defaultUnsafeRow(keySchema)
839+
840+ /**
841+ * Build a scan boundary row for rangeScan. The TsWithKeyTypeStore uses
842+ * TimestampAsPrefixKeyStateEncoder, which encodes the row as [timestamp][key_fields].
843+ * We need a full-schema row (not just the timestamp) because the encoder expects all
844+ * key columns to be present. Default values are used for the key fields since only the
845+ * timestamp matters for ordering in the prefix encoder.
846+ */
847+ private def createScanBoundaryRow (timestamp : Long ): UnsafeRow = {
848+ createKeyRow(defaultKey, timestamp).copy()
849+ }
850+
851+ /**
852+ * Scan keys eligible for eviction within the timestamp range.
853+ *
854+ * This assumes we consume the whole iterator to trigger completion.
855+ *
856+ * @param endTimestamp Inclusive upper bound: entries with timestamp <= endTimestamp are
857+ * eligible for eviction.
858+ * @param startTimestamp Exclusive lower bound: entries with timestamp <= startTimestamp
859+ * are assumed to have been evicted already. The scan starts from startTimestamp + 1.
860+ */
861+ def scanEvictedKeys (
862+ endTimestamp : Long ,
863+ startTimestamp : Option [Long ] = None ): Iterator [EvictedKeysResult ] = {
864+ // If startTimestamp == Long.MaxValue, everything has already been evicted;
865+ // nothing can match, so return immediately.
866+ if (startTimestamp.contains(Long .MaxValue )) {
867+ return Iterator .empty
868+ }
869+
870+ // rangeScanWithMultiValues: startKey is inclusive, endKey is exclusive.
871+ // startTimestamp is exclusive (already evicted), so we seek from st + 1.
872+ val startKeyRow = startTimestamp.map { st =>
873+ createScanBoundaryRow(st + 1 )
874+ }
875+ // endTimestamp is inclusive, so we use endTimestamp + 1 as the exclusive upper bound.
876+ // When endTimestamp == Long.MaxValue we cannot add 1, so endKeyRow is None. This is
877+ // safe because rangeScanWithMultiValues with no end key uses the column-family prefix
878+ // as the upper bound, naturally scoping the scan within this column family.
879+ val endKeyRow = if (endTimestamp < Long .MaxValue ) {
880+ Some (createScanBoundaryRow(endTimestamp + 1 ))
881+ } else {
882+ None
883+ }
884+ val evictIterator = stateStore.rangeScanWithMultiValues(startKeyRow, endKeyRow, colFamilyName)
794885 new NextIterator [EvictedKeysResult ]() {
795886 var currentKeyRow : UnsafeRow = null
796887 var currentEventTime : Long = - 1L
0 commit comments