data-catering
diff --git a/‎app/src/integrationTest/scala/io/github/datacatering/datacaterer/core/foreignkey/ForeignKeyEndToEndIntegrationTest.scala‎
Lines changed: 17 additions & 7 deletions b/‎app/src/integrationTest/scala/io/github/datacatering/datacaterer/core/foreignkey/ForeignKeyEndToEndIntegrationTest.scala‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/CardinalityStrategy.scala‎
Lines changed: 13 additions & 2 deletions b/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/CardinalityStrategy.scala‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/DistributedSamplingStrategy.scala‎
Lines changed: 11 additions & 3 deletions b/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/DistributedSamplingStrategy.scala‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/GenerationModeStrategy.scala‎
Lines changed: 21 additions & 4 deletions b/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/GenerationModeStrategy.scala‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/NullabilityStrategy.scala‎
Lines changed: 25 additions & 4 deletions b/‎app/src/main/scala/io/github/datacatering/datacaterer/core/foreignkey/strategy/NullabilityStrategy.scala‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎app/src/main/scala/io/github/datacatering/datacaterer/core/generator/execution/PatternBasedExecutionStrategy.scala‎
Lines changed: 8 additions & 2 deletions b/‎app/src/main/scala/io/github/datacatering/datacaterer/core/generator/execution/PatternBasedExecutionStrategy.scala‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metrics/PerformanceMetrics.scala‎
Lines changed: 5 additions & 5 deletions b/‎app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metrics/PerformanceMetrics.scala‎
Lines changed: 5 additions & 5 deletions
@@ -393,8 +393,14 @@ class ForeignKeyEndToEndIntegrationTest extends SparkSuite {
 
     // Count null FKs (violations)
     val nullCount = updatedReviewsDf.filter(updatedReviewsDf("product_id").isNull).count()
-
-    // With seed=1 and 25% ratio, expect exactly 4 nulls out of 12 records
+    val nullRowIds = updatedReviewsDf.filter(updatedReviewsDf("product_id").isNull)
+      .select("review_id").collect().map(_.getString(0)).sorted.toList
+
+    // With seed=1 and 25% nullability ratio on 12 records, we get exactly these null rows
+    // This verifies the hash-based approach is deterministic across environments
+    val expectedNullRows = List("REV004", "REV007", "REV008", "REV011")
+    assert(nullRowIds == expectedNullRows,
+      s"Expected exactly $expectedNullRows to be null with seed=1, but got $nullRowIds")
     assert(nullCount == 4, s"Expected exactly 4 nulls with seed=1, got $nullCount")
 
     // Verify non-null FKs are valid
@@ -565,11 +571,15 @@ class ForeignKeyEndToEndIntegrationTest extends SparkSuite {
 
     // Count nulls
     val nullCount = updatedSalesDf.filter(updatedSalesDf("store_id").isNull).count()
-
-    // With seed=12349 and 20% ratio, we expect around 2 nulls out of 10 records
-    // Exact count depends on seed randomness
-    assert(nullCount >= 0 && nullCount <= 4,
-      s"Expected 0-4 nulls with 20% ratio (seed variance), got $nullCount")
+    val nullRowIds = updatedSalesDf.filter(updatedSalesDf("store_id").isNull)
+      .select("sale_id").collect().map(_.getString(0)).sorted.toList
+
+    // With seed=12349 and 20% nullability ratio on 10 records, we get exactly these null rows
+    // This verifies the hash-based approach is deterministic across environments
+    val expectedNullRows = List("SALE001", "SALE004")
+    assert(nullRowIds == expectedNullRows,
+      s"Expected exactly $expectedNullRows to be null with seed=12349, but got $nullRowIds")
+    assert(nullCount == 2, s"Expected exactly 2 nulls with seed=12349, got $nullCount")
 
     // Non-null values should be valid store IDs
     val validStoreIds = storesDf.select("store_id").collect().map(_.getString(0)).toSet
 
@@ -24,6 +24,9 @@ class CardinalityStrategy extends ForeignKeyStrategy {
 
   private val LOGGER = Logger.getLogger(getClass.getName)
 
+  /** Minimum source count required for modulo operations to avoid division by zero */
+  private val MIN_SOURCE_COUNT_FOR_MODULO = 1L
+
   override def name: String = "CardinalityStrategy"
 
   /**
@@ -102,6 +105,12 @@ class CardinalityStrategy extends ForeignKeyStrategy {
 
     LOGGER.info(s"Source has $sourceCount distinct parent records")
 
+    // Guard against empty source DataFrame to prevent division by zero in modulo operations
+    if (sourceCount == 0) {
+      LOGGER.warn("Source DataFrame has no records - cannot apply cardinality. Returning target DataFrame unchanged.")
+      return targetDf
+    }
+
     // Check if target has perField config that creates grouping structure
     // If so, use group-based approach which preserves the generated groups
     val hasMatchingPerFieldConfig = targetPerFieldCount.exists { pfc =>
@@ -127,8 +136,10 @@ class CardinalityStrategy extends ForeignKeyStrategy {
           1.0
       }
 
-      LOGGER.info(s"Using INDEX-BASED approach: assigning FKs by row position (${recordsPerParent} records per parent)")
-      applyCardinalityWithIndex(sourceDf, targetDf, sourceFields, targetFields, sourceCount, recordsPerParent.toLong)
+      // Use ceil to match calculateRequiredCount behavior and avoid generating fewer records than expected
+      val recordsPerParentCeiled = math.ceil(recordsPerParent).toLong
+      LOGGER.info(s"Using INDEX-BASED approach: assigning FKs by row position ($recordsPerParentCeiled records per parent)")
+      applyCardinalityWithIndex(sourceDf, targetDf, sourceFields, targetFields, sourceCount, recordsPerParentCeiled)
     }
   }
 
 
@@ -68,9 +68,17 @@ class DistributedSamplingStrategy extends ForeignKeyStrategy {
         .withColumn("_fk_idx", row_number().over(windowSpec) - 1)
 
       // Assign random index to each target row (0 to sourceCount-1)
-      val randExpr = config.seed.map(s => rand(s)).getOrElse(rand())
-      val targetWithIndex = targetDf
-        .withColumn("_fk_idx", floor(randExpr * sourceCount).cast(LongType))
+      // Use hash-based approach when seed is provided for deterministic behavior across environments
+      // (Spark's rand(seed) is partition-dependent and not truly deterministic)
+      val targetWithIndex = config.seed match {
+        case Some(s) =>
+          val allCols = targetDf.columns.map(col)
+          val hashExpr = xxhash64(allCols :+ lit(s): _*)
+          // Use absolute hash value modulo sourceCount for uniform distribution
+          targetDf.withColumn("_fk_idx", abs(hashExpr) % sourceCount)
+        case None =>
+          targetDf.withColumn("_fk_idx", floor(rand() * sourceCount).cast(LongType))
+      }
 
       // Rename source fields to avoid ambiguity
       val renamedSource = sourceFields.foldLeft(sourceWithIndex) { case (df, field) =>
 
@@ -125,17 +125,34 @@ class GenerationModeStrategy(generationMode: String = "all-exist") extends Forei
       val shouldInvalidate = (col("_combination_id") % totalCombinations).bitwiseAND(1 << fieldIdx) === 0
 
       // Generate random invalid values for this field
+      // Use hash-based approach when seed is provided for deterministic behavior across environments
+      // (Spark's rand(seed) is partition-dependent and not truly deterministic)
       val dataType = result.schema(targetField).dataType
-      val randExpr = relation.config.seed.map(s => rand(s)).getOrElse(rand())
       val invalidValue = dataType match {
         case StringType =>
           // Use deterministic hash-based approach when seed is available
           relation.config.seed match {
-            case Some(s) => concat(lit("INVALID_"), expr(s"MD5(CONCAT('$s', CAST(monotonically_increasing_id() AS STRING)))"))
+            case Some(s) =>
+              val allCols = result.columns.map(col)
+              concat(lit("INVALID_"), substring(md5(concat(allCols :+ lit(s): _*)), 1, 8))
             case None => concat(lit("INVALID_"), expr("uuid()"))
           }
-        case IntegerType => (randExpr * 999999999).cast(IntegerType)
-        case LongType => (randExpr * 999999999999L).cast(LongType)
+        case IntegerType =>
+          relation.config.seed match {
+            case Some(s) =>
+              val allCols = result.columns.map(col)
+              val hashExpr = xxhash64(allCols :+ lit(s) :+ lit(fieldIdx): _*)
+              (abs(hashExpr) % 999999999).cast(IntegerType)
+            case None => (rand() * 999999999).cast(IntegerType)
+          }
+        case LongType =>
+          relation.config.seed match {
+            case Some(s) =>
+              val allCols = result.columns.map(col)
+              val hashExpr = xxhash64(allCols :+ lit(s) :+ lit(fieldIdx): _*)
+              abs(hashExpr) % 999999999999L
+            case None => (rand() * 999999999999L).cast(LongType)
+          }
         case _ => lit(null).cast(dataType)
       }
 
 
@@ -92,10 +92,24 @@ class NullabilityStrategy extends PostProcessingStrategy {
     }
 
     // Add a column to determine which records get null FKs
+    // For deterministic behavior with seed, we use a hash-based approach instead of rand()
+    // because Spark's rand(seed) is partition-dependent and not truly deterministic across environments
     val withNullFlag = strategy match {
       case "random" =>
-        val randExpr = uniqueSeed.map(s => rand(s)).getOrElse(rand())
-        targetDf.withColumn("_should_null_fk", randExpr < percentage)
+        uniqueSeed match {
+          case Some(s) =>
+            // Use hash-based deterministic selection: hash all columns + seed, then check if < percentage
+            // This ensures the same rows are selected regardless of partitioning
+            val allCols = targetDf.columns.map(col)
+            // Use xxhash64 for better distribution (returns Long), then normalize to [0, 1)
+            val hashExpr = xxhash64(allCols :+ lit(s): _*)
+            // Convert to unsigned by bitwise AND with max long, then normalize
+            val normalizedHash = (hashExpr.bitwiseAND(lit(Long.MaxValue))).cast("double") / lit(Long.MaxValue.toDouble)
+            targetDf.withColumn("_should_null_fk", normalizedHash < percentage)
+          case None =>
+            // No seed provided - use non-deterministic rand()
+            targetDf.withColumn("_should_null_fk", rand() < percentage)
+        }
 
       case "head" =>
         // First N% of records get null FKs
@@ -117,8 +131,15 @@ class NullabilityStrategy extends PostProcessingStrategy {
 
       case _ =>
         LOGGER.warn(s"Unknown nullability strategy: $strategy, using random")
-        val randExpr = uniqueSeed.map(s => rand(s)).getOrElse(rand())
-        targetDf.withColumn("_should_null_fk", randExpr < percentage)
+        uniqueSeed match {
+          case Some(s) =>
+            val allCols = targetDf.columns.map(col)
+            val hashExpr = xxhash64(allCols :+ lit(s): _*)
+            val normalizedHash = (hashExpr.bitwiseAND(lit(Long.MaxValue))).cast("double") / lit(Long.MaxValue.toDouble)
+            targetDf.withColumn("_should_null_fk", normalizedHash < percentage)
+          case None =>
+            targetDf.withColumn("_should_null_fk", rand() < percentage)
+        }
     }
 
     // Apply nulls to target fields
 
@@ -17,6 +17,12 @@ class PatternBasedExecutionStrategy(
                                    ) extends ExecutionStrategy {
 
   private val LOGGER = Logger.getLogger(getClass.getName)
+
+  /**
+   * Threshold for rate change detection. Only update the rate limiter when the rate
+   * changes by more than this fraction (10%) to avoid excessive rate limiter recreation.
+   */
+  private val RATE_CHANGE_THRESHOLD = 0.1
   private val metricsCollector = new PerformanceMetricsCollector()
 
   // Extract pattern configuration from first step with pattern configured
@@ -85,9 +91,9 @@ class PatternBasedExecutionStrategy(
     val elapsedSeconds = durationTracker.getElapsedTimeMs / 1000.0
     val targetRate = loadPattern.getRateAt(elapsedSeconds, totalDurationSeconds)
 
-    // Only create a new rate limiter if the rate has changed significantly (>10% change or first time)
+    // Only create a new rate limiter if the rate has changed significantly or this is the first time
     val shouldUpdate = currentRateLimiter.isEmpty ||
-      math.abs(targetRate - currentRate).toDouble / currentRate > 0.1
+      math.abs(targetRate - currentRate).toDouble / currentRate > RATE_CHANGE_THRESHOLD
 
     if (shouldUpdate) {
       currentRate = targetRate
 
@@ -76,7 +76,7 @@ case class PerformanceMetrics(
 
   /**
    * Calculate percentile using exact or approximate method based on dataset size.
-   * For large datasets (>100k samples), uses T-Digest for memory efficiency.
+   * For large datasets (>100k samples), uses SimplePercentileCalculator for memory efficiency.
    * For smaller datasets, uses exact sorting.
    * Phase 3 optimization.
    */
@@ -85,10 +85,10 @@ case class PerformanceMetrics(
 
     val latencies = batchMetrics.map(_.batchDurationMs.toDouble)
 
-    // Use T-Digest for large datasets (Phase 3 optimization)
-    if (latencies.size > TDigest.LARGE_DATASET_THRESHOLD) {
-      val digest = TDigest.fromValues(latencies)
-      digest.quantile(percentile)
+    // Use SimplePercentileCalculator for large datasets (Phase 3 optimization)
+    if (latencies.size > SimplePercentileCalculator.LARGE_DATASET_THRESHOLD) {
+      val calculator = SimplePercentileCalculator.fromValues(latencies)
+      calculator.quantile(percentile)
     } else {
       // Exact calculation for smaller datasets
       val sorted = latencies.sorted