Refactor YAML plan and task handling, enhance foreign key processing

pflooky · pflooky · commit d4255c3fd5e1 · 2025-12-18T21:26:14.000+11:00
- Removed obsolete integration test steps from GitHub Actions workflow.
- Improved logging in StepDataCoordinator for better debugging during record generation.
- Updated CardinalityCountAdjustmentProcessor to ensure only foreign key target steps are modified, preventing unintended changes.
- Added new YAML plan and task files for account balances and transactions, including validation of foreign key relationships.
- Introduced integration tests for YAML plan execution to verify record counts and foreign key integrity.
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -23,17 +23,3 @@ jobs:
       - name: Run gradle integration tests
         run: |
           ./gradlew :app:integrationTest --info
-      - name: Run intsa-integration tests
-        id: tests
-        uses: data-catering/insta-integration@v4
-      - name: Print results
-        run: |
-          echo "Records generated:         ${{ steps.tests.outputs.num_records_generated }}"
-          echo "Successful validations:    ${{ steps.tests.outputs.num_success_validations }}"
-          echo "Failed validations:        ${{ steps.tests.outputs.num_failed_validations }}"
-          echo "Number of validations:     ${{ steps.tests.outputs.num_validations }}"
-          echo "Validation success rate:   ${{ steps.tests.outputs.validation_success_rate }}"
-
-          if [ "${{ steps.tests.outputs.num_failed_validations }}" -gt 0 ]; then
-            exit 1
-          fi
diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/StepDataCoordinator.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/StepDataCoordinator.scala
@@ -211,12 +211,14 @@ class StepDataCoordinator(
     targetNumRecords: Long,
     retries: Int
   ): (DataFrame, Long) = {
-    LOGGER.debug(s"Record count does not reach expected num records for batch, generating more records until reached, " +
-      s"target-num-records=$targetNumRecords, actual-num-records=$currentRecordCount, num-retries=$retries, max-retries=$maxRetries")
     
     if (targetNumRecords == currentRecordCount || retries >= maxRetries) {
+      LOGGER.debug(s"Record count reaches expected num records for batch or reached max retries, stopping generation, " +
+        s"target-num-records=$targetNumRecords, actual-num-records=$currentRecordCount, num-retries=$retries, max-retries=$maxRetries")
       (currentDf, currentRecordCount)
     } else {
+      LOGGER.debug(s"Record count does not reach expected num records for batch, generating more records until reached, " +
+        s"target-num-records=$targetNumRecords, actual-num-records=$currentRecordCount, num-retries=$retries, max-retries=$maxRetries")
       val (newDf, newRecordCount, newBaseRecordCount) = generateAdditionalRecords(
         batch, step, task, dataSourceStepName, stepRecords, currentDf, currentRecordCount, currentBaseRecordCount, targetNumRecords
       )
diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/CardinalityCountAdjustmentProcessor.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/CardinalityCountAdjustmentProcessor.scala
@@ -65,10 +65,14 @@ class CardinalityCountAdjustmentProcessor(val dataCatererConfiguration: DataCate
 
       dataSourceNameOpt match {
         case Some(dataSourceName) =>
-          // Check if this task is a target in any FK relationship with cardinality
+          // Get all step names in this task to check if any are FK targets
+          val taskStepNames = task.steps.map(_.name).toSet
+          
+          // Check if any step in this task is a target in any FK relationship with cardinality
+          // Must match BOTH dataSource AND step name to avoid incorrect matching
           val targetRelationOpt = enhancedForeignKeys
             .flatMap(_.generate)
-            .find(target => target.dataSource == dataSourceName && target.cardinality.isDefined)
+            .find(target => target.dataSource == dataSourceName && target.cardinality.isDefined && taskStepNames.contains(target.step))
 
           targetRelationOpt match {
             case Some(targetRelation) =>
@@ -85,103 +89,109 @@ class CardinalityCountAdjustmentProcessor(val dataCatererConfiguration: DataCate
                     LOGGER.debug(s"Adjusting task count due to cardinality: data-source=$dataSourceName, " +
                       s"task=${task.name}, original-count=$originalCount, adjusted-count=$requiredCount")
 
-                    // Update the count for all steps in this task
-                    // Also set up perField configuration to match the cardinality grouping
+                    // Update only steps that are FK targets with cardinality config
+                    // DO NOT modify steps that are not FK targets (like the source step in the same task)
                     val updatedSteps = task.steps.map { step =>
                       // Get the target relation for this step from the foreign key config
                       val targetRelationOpt = enhancedForeignKeys
                         .flatMap(_.generate)
                         .find(target => target.dataSource == dataSourceName && target.step == step.name)
 
-                      val fkFieldNames = targetRelationOpt.map(_.fields).getOrElse(List()).distinct
-
-                      // Get the cardinality configuration from the target relation
-                      val cardinalityConfigOpt = targetRelationOpt.flatMap(_.cardinality)
-
-                      // Get the source FK for this step
-                      val fkOpt = enhancedForeignKeys
-                        .find(fk => fk.generate.exists(g => g.dataSource == dataSourceName && g.step == step.name))
-
-                      val sourceCount = fkOpt
-                        .map { fk =>
-                          tasksByDataSource.get(fk.source.dataSource)
-                            .flatMap(_.steps.headOption)
-                            .flatMap(_.count.records)
+                      // Only process steps that are actual FK targets
+                      targetRelationOpt match {
+                        case None =>
+                          // This step is NOT a FK target - leave it unchanged
+                          LOGGER.debug(s"Step ${step.name} is not a FK target, leaving count unchanged: ${step.count.records}")
+                          step
+                          
+                        case Some(targetRel) =>
+                          val fkFieldNames = targetRel.fields.distinct
+                          val cardinalityConfigOpt = targetRel.cardinality
+
+                          // Get the source FK for this step
+                          val fkOpt = enhancedForeignKeys
+                            .find(fk => fk.generate.exists(g => g.dataSource == dataSourceName && g.step == step.name))
+
+                          val sourceCount = fkOpt
+                            .map { fk =>
+                              tasksByDataSource.get(fk.source.dataSource)
+                                .flatMap(_.steps.find(_.name == fk.source.step))
+                                .flatMap(_.count.records)
+                                .getOrElse(1L)
+                            }
                             .getOrElse(1L)
-                        }
-                        .getOrElse(1L)
-
-                      // Check if step originally had perField config on FK fields (before our processing)
-                      val hadOriginalPerField = step.count.perField.exists { pfc =>
-                        fkFieldNames.exists(pfc.fieldNames.contains)
-                      }
 
-                      // Determine if we should set perField configuration
-                      // - If step HAD perField on FK fields: DON'T set it (causes double-grouping with random values)
-                      // - If step DIDN'T have perField: SET it (enables proper grouping during generation)
-                      val updatedCount = if (fkFieldNames.nonEmpty && cardinalityConfigOpt.isDefined && !hadOriginalPerField) {
-                        val cardinalityConfig = cardinalityConfigOpt.get
-
-                        cardinalityConfig match {
-                          case config if config.min.isDefined && config.max.isDefined =>
-                            // Bounded: set perField with min/max options
-                            LOGGER.debug(s"Setting perField config for step ${step.name}: fields=${fkFieldNames.mkString(",")}, " +
-                              s"records=$sourceCount, min=${config.min.get}, max=${config.max.get}, distribution=${config.distribution}")
-                            step.count.copy(
-                              records = Some(sourceCount), // Use source count for bounded
-                              perField = Some(io.github.datacatering.datacaterer.api.model.PerFieldCount(
-                                fieldNames = fkFieldNames,
-                                count = None,
-                                options = Map(
-                                  "min" -> config.min.get,
-                                  "max" -> config.max.get,
-                                  "distribution" -> config.distribution
+                          // Check if step originally had perField config on FK fields (before our processing)
+                          val hadOriginalPerField = step.count.perField.exists { pfc =>
+                            fkFieldNames.exists(pfc.fieldNames.contains)
+                          }
+
+                          // Determine if we should set perField configuration
+                          // - If step HAD perField on FK fields: DON'T set it (causes double-grouping with random values)
+                          // - If step DIDN'T have perField: SET it (enables proper grouping during generation)
+                          val updatedCount = if (fkFieldNames.nonEmpty && cardinalityConfigOpt.isDefined && !hadOriginalPerField) {
+                            val cardinalityConfig = cardinalityConfigOpt.get
+
+                            cardinalityConfig match {
+                              case config if config.min.isDefined && config.max.isDefined =>
+                                // Bounded: set perField with min/max options
+                                LOGGER.debug(s"Setting perField config for step ${step.name}: fields=${fkFieldNames.mkString(",")}, " +
+                                  s"records=$sourceCount, min=${config.min.get}, max=${config.max.get}, distribution=${config.distribution}")
+                                step.count.copy(
+                                  records = Some(sourceCount), // Use source count for bounded
+                                  perField = Some(io.github.datacatering.datacaterer.api.model.PerFieldCount(
+                                    fieldNames = fkFieldNames,
+                                    count = None,
+                                    options = Map(
+                                      "min" -> config.min.get,
+                                      "max" -> config.max.get,
+                                      "distribution" -> config.distribution
+                                    )
+                                  ))
                                 )
-                              ))
-                            )
-
-                          case config if config.ratio.isDefined =>
-                            // Ratio: set perField with fixed count
-                            // Use requiredCount for total records, perField count for records per parent
-                            val recordsPerParent = config.ratio.get.toInt
-                            LOGGER.debug(s"Setting perField config for step ${step.name}: fields=${fkFieldNames.mkString(",")}, " +
-                              s"records=$sourceCount, count=$recordsPerParent, distribution=${config.distribution}")
-
-                            if (config.distribution == "uniform") {
-                              step.count.copy(
-                                records = Some(sourceCount),
-                                perField = Some(io.github.datacatering.datacaterer.api.model.PerFieldCount(
-                                  fieldNames = fkFieldNames,
-                                  count = Some(recordsPerParent.toLong)
-                                ))
-                              )
-                            } else {
-                              step.count.copy(
-                                records = Some(sourceCount),
-                                perField = Some(io.github.datacatering.datacaterer.api.model.PerFieldCount(
-                                  fieldNames = fkFieldNames,
-                                  count = None,
-                                  options = Map(
-                                    "min" -> recordsPerParent,
-                                    "max" -> recordsPerParent,
-                                    "distribution" -> config.distribution
+
+                              case config if config.ratio.isDefined =>
+                                // Ratio: set perField with fixed count
+                                val recordsPerParent = config.ratio.get.toInt
+                                LOGGER.debug(s"Setting perField config for step ${step.name}: fields=${fkFieldNames.mkString(",")}, " +
+                                  s"records=$sourceCount, count=$recordsPerParent, distribution=${config.distribution}")
+
+                                if (config.distribution == "uniform") {
+                                  step.count.copy(
+                                    records = Some(sourceCount),
+                                    perField = Some(io.github.datacatering.datacaterer.api.model.PerFieldCount(
+                                      fieldNames = fkFieldNames,
+                                      count = Some(recordsPerParent.toLong)
+                                    ))
                                   )
-                                ))
-                              )
-                            }
+                                } else {
+                                  step.count.copy(
+                                    records = Some(sourceCount),
+                                    perField = Some(io.github.datacatering.datacaterer.api.model.PerFieldCount(
+                                      fieldNames = fkFieldNames,
+                                      count = None,
+                                      options = Map(
+                                        "min" -> recordsPerParent,
+                                        "max" -> recordsPerParent,
+                                        "distribution" -> config.distribution
+                                      )
+                                    ))
+                                  )
+                                }
 
-                          case _ =>
-                            step.count.copy(records = Some(sourceCount), perField = None)
-                        }
-                      } else if (hadOriginalPerField) {
-                        // Step had original perField on FK fields - remove it to avoid double-grouping
-                        LOGGER.debug(s"Removing original perField config from step ${step.name} to avoid double-grouping (FK fields: ${fkFieldNames.mkString(",")})")
-                        step.count.copy(records = Some(requiredCount), perField = None)
-                      } else {
-                        step.count.copy(records = Some(requiredCount))
+                              case _ =>
+                                step.count.copy(records = Some(sourceCount), perField = None)
+                            }
+                          } else if (hadOriginalPerField) {
+                            // Step had original perField on FK fields - remove it to avoid double-grouping
+                            LOGGER.debug(s"Removing original perField config from step ${step.name} to avoid double-grouping (FK fields: ${fkFieldNames.mkString(",")})")
+                            step.count.copy(records = Some(requiredCount), perField = None)
+                          } else {
+                            step.count.copy(records = Some(requiredCount))
+                          }
+
+                          step.copy(count = updatedCount)
                       }
-
-                      step.copy(count = updatedCount)
                     }
                     task.copy(steps = updatedSteps)
                   } else {
diff --git a/app/src/test/resources/sample/plan/account_balance_and_transactions_create_plan.yaml b/app/src/test/resources/sample/plan/account_balance_and_transactions_create_plan.yaml
@@ -0,0 +1,19 @@
+---
+name: "account_balance_and_transactions_create_plan"
+description: "Create balances and transactions in Parquet files"
+tasks: []
+sinkOptions:
+  foreignKeys:
+  - source:
+      dataSource: "parquet_ds"
+      step: "balances"
+      fields:
+      - "account_number"
+    generate:
+    - dataSource: "parquet_ds"
+      step: "transactions"
+      fields:
+      - "account_number"
+    delete: []
+validations: []
+runId: "92f4fb44-c6cc-41db-9a42-3988c08c1254"
diff --git a/app/src/test/resources/sample/plan/parquet-balance-transaction-plan.yaml b/app/src/test/resources/sample/plan/parquet-balance-transaction-plan.yaml
@@ -0,0 +1,16 @@
+name: "parquet_balance_and_transactions_create_plan"
+description: "Create balances and transactions in Parquet files"
+tasks:
+  - name: "parquet_balance_and_transactions"
+    dataSourceName: "parquet"
+
+sinkOptions:
+  foreignKeys:
+    - source:
+        dataSource: "parquet"
+        step: "balances"
+        fields: [ "account_number" ]
+      generate:
+        - dataSource: "parquet"
+          step: "transactions"
+          fields: [ "account_number" ]
diff --git a/app/src/test/resources/sample/plan/test_plan_500.yaml b/app/src/test/resources/sample/plan/test_plan_500.yaml
@@ -0,0 +1,19 @@
+---
+name: "test_plan_500"
+description: "Test with 500 balances"
+tasks: []
+sinkOptions:
+  foreignKeys:
+  - source:
+      dataSource: "parquet_ds"
+      step: "balances"
+      fields:
+      - "account_number"
+    generate:
+    - dataSource: "parquet_ds"
+      step: "transactions"
+      fields:
+      - "account_number"
+    delete: []
+validations: []
+runId: "068a8494-0dd4-4ac3-8022-c23fe04867c8"
diff --git a/app/src/test/resources/sample/task/file/parquet-balance-transaction-task.yaml b/app/src/test/resources/sample/task/file/parquet-balance-transaction-task.yaml
@@ -0,0 +1,42 @@
+name: "parquet_balance_and_transactions"
+steps:
+  - name: "balances"
+    type: "parquet"
+    count:
+      records: 1000
+    options:
+      path: "/tmp/data-caterer-parquet-fk-test/balances"
+    fields:
+      - name: "account_number"
+        options:
+          regex: "ACC1[0-9]{5,10}"
+          isUnique: true
+      - name: "create_time"
+        type: "timestamp"
+      - name: "account_status"
+        type: "string"
+        options:
+          oneOf:
+            - "open"
+            - "closed"
+            - "suspended"
+      - name: "balance"
+        type: "double"
+  - name: "transactions"
+    type: "parquet"
+    count:
+      perField:
+        fieldNames:
+          - "account_number"
+        count: 5
+    options:
+      path: "/tmp/data-caterer-parquet-fk-test/transactions"
+    fields:
+      - name: "account_number"
+      - name: "create_time"
+        type: "timestamp"
+      - name: "transaction_id"
+        options:
+          regex: "txn-[0-9]{10}"
+      - name: "amount"
+        type: "double"
diff --git a/app/src/test/resources/sample/task/postgres/postgres-balance-transaction-task.yaml b/app/src/test/resources/sample/task/postgres/postgres-balance-transaction-task.yaml
@@ -9,7 +9,7 @@ steps:
     fields:
       - name: "account_number"
         options:
-          regex: "ACC1[0-9]{5,10}"
+          regex: "ACC1[0-9]{10}"
       - name: "create_time"
         type: "timestamp"
       - name: "account_status"
diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/YamlPlanExecutionIntegrationTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/YamlPlanExecutionIntegrationTest.scala