Generate project-context.md for sourcedb-to-spanner

aasthabharill · aasthabharill · commit ac452beb86ee · 2026-06-09T06:26:46.000Z
diff --git a/v2/sourcedb-to-spanner/architecture.dot b/v2/sourcedb-to-spanner/architecture.dot
@@ -0,0 +1,35 @@
+digraph Architecture {
+    node [shape=box, style=filled, color=lightblue];
+    
+    SourceDb [label="Source Database\n(Cassandra, MySQL, PostgreSQL)"];
+    
+    subgraph cluster_Reader {
+        label = "com.google.cloud.teleport.v2.source.reader";
+        ReaderImpl [label="ReaderImpl"];
+        IoWrapper [label="IoWrapper (Cassandra, JDBC)"];
+        RowMapper [label="RowMapper"];
+    }
+    
+    subgraph cluster_Transformer {
+        label = "com.google.cloud.teleport.v2.transformer";
+        SourceRowToMutation [label="SourceRowToMutationDoFn"];
+    }
+    
+    subgraph cluster_Writer {
+        label = "com.google.cloud.teleport.v2.writer";
+        SpannerWriter [label="SpannerWriter"];
+        DLQ [label="DeadLetterQueue"];
+    }
+
+    Spanner [label="Cloud Spanner"];
+    GCS [label="GCS (DLQ)"];
+
+    SourceDb -> IoWrapper;
+    IoWrapper -> RowMapper;
+    RowMapper -> ReaderImpl;
+    ReaderImpl -> SourceRowToMutation [label="SourceRow"];
+    SourceRowToMutation -> SpannerWriter [label="Mutation"];
+    SpannerWriter -> Spanner;
+    SpannerWriter -> DLQ [label="Failed Mutations"];
+    DLQ -> GCS;
+}
diff --git a/v2/sourcedb-to-spanner/architecture.svg b/v2/sourcedb-to-spanner/architecture.svg
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 14.1.2 (0)
+ -->
+<!-- Title: Architecture Pages: 1 -->
+<svg width="335pt" height="674pt"
+ viewBox="0.00 0.00 335.00 674.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 670)">
+<title>Architecture</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-670 330.5,-670 330.5,4 -4,4"/>
+<g id="clust1" class="cluster">
+<title>cluster_Reader</title>
+<polygon fill="none" stroke="black" points="36,-375.5 36,-598 289,-598 289,-375.5 36,-375.5"/>
+<text xml:space="preserve" text-anchor="middle" x="162.5" y="-580.7" font-family="Times,serif" font-size="14.00">com.google.cloud.teleport.v2.source.reader</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_Transformer</title>
+<polygon fill="none" stroke="black" points="38,-264.5 38,-341 282,-341 282,-264.5 38,-264.5"/>
+<text xml:space="preserve" text-anchor="middle" x="160" y="-323.7" font-family="Times,serif" font-size="14.00">com.google.cloud.teleport.v2.transformer</text>
+</g>
+<g id="clust3" class="cluster">
+<title>cluster_Writer</title>
+<polygon fill="none" stroke="black" points="8,-65 8,-230 221,-230 221,-65 8,-65"/>
+<text xml:space="preserve" text-anchor="middle" x="114.5" y="-212.7" font-family="Times,serif" font-size="14.00">com.google.cloud.teleport.v2.writer</text>
+</g>
+<!-- SourceDb -->
+<g id="node1" class="node">
+<title>SourceDb</title>
+<polygon fill="lightblue" stroke="lightblue" points="245.25,-666 38.75,-666 38.75,-625 245.25,-625 245.25,-666"/>
+<text xml:space="preserve" text-anchor="middle" x="142" y="-648.7" font-family="Times,serif" font-size="14.00">Source Database</text>
+<text xml:space="preserve" text-anchor="middle" x="142" y="-632.2" font-family="Times,serif" font-size="14.00">(Cassandra, MySQL, PostgreSQL)</text>
+</g>
+<!-- IoWrapper -->
+<g id="node3" class="node">
+<title>IoWrapper</title>
+<polygon fill="lightblue" stroke="lightblue" points="233.62,-565.5 50.38,-565.5 50.38,-529.5 233.62,-529.5 233.62,-565.5"/>
+<text xml:space="preserve" text-anchor="middle" x="142" y="-542.45" font-family="Times,serif" font-size="14.00">IoWrapper (Cassandra, JDBC)</text>
+</g>
+<!-- SourceDb&#45;&gt;IoWrapper -->
+<g id="edge1" class="edge">
+<title>SourceDb&#45;&gt;IoWrapper</title>
+<path fill="none" stroke="black" d="M142,-624.61C142,-610.91 142,-592.51 142,-577.23"/>
+<polygon fill="black" stroke="black" points="145.5,-577.3 142,-567.3 138.5,-577.3 145.5,-577.3"/>
+</g>
+<!-- ReaderImpl -->
+<g id="node2" class="node">
+<title>ReaderImpl</title>
+<polygon fill="lightblue" stroke="lightblue" points="182.25,-419.5 101.75,-419.5 101.75,-383.5 182.25,-383.5 182.25,-419.5"/>
+<text xml:space="preserve" text-anchor="middle" x="142" y="-396.45" font-family="Times,serif" font-size="14.00">ReaderImpl</text>
+</g>
+<!-- SourceRowToMutation -->
+<g id="node5" class="node">
+<title>SourceRowToMutation</title>
+<polygon fill="lightblue" stroke="lightblue" points="229.5,-308.5 54.5,-308.5 54.5,-272.5 229.5,-272.5 229.5,-308.5"/>
+<text xml:space="preserve" text-anchor="middle" x="142" y="-285.45" font-family="Times,serif" font-size="14.00">SourceRowToMutationDoFn</text>
+</g>
+<!-- ReaderImpl&#45;&gt;SourceRowToMutation -->
+<g id="edge4" class="edge">
+<title>ReaderImpl&#45;&gt;SourceRowToMutation</title>
+<path fill="none" stroke="black" d="M142,-383.47C142,-366.53 142,-340.23 142,-320.09"/>
+<polygon fill="black" stroke="black" points="145.5,-320.28 142,-310.28 138.5,-320.28 145.5,-320.28"/>
+<text xml:space="preserve" text-anchor="middle" x="173.5" y="-352.2" font-family="Times,serif" font-size="14.00">SourceRow</text>
+</g>
+<!-- RowMapper -->
+<g id="node4" class="node">
+<title>RowMapper</title>
+<polygon fill="lightblue" stroke="lightblue" points="184.12,-492.5 99.88,-492.5 99.88,-456.5 184.12,-456.5 184.12,-492.5"/>
+<text xml:space="preserve" text-anchor="middle" x="142" y="-469.45" font-family="Times,serif" font-size="14.00">RowMapper</text>
+</g>
+<!-- IoWrapper&#45;&gt;RowMapper -->
+<g id="edge2" class="edge">
+<title>IoWrapper&#45;&gt;RowMapper</title>
+<path fill="none" stroke="black" d="M142,-529.31C142,-521.73 142,-512.6 142,-504.04"/>
+<polygon fill="black" stroke="black" points="145.5,-504.04 142,-494.04 138.5,-504.04 145.5,-504.04"/>
+</g>
+<!-- RowMapper&#45;&gt;ReaderImpl -->
+<g id="edge3" class="edge">
+<title>RowMapper&#45;&gt;ReaderImpl</title>
+<path fill="none" stroke="black" d="M142,-456.31C142,-448.73 142,-439.6 142,-431.04"/>
+<polygon fill="black" stroke="black" points="145.5,-431.04 142,-421.04 138.5,-431.04 145.5,-431.04"/>
+</g>
+<!-- SpannerWriter -->
+<g id="node6" class="node">
+<title>SpannerWriter</title>
+<polygon fill="lightblue" stroke="lightblue" points="189.75,-197.5 94.25,-197.5 94.25,-161.5 189.75,-161.5 189.75,-197.5"/>
+<text xml:space="preserve" text-anchor="middle" x="142" y="-174.45" font-family="Times,serif" font-size="14.00">SpannerWriter</text>
+</g>
+<!-- SourceRowToMutation&#45;&gt;SpannerWriter -->
+<g id="edge5" class="edge">
+<title>SourceRowToMutation&#45;&gt;SpannerWriter</title>
+<path fill="none" stroke="black" d="M142,-272.47C142,-255.53 142,-229.23 142,-209.09"/>
+<polygon fill="black" stroke="black" points="145.5,-209.28 142,-199.28 138.5,-209.28 145.5,-209.28"/>
+<text xml:space="preserve" text-anchor="middle" x="167.12" y="-241.2" font-family="Times,serif" font-size="14.00">Mutation</text>
+</g>
+<!-- DLQ -->
+<g id="node7" class="node">
+<title>DLQ</title>
+<polygon fill="lightblue" stroke="lightblue" points="176,-109 64,-109 64,-73 176,-73 176,-109"/>
+<text xml:space="preserve" text-anchor="middle" x="120" y="-85.95" font-family="Times,serif" font-size="14.00">DeadLetterQueue</text>
+</g>
+<!-- SpannerWriter&#45;&gt;DLQ -->
+<g id="edge7" class="edge">
+<title>SpannerWriter&#45;&gt;DLQ</title>
+<path fill="none" stroke="black" d="M137.65,-161.41C134.66,-149.64 130.61,-133.73 127.15,-120.11"/>
+<polygon fill="black" stroke="black" points="130.64,-119.65 124.79,-110.82 123.86,-121.37 130.64,-119.65"/>
+<text xml:space="preserve" text-anchor="middle" x="179.47" y="-130.2" font-family="Times,serif" font-size="14.00">Failed Mutations</text>
+</g>
+<!-- Spanner -->
+<g id="node8" class="node">
+<title>Spanner</title>
+<polygon fill="lightblue" stroke="lightblue" points="326.5,-109 229.5,-109 229.5,-73 326.5,-73 326.5,-109"/>
+<text xml:space="preserve" text-anchor="middle" x="278" y="-85.95" font-family="Times,serif" font-size="14.00">Cloud Spanner</text>
+</g>
+<!-- SpannerWriter&#45;&gt;Spanner -->
+<g id="edge6" class="edge">
+<title>SpannerWriter&#45;&gt;Spanner</title>
+<path fill="none" stroke="black" d="M190.06,-163.79C203.6,-158.5 217.92,-151.76 230,-143.5 240.29,-136.47 250,-126.86 257.96,-117.85"/>
+<polygon fill="black" stroke="black" points="260.55,-120.21 264.32,-110.31 255.2,-115.7 260.55,-120.21"/>
+</g>
+<!-- GCS -->
+<g id="node9" class="node">
+<title>GCS</title>
+<polygon fill="lightblue" stroke="lightblue" points="161.38,-36 78.62,-36 78.62,0 161.38,0 161.38,-36"/>
+<text xml:space="preserve" text-anchor="middle" x="120" y="-12.95" font-family="Times,serif" font-size="14.00">GCS (DLQ)</text>
+</g>
+<!-- DLQ&#45;&gt;GCS -->
+<g id="edge8" class="edge">
+<title>DLQ&#45;&gt;GCS</title>
+<path fill="none" stroke="black" d="M120,-72.81C120,-65.23 120,-56.1 120,-47.54"/>
+<polygon fill="black" stroke="black" points="123.5,-47.54 120,-37.54 116.5,-47.54 123.5,-47.54"/>
+</g>
+</g>
+</svg>
diff --git a/v2/sourcedb-to-spanner/project-context.md b/v2/sourcedb-to-spanner/project-context.md
@@ -0,0 +1,78 @@
+# Project Context: SourceDb to Spanner
+
+<!-- AI Agent: Please parse this document to understand the project's context before making changes. -->
+
+## Overview
+
+*   **Core Intent:** A bulk migration Dataflow pipeline to migrate data from various Source Databases (MySQL, PostgreSQL, Cassandra) into Cloud Spanner. It handles sharded and non-sharded databases.
+*   **Primary Users:** Internal SREs, external customers migrating to Cloud Spanner, and users of Spanner Migration Tool.
+*   **Critical SLOs/Guarantees:** Must effectively handle bulk data extraction and mapping to Cloud Spanner mutations while maintaining data integrity. Features a Dead Letter Queue (DLQ) for failed mutations.
+*   **Terminology:** 
+    *   **DLQ:** Dead Letter Queue (for failed records).
+    *   **SourceRow:** Intermediate representation of a row read from the source database.
+    *   **Mutation:** Spanner mutation to be applied.
+
+## Technical Details
+
+*   **Tech Stack & Versions:**
+    *   **Languages:** Java 17
+    *   **Frameworks/Libraries:** Apache Beam 2.73.0, Maven
+    *   **Key Google Technologies:** Cloud Spanner, Cloud Storage (GCS), Dataflow
+*   **Code Location:** `/usr/local/google/home/aasthabharill/DataflowTemplates/v2/sourcedb-to-spanner`
+*   **Data Flow:** Data is read from Source Databases (MySQL/PostgreSQL/Cassandra) using JDBC or Datastax driver -> Mapped into SourceRows -> Transformed to Spanner Mutations -> Written to Cloud Spanner. Failed mutations are logged to a GCS DLQ.
+*   **Project Structure (Logical Architecture Mapping):**
+    *   `src/main/java/com/google/cloud/teleport/v2/source/reader`: Source Readers (IoWrappers for Cassandra, JDBC, etc., RowMappers)
+    *   `src/main/java/com/google/cloud/teleport/v2/transformer`: Transformers (e.g., `SourceRowToMutationDoFn`)
+    *   `src/main/java/com/google/cloud/teleport/v2/writer`: Writers and error handling (`SpannerWriter`, `DeadLetterQueue`)
+    *   `src/main/java/com/google/cloud/teleport/v2/templates`: Main pipeline definition (`SourceDbToSpanner`)
+*   **Build/Run Commands:**
+    ```bash
+    # To build the flex template
+    export PROJECT=span-cloud-ck-testing-external
+    export BUCKET_NAME=ea-functional-tests
+    mvn clean package -PtemplatesStage -DskipTests -DprojectId="$PROJECT" -DbucketName="$BUCKET_NAME" -DstagePrefix="templates-<replace-with-your-prefix>" -DtemplateName="Sourcedb_to_Spanner_Flex" -pl v2/sourcedb-to-spanner -am
+
+    # To run tests
+    mvn clean test -pl v2/sourcedb-to-spanner -am
+
+    # To run pipeline
+    export JOB_NAME="bulk-migrate-to-spanner-$(date +%Y%m%d-%H%M%S)"
+    export OUTPUT_DIR="gs://${BUCKET_NAME}/bulk-migration"
+    gcloud dataflow flex-template run $JOB_NAME \
+      --project=$PROJECT_ID \
+      --region=$REGION \
+      --template-file-gcs-location="gs://dataflow-templates-${REGION}/latest/flex/Sourcedb_to_Spanner_Flex" \
+      --max-workers=2 \
+      --num-workers=1 \
+      --worker-machine-type=n2-highmem-8 \
+      --parameters sourceConfigURL=$GCS_SHARDING_PATH,instanceId=$SPANNER_INSTANCE_NAME,databaseId=$SPANNER_DATABASE_NAME,projectId=$PROJECT_ID,outputDirectory=$OUTPUT_DIR,username=datastream_user,password=complex_password_123,schemaOverridesFilePath=$GCS_OVERRIDES_PATH,transformationJarPath=$CUSTOM_JAR_PATH,transformationClassName=com.custom.CustomTransformationFetcher
+    ```
+
+## Project Management
+
+*   **Buganizer Component:** [Infrastructure > Spanner > Cloud > Migrations](https://b.corp.google.com/issues?q=componentid:1008064) - (Cloud Spanner migrations component)
+*   **Key Contacts:**
+    *   **Recent Contributors:** darshan-sj, aasthabharill, shreyakhajanchi, sm745052
+
+## Documentation
+
+*   **Key Design Docs:**
+    *   [Bulk Migration to Spanner Design](http://go/bulk-migration-to-spanner-design) - Overall pipeline design.
+    *   [CS Reader for Bulk Migration](http://go/cs-reader-for-bulk-migration-to-spanner) - Reader design.
+    *   [Spanner Bulk Migration User Guide](http://go/spanner-bulk-migration-user-guide) - Usage instructions.
+*   **Architecture Diagram:** [architecture.svg](architecture.svg)
+
+## AI Agent Tips
+
+*   **Common Tasks:** Adding new JDBC dialects, fixing parsing errors, implementing new transformations or schema overrides, adding new source reader capabilities.
+*   **Coding Standards & Best Practices:**
+    *   Use `AutoValue` for POJOs.
+    *   Strict adherence to Apache Beam paradigms (PTransforms, DoFns). Use `TupleTag` for side outputs like the DLQ.
+    *   Use structured logging (`com.google.cloud.teleport.structured-logging`).
+*   **Testing Frameworks & Guidelines:**
+    *   **Frameworks:** JUnit 4, Google Truth for assertions, Mockito for mocking.
+    *   **Rules:** Ensure tests use `@RunWith(JUnit4.class)`. Use embedded databases for testing when possible (e.g. `derby` or `embedded-cassandra`).
+*   **Areas to be Careful:** Cross-shard querying logic, causal ordering around the DLQ, and schema mappings parsing.
+*   **Example CLs:**
+    *   [39a8ae5e0](https://github.com/GoogleCloudPlatform/DataflowTemplates/commit/39a8ae5e0) - Fix GCS Avro Export flow
+    *   [90964dca6](https://github.com/GoogleCloudPlatform/DataflowTemplates/commit/90964dca6) - Add Support for UUID-based Partitioning