googleapis
diff --git a/‎bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/README.md‎
Lines changed: 180 additions & 51 deletions b/‎bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/README.md‎
Lines changed: 180 additions & 51 deletions
diff --git a/‎bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/src/main/java/com/google/cloud/bigtable/mapreduce/Driver.java‎
Lines changed: 10 additions & 0 deletions b/‎bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/src/main/java/com/google/cloud/bigtable/mapreduce/Driver.java‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/src/main/java/com/google/cloud/bigtable/mapreduce/hbasesnapshots/ImportHBaseSnapshotJob.java‎
Lines changed: 3 additions & 0 deletions b/‎bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/src/main/java/com/google/cloud/bigtable/mapreduce/hbasesnapshots/ImportHBaseSnapshotJob.java‎
Lines changed: 3 additions & 0 deletions
@@ -2,51 +2,89 @@
 
 This module provides a work alike to some of the jobs implemented in hbase-server.
 Specifically this currently has the ability to export and import SequenceFiles
-from/to Cloud Bigtable and import HBase snapshots using a Map Reduce cluster (ie. dataproc).
+from/to Cloud Bigtable, import HBase snapshots using a Map Reduce cluster (ie. 
+dataproc), and HashTable/SyncTable for validation.  
 
-## Expected Usage 
+## Setup
+
+To use the tools in this folder, you can download them from the maven repository, or
+you can build them using Maven.
 
 [//]: # ({x-version-update-start:bigtable-client-parent:released})
+### Download the jars
+Download [bigtable-hbase-1.x-mapreduce jars](https://search.maven.org/artifact/com.google.cloud.bigtable/bigtable-hbase-1.x-mapreduce), which is an aggregation of all required jars.
+
+### Build the jars yourself
+
+Go to the top level directory and build the repo
+then return to this sub directory.
+
+```
+cd ../../
+mvn clean install -DskipTests=true
+cd bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce
+```
+
+## Expected Usage 
+
 ### On-prem Hadoop
 
-1. Download or build bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar
+1. Download or build bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar
 2. Download service account credentials json from Google Cloud Console.
 3. Submit the job using your edge node's hadoop installation. 
    ```bash
-   # Export to SequenceFiles
-   GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
-   hadoop jar bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar \
-       export-table \
-       -Dgoogle.bigtable.project.id=<project-id> \
-       -Dgoogle.bigtable.instance.id=<instance-id> \
-       <table-id> \
-       <outputdir>
+    # Export to SequenceFiles
+    GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
+    hadoop jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
+        export-table \
+        -Dgoogle.bigtable.project.id=<project-id> \
+        -Dgoogle.bigtable.instance.id=<instance-id> \
+        <table-id> \
+        <outputdir>
 
-   # Import from SequenceFiles
-      GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
-      hadoop jar bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar \
-          import-table \
-          -Dgoogle.bigtable.project.id=<project-id> \
-          -Dgoogle.bigtable.instance.id=<instance-id> \
-          <table-id> \
-          <inputdir>
+    # Import from SequenceFiles
+    GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
+    hadoop jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
+        import-table \
+        -Dgoogle.bigtable.project.id=<project-id> \
+        -Dgoogle.bigtable.instance.id=<instance-id> \
+        <table-id> \
+        <inputdir>
+   
+    # Import from HBase snapshot
+    GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
+    hadoop jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
+        import-snapshot \
+        -Dgoogle.bigtable.project.id=<project-id> \
+        -Dgoogle.bigtable.instance.id=<instance-id> \
+        <snapshot-name> \
+        <snapshot-dir> \
+        <table-id> \
+        <tmp-dir>
+   
+    # HashTable on HBase
+    GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
+    hadoop jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
+        hash-table \
+        -Dhbase.zookeeper.quorum=<source-zk-quorum> \
+        <source-table-id> \
+        <hash-outputdir-hbase>
 
-   # Import from HBase snapshot
-      GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
-      hadoop jar bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar \
-          import-snapshot \
-          -Dgoogle.bigtable.project.id=<project-id> \
-          -Dgoogle.bigtable.instance.id=<instance-id> \
-          <snapshot-name> \
-          <snapshot-dir> \
-          <table-id> \
-          <tmp-dir>
+    # SyncTable on Bigtable (dryrun enabled by default)
+    GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account.json \
+    hadoop jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
+        sync-table \
+        --sourcezkcluster=<source-zk-quorum> \
+        --targetbigtableproject=<project-id> \
+        --targetbigtableinstance=<instance-id> \
+        <hash-outputdir-hbase> \
+        <source-table-id> \
+        <target-table-id>
    ```
 
-
 ### Dataproc
 
-1. Download or build bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar.
+1. Download or build bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar.
 2. Install the gcloud sdk.
 3. Configure [Bigtable IAM roles](https://cloud.google.com/bigtable/docs/access-control#roles) 
     for the [Dataproc Service Account](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/service-accounts#what_are_service_accounts) 
@@ -55,9 +93,10 @@ from/to Cloud Bigtable and import HBase snapshots using a Map Reduce cluster (ie
    ```bash
     # Export to SequenceFiles
     gcloud dataproc jobs submit hadoop \
+        --project <project-id> \
         --cluster <dataproc-cluster> \
         --region <dataproc-region> \
-        --jar bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar \
+        --jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
         -- \
         export-table \
         -Dgoogle.bigtable.project.id=<project-id> \
@@ -67,9 +106,10 @@ from/to Cloud Bigtable and import HBase snapshots using a Map Reduce cluster (ie
 
     # Import from SequenceFiles
     gcloud dataproc jobs submit hadoop \
+        --project <project-id> \
         --cluster <dataproc-cluster> \
         --region <dataproc-region> \
-        --jar bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar \
+        --jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
         -- \
         import-table \
         -Dgoogle.bigtable.project.id=<project-id> \
@@ -79,17 +119,45 @@ from/to Cloud Bigtable and import HBase snapshots using a Map Reduce cluster (ie
 
     # Import from HBase snapshot
     gcloud dataproc jobs submit hadoop \
+        --project <project-id> \
         --cluster <dataproc-cluster> \
         --region <dataproc-region> \
-        --jar bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar \
+        --jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
         -- \
         import-snapshot \
         -Dgoogle.bigtable.project.id=<project-id> \
         -Dgoogle.bigtable.instance.id=<instance-id> \
         <snapshot-name> \
         <snapshot-dir> \
         <table-id> \
-        <tmp-dir>   
+        <tmp-dir>
+   
+    # HashTable on HBase
+    gcloud dataproc jobs submit hadoop \
+        --project <project-id> \
+        --cluster <dataproc-cluster> \
+        --region <dataproc-region> \
+        --jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
+        -- \
+        hash-table \
+        -Dhbase.zookeeper.quorum=<source-zk-quorum> \
+        <table-id> \
+        <hash-outputdir-hbase>
+   
+    # SyncTable on Bigtable (dryrun enabled by default)
+    gcloud dataproc jobs submit hadoop \
+        --project <project-id> \
+        --cluster <dataproc-cluster> \
+        --region <dataproc-region> \
+        --jar bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar \
+        -- \
+        sync-table \
+        --sourcezkcluster=<source-zk-quorum> \
+        --targetbigtableproject=<project-id> \
+        --targetbigtableinstance=<instance-id> \
+        <hash-outputdir-hbase> \
+        <source-table-id> \
+        <target-table-id>
    ```
 
 ## Examples
@@ -109,7 +177,7 @@ for the on-prem application to write to GCS).
     ```bash
     hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot \
      -snapshot <snapshotName> \
-     -copy-to gs://<bucket/<snapshot-dir> \
+     -copy-to gs://<bucket>/<snapshot-dir> \
      -mappers <num-mappers>
     ```
 
@@ -121,24 +189,24 @@ environment variables for running the subsequent steps.
     GCP environment properties 
     ```bash
     # set env properties
-    export PROJECT_ID=<PROJECT_ID>
-    export ZONE=<ZONE>
-    export REGION=${ZONE%-*}
-    export DATAPROC_CLUSTER=<DATAPROC_CLUSTER_NAME>
+    PROJECT_ID=<PROJECT_ID>
+    ZONE=<ZONE>
+    REGION=${ZONE%-*}
+    DATAPROC_CLUSTER=<DATAPROC_CLUSTER_NAME>
     
     # bigtable table properties
-    export CBT_INSTANCE=<BIGTABLE_INSTANCE>
-    export CBT_CLUSTER=<BIGTABLE_CLUSTER>
-    export CBT_TABLENAME=<TABLENAME>
-    export CBT_COLUMN_FAMILY=<CF1[,CF]>
+    CBT_INSTANCE=<BIGTABLE_INSTANCE>
+    CBT_CLUSTER=<BIGTABLE_CLUSTER>
+    CBT_TABLENAME=<TABLENAME>
+    CBT_COLUMN_FAMILY=<CF1[,CF]>
     
     # dataproc job jar
-    export JOB_JAR=bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar
+    JOB_JAR=bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar
     
     # dataproc job args
-    export JOB_ARG_SNAPSHOT_NAME=<SNAPSHOT_NAME>
-    export JOB_ARG_SNAPSHOT_DIR=<SNAPSHOT_DIR>
-    export JOB_ARG_TEMP_DIR=<JOB_TEMP_DIR>
+    JOB_ARG_SNAPSHOT_NAME=<SNAPSHOT_NAME>
+    JOB_ARG_SNAPSHOT_DIR=<SNAPSHOT_DIR>
+    JOB_ARG_TEMP_DIR=<JOB_TEMP_DIR>
     ```
 
 2. [Create a Dataproc Cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) for executing the import snapshot job.
@@ -172,13 +240,14 @@ the command:
 
 #### Run the import snapshot job
 
-1. Run the following command to start the import snapshot job on the Dataproc cluster
+1. Run the following command to start the `import-snapshot` job on the Dataproc cluster
 that was created. Slowly scale the dataproc cluster to increase/decrease throughput 
 and similarly scale up/down the bigtable cluster to meet the throughput demand. See 
 Bigtable [scaling limitations](https://cloud.google.com/bigtable/docs/scaling#limitations) if observing slower performance than expected.
 
     ```bash
     gcloud dataproc jobs submit hadoop \
+        --project ${PROJECT_ID} \
         --cluster ${DATAPROC_CLUSTER} \
         --region ${REGION} \
         --project ${PROJECT_ID} \
@@ -226,12 +295,72 @@ setting the properties for the job. For example:
     -Dhbase.snapshot.thread.pool.max=10
     ```
 
+### Example jobs to validate the data migrated from source to target
+
+1. Set the following additional environment variables for running the validation steps.
+    ```bash
+    # hash-table validation job
+    HBASE_TABLENAME=<HBASE_TABLENAME>
+    # hbase zookeeper quorum (ie. zk1.example.com:2181)
+    HBASE_ZK_QUORUM=<ZK_QUORUM>
+    HASH_OUTPUTDIR=<HASH_OUTPUTDIR>
+
+    # sync-table validation job
+    HBASE_ZK_QUORUM_FULL=${HBASE_ZK_QUORUM}:/hbase
+    ```
+
+2. Run `hash-table` and compute hashes for ranges on the source table and output 
+   results to a GCS bucket (See [HashTable/SyncTable](https://hbase.apache.org/book.html#_step_1_hashtable) doc for more details).
+    ```bash
+    hadoop jar ${JOB_JAR} \
+        hash-table \
+        -Dhbase.zookeeper.quorum=${HBASE_ZK_QUORUM} \
+        ${HBASE_TABLENAME} \
+        ${HASH_OUTPUTDIR}
+    ```
+
+3. Run `sync-table` to generate hashes on the target table and compare these hashes with
+   the output from `hash-table`. For diverging hashes, a cell-level comparison is performed    
+   between the source and target and summarized in the job counters. 
+    ```bash
+    # dryrun mode (readonly) enabled by default 
+    gcloud dataproc jobs submit hadoop \
+        --project ${PROJECT_ID} \
+        --cluster ${DATAPROC_CLUSTER} \
+        --region ${REGION} \
+        --project ${PROJECT_ID} \
+        --jar ${JOB_JAR} \
+        -- \
+        sync-table \
+        --sourcezkcluster=${HBASE_ZK_QUORUM_FULL} \
+        --targetbigtableproject=${PROJECT_ID} \
+        --targetbigtableinstance=${CBT_INSTANCE} \
+        ${HASH_OUTPUTDIR} \
+        ${HBASE_TABLENAME} \
+        ${CBT_TABLENAME}
+    ```
+Note: Connection with the source is required for providing cell-level comparison. Users may 
+enable debug mode `--properties mapreduce.map.log.level=DEBUG` on the job to provide additional 
+details on the diverging hash ranges and cell mismatches if divergence is detected. Job 
+configurations may also be updated to run `hash-table` against bigtable and `sync-table` run 
+against hbase.
+
+Additional Options:
+
+1. Disable dry run mode to perform synchronization between source and target for diverging hash ranges.
+
+    ```bash
+    --dryrun=false
+    ```
+
+2. Other job configuration and details may be referred to in [HBase SyncTable description](https://www.google.com/url?sa=D&q=https%3A%2F%2Fhbase.apache.org%2Fbook.html%23_step_2_synctable).
+
 ## Backwards compatibility
 
 To maintain backwards compatibility of this artifact, we still provide
-`bigtable-hbase-1.x-mapreduce-2.0.0-alpha1.jar` artifact that includes
+`bigtable-hbase-1.x-mapreduce-2.5.0-shaded.jar` artifact that includes
 hadoop jars. However we encourage our users to migrate to 
-`bigtable-hbase-1.x-mapreduce-2.0.0-alpha1-hadoop.jar` to avoid dependency
+`bigtable-hbase-1.x-mapreduce-2.5.0-shaded-byo-hadoop.jar` to avoid dependency
 conflicts with the existing classpath on Hadoop workers.
 
 [//]: # ({x-version-update-end})
@@ -16,7 +16,9 @@
 package com.google.cloud.bigtable.mapreduce;
 
 import com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportHBaseSnapshotJob;
+import com.google.cloud.bigtable.mapreduce.validation.BigtableSyncTableJob;
 import org.apache.hadoop.classification.InterfaceStability.Evolving;
+import org.apache.hadoop.hbase.mapreduce.HashTable;
 import org.apache.hadoop.util.ProgramDriver;
 
 /** Driver for bigtable mapreduce jobs. Select which to run by passing name of job to this main. */
@@ -44,6 +46,14 @@ public static void main(String[] args) {
           "import-snapshot",
           ImportHBaseSnapshotJob.class,
           "A map/reduce program that imports an hbase snapshot to a table.");
+      programDriver.addClass(
+          "hash-table",
+          HashTable.class,
+          "A map/reduce program that computes hashes on source and outputs to filesystem (or cloud storage).");
+      programDriver.addClass(
+          "sync-table",
+          BigtableSyncTableJob.class,
+          "A map/reduce program that computes hashes on target and compares with hashes from source.");
       programDriver.driver(args);
       exitCode = programDriver.run(args);
     } catch (Throwable e) {
 
@@ -184,6 +184,9 @@ protected static int setConfFromArgs(Configuration conf, String[] args) {
         conf.get(BigtableOptionsFactory.INSTANCE_ID_KEY),
         conf.get(BigtableOptionsFactory.APP_PROFILE_ID_KEY, ""));
 
+    // Set user agent
+    conf.set(BigtableOptionsFactory.CUSTOM_USER_AGENT_KEY, "HBaseMRImport");
+
     // implicit table outputformat configs that are used in the job to write map output to a table
     conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(TABLENAME_KEY));
     conf.setStrings(