datacommonsorg
diff --git a/‎tools/import_differ/README.md‎
Lines changed: 31 additions & 17 deletions b/‎tools/import_differ/README.md‎
Lines changed: 31 additions & 17 deletions
diff --git a/‎tools/import_differ/differ_utils.py‎
Lines changed: 15 additions & 71 deletions b/‎tools/import_differ/differ_utils.py‎
Lines changed: 15 additions & 71 deletions
diff --git a/‎tools/import_differ/import_differ.py‎
Lines changed: 50 additions & 30 deletions b/‎tools/import_differ/import_differ.py‎
Lines changed: 50 additions & 30 deletions
@@ -6,15 +6,16 @@ This utility generates a diff of two versions of a dataset for import analysis.
 
 ***Prerequisites***
 - Python/Pandas is installed for native runner mode.
+- Java JRE/JDK is installed for direct runner mode.
 - gcloud ADC is configured for cloud runner mode.
 
 ```bash
-python import_differ.py \
+python3 import_differ.py \
   --current_data=<path> \
   --previous_data=<path> \
   --output_location=<path> \
   --file_format=<mcf/tfrecord> \
-  --runner_mode=<local/cloud> \
+  --runner_mode=<native/direct/cloud> \
   --project_id=<id> \
   --job_name=<name>
 ```
@@ -31,18 +32,31 @@ python import_differ.py \
 
 ***Output***
 
-Summary output generated is of the form below showing counts of differences for each variable.
-
-| variableMeasured | ADDED | DELETED | MODIFIED |
-| :--- | :--- | :--- | :--- |
-| dcid:var1 | 1 | 0 | 0 |
-| dcid:var2 | 0 | 2 | 1 |
-| dcid:var3 | 0 | 0 | 1 |
-| dcid:var4 | 0 | 2 | 0 |
-
-Detailed diff output is written to files for further analysis. Sample result files can be found under folder 'test/results'.
-- obs\_diff\_summary.csv: diff summary for observation analysis
-- obs\_diff\_samples.csv: sample diff for observation analysis
-- obs\_diff\_log.csv: diff log for observations
-- schema\_diff\_summary.csv: diff summary for schema analysis
-- schema\_diff\_log.csv: diff log for schema nodes 
+The utility generates a summary of the differences and detailed MCF files.
+
+**Summary Output**
+A summary is printed to the logs and also written to `differ_summary.json` in the output directory:
+```json
+{
+    "current_version": "path/to/current",
+    "previous_version": "path/to/previous",
+    "current_obs_count": 1000,
+    "previous_obs_count": 950,
+    "current_schema_count": 100,
+    "previous_schema_count": 95,
+    "added_obs_count": 50,
+    "deleted_obs_count": 0,
+    "modified_obs_count": 10,
+    "added_schema_count": 5,
+    "deleted_schema_count": 0,
+    "modified_schema_count": 0,
+    "obs_diff_count": 60,
+    "schema_diff_count": 5
+}
+```
+
+**Detailed Diff Files**
+Detailed diff output is written to MCF files in the output directory:
+- nodes-added.mcf: MCF nodes added in the current version
+- nodes-deleted.mcf: MCF nodes deleted in the current version
+- nodes-modified.mcf: MCF nodes modified in the current version
@@ -1,19 +1,17 @@
-import glob
-import fnmatch
 import json
 import os
 import pandas as pd
 import re
 
 from absl import logging
-from google.cloud import storage
+from util.file_util import FileIO
+from util.file_util import file_get_matching
 
 
 def load_mcf_file(file: str):
     """ Reads an MCF text file and returns mcf nodes."""
-    mcf_file = open(file, 'r', encoding='utf-8')
-    mcf_contents = mcf_file.read()
-    mcf_file.close()
+    with FileIO(file, 'r', encoding='utf-8') as mcf_file:
+        mcf_contents = mcf_file.read()
     # nodes separated by a blank line
     mcf_nodes_text = mcf_contents.split('\n\n')
     # lines seprated as property: constraint
@@ -36,7 +34,7 @@ def load_mcf_files(path: str) -> pd.DataFrame:
     """ Loads all sharded mcf files in the given directory and 
     returns a combined MCF node list."""
     node_list = []
-    filenames = glob.glob(path)
+    filenames = file_get_matching(path)
     logging.info(f'Loading {len(filenames)} files from path {path}')
     for filename in filenames:
         nodes = load_mcf_file(filename)
@@ -48,49 +46,33 @@ def load_csv_data(path: str, tmp_dir: str) -> pd.DataFrame:
     """ Loads all matched files in the given path and 
     returns a single combined dataframe."""
     df_list = []
-    pattern = path
-    if path.startswith('gs://'):
-        pattern = get_gcs_data(path, tmp_dir)
-
-    filenames = glob.glob(pattern)
+    filenames = file_get_matching(path)
     for filename in filenames:
-        df = pd.read_csv(filename)
-        df_list.append(df)
+        with FileIO(filename, mode='r') as in_file:
+            df = pd.read_csv(in_file)
+            df_list.append(df)
     result = pd.concat(df_list, ignore_index=True)
     return result
 
 
 def write_csv_data(df: pd.DataFrame, dest: str, file: str, tmp_dir: str):
     """ Writes a dataframe to a CSV file with the given path."""
-    if dest.startswith('gs://'):
-        path = os.path.join(tmp_dir, file)
-    else:
-        path = os.path.join(dest, file)
-    with open(path, mode='w', encoding='utf-8') as out_file:
+    path = os.path.join(dest, file)
+    with FileIO(path, mode='w', encoding='utf-8') as out_file:
         df.to_csv(out_file, index=False, mode='w', header=True)
-    if dest.startswith('gs://'):
-        upload_output_data(path, dest)
 
 
 def write_json_data(data, dest: str, file: str, tmp_dir: str):
     """ Writes data to a JSON file with the given path."""
-    if dest.startswith('gs://'):
-        path = os.path.join(tmp_dir, file)
-    else:
-        path = os.path.join(dest, file)
-    with open(path, mode='w', encoding='utf-8') as out_file:
+    path = os.path.join(dest, file)
+    with FileIO(path, mode='w', encoding='utf-8') as out_file:
         json.dump(data, out_file, indent=4)
-    if dest.startswith('gs://'):
-        upload_output_data(path, dest)
 
 
 def write_mcf_nodes(nodes: list, dest: str, file: str, tmp_dir: str):
     """ Writes mcf nodes to a file with the given path."""
-    if dest.startswith('gs://'):
-        path = os.path.join(tmp_dir, file)
-    else:
-        path = os.path.join(dest, file)
-    with open(path, mode='w', encoding='utf-8') as out_file:
+    path = os.path.join(dest, file)
+    with FileIO(path, mode='w', encoding='utf-8') as out_file:
         for node in nodes:
             if 'Node' in node:
                 out_file.write(f'Node: {node["Node"]}\n')
@@ -102,40 +84,6 @@ def write_mcf_nodes(nodes: list, dest: str, file: str, tmp_dir: str):
                     continue
                 out_file.write(f'{key}: {value}\n')
             out_file.write('\n')
-    if dest.startswith('gs://'):
-        upload_output_data(path, dest)
-
-
-def upload_output_data(src: str, dest: str):
-    client = storage.Client()
-    bucket_name = dest.split('/')[2]
-    bucket = client.get_bucket(bucket_name)
-    for filepath in glob.iglob(src):
-        filename = os.path.basename(filepath)
-        logging.info('Uploading %s to %s', filename, dest)
-        blobname = dest[len('gs://' + bucket_name + '/'):] + '/' + filename
-        blob = bucket.blob(blobname)
-        blob.upload_from_filename(filepath)
-
-
-def get_gcs_data(uri: str, dest_dir: str) -> str:
-    """ Downloads files from GCS and copies them to local.
-    Args:
-      uri: single file path or wildcard format 
-      dest_dir: destination folder
-    Returns:
-      path to the output file/folder
-    """
-    client = storage.Client()
-    bucket = client.get_bucket(uri.split('/')[2])
-    file_pat = uri.split(bucket.name, 1)[1][1:]
-    dirname = os.path.dirname(file_pat)
-    for blob in bucket.list_blobs(prefix=dirname):
-        if fnmatch.fnmatch(blob.name, file_pat):
-            dest_file = os.path.join(dest_dir, blob.name)
-            os.makedirs(os.path.dirname(dest_file), exist_ok=True)
-            blob.download_to_filename(dest_file)
-    return os.path.join(dest_dir, file_pat)
 
 
 def load_data(path: str, tmp_dir: str) -> list:
@@ -146,9 +94,5 @@ def load_data(path: str, tmp_dir: str) -> list:
     Returns:
       combined list of mcf nodes
     """
-    if path.startswith('gs://'):
-        os.makedirs(tmp_dir, exist_ok=True)
-        path = get_gcs_data(path, tmp_dir)
-
     mcf_nodes = load_mcf_files(path)
     return mcf_nodes
@@ -37,6 +37,11 @@
 
 _DATAFLOW_TEMPLATE_URL = 'gs://datcom-templates/templates/flex/differ.json'
 
+_GROUPBY_KEYS = [
+    'variableMeasured', 'observationAbout', 'observationDate',
+    'observationPeriod', 'measurementMethod', 'unit', 'scalingFactor'
+]
+
 Diff = Enum('Diff', [
     ('ADDED', 1),
     ('DELETED', 2),
@@ -72,6 +77,14 @@
 flags.DEFINE_string('project_id', '', 'GCP project id for the dataflow job.')
 
 
+def val_str(value) -> str:
+    if isinstance(value, list):
+        return ",".join([val_str(v) for v in value])
+    if value and isinstance(value, str) and " " in value and value[0].isalpha():
+        return '"' + value + '"'
+    return str(value)
+
+
 class ImportDiffer:
     """
   Utility to generate a diff of two versions of a dataset for import analysis. 
@@ -80,6 +93,11 @@ class ImportDiffer:
   $ python import_differ.py --current_data=<path> --previous_data=<path> --output_location=<path> \
     --file_format=<mcf/tfrecord> --runner_mode=<native/direct/cloud> --project_id=<id> --job_name=<name> 
 
+  Runner Modes:
+  - native: Runs the differ using native Python (Pandas) locally.
+  - direct: Runs the differ using the Apache Beam DirectRunner (Java jar) locally.
+  - cloud: Runs the differ as a Dataflow job in GCP.
+
   Summary output generated is of the form below showing 
   counts of differences for each variable.  
 
@@ -90,7 +108,9 @@ class ImportDiffer:
   3   dcid:var4       0      2       0
 
   Detailed diff output is written to files for further analysis.
-  - import-diff.mcf: combined MCF diff for observations and schema
+  - nodes-added.mcf: MCF nodes added in the current version
+  - nodes-deleted.mcf: MCF nodes deleted in the current version
+  - nodes-modified.mcf: MCF nodes modified in the current version
   - differ_summary.json: consolidated diff statistics 
 
   """
@@ -139,7 +159,7 @@ def generate_diff(self, previous_df: pd.DataFrame,
         elif previous_df.empty and current_df.empty:
             column_list = [
                 Column.key_combined.name, Column.value_combined.name + '_x',
-                Column.value_combined.name + '_y' + Column.diff_type.name
+                Column.value_combined.name + '_y', Column.diff_type.name
             ]
             return pd.DataFrame(columns=column_list)
         result = pd.merge(previous_df,
@@ -160,7 +180,7 @@ def generate_diff(self, previous_df: pd.DataFrame,
         if result.empty:
             column_list = [
                 Column.key_combined.name, Column.value_combined.name + '_x',
-                Column.value_combined.name + '_y' + Column.diff_type.name
+                Column.value_combined.name + '_y', Column.diff_type.name
             ]
             return pd.DataFrame(columns=column_list)
 
@@ -179,13 +199,8 @@ def split_data(self, mcf_nodes: list) -> (pd.DataFrame, pd.DataFrame):
             if 'StatVarObservation' in node.get(Column.typeOf.name):
                 values_to_combine = []
                 keys_to_combine = []
-                groupby_keys = [
-                    'variableMeasured', 'observationAbout', 'observationDate',
-                    'observationPeriod', 'measurementMethod', 'unit',
-                    'scalingFactor'
-                ]
                 value_keys = [Column.value.name]
-                for key in groupby_keys:
+                for key in _GROUPBY_KEYS:
                     keys_to_combine.append(str(node.get(key, "")))
                 for key in value_keys:
                     values_to_combine.append(str(node.get(key, "")))
@@ -211,7 +226,8 @@ def split_data(self, mcf_nodes: list) -> (pd.DataFrame, pd.DataFrame):
                 node.pop('Node', None)
                 value_keys = sorted(node.keys())
                 for key in value_keys:
-                    values_to_combine.append(key + ":" + str(node.get(key, "")))
+                    values_to_combine.append(key + ":" +
+                                             val_str(node.get(key, "")))
                 key_combined = ";".join(keys_to_combine)
                 value_combined = ";".join(values_to_combine)
                 schema_list.append({
@@ -224,16 +240,19 @@ def split_data(self, mcf_nodes: list) -> (pd.DataFrame, pd.DataFrame):
         obs_df = pd.DataFrame(obs_list)
         return obs_df, schema_df
 
-    def convert_diff_to_mcf_nodes(self, diff_df: pd.DataFrame,
-                                  is_obs: bool) -> list:
+    def convert_diff_to_mcf_nodes(self,
+                                  diff_df: pd.DataFrame,
+                                  is_obs: bool,
+                                  diff_type: str = None) -> list:
         """
         Converts the diff dataframe back to MCF format nodes.
         """
         all_nodes = []
-        for diff_type in [
-                Diff.ADDED.name, Diff.DELETED.name, Diff.MODIFIED.name
-        ]:
-            df_type = diff_df[diff_df[Column.diff_type.name] == diff_type]
+        diff_types = [diff_type] if diff_type else [
+            Diff.ADDED.name, Diff.DELETED.name, Diff.MODIFIED.name
+        ]
+        for d_type in diff_types:
+            df_type = diff_df[diff_df[Column.diff_type.name] == d_type]
             if df_type.empty:
                 continue
 
@@ -242,7 +261,7 @@ def convert_diff_to_mcf_nodes(self, diff_df: pd.DataFrame,
                 key_combined = str(row[Column.key_combined.name])
 
                 # Determine which column to use for values and node IDs
-                suffix = '_x' if diff_type == Diff.DELETED.name else '_y'
+                suffix = '_x' if d_type == Diff.DELETED.name else '_y'
 
                 # Helper to get value from row, handles cases with or without suffix
                 def get_val(base_name):
@@ -262,13 +281,8 @@ def get_val(base_name):
                         node['dcid'] = dcid_id
 
                     # Reconstruct observation node
-                    groupby_keys = [
-                        'variableMeasured', 'observationAbout',
-                        'observationDate', 'observationPeriod',
-                        'measurementMethod', 'unit', 'scalingFactor'
-                    ]
                     keys = key_combined.split(';')
-                    for i, key in enumerate(groupby_keys):
+                    for i, key in enumerate(_GROUPBY_KEYS):
                         if i < len(keys) and keys[i] and keys[i] != "nan":
                             node[key] = keys[i]
 
@@ -290,7 +304,6 @@ def get_val(base_name):
                             k, v = kv.split(':', 1)
                             node[k] = v
 
-                node['diffType'] = diff_type
                 all_nodes.append(node)
         return all_nodes
 
@@ -422,12 +435,19 @@ def run_differ(self):
                                              current_df_schema)
 
             logging.info('Writing diff to MCF files...')
-            obs_nodes = self.convert_diff_to_mcf_nodes(obs_diff, True)
-            schema_nodes = self.convert_diff_to_mcf_nodes(schema_diff, False)
-            all_nodes = obs_nodes + schema_nodes
-            if all_nodes:
-                differ_utils.write_mcf_nodes(all_nodes, self.output_path,
-                                             'import_diff.mcf', tmp_path)
+            for d_type, filename in [
+                (Diff.ADDED.name, 'nodes-added.mcf'),
+                (Diff.DELETED.name, 'nodes-deleted.mcf'),
+                (Diff.MODIFIED.name, 'nodes-modified.mcf'),
+            ]:
+                obs_nodes = self.convert_diff_to_mcf_nodes(
+                    obs_diff, True, d_type)
+                schema_nodes = self.convert_diff_to_mcf_nodes(
+                    schema_diff, False, d_type)
+                type_nodes = obs_nodes + schema_nodes
+                if type_nodes:
+                    differ_utils.write_mcf_nodes(type_nodes, self.output_path,
+                                                 filename, tmp_path)
 
             obs_stats = obs_diff[Column.diff_type.name].value_counts().to_dict()
             schema_stats = schema_diff[