3737
3838_DATAFLOW_TEMPLATE_URL = 'gs://datcom-templates/templates/flex/differ.json'
3939
40+ _GROUPBY_KEYS = [
41+ 'variableMeasured' , 'observationAbout' , 'observationDate' ,
42+ 'observationPeriod' , 'measurementMethod' , 'unit' , 'scalingFactor'
43+ ]
44+
4045Diff = Enum ('Diff' , [
4146 ('ADDED' , 1 ),
4247 ('DELETED' , 2 ),
7277flags .DEFINE_string ('project_id' , '' , 'GCP project id for the dataflow job.' )
7378
7479
80+ def val_str (value ) -> str :
81+ if isinstance (value , list ):
82+ return "," .join ([val_str (v ) for v in value ])
83+ if value and isinstance (value , str ) and " " in value and value [0 ].isalpha ():
84+ return '"' + value + '"'
85+ return str (value )
86+
87+
7588class ImportDiffer :
7689 """
7790 Utility to generate a diff of two versions of a dataset for import analysis.
@@ -80,6 +93,11 @@ class ImportDiffer:
8093 $ python import_differ.py --current_data=<path> --previous_data=<path> --output_location=<path> \
8194 --file_format=<mcf/tfrecord> --runner_mode=<native/direct/cloud> --project_id=<id> --job_name=<name>
8295
96+ Runner Modes:
97+ - native: Runs the differ using native Python (Pandas) locally.
98+ - direct: Runs the differ using the Apache Beam DirectRunner (Java jar) locally.
99+ - cloud: Runs the differ as a Dataflow job in GCP.
100+
83101 Summary output generated is of the form below showing
84102 counts of differences for each variable.
85103
@@ -90,7 +108,9 @@ class ImportDiffer:
90108 3 dcid:var4 0 2 0
91109
92110 Detailed diff output is written to files for further analysis.
93- - import-diff.mcf: combined MCF diff for observations and schema
111+ - nodes-added.mcf: MCF nodes added in the current version
112+ - nodes-deleted.mcf: MCF nodes deleted in the current version
113+ - nodes-modified.mcf: MCF nodes modified in the current version
94114 - differ_summary.json: consolidated diff statistics
95115
96116 """
@@ -139,7 +159,7 @@ def generate_diff(self, previous_df: pd.DataFrame,
139159 elif previous_df .empty and current_df .empty :
140160 column_list = [
141161 Column .key_combined .name , Column .value_combined .name + '_x' ,
142- Column .value_combined .name + '_y' + Column .diff_type .name
162+ Column .value_combined .name + '_y' , Column .diff_type .name
143163 ]
144164 return pd .DataFrame (columns = column_list )
145165 result = pd .merge (previous_df ,
@@ -160,7 +180,7 @@ def generate_diff(self, previous_df: pd.DataFrame,
160180 if result .empty :
161181 column_list = [
162182 Column .key_combined .name , Column .value_combined .name + '_x' ,
163- Column .value_combined .name + '_y' + Column .diff_type .name
183+ Column .value_combined .name + '_y' , Column .diff_type .name
164184 ]
165185 return pd .DataFrame (columns = column_list )
166186
@@ -179,13 +199,8 @@ def split_data(self, mcf_nodes: list) -> (pd.DataFrame, pd.DataFrame):
179199 if 'StatVarObservation' in node .get (Column .typeOf .name ):
180200 values_to_combine = []
181201 keys_to_combine = []
182- groupby_keys = [
183- 'variableMeasured' , 'observationAbout' , 'observationDate' ,
184- 'observationPeriod' , 'measurementMethod' , 'unit' ,
185- 'scalingFactor'
186- ]
187202 value_keys = [Column .value .name ]
188- for key in groupby_keys :
203+ for key in _GROUPBY_KEYS :
189204 keys_to_combine .append (str (node .get (key , "" )))
190205 for key in value_keys :
191206 values_to_combine .append (str (node .get (key , "" )))
@@ -211,7 +226,8 @@ def split_data(self, mcf_nodes: list) -> (pd.DataFrame, pd.DataFrame):
211226 node .pop ('Node' , None )
212227 value_keys = sorted (node .keys ())
213228 for key in value_keys :
214- values_to_combine .append (key + ":" + str (node .get (key , "" )))
229+ values_to_combine .append (key + ":" +
230+ val_str (node .get (key , "" )))
215231 key_combined = ";" .join (keys_to_combine )
216232 value_combined = ";" .join (values_to_combine )
217233 schema_list .append ({
@@ -224,16 +240,19 @@ def split_data(self, mcf_nodes: list) -> (pd.DataFrame, pd.DataFrame):
224240 obs_df = pd .DataFrame (obs_list )
225241 return obs_df , schema_df
226242
227- def convert_diff_to_mcf_nodes (self , diff_df : pd .DataFrame ,
228- is_obs : bool ) -> list :
243+ def convert_diff_to_mcf_nodes (self ,
244+ diff_df : pd .DataFrame ,
245+ is_obs : bool ,
246+ diff_type : str = None ) -> list :
229247 """
230248 Converts the diff dataframe back to MCF format nodes.
231249 """
232250 all_nodes = []
233- for diff_type in [
234- Diff .ADDED .name , Diff .DELETED .name , Diff .MODIFIED .name
235- ]:
236- df_type = diff_df [diff_df [Column .diff_type .name ] == diff_type ]
251+ diff_types = [diff_type ] if diff_type else [
252+ Diff .ADDED .name , Diff .DELETED .name , Diff .MODIFIED .name
253+ ]
254+ for d_type in diff_types :
255+ df_type = diff_df [diff_df [Column .diff_type .name ] == d_type ]
237256 if df_type .empty :
238257 continue
239258
@@ -242,7 +261,7 @@ def convert_diff_to_mcf_nodes(self, diff_df: pd.DataFrame,
242261 key_combined = str (row [Column .key_combined .name ])
243262
244263 # Determine which column to use for values and node IDs
245- suffix = '_x' if diff_type == Diff .DELETED .name else '_y'
264+ suffix = '_x' if d_type == Diff .DELETED .name else '_y'
246265
247266 # Helper to get value from row, handles cases with or without suffix
248267 def get_val (base_name ):
@@ -262,13 +281,8 @@ def get_val(base_name):
262281 node ['dcid' ] = dcid_id
263282
264283 # Reconstruct observation node
265- groupby_keys = [
266- 'variableMeasured' , 'observationAbout' ,
267- 'observationDate' , 'observationPeriod' ,
268- 'measurementMethod' , 'unit' , 'scalingFactor'
269- ]
270284 keys = key_combined .split (';' )
271- for i , key in enumerate (groupby_keys ):
285+ for i , key in enumerate (_GROUPBY_KEYS ):
272286 if i < len (keys ) and keys [i ] and keys [i ] != "nan" :
273287 node [key ] = keys [i ]
274288
@@ -290,7 +304,6 @@ def get_val(base_name):
290304 k , v = kv .split (':' , 1 )
291305 node [k ] = v
292306
293- node ['diffType' ] = diff_type
294307 all_nodes .append (node )
295308 return all_nodes
296309
@@ -422,12 +435,19 @@ def run_differ(self):
422435 current_df_schema )
423436
424437 logging .info ('Writing diff to MCF files...' )
425- obs_nodes = self .convert_diff_to_mcf_nodes (obs_diff , True )
426- schema_nodes = self .convert_diff_to_mcf_nodes (schema_diff , False )
427- all_nodes = obs_nodes + schema_nodes
428- if all_nodes :
429- differ_utils .write_mcf_nodes (all_nodes , self .output_path ,
430- 'import_diff.mcf' , tmp_path )
438+ for d_type , filename in [
439+ (Diff .ADDED .name , 'nodes-added.mcf' ),
440+ (Diff .DELETED .name , 'nodes-deleted.mcf' ),
441+ (Diff .MODIFIED .name , 'nodes-modified.mcf' ),
442+ ]:
443+ obs_nodes = self .convert_diff_to_mcf_nodes (
444+ obs_diff , True , d_type )
445+ schema_nodes = self .convert_diff_to_mcf_nodes (
446+ schema_diff , False , d_type )
447+ type_nodes = obs_nodes + schema_nodes
448+ if type_nodes :
449+ differ_utils .write_mcf_nodes (type_nodes , self .output_path ,
450+ filename , tmp_path )
431451
432452 obs_stats = obs_diff [Column .diff_type .name ].value_counts ().to_dict ()
433453 schema_stats = schema_diff [
0 commit comments