@@ -37,9 +37,11 @@ def __init__(self, model: "DataModel"):
3737 def parse_xml (
3838 self ,
3939 xml_file : Union [str , BytesIO ],
40+ metadata : dict = None ,
4041 skip_validation : bool = True ,
4142 iterparse : bool = True ,
4243 recover : bool = False ,
44+ flat_data : dict = None ,
4345 ) -> None :
4446 """Parse an XML document and apply transformation corresponding to the target data model
4547
@@ -50,9 +52,13 @@ def parse_xml(
5052
5153 Args:
5254 xml_file: The path or the file object of an XML file to parse
55+ metadata: A dict of metadata values to add to the root table (a value for each key defined in
56+ `metadata_columns` passed to model config)
5357 skip_validation: Should we validate the document against the schema first?
5458 iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory
5559 recover: Should we try to parse incorrect XML? (argument passed to lxml parser)
60+ flat_data: A dict containing flat data if we want to add data to another dataset instead of creating
61+ a new one
5662 """
5763 self .xml_file_path = xml_file [:255 ] if isinstance (xml_file , str ) else "<stream>"
5864
@@ -69,7 +75,11 @@ def parse_xml(
6975 document_tree = self .model .model_config ["document_tree_hook" ](document_tree )
7076
7177 logger .info (f"Adding records to data model for { self .xml_file_path } " )
72- self .data = self .doc_tree_to_flat_data (document_tree )
78+ self .data = self .doc_tree_to_flat_data (
79+ document_tree ,
80+ metadata = metadata ,
81+ flat_data = flat_data ,
82+ )
7383
7484 logger .debug (self .__repr__ ())
7585
@@ -90,11 +100,16 @@ def to_xml(
90100 converter .document_tree = self .flat_data_to_doc_tree ()
91101 return converter .to_xml (out_file = out_file , nsmap = nsmap , indent = indent )
92102
93- def doc_tree_to_flat_data (self , document_tree : tuple ) -> dict :
103+ def doc_tree_to_flat_data (
104+ self , document_tree : tuple , metadata : dict = None , flat_data : dict = None
105+ ) -> dict :
94106 """Convert document tree (nested dict) to flat tables data model to prepare database import
95107
96108 Args:
97109 document_tree: A tuple (node_type, content, hash) containing the document tree
110+ metadata: A dict of metadata values to add to the root table (a value for each key defined in
111+ `metadata_columns` passed to model config)
112+ flat_data: A dict to store the flat data into
98113
99114 Returns:
100115 A dict containing flat tables
@@ -108,6 +123,7 @@ def _extract_node(
108123 Args:
109124 node: A tuple (node_type, content, hash) containing a node of the document tree
110125 pk_parent_node: The primary key of its parent node
126+ row_number: The row number of the record
111127 data_model: The dict to write output to
112128
113129 Returns:
@@ -196,6 +212,12 @@ def _extract_node(
196212 else :
197213 record [f"temp_{ rel .field_name } " ] = None
198214
215+ # write metadata if it is the root table
216+ if pk_parent_node == 0 and isinstance (metadata , dict ):
217+ for meta_col in self .model .model_config .get ("metadata_columns" , []):
218+ if meta_col ["name" ] in metadata :
219+ record [meta_col ["name" ]] = metadata [meta_col ["name" ]]
220+
199221 record [self .model .model_config ["record_hash_column_name" ]] = node_hash
200222
201223 # add n-n relationship data for reused children nodes
@@ -231,7 +253,7 @@ def _extract_node(
231253
232254 return record_pk
233255
234- flat_tables = {}
256+ flat_tables = flat_data if flat_data else {}
235257 _extract_node (document_tree , 0 , 0 , flat_tables )
236258
237259 return flat_tables
@@ -346,17 +368,13 @@ def _build_node(node_type: str, node_pk: int) -> tuple:
346368 int (list (data_index [self .model .root_table ]["records" ].keys ())[0 ]),
347369 )
348370
349- def insert_into_temp_tables (
350- self , max_lines : int = - 1 , metadata : dict = None
351- ) -> None :
371+ def insert_into_temp_tables (self , max_lines : int = - 1 ) -> None :
352372 """Insert data into temporary tables
353373
354374 (Re)creates temp tables before inserting data.
355375
356376 Args:
357377 max_lines: The maximum number of lines to insert in a single statement
358- metadata: A dict of metadata values to add to the root table (a value for each key defined in
359- `metadata_columns` passed to model config)
360378 """
361379 logger .info (f"Dropping temp tables if exist for { self .xml_file_path } " )
362380 self .model .drop_all_temp_tables ()
@@ -365,11 +383,6 @@ def insert_into_temp_tables(
365383 self .model .create_all_tables (temp = True )
366384
367385 logger .info (f"Inserting data into temporary tables from { self .xml_file_path } " )
368- # write metadata into the root table data
369- root_data = self .data [self .model .root_table ]["records" ][0 ]
370- for meta_col in self .model .model_config .get ("metadata_columns" , []):
371- if meta_col ["name" ] in metadata :
372- root_data [meta_col ["name" ]] = metadata [meta_col ["name" ]]
373386 # insert data (order does not really matter)
374387 for tb in self .model .fk_ordered_tables :
375388 for query , data in tb .get_insert_temp_records_statements (
@@ -418,7 +431,6 @@ def insert_into_target_tables(
418431 self ,
419432 single_transaction : bool = True ,
420433 max_lines : int = - 1 ,
421- metadata : dict = None ,
422434 ) -> int :
423435 """Insert and merge data into the database
424436
@@ -429,8 +441,6 @@ def insert_into_target_tables(
429441 scope required to ensure database consistency?
430442 max_lines: The maximum number of lines to insert in a single statement when loading data to the temporary
431443 tables
432- metadata: A dict of metadata values to add to the root table (a value for each key defined in
433- `metadata_columns` passed to model config)
434444
435445 Returns:
436446 The number of inserted rows
@@ -444,7 +454,7 @@ def insert_into_target_tables(
444454 logger .error (e )
445455 raise
446456 try :
447- self .insert_into_temp_tables (max_lines , metadata )
457+ self .insert_into_temp_tables (max_lines )
448458 except Exception as e :
449459 logger .error (
450460 f"Error while importing into temporary tables from { self .xml_file_path } "
0 commit comments