Enable loading several documents at the same time (#7)

cre-os · web-flow · commit f97eb2fcb4fd · 2024-07-23T13:58:37.000+02:00
diff --git a/.github/workflows/integration-tests-mssql.yml b/.github/workflows/integration-tests-mssql.yml
@@ -23,10 +23,10 @@ jobs:
       - name: Check out repository code
         uses: actions/checkout@v4
       
-      - name: Set up Python 3.11
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: 3.11
+          python-version: 3.12
           
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/integration-tests-mysql.yml b/.github/workflows/integration-tests-mysql.yml
@@ -32,10 +32,10 @@ jobs:
       - name: Check out repository code
         uses: actions/checkout@v4
         
-      - name: Set up Python 3.11
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: 3.11
+          python-version: 3.12
           
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/integration-tests-postgres.yml b/.github/workflows/integration-tests-postgres.yml
@@ -10,7 +10,7 @@ on:
 jobs:
   integration-tests:
     runs-on: ubuntu-latest
-    container: python:3.11-bookworm
+    container: python:3.12-bookworm
     services:
       postgres:
         image: postgres
@@ -29,10 +29,10 @@ jobs:
       - name: Check out repository code
         uses: actions/checkout@v4
       
-      - name: Set up Python 3.11
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: 3.11
+          python-version: 3.12
           
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/publish-to-gh-pages.yml b/.github/workflows/publish-to-gh-pages.yml
@@ -19,7 +19,7 @@ jobs:
           git config user.email 41898282+github-actions[bot]@users.noreply.github.com
       - uses: actions/setup-python@v5
         with:
-          python-version: 3.x
+          python-version: 3.12
       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
       - uses: actions/cache@v4
         with:
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -15,7 +15,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: "3.11"
+        python-version: "3.12"
     - name: Install pypa/build
       run: >-
         python3 -m
diff --git a/docs/configuring.md b/docs/configuring.md
@@ -64,7 +64,7 @@ clustered columnstore indexes. The default value is `False` (disabled).
 useful for instance to add the name of the file which has been parsed, or a timestamp, etc. Columns should be specified
 as dicts, the only required keys are `name` and `type` (a SQLAlchemy type object); other keys will be passed directly
 as keyword arguments to `sqlalchemy.Column`. Actual values need to be passed to 
-[`Document.insert_into_target_tables`](api/document.md#xml2db.document.Document.insert_into_target_tables) for each 
+[`DataModel.parse_xml`](api/data_model.md#xml2db.model.DataModel.parse_xml) for each 
 parsed documents, as a `dict`, using the `metadata` argument.
 * `record_hash_column_name`: the column name to use to store records hash data (defaults to `xml2db_record_hash`).
 * `record_hash_constructor`: a function used to build a hash, with a signature similar to `hashlib` constructor 
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -121,6 +121,25 @@ troubleshooting if need be.
     [`metadata_columns` option](configuring.md#model-configuration) and create additional columns in the root table.
     It can be used for instance to save file name or loading timestamp.
 
+    Actual values need to be passed to [`DataModel.parse_xml`](api/data_model.md#xml2db.model.DataModel.parse_xml) for 
+    each parsed documents, as a `dict`, using the `metadata` argument.
+
+!!! note
+    You can also load multiple documents at the same time to the database, which could make the process faster if you 
+    have a lot of small XML files to load:
+    ``` py
+    data = None
+    for xml_file in files:
+        document = data_model.parse_xml(
+            xml_file="path/to/file.xml",
+            flat_data=data,
+        )
+        data = document.data
+    document.insert_into_target_tables()
+    ```
+
+
+
 ## Getting back the data into XML
 
 You can extract the data from the database into XML files. This was implemented primarily to be able to test the package
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
@@ -6,4 +6,8 @@
   --md-accent-fg-color:         #116baa;
   --md-accent-fg-color--light:  #2a8cd0;
   --md-accent-fg-color--dark:   #116baa;
+}
+
+.md-typeset .admonition, .md-typeset details {
+  font-size: .75rem;
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "xml2db"
-version = "0.11.0"
+version = "0.12.0"
 authors = [
   { name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
 ]
diff --git a/src/xml2db/document.py b/src/xml2db/document.py
@@ -37,9 +37,11 @@ def __init__(self, model: "DataModel"):
     def parse_xml(
         self,
         xml_file: Union[str, BytesIO],
+        metadata: dict = None,
         skip_validation: bool = True,
         iterparse: bool = True,
         recover: bool = False,
+        flat_data: dict = None,
     ) -> None:
         """Parse an XML document and apply transformation corresponding to the target data model
 
@@ -50,9 +52,13 @@ def parse_xml(
 
         Args:
             xml_file: The path or the file object of an XML file to parse
+            metadata: A dict of metadata values to add to the root table (a value for each key defined in
+                `metadata_columns` passed to model config)
             skip_validation: Should we validate the document against the schema first?
             iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory
             recover: Should we try to parse incorrect XML? (argument passed to lxml parser)
+            flat_data: A dict containing flat data if we want to add data to another dataset instead of creating
+                a new one
         """
         self.xml_file_path = xml_file[:255] if isinstance(xml_file, str) else "<stream>"
 
@@ -69,7 +75,11 @@ def parse_xml(
             document_tree = self.model.model_config["document_tree_hook"](document_tree)
 
         logger.info(f"Adding records to data model for {self.xml_file_path}")
-        self.data = self.doc_tree_to_flat_data(document_tree)
+        self.data = self.doc_tree_to_flat_data(
+            document_tree,
+            metadata=metadata,
+            flat_data=flat_data,
+        )
 
         logger.debug(self.__repr__())
 
@@ -90,11 +100,16 @@ def to_xml(
         converter.document_tree = self.flat_data_to_doc_tree()
         return converter.to_xml(out_file=out_file, nsmap=nsmap, indent=indent)
 
-    def doc_tree_to_flat_data(self, document_tree: tuple) -> dict:
+    def doc_tree_to_flat_data(
+        self, document_tree: tuple, metadata: dict = None, flat_data: dict = None
+    ) -> dict:
         """Convert document tree (nested dict) to flat tables data model to prepare database import
 
         Args:
             document_tree: A tuple (node_type, content, hash) containing the document tree
+            metadata: A dict of metadata values to add to the root table (a value for each key defined in
+                `metadata_columns` passed to model config)
+            flat_data: A dict to store the flat data into
 
         Returns:
             A dict containing flat tables
@@ -108,6 +123,7 @@ def _extract_node(
             Args:
                 node: A tuple (node_type, content, hash) containing a node of the document tree
                 pk_parent_node: The primary key of its parent node
+                row_number: The row number of the record
                 data_model: The dict to write output to
 
             Returns:
@@ -196,6 +212,12 @@ def _extract_node(
                     else:
                         record[f"temp_{rel.field_name}"] = None
 
+            # write metadata if it is the root table
+            if pk_parent_node == 0 and isinstance(metadata, dict):
+                for meta_col in self.model.model_config.get("metadata_columns", []):
+                    if meta_col["name"] in metadata:
+                        record[meta_col["name"]] = metadata[meta_col["name"]]
+
             record[self.model.model_config["record_hash_column_name"]] = node_hash
 
             # add n-n relationship data for reused children nodes
@@ -231,7 +253,7 @@ def _extract_node(
 
             return record_pk
 
-        flat_tables = {}
+        flat_tables = flat_data if flat_data else {}
         _extract_node(document_tree, 0, 0, flat_tables)
 
         return flat_tables
@@ -346,17 +368,13 @@ def _build_node(node_type: str, node_pk: int) -> tuple:
             int(list(data_index[self.model.root_table]["records"].keys())[0]),
         )
 
-    def insert_into_temp_tables(
-        self, max_lines: int = -1, metadata: dict = None
-    ) -> None:
+    def insert_into_temp_tables(self, max_lines: int = -1) -> None:
         """Insert data into temporary tables
 
         (Re)creates temp tables before inserting data.
 
         Args:
             max_lines: The maximum number of lines to insert in a single statement
-            metadata: A dict of metadata values to add to the root table (a value for each key defined in
-                `metadata_columns` passed to model config)
         """
         logger.info(f"Dropping temp tables if exist for {self.xml_file_path}")
         self.model.drop_all_temp_tables()
@@ -365,11 +383,6 @@ def insert_into_temp_tables(
         self.model.create_all_tables(temp=True)
 
         logger.info(f"Inserting data into temporary tables from {self.xml_file_path}")
-        # write metadata into the root table data
-        root_data = self.data[self.model.root_table]["records"][0]
-        for meta_col in self.model.model_config.get("metadata_columns", []):
-            if meta_col["name"] in metadata:
-                root_data[meta_col["name"]] = metadata[meta_col["name"]]
         # insert data (order does not really matter)
         for tb in self.model.fk_ordered_tables:
             for query, data in tb.get_insert_temp_records_statements(
@@ -418,7 +431,6 @@ def insert_into_target_tables(
         self,
         single_transaction: bool = True,
         max_lines: int = -1,
-        metadata: dict = None,
     ) -> int:
         """Insert and merge data into the database
 
@@ -429,8 +441,6 @@ def insert_into_target_tables(
                 scope required to ensure database consistency?
             max_lines: The maximum number of lines to insert in a single statement when loading data to the temporary
                 tables
-            metadata: A dict of metadata values to add to the root table (a value for each key defined in
-                `metadata_columns` passed to model config)
 
         Returns:
             The number of inserted rows
@@ -444,7 +454,7 @@ def insert_into_target_tables(
             logger.error(e)
             raise
         try:
-            self.insert_into_temp_tables(max_lines, metadata)
+            self.insert_into_temp_tables(max_lines)
         except Exception as e:
             logger.error(
                 f"Error while importing into temporary tables from {self.xml_file_path}"
diff --git a/src/xml2db/model.py b/src/xml2db/model.py
@@ -676,29 +676,37 @@ def drop_all_temp_tables(self):
     def parse_xml(
         self,
         xml_file: Union[str, BytesIO],
+        metadata: dict = None,
         skip_validation: bool = True,
         iterparse: bool = True,
         recover: bool = False,
+        flat_data: dict = None,
     ) -> Document:
         """Parse an XML document based on this data model
 
         This method is just a wrapper around the parse_xml method of the Document class.
 
         Args:
             xml_file: The path or the file object of an XML file to parse
+            metadata: A dict of metadata values to add to the root table (a value for each key defined in
+                `metadata_columns` passed to model config)
             skip_validation: Should we validate the documents against the schema first?
             iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory
             recover: Should we try to parse incorrect XML? (argument passed to lxml parser)
+            flat_data: A dict containing flat data if we want to add data to another dataset instead of creating
+                a new one
 
         Returns:
             A parsed [`Document`](document.md) object
         """
         doc = Document(self)
         doc.parse_xml(
             xml_file=xml_file,
+            metadata=metadata,
             skip_validation=skip_validation,
             iterparse=iterparse,
             recover=recover,
+            flat_data=flat_data,
         )
         return doc
 
diff --git a/tests/sample_models/models.py b/tests/sample_models/models.py
@@ -6,9 +6,9 @@
 def make_sample_index(table_name):
     def wrapped():
         yield sqlalchemy.Index(
-            f"{table_name}_fk_parent_REMITTable1_idx",
-            "fk_parent_REMITTable1"
+            f"{table_name}_fk_parent_REMITTable1_idx", "fk_parent_REMITTable1"
         )
+
     return wrapped
 
 
@@ -219,4 +219,3 @@ def _generate_models_output():
 
 if __name__ == "__main__":
     _generate_models_output()
-
diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py

Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,8 @@`
`6`	`6`	`--md-accent-fg-color: #116baa;`
`7`	`7`	`--md-accent-fg-color--light: #2a8cd0;`
`8`	`8`	`--md-accent-fg-color--dark: #116baa;`
	`9`	`+}`
	`10`	`+`
	`11`	`+.md-typeset .admonition, .md-typeset details {`
	`12`	`+ font-size: .75rem;`
`9`	`13`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "xml2db"`
`7`		`-version = "0.11.0"`
	`7`	`+version = "0.12.0"`
`8`	`8`	`authors = [`
`9`	`9`	`{ name="Commission de régulation de l'énergie", email="opensource@cre.fr" },`
`10`	`10`	`]`