Skip to content

Commit f4e6fd5

Browse files
authored
Switch to lxml XSD validator (#12)
* Add imported XSD schema to test cases * Switch XML validator to lxml * Improve docs
1 parent 1572758 commit f4e6fd5

7 files changed

Lines changed: 57 additions & 53 deletions

File tree

docs/index.md

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,15 @@ title: "Loading XML into a relational database"
44

55
# Loading XML into a relational database
66

7-
`xml2db` is a Python package which allows parsing and loading XML files into a relational database. It builds
8-
automatically a mapping between an XML schema and a set of tables in the database. It means that it can handle complex
9-
XML files which cannot be denormalized into flat tables. It works out of the box, without any custom mapping rules.
7+
`xml2db` is a Python package which allows parsing and loading XML files into a relational database:
108

11-
`xml2db` fits well within an [Extract, Load, Transform](https://docs.getdbt.com/terms/elt) data pipeline pattern as it
12-
allows loading XML files into a relational data model which is very close to the source data, yet easy to work with,
13-
being flat database tables.
9+
* it maps automatically a XSD schema with a set of tables in the database
10+
* it can handle complex XML files which cannot be denormalized into flat tables
11+
* it works out of the box, without any custom mapping rules.
1412

15-
Starting from an XSD schema which represents a given XML structure, `xml2db` builds a data model, i.e. a set of database
16-
tables linked to each other by foreign keys relationships. Then, it allows parsing and loading XML files into the
17-
database, and getting them back from the database into XML format if needed.
13+
`xml2db` fits well within an [Extract, Load, Transform](https://docs.getdbt.com/terms/elt) data pipeline pattern: it
14+
loads XML files into a relational data model which is very close to the source data, yet easy to work with, being flat
15+
database tables.
1816

1917
## How to load XML files into a database
2018

@@ -130,4 +128,4 @@ XML data.
130128
Contributions are welcome, as well as bug reports, starting on the project's
131129
[issue page](https://github.com/cre-dev/xml2db/issues).
132130

133-
131+
If you find this package useful, you can give it a star on [`xml2db`'s GitHub repo](https://github.com/cre-dev/xml2db)!

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "xml2db"
7-
version = "0.12.1"
7+
version = "0.12.2"
88
authors = [
99
{ name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
1010
]

src/xml2db/model.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import xmlschema
1010
import sqlalchemy
11+
from lxml import etree
1112
from sqlalchemy import MetaData, create_engine, inspect
1213
from sqlalchemy.sql.ddl import CreateIndex, CreateTable
1314
from sqlalchemy.exc import ProgrammingError
@@ -50,6 +51,7 @@ class DataModel:
5051
5152
Attributes:
5253
xml_schema: The `xmlschema.XMLSchema` object associated with this data model
54+
lxml_schema: The `lxml.etree.XMLSchema` object associated with this data model
5355
data_flow_name: A short identifier used for the data model (`short_name` argument value)
5456
data_flow_long_name: A longer for the data model (`long_name` argument value)
5557
db_schema: A database schema name to store the database tables
@@ -73,22 +75,22 @@ def __init__(
7375
base_url: str = None,
7476
model_config: dict = None,
7577
connection_string: str = None,
76-
db_engine: str = None,
78+
db_engine: sqlalchemy.Engine = None,
7779
db_type: str = None,
7880
db_schema: str = None,
7981
temp_prefix: str = None,
8082
):
8183
self.model_config = self._validate_config(model_config)
8284
self.tables_config = model_config.get("tables", {}) if model_config else {}
8385

84-
self.xml_schema = xmlschema.XMLSchema(
85-
os.path.basename(xsd_file) if base_url is None else xsd_file,
86-
base_url=(
87-
base_url
88-
if base_url is not None
89-
else os.path.normpath(os.path.dirname(xsd_file))
90-
),
91-
)
86+
xsd_file_name = xsd_file
87+
if base_url is None:
88+
base_url = os.path.normpath(os.path.dirname(xsd_file))
89+
xsd_file_name = os.path.basename(xsd_file)
90+
91+
self.xml_schema = xmlschema.XMLSchema(xsd_file_name, base_url=base_url)
92+
self.lxml_schema = etree.XMLSchema(etree.parse(xsd_file))
93+
9294
self.xml_converter = XMLConverter(data_model=self)
9395
self.data_flow_name = short_name
9496
self.data_flow_long_name = long_name

src/xml2db/xml_converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def parse_xml(
7474
logger.info("Skipping XML file validation")
7575
else:
7676
logger.info("Validating XML file against the schema")
77-
if not self.model.xml_schema.is_valid(xt if xt else xml_file):
77+
if not self.model.lxml_schema.validate(xt if xt else etree.parse(xml_file)):
7878
logger.error(f"XML file {file_path} does not conform with the schema")
7979
raise ValueError(
8080
f"XML file {file_path} does not conform with the schema"
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<?xml version="1.0" encoding="UTF-8" ?>
2+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
3+
targetNamespace="http://www.cre.fr/xml2db/base_types.xsd"
4+
elementFormDefault="qualified">
5+
<xs:simpleType name="stringtype">
6+
<xs:restriction base="xs:string"/>
7+
</xs:simpleType>
8+
<xs:simpleType name="inttype">
9+
<xs:restriction base="xs:integer"/>
10+
</xs:simpleType>
11+
<xs:simpleType name="dectype">
12+
<xs:restriction base="xs:decimal"/>
13+
</xs:simpleType>
14+
</xs:schema>

tests/sample_models/orders/orders.xsd

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,18 @@
11
<?xml version="1.0" encoding="UTF-8" ?>
2-
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
3-
4-
<xs:simpleType name="stringtype">
5-
<xs:restriction base="xs:string"/>
6-
</xs:simpleType>
7-
8-
<xs:simpleType name="inttype">
9-
<xs:restriction base="xs:integer"/>
10-
</xs:simpleType>
11-
12-
<xs:simpleType name="dectype">
13-
<xs:restriction base="xs:decimal"/>
14-
</xs:simpleType>
15-
2+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
3+
xmlns:bt="http://www.cre.fr/xml2db/base_types.xsd">
4+
<xs:import schemaLocation="base_types.xsd"
5+
namespace="http://www.cre.fr/xml2db/base_types.xsd" />
166
<xs:complexType name="companyIdType">
177
<xs:choice>
18-
<xs:element name="ace" type="stringtype"/>
19-
<xs:element name="bic" type="stringtype"/>
20-
<xs:element name="lei" type="stringtype"/>
8+
<xs:element name="ace" type="bt:stringtype"/>
9+
<xs:element name="bic" type="bt:stringtype"/>
10+
<xs:element name="lei" type="bt:stringtype"/>
2111
</xs:choice>
2212
</xs:complexType>
2313

2414
<xs:simpleType name="ZipCodeType-base">
25-
<xs:restriction base="stringtype">
15+
<xs:restriction base="bt:stringtype">
2616
<xs:maxLength value="10"/>
2717
</xs:restriction>
2818
</xs:simpleType>
@@ -37,29 +27,29 @@
3727

3828
<xs:complexType name="contacttype">
3929
<xs:sequence>
40-
<xs:element name="name" type="stringtype"/>
41-
<xs:element name="address" type="stringtype"/>
42-
<xs:element name="city" type="stringtype"/>
30+
<xs:element name="name" type="bt:stringtype"/>
31+
<xs:element name="address" type="bt:stringtype"/>
32+
<xs:element name="city" type="bt:stringtype"/>
4333
<xs:element name="zip" type="ZipCodeType"/>
44-
<xs:element name="country" type="stringtype"/>
45-
<xs:element name="phoneNumber" type="stringtype" minOccurs="0" maxOccurs="unbounded"/>
34+
<xs:element name="country" type="bt:stringtype"/>
35+
<xs:element name="phoneNumber" type="bt:stringtype" minOccurs="0" maxOccurs="unbounded"/>
4636
<xs:element name="companyId" type="companyIdType" minOccurs="0" maxOccurs="1"/>
4737
</xs:sequence>
4838
</xs:complexType>
4939

5040
<xs:complexType name="producttype">
5141
<xs:sequence>
52-
<xs:element name="name" type="stringtype"/>
53-
<xs:element name="version" type="stringtype"/>
42+
<xs:element name="name" type="bt:stringtype"/>
43+
<xs:element name="version" type="bt:stringtype"/>
5444
</xs:sequence>
5545
</xs:complexType>
5646

5747
<xs:complexType name="itemtype">
5848
<xs:sequence>
5949
<xs:element name="product" type="producttype" minOccurs="1" maxOccurs="1"/>
60-
<xs:element name="note" type="stringtype" minOccurs="0"/>
61-
<xs:element name="quantity" type="inttype"/>
62-
<xs:element name="price" type="dectype"/>
50+
<xs:element name="note" type="bt:stringtype" minOccurs="0"/>
51+
<xs:element name="quantity" type="bt:inttype"/>
52+
<xs:element name="price" type="bt:dectype"/>
6353
</xs:sequence>
6454
</xs:complexType>
6555

@@ -69,15 +59,15 @@
6959
<xs:element name="shipto" type="contacttype" minOccurs="0" maxOccurs="1"/>
7060
<xs:element name="item" maxOccurs="unbounded" type="itemtype"/>
7161
</xs:sequence>
72-
<xs:attribute name="orderid" type="stringtype" use="required" />
62+
<xs:attribute name="orderid" type="bt:stringtype" use="required" />
7363
<xs:attribute name="processed_at" type="xs:dateTime" />
7464
</xs:complexType>
7565

7666
<xs:complexType name="orderstype">
7767
<xs:sequence>
7868
<xs:element name="shiporder" type="shipordertype" minOccurs="0" maxOccurs="unbounded" />
7969
</xs:sequence>
80-
<xs:attribute name="batch_id" type="stringtype" />
70+
<xs:attribute name="batch_id" type="bt:stringtype" />
8171
</xs:complexType>
8272

8373
<xs:element name="orders" type="orderstype"/>

tests/test_validation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
("invalid", True, True, ValueError),
1515
("invalid", False, False, ValueError),
1616
("invalid", False, True, ValueError),
17-
("malformed_recover", True, False, xml.etree.ElementTree.ParseError),
17+
("malformed_recover", True, False, lxml.etree.XMLSyntaxError),
1818
("malformed_recover", True, True, None),
1919
("malformed_recover", False, False, lxml.etree.XMLSyntaxError),
2020
("malformed_recover", False, True, None),
21-
("malformed_no_recover", True, False, xml.etree.ElementTree.ParseError),
21+
("malformed_no_recover", True, False, lxml.etree.XMLSyntaxError),
2222
("malformed_no_recover", True, True, ValueError),
2323
("malformed_no_recover", False, False, lxml.etree.XMLSyntaxError),
2424
("malformed_no_recover", False, True, ValueError),

0 commit comments

Comments
 (0)