From 76df13322c3544bc50b987b308751a4b704ed40e Mon Sep 17 00:00:00 2001 From: hpoeche Date: Wed, 15 Apr 2026 16:27:52 +0200 Subject: [PATCH 1/3] sdk: fix langtag constraints in LangStringSet Previously the LangStringSet checked language tags that were added to match the format of xx or xx-XX. This did not follow the documented behavior, which requires all IETF BCP 47 conform language tags to be accepted. These changes replaced the previous constraint check, with a check by regular expression, following the IETF BCP 47 standard. Fixes #157 --- sdk/basyx/aas/model/base.py | 35 ++++++++++++++++++++++++++++++----- sdk/test/model/test_base.py | 30 +++++++++++++++++++----------- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/sdk/basyx/aas/model/base.py b/sdk/basyx/aas/model/base.py index f6b55fa8..ea613fc8 100644 --- a/sdk/basyx/aas/model/base.py +++ b/sdk/basyx/aas/model/base.py @@ -301,11 +301,36 @@ def __init__(self, dict_: Dict[str, str]): @classmethod def _check_language_tag_constraints(cls, ltag: str): - split = ltag.split("-", 1) - lang_code = split[0] - if len(lang_code) != 2 or not lang_code.isalpha() or not lang_code.islower(): - raise ValueError(f"The language code of the language tag must consist of exactly two lower-case letters! " - f"Given language tag and language code: '{ltag}', '{lang_code}'") + alphanum = "[a-zA-Z0-9]" + singleton = "[0-9A-WY-Za-wy-z]" + extension = f"{singleton}(-({alphanum}){{2,8}})+" + extlang = "[a-zA-Z]{3}(-[a-zA-Z]{3}){2}" + irregular = ( + "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|" + "i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|" + "i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)" + ) + regular = ( + "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|" + "zh-min|zh-min-nan|zh-xiang)" + ) + grandfathered = f"({irregular}|{regular})" + language = f"([a-zA-Z]{{2,3}}(-{extlang})?|[a-zA-Z]{{4}}|[a-zA-Z]{{5,8}})" + script = "[a-zA-Z]{4}" + region = "([a-zA-Z]{2}|[0-9]{3})" + variant = f"(({alphanum}){{5,8}}|[0-9]({alphanum}){{3}})" + privateuse = f"[xX](-({alphanum}){{1,8}})+" + langtag = ( + f"{language}(-{script})?(-{region})?(-{variant})*(-{extension})*(-" + f"{privateuse})?" + ) + language_tag = f"({langtag}|{privateuse}|{grandfathered})" + + pattern = f"^{language_tag}$" + + if re.match(pattern, ltag) is None: + raise ValueError(f"The language tag must follow the format defined in BCP 47. " + f"Given language tag: {ltag}") def __getitem__(self, item: str) -> str: return self._dict[item] diff --git a/sdk/test/model/test_base.py b/sdk/test/model/test_base.py index 98c6cfb8..b40174b5 100644 --- a/sdk/test/model/test_base.py +++ b/sdk/test/model/test_base.py @@ -1230,20 +1230,28 @@ def hook(itm: int, _list: List[int]) -> None: class LangStringSetTest(unittest.TestCase): def test_language_tag_constraints(self) -> None: + with self.assertRaises(ValueError): + model.LangStringSet({"": "bar"}) + with self.assertRaises(ValueError) as cm: - model.LangStringSet({"foo": "bar"}) - self.assertEqual("The language code of the language tag must consist of exactly two lower-case letters! " - "Given language tag and language code: 'foo', 'foo'", str(cm.exception)) + model.LangStringSet({"x": "bar"}) + self.assertEqual(f"The language tag must follow the format defined in BCP 47. " + f"Given language tag: x", cm.exception.args[0]) - lss = model.LangStringSet({"fo-OO": "bar"}) with self.assertRaises(ValueError) as cm: - lss["foo"] = "bar" - self.assertEqual("The language code of the language tag must consist of exactly two lower-case letters! " - "Given language tag and language code: 'foo', 'foo'", str(cm.exception)) - self.assertNotIn("foo", lss) - self.assertNotIn("fo", lss) - lss["fo"] = "bar" - self.assertIn("fo", lss) + model.LangStringSet({"foo-oo1": "bar"}) + self.assertEqual(f"The language tag must follow the format defined in BCP 47. " + f"Given language tag: foo-oo1", cm.exception.args[0]) + + lss = model.LangStringSet({"fo-OO": "bar"}) + self.assertIn("fo-OO", lss) + with self.assertRaises(ValueError): + lss["foo-oo1"] = "bar" + self.assertNotIn("foo-oo1", lss) + + self.assertNotIn("foo-ASDF-OO", lss) + lss["foo-ASDF-OO"] = "bar" + self.assertIn("foo-ASDF-OO", lss) def test_empty(self) -> None: lss = model.LangStringSet({"fo": "bar", "fo-OO": "baz"}) From da7a24f92229392cb96cedbcbae2e1ad159af388 Mon Sep 17 00:00:00 2001 From: hpoeche Date: Wed, 15 Apr 2026 16:50:19 +0200 Subject: [PATCH 2/3] sdk: correct langtag check to follow RFC 5646 The langtag check of ITEF BCP 47 format from the aas-core project was not following the RFC 5646 precisely in two minor important points. These deviations are now corrected, so the check strictly follows the syntax described in the standard. --- sdk/basyx/aas/model/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/basyx/aas/model/base.py b/sdk/basyx/aas/model/base.py index ea613fc8..6c9cff93 100644 --- a/sdk/basyx/aas/model/base.py +++ b/sdk/basyx/aas/model/base.py @@ -304,7 +304,7 @@ def _check_language_tag_constraints(cls, ltag: str): alphanum = "[a-zA-Z0-9]" singleton = "[0-9A-WY-Za-wy-z]" extension = f"{singleton}(-({alphanum}){{2,8}})+" - extlang = "[a-zA-Z]{3}(-[a-zA-Z]{3}){2}" + extlang = "[a-zA-Z]{3}(-[a-zA-Z]{3}){0,2}" irregular = ( "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|" "i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|" @@ -319,7 +319,7 @@ def _check_language_tag_constraints(cls, ltag: str): script = "[a-zA-Z]{4}" region = "([a-zA-Z]{2}|[0-9]{3})" variant = f"(({alphanum}){{5,8}}|[0-9]({alphanum}){{3}})" - privateuse = f"[xX](-({alphanum}){{1,8}})+" + privateuse = f"[x](-({alphanum}){{1,8}})+" langtag = ( f"{language}(-{script})?(-{region})?(-{variant})*(-{extension})*(-" f"{privateuse})?" From 2c97d0a725291a506dbb24e00bdb9f266d7a5880 Mon Sep 17 00:00:00 2001 From: hpoeche Date: Fri, 17 Apr 2026 11:56:45 +0200 Subject: [PATCH 3/3] sdk: align langtag check with aas-core-work bcp47 Previous change ignored case-insensivity of the abnf grammer of the BCP 47 standard. This change aligns the regex check with the one generated from https://github.com/aas-core-works/abnf-to-regexp/blob/main/test_data/nested-python/bcp47/grammar.abnf. --- sdk/basyx/aas/model/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/basyx/aas/model/base.py b/sdk/basyx/aas/model/base.py index 6c9cff93..3edaac6e 100644 --- a/sdk/basyx/aas/model/base.py +++ b/sdk/basyx/aas/model/base.py @@ -319,7 +319,7 @@ def _check_language_tag_constraints(cls, ltag: str): script = "[a-zA-Z]{4}" region = "([a-zA-Z]{2}|[0-9]{3})" variant = f"(({alphanum}){{5,8}}|[0-9]({alphanum}){{3}})" - privateuse = f"[x](-({alphanum}){{1,8}})+" + privateuse = f"[xX](-({alphanum}){{1,8}})+" langtag = ( f"{language}(-{script})?(-{region})?(-{variant})*(-{extension})*(-" f"{privateuse})?"