From 76df13322c3544bc50b987b308751a4b704ed40e Mon Sep 17 00:00:00 2001
From: hpoeche <h.poeche@iat.rwth-aachen.de>
Date: Wed, 15 Apr 2026 16:27:52 +0200
Subject: [PATCH 1/3] sdk: fix langtag constraints in LangStringSet

Previously the LangStringSet checked language tags that were added
to match the format of xx or xx-XX. This did not follow the
documented behavior, which requires all IETF BCP 47 conform
language tags to be accepted.

These changes replaced the previous constraint check, with a
check by regular expression, following the IETF BCP 47 standard.

Fixes #157
---
 sdk/basyx/aas/model/base.py | 35 ++++++++++++++++++++++++++++++-----
 sdk/test/model/test_base.py | 30 +++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/sdk/basyx/aas/model/base.py b/sdk/basyx/aas/model/base.py
index f6b55fa8..ea613fc8 100644
--- a/sdk/basyx/aas/model/base.py
+++ b/sdk/basyx/aas/model/base.py
@@ -301,11 +301,36 @@ def __init__(self, dict_: Dict[str, str]):
 
     @classmethod
     def _check_language_tag_constraints(cls, ltag: str):
-        split = ltag.split("-", 1)
-        lang_code = split[0]
-        if len(lang_code) != 2 or not lang_code.isalpha() or not lang_code.islower():
-            raise ValueError(f"The language code of the language tag must consist of exactly two lower-case letters! "
-                             f"Given language tag and language code: '{ltag}', '{lang_code}'")
+        alphanum = "[a-zA-Z0-9]"
+        singleton = "[0-9A-WY-Za-wy-z]"
+        extension = f"{singleton}(-({alphanum}){{2,8}})+"
+        extlang = "[a-zA-Z]{3}(-[a-zA-Z]{3}){2}"
+        irregular = (
+            "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|"
+            "i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|"
+            "i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)"
+        )
+        regular = (
+            "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|"
+            "zh-min|zh-min-nan|zh-xiang)"
+        )
+        grandfathered = f"({irregular}|{regular})"
+        language = f"([a-zA-Z]{{2,3}}(-{extlang})?|[a-zA-Z]{{4}}|[a-zA-Z]{{5,8}})"
+        script = "[a-zA-Z]{4}"
+        region = "([a-zA-Z]{2}|[0-9]{3})"
+        variant = f"(({alphanum}){{5,8}}|[0-9]({alphanum}){{3}})"
+        privateuse = f"[xX](-({alphanum}){{1,8}})+"
+        langtag = (
+            f"{language}(-{script})?(-{region})?(-{variant})*(-{extension})*(-"
+            f"{privateuse})?"
+        )
+        language_tag = f"({langtag}|{privateuse}|{grandfathered})"
+
+        pattern = f"^{language_tag}$"
+
+        if re.match(pattern, ltag) is None:
+            raise ValueError(f"The language tag must follow the format defined in BCP 47. "
+                             f"Given language tag: {ltag}")
 
     def __getitem__(self, item: str) -> str:
         return self._dict[item]
diff --git a/sdk/test/model/test_base.py b/sdk/test/model/test_base.py
index 98c6cfb8..b40174b5 100644
--- a/sdk/test/model/test_base.py
+++ b/sdk/test/model/test_base.py
@@ -1230,20 +1230,28 @@ def hook(itm: int, _list: List[int]) -> None:
 
 class LangStringSetTest(unittest.TestCase):
     def test_language_tag_constraints(self) -> None:
+        with self.assertRaises(ValueError):
+            model.LangStringSet({"": "bar"})
+
         with self.assertRaises(ValueError) as cm:
-            model.LangStringSet({"foo": "bar"})
-        self.assertEqual("The language code of the language tag must consist of exactly two lower-case letters! "
-                         "Given language tag and language code: 'foo', 'foo'", str(cm.exception))
+            model.LangStringSet({"x": "bar"})
+        self.assertEqual(f"The language tag must follow the format defined in BCP 47. "
+                         f"Given language tag: x", cm.exception.args[0])
 
-        lss = model.LangStringSet({"fo-OO": "bar"})
         with self.assertRaises(ValueError) as cm:
-            lss["foo"] = "bar"
-        self.assertEqual("The language code of the language tag must consist of exactly two lower-case letters! "
-                         "Given language tag and language code: 'foo', 'foo'", str(cm.exception))
-        self.assertNotIn("foo", lss)
-        self.assertNotIn("fo", lss)
-        lss["fo"] = "bar"
-        self.assertIn("fo", lss)
+            model.LangStringSet({"foo-oo1": "bar"})
+        self.assertEqual(f"The language tag must follow the format defined in BCP 47. "
+                         f"Given language tag: foo-oo1", cm.exception.args[0])
+
+        lss = model.LangStringSet({"fo-OO": "bar"})
+        self.assertIn("fo-OO", lss)
+        with self.assertRaises(ValueError):
+            lss["foo-oo1"] = "bar"
+        self.assertNotIn("foo-oo1", lss)
+
+        self.assertNotIn("foo-ASDF-OO", lss)
+        lss["foo-ASDF-OO"] = "bar"
+        self.assertIn("foo-ASDF-OO", lss)
 
     def test_empty(self) -> None:
         lss = model.LangStringSet({"fo": "bar", "fo-OO": "baz"})

From da7a24f92229392cb96cedbcbae2e1ad159af388 Mon Sep 17 00:00:00 2001
From: hpoeche <h.poeche@iat.rwth-aachen.de>
Date: Wed, 15 Apr 2026 16:50:19 +0200
Subject: [PATCH 2/3] sdk: correct langtag check to follow RFC 5646

The langtag check of ITEF BCP 47 format from the aas-core project
was not following the RFC 5646 precisely in two minor important
points.

These deviations are now corrected, so the check strictly follows
the syntax described in the standard.
---
 sdk/basyx/aas/model/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/basyx/aas/model/base.py b/sdk/basyx/aas/model/base.py
index ea613fc8..6c9cff93 100644
--- a/sdk/basyx/aas/model/base.py
+++ b/sdk/basyx/aas/model/base.py
@@ -304,7 +304,7 @@ def _check_language_tag_constraints(cls, ltag: str):
         alphanum = "[a-zA-Z0-9]"
         singleton = "[0-9A-WY-Za-wy-z]"
         extension = f"{singleton}(-({alphanum}){{2,8}})+"
-        extlang = "[a-zA-Z]{3}(-[a-zA-Z]{3}){2}"
+        extlang = "[a-zA-Z]{3}(-[a-zA-Z]{3}){0,2}"
         irregular = (
             "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|"
             "i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|"
@@ -319,7 +319,7 @@ def _check_language_tag_constraints(cls, ltag: str):
         script = "[a-zA-Z]{4}"
         region = "([a-zA-Z]{2}|[0-9]{3})"
         variant = f"(({alphanum}){{5,8}}|[0-9]({alphanum}){{3}})"
-        privateuse = f"[xX](-({alphanum}){{1,8}})+"
+        privateuse = f"[x](-({alphanum}){{1,8}})+"
         langtag = (
             f"{language}(-{script})?(-{region})?(-{variant})*(-{extension})*(-"
             f"{privateuse})?"

From 2c97d0a725291a506dbb24e00bdb9f266d7a5880 Mon Sep 17 00:00:00 2001
From: hpoeche <h.poeche@iat.rwth-aachen.de>
Date: Fri, 17 Apr 2026 11:56:45 +0200
Subject: [PATCH 3/3] sdk: align langtag check with aas-core-work bcp47

Previous change ignored case-insensivity of the abnf grammer
of the BCP 47 standard.

This change aligns the regex check with the one generated
from https://github.com/aas-core-works/abnf-to-regexp/blob/main/test_data/nested-python/bcp47/grammar.abnf.
---
 sdk/basyx/aas/model/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/basyx/aas/model/base.py b/sdk/basyx/aas/model/base.py
index 6c9cff93..3edaac6e 100644
--- a/sdk/basyx/aas/model/base.py
+++ b/sdk/basyx/aas/model/base.py
@@ -319,7 +319,7 @@ def _check_language_tag_constraints(cls, ltag: str):
         script = "[a-zA-Z]{4}"
         region = "([a-zA-Z]{2}|[0-9]{3})"
         variant = f"(({alphanum}){{5,8}}|[0-9]({alphanum}){{3}})"
-        privateuse = f"[x](-({alphanum}){{1,8}})+"
+        privateuse = f"[xX](-({alphanum}){{1,8}})+"
         langtag = (
             f"{language}(-{script})?(-{region})?(-{variant})*(-{extension})*(-"
             f"{privateuse})?"