Skip to content

Commit b1e078a

Browse files
committed
avoid overwriting language codes for well-known ISO codes
1 parent 3e519c6 commit b1e078a

2 files changed

Lines changed: 35 additions & 14 deletions

File tree

src/main/python/opensextant/__init__.py

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,10 +1142,11 @@ class Language:
11421142
In some situations there are competeing 2-char codes in code books, such as Lib of Congress (LOC)
11431143
"""
11441144

1145-
def __init__(self, iso3, iso2, nmlist: list):
1145+
def __init__(self, iso3, iso2, nmlist: list, locale=None):
11461146
self.code_iso3 = iso3
11471147
self.code = iso2
11481148
self.names = nmlist
1149+
self.locale = locale
11491150
if nmlist:
11501151
if not isinstance(nmlist, list):
11511152
raise Exception("Name list is a list of names for the language. The first one is the default.")
@@ -1171,6 +1172,7 @@ def __str__(self):
11711172
def list_languages():
11721173
"""
11731174
List out a flattened list of languages, de-duplicated by ISO2 language ID.
1175+
11741176
TODO: alternatively list out every language
11751177
:return:
11761178
"""
@@ -1191,15 +1193,26 @@ def list_languages():
11911193

11921194

11931195
def add_language(lg: Language, override=False):
1196+
"""
1197+
The language map for ISO 2-alpha and 3-alpha codes should be protected from language IDs that are dialect or locale
1198+
1199+
"en" ==> en-au, en-gb, en-us, etc.? This is ambiguous
1200+
The reverse is true -- "en-gb" is at least "en" or "eng" english
1201+
1202+
:param lg:
1203+
:param override:
1204+
:return:
1205+
"""
11941206
if not lg:
11951207
return
11961208

11971209
codes = []
11981210
if lg.code:
11991211
codes.append(lg.code.lower())
1200-
12011212
if lg.code_iso3:
12021213
codes.append(lg.code_iso3.lower())
1214+
if lg.locale:
1215+
codes.append(lg.locale.lower())
12031216

12041217
if lg.names:
12051218
for nm in lg.names:
@@ -1211,7 +1224,13 @@ def add_language(lg: Language, override=False):
12111224
override = True
12121225

12131226
for k in set(codes):
1214-
if k in language_map and not override:
1227+
exists = k in language_map
1228+
1229+
# coding rule: 2 or 3 char alpha codes for ISO or Biblio code books are not overriden.
1230+
if len(k) <= 3 and exists:
1231+
continue
1232+
1233+
if exists and not override:
12151234
raise Exception(f"Forcibly remap language code? {k}")
12161235

12171236
language_map[k] = lg
@@ -1331,31 +1350,34 @@ def load_languages():
13311350
if iso3 in IGNORE_LANGUAGES:
13321351
continue
13331352

1334-
L = Language(lang[0], lang[2], lang_names)
1353+
iso2=lang[2]
1354+
L = Language(iso3, iso2, lang_names)
13351355
add_language(L)
13361356
if bib3:
1337-
L = Language(bib3, lang[2], lang_names)
1357+
L = Language(bib3, iso2, lang_names)
13381358
add_language(L, override=True)
13391359

13401360
# Some odd additions -- Bibliographic vs. Terminologic codes may vary.
13411361
# FRE vs. FRA is valid for French, for example.
13421362
#
13431363
for lg in [Language("fra", "fr", ["French"]),
13441364

1345-
Language("zh-cn", "zh", ["Chinese"]),
1365+
Language("zho", "zh", ["Chinese"], locale="zh-cn"),
13461366

13471367
Language(None, "zt", ["Traditionl Chinese"]),
1348-
Language("zh-tw", "zt", ["Traditionl Chinese/Taiwain"]),
1368+
Language(None, "zt", ["Traditionl Chinese/Taiwain"], locale="zh-tw"),
13491369

1350-
Language("fa-AF", "dr", ["Dari", "Afghan Persian"]),
1370+
Language("prs", "dr", ["Dari", "Afghan Persian"], locale="fa-AF"),
13511371
Language("prs", "dr", ["Dari", "Afghan Persian"]),
13521372

13531373
Language("eng", "en", ["English"]),
1354-
Language("en-gb", "en", ["English"]),
1355-
Language("en-us", "en", ["English"]),
1356-
Language("en-uk", "en", ["English"]),
1357-
Language("en-ca", "en", ["English"]),
1358-
Language("en-au", "en", ["English"])]:
1374+
1375+
Language("eng", "en", ["English/British"], locale="en-gb"),
1376+
Language("eng", "en", ["English/USA"], locale="en-us"),
1377+
Language("eng", "en", ["English/United Kingdom"], locale="en-uk"),
1378+
Language("eng", "en", ["English/Canadian"], locale="en-ca"),
1379+
Language("eng", "en", ["English/Australian"], locale="en-au")]:
1380+
13591381
add_language(lg, override=True)
13601382

13611383
__language_map_init = True

src/main/python/opensextant/utility.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ def is_ascii(s):
8080
pass
8181
return False
8282

83-
8483
def get_text(t):
8584
""" Default is to return Unicode string from raw data"""
8685
if isinstance(t, str):

0 commit comments

Comments
 (0)