Skip to content

Commit 92e818d

Browse files
committed
chore: update version to 0.5.1, add support for reading xlsx files, and include Motorola datasets
1 parent b33427c commit 92e818d

7 files changed

Lines changed: 78 additions & 10 deletions

File tree

mtdata/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# Created: 4/4/20
55

66

7-
__version__ = '0.5.0'
7+
__version__ = '0.5.1'
88
__description__ = 'mtdata is a tool to download datasets for machine translation'
99
__author__ = 'Thamme Gowda'
1010

mtdata/cache.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def get_entry(self, entry: Entry, fix_missing=True) -> Union[Path, List[Path]]:
4545
else:
4646
assert isinstance(entry.url, str)
4747
local = self.get_local_path(entry.url, filename=entry.filename, fix_missing=fix_missing, entry=entry)
48-
if isinstance(local, Path) and (zipfile.is_zipfile(local) or tarfile.is_tarfile(local)):
48+
if isinstance(local, Path) and entry.is_archive and (zipfile.is_zipfile(local) or tarfile.is_tarfile(local)):
4949
# look inside the archives and get the desired files
5050
local = self.get_local_in_paths(path=local, entry=entry)
5151
return local
@@ -261,7 +261,13 @@ def download(self, url: str, save_at: Path, timeout=(5, 10), entry=None):
261261
if valid_flag.exists() and save_at.exists():
262262
return save_at
263263
log.debug(f"GET {url}{save_at}")
264-
resp = requests.get(url=url, allow_redirects=True, headers=headers, stream=True, timeout=timeout)
264+
try:
265+
resp = requests.get(url=url, allow_redirects=True, headers=headers, stream=True,
266+
timeout=timeout)
267+
except requests.exceptions.SSLError as e:
268+
log.warning(f"SSL verification failed for {url}: {e}; retrying without verification")
269+
resp = requests.get(url=url, allow_redirects=True, headers=headers, stream=True,
270+
timeout=timeout, verify=False)
265271
assert resp.status_code == 200, resp.status_code
266272
buf_size = 2 ** 14
267273
tot_bytes = int(resp.headers.get('Content-Length', '0'))

mtdata/index/other.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,22 @@ def load_all(index: Index):
164164
("2", "*/mono-bho-corpus/monolingual-v0.2.bho")]:
165165
index += Entry(did=DatasetId(group='BHLTR', name=f'mono', version=version, langs=('bho',)),
166166
url=url, filename=filename, ext='zip', in_ext='txt', in_paths=[f1], cite=cite)
167+
168+
### Motorola Endangered Indigenous Languages ###
169+
_moto_base = "https://wpa_supplier.motorola.com/"
170+
_moto_datasets = [
171+
# (filename, src_lang, tgt_lang, version-date)
172+
("MotorolaMobility_All-en_US-lld_IT-2024-06-04.xlsx", "eng", "lld", "20240604"),
173+
("MotorolaMobility_All-en_US-chr-2022-03-08.xlsx", "eng", "chr", "20220308"),
174+
("MotorolaMobility_All-en_US-xnr_IN-2023-02-10.xlsx", "eng", "xnr", "20230210"),
175+
("MotorolaMobility_All-en_US-mi_NZ-2023-08-08.xlsx", "eng", "mri", "20230808"),
176+
("MotorolaMobility-en_US-yrl-2022-12-08.xlsx", "eng", "yrl", "20221208"),
177+
("MotorolaMobility-en_US-kgp-2022-12-08.xlsx", "eng", "kgp", "20221208"),
178+
("MotorolaMobility_All-pt_BR-yrl_BR-2022-07-20.xlsx", "por", "yrl", "20220720"),
179+
("MotorolaMobility_All-pt_BR-kgp_BR-2022-07-20.xlsx", "por", "kgp", "20220720"),
180+
]
181+
for fname, src, tgt, version in _moto_datasets:
182+
url = _moto_base + fname
183+
index += Entry(did=DatasetId(group='Motorola', name='lang_revitalization', version=version,
184+
langs=(src, tgt)),
185+
url=url, filename=fname, ext='xlsx', in_ext='xlsx')

mtdata/parser.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ def read_segs(self, show_pbar=True):
9191
readers.append(read_wmt21_xml(p))
9292
elif HF_EXT in self.ext:
9393
readers.append(self.read_hfds(p))
94+
elif 'xlsx' in self.ext:
95+
readers.append(self.read_xlsx(p))
9496
else:
9597
raise Exception(f'Not supported {self.ext} : {p}')
9698

@@ -152,6 +154,26 @@ def read_tsv(self, path, delim='\t', cols=None, skipheader=False, meta_fields=No
152154
out_row.append(metadata)
153155
yield out_row
154156

157+
def read_xlsx(self, path, cols=None):
158+
"""Read data from an Excel .xlsx file.
159+
:param path: path to .xlsx file
160+
:param cols: column indices to extract; default uses ent.cols or (0, 1)
161+
"""
162+
try:
163+
from openpyxl import load_workbook
164+
except ImportError as e:
165+
raise ImportError("openpyxl is required to read .xlsx files. Run: pip install openpyxl") from e
166+
if cols is None:
167+
cols = self.ent.cols if (self.ent and self.ent.cols) else (0, 1)
168+
wb = load_workbook(path, read_only=True, data_only=True)
169+
ws = wb.active
170+
for row in ws.iter_rows(min_row=2, values_only=True): # skip header
171+
out = [str(row[c]).strip() if row[c] is not None else '' for c in cols]
172+
if all(v == '' for v in out):
173+
continue
174+
yield out
175+
wb.close()
176+
155177
@staticmethod
156178
def _nested_get(row, field):
157179
"""Get a value from a dict using dot-separated path for nested access.
@@ -176,11 +198,24 @@ def read_hfds(self, ds):
176198
# in the current version, I am going to retain all fields to see what all fields exist,
177199
# and map the subset of fields as per the dict; so, created rev_map.get(orig,orig)
178200
for row in ds:
179-
out_row = [self._nested_get(row, src_field)]
180-
if tgt_field is not None:
181-
out_row.append(self._nested_get(row, tgt_field))
182-
# remap meta fields if necessary
201+
src_val = self._nested_get(row, src_field)
202+
tgt_val = self._nested_get(row, tgt_field) if tgt_field else None
183203
top_keys = {f.split('.')[0] for f in [src_field] + ([tgt_field] if tgt_field else [])}
184204
metadata = {rev_map.get(k, k): v for k, v in row.items() if k not in top_keys}
185-
out_row.append(metadata)
186-
yield out_row
205+
206+
src_is_list = isinstance(src_val, list)
207+
tgt_is_list = isinstance(tgt_val, list)
208+
if src_is_list and tgt_is_list:
209+
# Both lists (e.g. SmolDoc srcs/trgs): zip and yield each pair
210+
for s, t in zip(src_val, tgt_val):
211+
yield [s, t, metadata]
212+
elif not src_is_list and tgt_is_list:
213+
# Source is scalar, target is list (e.g. GATITOS src/trgs): expand
214+
for t in tgt_val:
215+
yield [src_val, t, metadata]
216+
else:
217+
out_row = [src_val]
218+
if tgt_val is not None:
219+
out_row.append(tgt_val)
220+
out_row.append(metadata)
221+
yield out_row

mtdata/resource/huggingface-datasets.jsonl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,5 @@
1212
{"id": "sfrontull/autonomia-lld_valbadia-ita", "configs": [{"name": null, "langs": ["ita", "lld"], "fields": {"source": "translation.ita", "target": "translation.lld_valbadia"}}]}
1313
{"id": "sfrontull/stiftungsparkasse-lld_valbadia-ita", "configs": [{"name": null, "langs": ["ita", "lld"], "fields": {"source": "translation.ita", "target": "translation.lld_valbadia"}}]}
1414
{"id": "sfrontull/pinocchio-lld_valbadia-ita", "configs": [{"name": null, "langs": ["ita", "lld"], "fields": {"source": "translation.ita", "target": "translation.lld_valbadia"}}]}
15+
# === SMOL: Professional translations for low-resource languages (google/smol) ===
16+
{"id": "google/smol", "configs": [{"name": "smolsent__en_lij", "ds_name": "smol_sent", "langs": ["eng", "lij_Latn"], "fields": {"source": "src", "target": "trg"}}, {"name": "smoldoc__en_lij", "ds_name": "smol_doc", "langs": ["eng", "lij_Latn"], "fields": {"source": "srcs", "target": "trgs"}}, {"name": "gatitos__en_lij", "ds_name": "smol_gatitos", "langs": ["eng", "lij_Latn"], "fields": {"source": "src", "target": "trgs"}}]}

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ exclude = ["tests*", "tmp*", "build*", "dist*", "crawler*", "*.egg-info*"]
5252

5353
[project.optional-dependencies]
5454
hf = ["datasets>=4.0.0"]
55+
xlsx = ["openpyxl>=3.1.0"]
5556
test = [ "pytest", "pytest-cov[all]", "black", "isort", "mypy"]
5657

5758

recipes/mtdata.recipes.wmt26-constrained.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1078,6 +1078,7 @@
10781078
- OPUS-wikimedia-v20230407-eng-lld
10791079
- OPUS-translatewiki-v20250101-eng-lld
10801080
- OPUS-translatewiki-v20250101-eng_CA-lld
1081+
- Motorola-lang_revitalization-20240604-eng-lld
10811082
mono_train:
10821083
- Sfrontull-la_usc_valbadia_loresmt24-1-lld
10831084
- Sfrontull-south_tyrol_weather_lld-1-lld
@@ -1086,9 +1087,13 @@
10861087
- id: wmt26-eng-lij_Latn
10871088
langs: eng-lij_Latn
10881089
train:
1089-
- AllenAi-nllb-1-eng-lij_Latn
1090+
#- AllenAi-nllb-1-eng-lij_Latn # we heard its noisy -- use it at your own risk
10901091
- Conseggioligure-zenamt_eng_train-1-eng-lij_Latn
10911092
- Openlanguagedata-oldi_seed-1-eng-lij_Latn
1093+
- Google-smol_sent-1-eng-lij_Latn
1094+
- Google-smol_doc-1-eng-lij_Latn
1095+
- Google-smol_gatitos-1-eng-lij_Latn
1096+
- OPUS-tatoeba-v20230412-eng-lij
10921097
mono_train:
10931098
- Conseggioligure-linc-1-lij_Latn
10941099

0 commit comments

Comments
 (0)