Skip to content

Commit 3222ac4

Browse files
committed
add completed test cases
1 parent 3ab45ff commit 3222ac4

2 files changed

Lines changed: 136 additions & 34 deletions

File tree

mokelumne/util/ldc.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -40,28 +40,36 @@ def scrape_corpus_metadata(tag: Tag) -> dict[str, str]:
4040
"""
4141
cells = tag.find_all("td")
4242
if len(cells) > 0:
43-
catalog_id = cells[0].get_text(strip=True)
44-
corpus_name = cells[1].get_text(strip=True)
45-
invoice_date = cells[2].get_text(strip=True)
46-
download_link = cells[3].a["href"] # pyright: ignore[reportOptionalSubscript]
47-
# the technical metadata is not broken up into distinct cells, so
48-
# we have to parse it more. the "file" metadata is also not
49-
# the true filename as returned by LDC.
50-
techmd = cells[4].get_text(strip=True, separator="\n").splitlines()
51-
file, filesize, checksum = [
52-
re.sub(r"^\s*(File Size|MD5 Checksum): ", "", t) for t in techmd
53-
]
54-
return {
55-
"catalog_id": catalog_id,
56-
"corpus_name": corpus_name,
57-
"download_link": download_link, # pyright: ignore[reportReturnType]
58-
"invoice_date": invoice_date,
59-
"file": file,
60-
"filesize": filesize,
61-
"checksum": checksum
62-
}
43+
try:
44+
catalog_id = cells[0].get_text(strip=True)
45+
corpus_name = cells[1].get_text(strip=True)
46+
47+
# note: there is a unique download link for each distinct
48+
# invoice date that is associated woith a file
49+
invoice_date = cells[2].get_text(strip=True)
50+
download_link = cells[3].a["href"] # pyright: ignore[reportOptionalSubscript]
51+
52+
# the file-level metadata is not broken up into distinct cells, so
53+
# we have to parse it more. the "file" metadata is also not
54+
# the true filename as returned by LDC.
55+
techmd = cells[4].get_text(strip=True, separator="\n").splitlines()
56+
file, filesize, checksum = [
57+
re.sub(r"^\s*(File Size|MD5 Checksum): ", "", t) for t in techmd
58+
]
59+
return {
60+
"catalog_id": catalog_id,
61+
"corpus_name": corpus_name,
62+
"download_link": download_link, # pyright: ignore[reportReturnType]
63+
"invoice_date": invoice_date,
64+
"file": file,
65+
"filesize": filesize,
66+
"checksum": checksum
67+
}
68+
except (IndexError, KeyError):
69+
return {}
6370
return {}
6471

72+
6573
def get_latest_invoice_date(
6674
corpora: list[dict[str, str]] = [], corpus_id: str = ""
6775
) -> Optional[str]:

test/unit/test_ldc.py

Lines changed: 108 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,119 @@
33
import json
44
import pytest
55

6-
from .. import fixtures
6+
from bs4 import BeautifulSoup
77

88
from mokelumne.util.ldc import (
99
get_csrf_token, get_latest_invoice_date, scrape_corpus_metadata,
1010
)
11+
from .. import fixtures
12+
1113

1214
class TestLDC:
13-
def test_get_csrf_token(self) -> None:
14-
markup = '<form><input name="authenticity_token" value="foo"/><form>'
15-
markup_alt = '<form><input name="something_else" value="bar"/><form>'
16-
assert {"authenticity_token": "foo"} == get_csrf_token(markup=markup)
17-
assert {"something_else": "bar"} == get_csrf_token(
18-
markup=markup_alt, param_name="something_else"
15+
with (
16+
importlib.resources.path(fixtures, "ldc-treebank-3.json") as test_json,
17+
open(test_json) as fh
18+
):
19+
duplicate_invoice_data = json.loads(fh.read())
20+
21+
@pytest.mark.parametrize(
22+
"markup,param_name,expected", [
23+
pytest.param(
24+
'<form><input name="authenticity_token" value="foo"/></form>',
25+
"authenticity_token",
26+
{"authenticity_token": "foo"},
27+
id="with_default_param_name"
28+
),
29+
pytest.param(
30+
'<form><input name="something_else" value="bar"/><input name="authenticity_token" value="baz"/></form>',
31+
"something_else",
32+
{"something_else": "bar"},
33+
id="with_param_name"
34+
),
35+
pytest.param(
36+
'<form><input name="csrf-token" value="quux"/></form',
37+
"doesnt_exist",
38+
{},
39+
id="without_matching_tag"
1940
)
41+
])
42+
def test_get_csrf_token(self, markup, param_name, expected) -> None:
43+
"""Ensure we can gather the CSRF token from the LDC login form."""
44+
assert get_csrf_token(markup=markup, param_name=param_name) == expected
45+
46+
47+
@pytest.mark.parametrize(
48+
"corpora,corpus_id,expected", [
49+
pytest.param(
50+
[{ "catalog_id": "LDC99T42", "corpus_name": "Treebank-3", "download_link": "/download/4c0512a1451377eb2790d557fc76a690fa11693ad846df02f3ee59d12788", "invoice_date": "2025-01-01", "file": "treebank_3_LDC99T42", "filesize": "51.6 MB", "checksum": "98c74f99f6ca17dc88efb4077fcd9539" }],
51+
"LDC99T42",
52+
"2025-01-01",
53+
id="with_single_item_list"
54+
),
55+
pytest.param(duplicate_invoice_data, "LDC99T42", "2020-08-22", id="with_dupes"),
56+
pytest.param([], "bogus", None, id="with_empty_corpora_list")
57+
]
58+
)
59+
def test_get_latest_invoice_date(
60+
self, corpora, corpus_id, expected
61+
) -> None:
62+
"""Ensure latest invoice date is fetched for a single corpus."""
63+
assert get_latest_invoice_date(corpora=corpora, corpus_id=corpus_id) == expected
64+
65+
66+
@pytest.mark.parametrize(
67+
"tag,expected", [
68+
pytest.param(BeautifulSoup("""
69+
<tr class="odd">
70+
<td class="">LDC2026S04</td>
71+
<td>CALLHOME Spanish Second Edition</td>
72+
<td class="">2026-03-16</td>
73+
<td class="download-counter-cell">
74+
<span class='download-counter-counter'>(2)</span> <a class='button download-counter-button' href='/download/6223e1ba26b43ce2787aa7303fd0329c1955d225fa30f8688e85abb019e8' title='Download Corpus'><span class='glyphicon glyphicon-download-alt'></span></a>
75+
</td>
76+
<td>
77+
CALLHOME_Spanish_Second_Edition.zip<br/>
78+
File Size: 1.46 GB
79+
MD5 Checksum: d57395eacde73a80ca6e2abcd7ddde52
80+
</td>
81+
</tr>""", "html.parser"),
82+
{
83+
"catalog_id": "LDC2026S04",
84+
"corpus_name": "CALLHOME Spanish Second Edition",
85+
"invoice_date": "2026-03-16",
86+
"download_link": "/download/6223e1ba26b43ce2787aa7303fd0329c1955d225fa30f8688e85abb019e8",
87+
"file": "CALLHOME_Spanish_Second_Edition.zip",
88+
"filesize": "1.46 GB",
89+
"checksum": "d57395eacde73a80ca6e2abcd7ddde52"
90+
},
91+
id="with_single_row"
92+
),
93+
pytest.param(
94+
BeautifulSoup("<tr></tr>", "html.parser"),
95+
{},
96+
id="with_empty_row"
97+
),
98+
pytest.param(
99+
BeautifulSoup("<tr><td>boop</td></tr>", "html.parser"),
100+
{},
101+
id="with_malformed_row"
102+
),
103+
pytest.param(
104+
BeautifulSoup("""
105+
<tr class="odd">
106+
<td class="">LDC2026S04</td>
107+
<td>CALLHOME Spanish Second Edition</td>
108+
<td class="">2026-03-16</td>
109+
<td class="download-counter-cell">
110+
<span class='download-counter-counter'>(2)</span> <a class='button download-counter-button' fake='/download/blah' title='Download Corpus'><span class='glyphicon glyphicon-download-alt'></span></a>
111+
</td>""", "html.parser"),
112+
{},
113+
id="with_malformed_download_link"
114+
)
115+
]
116+
)
117+
def test_scrape_corpus_metadata(self, tag, expected) -> None:
118+
"""Ensure we can parse the LDC organization downloads page into a
119+
JSON-based structure."""
120+
assert scrape_corpus_metadata(tag) == expected
20121

21-
def test_get_latest_invoice_date(self) -> None:
22-
with (
23-
importlib.resources.path(fixtures, "treebank-3.json") as test_json,
24-
open(test_json) as fh
25-
):
26-
data = json.loads(fh.read())
27-
assert "2020-08-22" == get_latest_invoice_date(data)

0 commit comments

Comments
 (0)