add completed test cases

anarchivist · anarchivist · commit 3222ac488f4d · 2026-05-14T23:31:59.000-07:00
diff --git a/mokelumne/util/ldc.py b/mokelumne/util/ldc.py
@@ -40,28 +40,36 @@ def scrape_corpus_metadata(tag: Tag) -> dict[str, str]:
     """
     cells = tag.find_all("td")
     if len(cells) > 0:
-        catalog_id = cells[0].get_text(strip=True)
-        corpus_name = cells[1].get_text(strip=True)
-        invoice_date = cells[2].get_text(strip=True)
-        download_link = cells[3].a["href"]   # pyright: ignore[reportOptionalSubscript]
-        # the technical metadata is not broken up into distinct cells, so
-        # we have to parse it more. the "file" metadata is also not 
-        # the true filename as returned by LDC. 
-        techmd = cells[4].get_text(strip=True, separator="\n").splitlines()
-        file, filesize, checksum = [
-            re.sub(r"^\s*(File Size|MD5 Checksum): ", "", t) for t in techmd
-        ]
-        return {
-            "catalog_id": catalog_id,
-            "corpus_name": corpus_name,
-            "download_link": download_link,  # pyright: ignore[reportReturnType]
-            "invoice_date": invoice_date,
-            "file": file,
-            "filesize": filesize,
-            "checksum": checksum
-        }
+        try:
+            catalog_id = cells[0].get_text(strip=True)
+            corpus_name = cells[1].get_text(strip=True)
+
+            # note: there is a unique download link for each distinct 
+            # invoice date that is associated woith a file
+            invoice_date = cells[2].get_text(strip=True)
+            download_link = cells[3].a["href"]   # pyright: ignore[reportOptionalSubscript]
+
+            # the file-level metadata is not broken up into distinct cells, so
+            # we have to parse it more. the "file" metadata is also not 
+            # the true filename as returned by LDC. 
+            techmd = cells[4].get_text(strip=True, separator="\n").splitlines()
+            file, filesize, checksum = [
+                re.sub(r"^\s*(File Size|MD5 Checksum): ", "", t) for t in techmd
+            ]
+            return {
+                "catalog_id": catalog_id,
+                "corpus_name": corpus_name,
+                "download_link": download_link,  # pyright: ignore[reportReturnType]
+                "invoice_date": invoice_date,
+                "file": file,
+                "filesize": filesize,
+                "checksum": checksum
+            }
+        except (IndexError, KeyError):
+            return {}
     return {}
 
+
 def get_latest_invoice_date(
     corpora: list[dict[str, str]] = [], corpus_id: str = ""
 ) -> Optional[str]:
diff --git a/test/unit/test_ldc.py b/test/unit/test_ldc.py
@@ -3,25 +3,119 @@
 import json
 import pytest
 
-from .. import fixtures
+from bs4 import BeautifulSoup
 
 from mokelumne.util.ldc import (
     get_csrf_token, get_latest_invoice_date, scrape_corpus_metadata,
 )
+from .. import fixtures
+
 
 class TestLDC:
-    def test_get_csrf_token(self) -> None:
-        markup = '<form><input name="authenticity_token" value="foo"/><form>'
-        markup_alt = '<form><input name="something_else" value="bar"/><form>'
-        assert {"authenticity_token": "foo"} == get_csrf_token(markup=markup)
-        assert {"something_else": "bar"} == get_csrf_token(
-            markup=markup_alt, param_name="something_else"
+    with (
+        importlib.resources.path(fixtures, "ldc-treebank-3.json") as test_json,
+        open(test_json) as fh
+    ):
+        duplicate_invoice_data = json.loads(fh.read())
+
+    @pytest.mark.parametrize(
+        "markup,param_name,expected", [
+        pytest.param(
+            '<form><input name="authenticity_token" value="foo"/></form>',
+            "authenticity_token",
+            {"authenticity_token": "foo"},
+            id="with_default_param_name"
+        ),
+        pytest.param(
+            '<form><input name="something_else" value="bar"/><input name="authenticity_token" value="baz"/></form>',
+            "something_else",
+            {"something_else": "bar"},
+            id="with_param_name"
+        ),
+        pytest.param(
+            '<form><input name="csrf-token" value="quux"/></form',
+            "doesnt_exist",
+            {},
+            id="without_matching_tag"
         )
+    ])
+    def test_get_csrf_token(self, markup, param_name, expected) -> None:
+        """Ensure we can gather the CSRF token from the LDC login form."""
+        assert get_csrf_token(markup=markup, param_name=param_name) == expected
+
+
+    @pytest.mark.parametrize(
+        "corpora,corpus_id,expected", [
+        pytest.param(
+            [{ "catalog_id": "LDC99T42", "corpus_name": "Treebank-3", "download_link": "/download/4c0512a1451377eb2790d557fc76a690fa11693ad846df02f3ee59d12788", "invoice_date": "2025-01-01", "file": "treebank_3_LDC99T42", "filesize": "51.6 MB", "checksum": "98c74f99f6ca17dc88efb4077fcd9539" }],
+            "LDC99T42",
+            "2025-01-01",
+            id="with_single_item_list"
+        ),
+        pytest.param(duplicate_invoice_data, "LDC99T42", "2020-08-22", id="with_dupes"),
+        pytest.param([], "bogus", None, id="with_empty_corpora_list")
+        ]
+    )
+    def test_get_latest_invoice_date(
+        self, corpora, corpus_id, expected
+    ) -> None:
+        """Ensure latest invoice date is fetched for a single corpus."""
+        assert get_latest_invoice_date(corpora=corpora, corpus_id=corpus_id) == expected
+
+
+    @pytest.mark.parametrize(
+        "tag,expected", [
+            pytest.param(BeautifulSoup("""
+                <tr class="odd">
+                <td class="">LDC2026S04</td>
+                <td>CALLHOME Spanish Second Edition</td>
+                <td class="">2026-03-16</td>
+                <td class="download-counter-cell">
+                    <span class='download-counter-counter'>(2)</span> <a class='button download-counter-button' href='/download/6223e1ba26b43ce2787aa7303fd0329c1955d225fa30f8688e85abb019e8' title='Download Corpus'><span class='glyphicon glyphicon-download-alt'></span></a>
+                </td>
+                <td>
+                    CALLHOME_Spanish_Second_Edition.zip<br/>
+                        File Size: 1.46 GB
+                        MD5 Checksum: d57395eacde73a80ca6e2abcd7ddde52
+                </td>
+                </tr>""", "html.parser"),
+                {
+                    "catalog_id": "LDC2026S04",
+                    "corpus_name": "CALLHOME Spanish Second Edition",
+                    "invoice_date": "2026-03-16",
+                    "download_link": "/download/6223e1ba26b43ce2787aa7303fd0329c1955d225fa30f8688e85abb019e8",
+                    "file": "CALLHOME_Spanish_Second_Edition.zip",
+                    "filesize": "1.46 GB",
+                    "checksum": "d57395eacde73a80ca6e2abcd7ddde52"
+                },
+                id="with_single_row"
+            ),
+            pytest.param(
+                BeautifulSoup("<tr></tr>", "html.parser"),
+                {},
+                id="with_empty_row"
+            ),
+            pytest.param(
+                BeautifulSoup("<tr><td>boop</td></tr>", "html.parser"),
+                {},
+                id="with_malformed_row"
+            ),
+            pytest.param(
+                BeautifulSoup("""
+                <tr class="odd">
+                <td class="">LDC2026S04</td>
+                <td>CALLHOME Spanish Second Edition</td>
+                <td class="">2026-03-16</td>
+                <td class="download-counter-cell">
+                    <span class='download-counter-counter'>(2)</span> <a class='button download-counter-button' fake='/download/blah' title='Download Corpus'><span class='glyphicon glyphicon-download-alt'></span></a>
+                </td>""", "html.parser"),
+                {},
+                id="with_malformed_download_link"
+            )
+        ]
+    )
+    def test_scrape_corpus_metadata(self, tag, expected) -> None:
+        """Ensure we can parse the LDC organization downloads page into a
+        JSON-based structure."""
+        assert scrape_corpus_metadata(tag) == expected
 
-    def test_get_latest_invoice_date(self) -> None:
-        with (
-            importlib.resources.path(fixtures, "treebank-3.json") as test_json,
-            open(test_json) as fh
-        ):
-            data = json.loads(fh.read())
-        assert "2020-08-22" == get_latest_invoice_date(data)