33import json
44import pytest
55
6- from .. import fixtures
6+ from bs4 import BeautifulSoup
77
88from mokelumne .util .ldc import (
99 get_csrf_token , get_latest_invoice_date , scrape_corpus_metadata ,
1010)
11+ from .. import fixtures
12+
1113
1214class TestLDC :
13- def test_get_csrf_token (self ) -> None :
14- markup = '<form><input name="authenticity_token" value="foo"/><form>'
15- markup_alt = '<form><input name="something_else" value="bar"/><form>'
16- assert {"authenticity_token" : "foo" } == get_csrf_token (markup = markup )
17- assert {"something_else" : "bar" } == get_csrf_token (
18- markup = markup_alt , param_name = "something_else"
15+ with (
16+ importlib .resources .path (fixtures , "ldc-treebank-3.json" ) as test_json ,
17+ open (test_json ) as fh
18+ ):
19+ duplicate_invoice_data = json .loads (fh .read ())
20+
21+ @pytest .mark .parametrize (
22+ "markup,param_name,expected" , [
23+ pytest .param (
24+ '<form><input name="authenticity_token" value="foo"/></form>' ,
25+ "authenticity_token" ,
26+ {"authenticity_token" : "foo" },
27+ id = "with_default_param_name"
28+ ),
29+ pytest .param (
30+ '<form><input name="something_else" value="bar"/><input name="authenticity_token" value="baz"/></form>' ,
31+ "something_else" ,
32+ {"something_else" : "bar" },
33+ id = "with_param_name"
34+ ),
35+ pytest .param (
36+ '<form><input name="csrf-token" value="quux"/></form' ,
37+ "doesnt_exist" ,
38+ {},
39+ id = "without_matching_tag"
1940 )
41+ ])
42+ def test_get_csrf_token (self , markup , param_name , expected ) -> None :
43+ """Ensure we can gather the CSRF token from the LDC login form."""
44+ assert get_csrf_token (markup = markup , param_name = param_name ) == expected
45+
46+
47+ @pytest .mark .parametrize (
48+ "corpora,corpus_id,expected" , [
49+ pytest .param (
50+ [{ "catalog_id" : "LDC99T42" , "corpus_name" : "Treebank-3" , "download_link" : "/download/4c0512a1451377eb2790d557fc76a690fa11693ad846df02f3ee59d12788" , "invoice_date" : "2025-01-01" , "file" : "treebank_3_LDC99T42" , "filesize" : "51.6 MB" , "checksum" : "98c74f99f6ca17dc88efb4077fcd9539" }],
51+ "LDC99T42" ,
52+ "2025-01-01" ,
53+ id = "with_single_item_list"
54+ ),
55+ pytest .param (duplicate_invoice_data , "LDC99T42" , "2020-08-22" , id = "with_dupes" ),
56+ pytest .param ([], "bogus" , None , id = "with_empty_corpora_list" )
57+ ]
58+ )
59+ def test_get_latest_invoice_date (
60+ self , corpora , corpus_id , expected
61+ ) -> None :
62+ """Ensure latest invoice date is fetched for a single corpus."""
63+ assert get_latest_invoice_date (corpora = corpora , corpus_id = corpus_id ) == expected
64+
65+
66+ @pytest .mark .parametrize (
67+ "tag,expected" , [
68+ pytest .param (BeautifulSoup ("""
69+ <tr class="odd">
70+ <td class="">LDC2026S04</td>
71+ <td>CALLHOME Spanish Second Edition</td>
72+ <td class="">2026-03-16</td>
73+ <td class="download-counter-cell">
74+ <span class='download-counter-counter'>(2)</span> <a class='button download-counter-button' href='/download/6223e1ba26b43ce2787aa7303fd0329c1955d225fa30f8688e85abb019e8' title='Download Corpus'><span class='glyphicon glyphicon-download-alt'></span></a>
75+ </td>
76+ <td>
77+ CALLHOME_Spanish_Second_Edition.zip<br/>
78+ File Size: 1.46 GB
79+ MD5 Checksum: d57395eacde73a80ca6e2abcd7ddde52
80+ </td>
81+ </tr>""" , "html.parser" ),
82+ {
83+ "catalog_id" : "LDC2026S04" ,
84+ "corpus_name" : "CALLHOME Spanish Second Edition" ,
85+ "invoice_date" : "2026-03-16" ,
86+ "download_link" : "/download/6223e1ba26b43ce2787aa7303fd0329c1955d225fa30f8688e85abb019e8" ,
87+ "file" : "CALLHOME_Spanish_Second_Edition.zip" ,
88+ "filesize" : "1.46 GB" ,
89+ "checksum" : "d57395eacde73a80ca6e2abcd7ddde52"
90+ },
91+ id = "with_single_row"
92+ ),
93+ pytest .param (
94+ BeautifulSoup ("<tr></tr>" , "html.parser" ),
95+ {},
96+ id = "with_empty_row"
97+ ),
98+ pytest .param (
99+ BeautifulSoup ("<tr><td>boop</td></tr>" , "html.parser" ),
100+ {},
101+ id = "with_malformed_row"
102+ ),
103+ pytest .param (
104+ BeautifulSoup ("""
105+ <tr class="odd">
106+ <td class="">LDC2026S04</td>
107+ <td>CALLHOME Spanish Second Edition</td>
108+ <td class="">2026-03-16</td>
109+ <td class="download-counter-cell">
110+ <span class='download-counter-counter'>(2)</span> <a class='button download-counter-button' fake='/download/blah' title='Download Corpus'><span class='glyphicon glyphicon-download-alt'></span></a>
111+ </td>""" , "html.parser" ),
112+ {},
113+ id = "with_malformed_download_link"
114+ )
115+ ]
116+ )
117+ def test_scrape_corpus_metadata (self , tag , expected ) -> None :
118+ """Ensure we can parse the LDC organization downloads page into a
119+ JSON-based structure."""
120+ assert scrape_corpus_metadata (tag ) == expected
20121
21- def test_get_latest_invoice_date (self ) -> None :
22- with (
23- importlib .resources .path (fixtures , "treebank-3.json" ) as test_json ,
24- open (test_json ) as fh
25- ):
26- data = json .loads (fh .read ())
27- assert "2020-08-22" == get_latest_invoice_date (data )
0 commit comments