BerkeleyLibrary
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mokelumne/dags/fetch_ldc_corpus.py‎
Lines changed: 43 additions & 104 deletions b/‎mokelumne/dags/fetch_ldc_corpus.py‎
Lines changed: 43 additions & 104 deletions
diff --git a/‎mokelumne/providers/ldc/__init__.py‎ b/‎mokelumne/providers/ldc/__init__.py‎
diff --git a/‎mokelumne/providers/ldc/get_provider_info.py‎
Lines changed: 34 additions & 0 deletions b/‎mokelumne/providers/ldc/get_provider_info.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎mokelumne/providers/ldc/hooks/__init__.py‎ b/‎mokelumne/providers/ldc/hooks/__init__.py‎
@@ -15,4 +15,5 @@ airflow.cfg
 *.egg-info
 .coverage
 build/
+public/
 uv.lock
@@ -13,15 +13,13 @@
 import logging
 import re
 
-from http.cookiejar import MozillaCookieJar
 from mmap import mmap, ACCESS_READ
 
-import requests
-
-from airflow.sdk import Connection, Param, chain, dag, get_current_context, task
+from airflow.sdk import Param, chain, dag, get_current_context, task
 from bs4 import BeautifulSoup
 
-from mokelumne.util.ldc import get_csrf_token, scrape_corpus_metadata, get_latest_invoice_date
+from mokelumne.providers.ldc.hooks.ldc import LDCHook
+from mokelumne.util.ldc import filter_corpora, scrape_corpus_metadata, get_latest_invoice_date
 from mokelumne.util.storage import run_dir
 
 logger = logging.getLogger(__name__)
@@ -62,78 +60,38 @@ def fetch_ldc_corpus():
     .. _ldcdl: https://github.com/jonmay/ldcdl
     """
 
-    @task
-    def authenticate_session() -> str:
-        """
-        Authenticate a browser session and persist the session's cookies to a
-        cookiejar file.
-
-        The LDC catalog checks for the presence of the `_xiexie` cookie,
-        which is a session cookie and thus not automatically included in
-        serialized cookie jars. If not present, the LDC Catalog automatically
-        redirects to the login page.
-
-        :returns: Location of the cookiejar file.
-        :rtype: str
-        """
-        ctx = get_current_context() 
-        cj = run_dir(ctx["run_id"]) / "cookies.txt"
-        conn = Connection.get("ldc")
-        login_url = f"{conn.host}/login"
-        session = requests.Session()
-        login_page = session.get(login_url)
-        form_data = get_csrf_token(login_page.text)
-        form_data["spree_user[login]"] = conn.login
-        form_data["spree_user[password]"] = conn.password
-        form_data["utf8"] = "✓"
-
-        login_request = requests.Request("POST", url=login_url, data=form_data)
-        prepped = session.prepare_request(login_request)
-        _ = session.send(prepped)
-        cookies = MozillaCookieJar(filename=cj)
-        for c in session.cookies:
-            cookies.set_cookie(c)
-        cookies.save(ignore_discard=True)
-        return str(cj)
-
+    hook = LDCHook()
 
     @task
-    def get_available_ldc_corpora(cookiejar) -> str:
+    def get_available_ldc_corpora() -> str:
         """
         Fetch the page listing the corpora available for download from the LDC
         catalog. This is an HTML page cached locally for further parsing.
 
-        :param cookiejar: The path to a cookiejar file for LDC
         :returns: Path to the HTML file fetched from LDC.
         :rtype: str
         """
-        cookies = MozillaCookieJar(filename=cookiejar)
-        cookies.load(ignore_discard=True)
         ctx = get_current_context()
-        conn = Connection.get("ldc")
-        output = run_dir(ctx["run_id"]) / "corpora.html"
-        datasets_url = f"{conn.host}/organization/downloads"
-
-        with (requests.Session() as session, open(output, "wb") as outfile):
-            session.cookies = cookies  # pyright: ignore[reportAttributeAccessIssue]
-            request = requests.Request('GET', datasets_url)
-            prepped = session.prepare_request(request)
-            resp = session.send(prepped, stream=True)
-            for chunk in resp.iter_content(chunk_size=(8*1024)):
-                outfile.write(chunk)
+        dest_dir = run_dir(ctx["run_id"])
+        corpora_html_path = dest_dir / "corpora.html"
+        
+        response = hook.get_corpora_response()
+        with open(corpora_html_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
 
-        return str(output)
+        return str(corpora_html_path)
 
 
     @task
-    def parse_corpora_metadata(corpora_file) -> str:
+    def parse_corpora_metadata(corpora_file) -> list[dict[str, str]]:
         """
         Parse the HTML of the catalog to create a structured representation for
         further use. There is no API for LDC, so we are forced to screenscrape.
 
         :param corpora_file: Location of the fetched downloads page.
-        :returns: Path to JSON-serialized LDC metadata.
-        :rtype:
+        :returns: Parsed LDC metadata as a list of dicts.
+        :rtype: list[dict[str, str]]
         """
         ctx = get_current_context()
         corpora_json = run_dir(ctx["run_id"]) / "corpora.json"
@@ -148,80 +106,59 @@ def parse_corpora_metadata(corpora_file) -> str:
         with open(corpora_json, "w") as corpora_out:
             corpora_out.write(json.dumps(corpora))
 
-        return str(corpora_json)
+        return corpora
 
 
     @task.short_circuit
-    def corpus_is_available(corpora_json) -> list[dict[str, str]]:
+    def corpus_is_available(corpora_list) -> list[dict[str, str]]:
         """
         Check to see if the requested corpus is listed in the set of corpora
-        avilable for download. Used to shortcircuit the ``fetch_ldc_corpus()``
+        available for download. Used to shortcircuit the ``fetch_ldc_corpus()``
         task if the dataset is not available.
 
         Since the LDC catalog provides multiple duplicate downloads based on
-        invoice date, this also filters the downloads availalble to 
+        invoice date, this also filters the downloads available to
         those with the latest invoice date.
 
-        :param corpora_json: Path to JSON-serialized LDC catalog metadata.
+        :param corpora_list: Parsed LDC metadata list.
         :returns: A list of LDC downloads for fetching or an empty list.
         :rtype: list[dict[str, str]]
         """
         ctx = get_current_context()
-        id_ = ctx["params"].get("ldc_corpus")
-        with open(corpora_json) as corpora_fp:
-            corpora = json.load(corpora_fp)
-        
-        latest = get_latest_invoice_date(corpora=corpora, corpus_id=id_)  # pyright: ignore[reportArgumentType]
-        if latest:
-            logger.debug("Latest invoice date for %s is %s" % (id_, latest))
-            fnregex = re.compile(ctx["params"].get("filename_regex", ".*"))
-            return [c for c in corpora if (
-                c["catalog_id"] == id_
-                and c["invoice_date"] == latest
-                and re.search(fnregex, c.get("file"))
-            )]
-        return []
+        id_ = ctx["params"].get("ldc_corpus", "")
+        filename_regex = ctx["params"].get("filename_regex")
+
+        return filter_corpora(corpora=corpora_list, corpus_id=id_, filename_regex=filename_regex)
 
 
     @task
-    def download_corpus_from_ldc(filedict, cookiejar) -> str:
+    def download_corpus_from_ldc(filedict) -> str:
         """
-        Download a corpus from the LDC catalog and verify that the MD5
-        checksum reported by LDC matches that of the downloaded file.
+        Download a corpus from the LDC catalog using the authenticated hook and
+        verify that the MD5 checksum reported by LDC matches the downloaded file.
 
         :param filedict: A dict representing the file metadata
-        :param cookiejar: Location of the cookiejar file.
-        :param available_corpora: The metadata for the available corpora.
         :returns: The location of the downloaded dataset file.
         :rtype: str
         """
-        cookies = MozillaCookieJar(cookiejar)
-        cookies.load(ignore_discard=True)
-        conn = Connection.get("ldc")
         ctx = get_current_context()
-        corpus = ctx["params"].get("ldc_corpus")
-
-        # corpus_metadata = available_corpora.get(corpus)
-        dl_uri = f"{conn.host}/{filedict.get("download_link")}"
-        logger.info("Fetching corpus %s: %s" % (corpus, filedict))
-        session = requests.session()
-        session.cookies = cookies   # pyright: ignore[reportAttributeAccessIssue]
-        request = requests.Request('GET', dl_uri)
-        prepped = session.prepare_request(request)
-        resp = session.send(prepped, stream=True)
+        dest_dir = run_dir(ctx["run_id"])
+        resp = hook.get_corpus_file(filedict["download_link"])
+
         match = re.match(
-            r'^attachment; filename="(.*)"$', resp.headers['Content-Disposition']
+            r'^attachment; filename="(.*)"$', resp.headers.get("Content-Disposition", "")
         )
 
         if match:
-            dest = run_dir(ctx["run_id"]) / match.group(1)
+            dest = dest_dir / match.group(1)
         else:
             logger.warning("No Content-Disposition header; falling back to catalog filename")
-            dest = run_dir(ctx["run_id"]) / filedict["filename"]
+            dest = dest_dir / filedict["filename"]
 
         with open(dest, "wb") as out:
             for chunk in resp.iter_content(chunk_size=(8*1024)):
-                out.write(chunk)
+                if chunk:
+                    out.write(chunk)
 
         with (
             open(dest, "rb") as f,
@@ -230,18 +167,20 @@ def download_corpus_from_ldc(filedict, cookiejar) -> str:
             dl_checksum = hashlib.md5(f).hexdigest()
 
         if dl_checksum != filedict["checksum"]:
-            logger.warning("Downloaded file's checksum %s does not match LDC checksum %s" % (dl_checksum, filedict["checksum"]))
-
+            logger.warning(
+                "Downloaded file's checksum %s does not match LDC checksum %s" % (
+                    dl_checksum, filedict["checksum"]
+                )
+            )
         return str(dest)
 
 
-    cookiejar = authenticate_session()
-    corpora_file = get_available_ldc_corpora(cookiejar)
+    corpora_file = get_available_ldc_corpora()
     available_corpora = parse_corpora_metadata(corpora_file)
     files_to_download = corpus_is_available(available_corpora)
     chain(
         files_to_download,
-        download_corpus_from_ldc.partial(cookiejar=cookiejar).expand(filedict=files_to_download)
+        download_corpus_from_ldc.expand(filedict=files_to_download)
     )
 
 
 
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from pathlib import Path
+import yaml
+
+_RUNTIME_FIELDS = {
+    "package-name",
+    "name",
+    "description",
+    "hook-class-names",
+    "connection-types",
+    "hooks",
+    "operators",
+    "sensors",
+    "transfers",
+    "triggers",
+    "bundles",
+    "integrations",
+    "filesystems",
+    "asset-uris",
+    "dialects",
+    "extra-links",
+    "auth-backends",
+    "auth-managers",
+    "notifications",
+    "executors",
+    "config",
+}
+
+
+def get_provider_info() -> dict:
+    data = (Path(__file__).parent / "provider.yaml").read_text()
+    raw = yaml.safe_load(data)
+    return {k: v for k, v in raw.items() if k in _RUNTIME_FIELDS}