BerkeleyLibrary
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mokelumne/dags/fetch_ldc_corpus_files.py‎
Lines changed: 184 additions & 0 deletions b/‎mokelumne/dags/fetch_ldc_corpus_files.py‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎mokelumne/providers/ldc/__init__.py‎ b/‎mokelumne/providers/ldc/__init__.py‎
diff --git a/‎mokelumne/providers/ldc/get_provider_info.py‎
Lines changed: 34 additions & 0 deletions b/‎mokelumne/providers/ldc/get_provider_info.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎mokelumne/providers/ldc/hooks/__init__.py‎ b/‎mokelumne/providers/ldc/hooks/__init__.py‎
diff --git a/‎mokelumne/providers/ldc/hooks/ldc.py‎
Lines changed: 125 additions & 0 deletions b/‎mokelumne/providers/ldc/hooks/ldc.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎mokelumne/providers/ldc/provider.yaml‎
Lines changed: 20 additions & 0 deletions b/‎mokelumne/providers/ldc/provider.yaml‎
Lines changed: 20 additions & 0 deletions
@@ -15,4 +15,5 @@ airflow.cfg
 *.egg-info
 .coverage
 build/
+public/
 uv.lock
@@ -0,0 +1,184 @@
+"""
+Fetches files for a given corpus from the Linguistic Data Consortium catalog.
+"""
+
+# pyright: reportTypedDictNotRequiredAccess=false
+
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import re
+
+from mmap import mmap, ACCESS_READ
+
+from airflow.sdk import Param, chain, dag, get_current_context, task
+from bs4 import BeautifulSoup
+
+from mokelumne.providers.ldc.hooks.ldc import LDCHook
+from mokelumne.util.ldc import filter_corpora, scrape_corpus_metadata
+from mokelumne.util.storage import run_dir
+
+logger = logging.getLogger(__name__)
+
+@dag(
+    description="Fetches files for a given corpus from the Linguistic Data Consortium catalog",
+    schedule=None,
+    catchup=False,
+    params={
+        "ldc_corpus": Param(
+            default="",
+            title="LDC Catalog ID",
+            description="The catalog ID for the desired LDC corpus",
+            type="string",
+            ),
+        "filename_regex": Param(
+            default="",
+            title="Filename regular expression",
+            description="""Regular expression to match file metadata in the
+LDC catalog. Note that this is not necessarily the same as the downloaded
+filename reported by LDC's webserver.""",
+            type=["string", "null"],
+            format="regex",
+            ),
+    },
+    tags=["ldsp"],
+)
+def fetch_ldc_corpus_files():
+    """
+    Fetch a corpus from the `Linguistic Data Consortium catalog`_.  LDC
+    does not provide an API, so we have to screenscrape into an authorized
+    session to fetch the list of available datasets.
+
+    This is effectively a reimplementation of `ldcdl`_ by Jonathan May and
+    Alex Hedges.
+
+    .. _Linguistic Data Consortium catalog: https://catalog.ldc.upenn.edu/
+    .. _ldcdl: https://github.com/jonmay/ldcdl
+    """
+
+    hook = LDCHook()
+
+    @task
+    def get_available_ldc_corpora() -> str:
+        """
+        Fetch the page listing the corpora available for download from the LDC
+        catalog. This is an HTML page cached locally for further parsing.
+
+        :returns: Path to the HTML file fetched from LDC.
+        :rtype: str
+        """
+        ctx = get_current_context()
+        dest_dir = run_dir(ctx["run_id"])
+        corpora_html_path = dest_dir / "corpora.html"
+        
+        response = hook.get_corpora_response()
+        with open(corpora_html_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        
+        return str(corpora_html_path)
+
+
+    @task
+    def parse_corpora_metadata(corpora_file) -> list[dict[str, str]]:
+        """
+        Parse the HTML of the catalog to create a structured representation for
+        further use. There is no API for LDC, so we are forced to screenscrape.
+
+        :param corpora_file: Location of the fetched downloads page.
+        :returns: Parsed LDC metadata as a list of dicts.
+        :rtype: list[dict[str, str]]
+        """
+        ctx = get_current_context()
+        corpora_json = run_dir(ctx["run_id"]) / "corpora.json"
+
+        with open(corpora_file) as page:
+            corpora_html = page.read()
+
+        data = BeautifulSoup(corpora_html, "html.parser")
+        rows = data.select("#user-corpora-download-table > tbody > tr")
+        corpora = [scrape_corpus_metadata(row) for row in rows]
+
+        with open(corpora_json, "w") as corpora_out:
+            corpora_out.write(json.dumps(corpora))
+
+        return corpora
+
+
+    @task.short_circuit
+    def corpus_is_available(corpora_list) -> list[dict[str, str]]:
+        """
+        Check to see if the requested corpus is listed in the set of corpora
+        available for download. Used to shortcircuit the ``fetch_ldc_corpus_files()``
+        task if the dataset is not available.
+
+        Since the LDC catalog provides multiple duplicate downloads based on
+        invoice date, this also filters the downloads available to
+        those with the latest invoice date.
+
+        :param corpora_list: Parsed LDC metadata list.
+        :returns: A list of LDC downloads for fetching or an empty list.
+        :rtype: list[dict[str, str]]
+        """
+        ctx = get_current_context()
+        id_ = ctx["params"].get("ldc_corpus", "")
+        filename_regex = ctx["params"].get("filename_regex")
+
+        return filter_corpora(corpora=corpora_list, corpus_id=id_, filename_regex=filename_regex)
+
+
+    @task
+    def download_corpus_from_ldc(filedict) -> str:
+        """
+        Download a corpus from the LDC catalog using the authenticated hook and
+        verify that the MD5 checksum reported by LDC matches the downloaded file.
+
+        :param filedict: A dict representing the file metadata
+        :returns: The location of the downloaded dataset file.
+        :rtype: str
+        """
+        ctx = get_current_context()
+        dest_dir = run_dir(ctx["run_id"])
+        resp = hook.get_corpus_file(filedict["download_link"])
+
+        match = re.match(
+            r'^attachment; filename="(.*)"$', resp.headers.get("Content-Disposition", "")
+        )
+
+        if match:
+            dest = dest_dir / match.group(1)
+        else:
+            logger.warning("No Content-Disposition header; falling back to catalog filename")
+            dest = dest_dir / filedict["filename"]
+
+        with open(dest, "wb") as out:
+            for chunk in resp.iter_content(chunk_size=(8*1024)):
+                if chunk:
+                    out.write(chunk)
+
+        with (
+            open(dest, "rb") as f,
+            mmap(f.fileno(), 0, access=ACCESS_READ) as f
+        ):
+            dl_checksum = hashlib.md5(f).hexdigest()
+
+        if dl_checksum != filedict["checksum"]:
+            logger.warning(
+                "Downloaded file's checksum %s does not match LDC checksum %s" % (
+                    dl_checksum, filedict["checksum"]
+                )
+            )
+        return str(dest)
+
+
+    corpora_file = get_available_ldc_corpora()
+    available_corpora = parse_corpora_metadata(corpora_file)
+    files_to_download = corpus_is_available(available_corpora)
+    chain(
+        files_to_download,
+        download_corpus_from_ldc.expand(filedict=files_to_download)
+    )
+
+
+fetch_ldc_corpus_files()  # pyright: ignore[reportUnusedExpression]
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from pathlib import Path
+import yaml
+
+_RUNTIME_FIELDS = {
+    "package-name",
+    "name",
+    "description",
+    "hook-class-names",
+    "connection-types",
+    "hooks",
+    "operators",
+    "sensors",
+    "transfers",
+    "triggers",
+    "bundles",
+    "integrations",
+    "filesystems",
+    "asset-uris",
+    "dialects",
+    "extra-links",
+    "auth-backends",
+    "auth-managers",
+    "notifications",
+    "executors",
+    "config",
+}
+
+
+def get_provider_info() -> dict:
+    data = (Path(__file__).parent / "provider.yaml").read_text()
+    raw = yaml.safe_load(data)
+    return {k: v for k, v in raw.items() if k in _RUNTIME_FIELDS}
@@ -0,0 +1,125 @@
+"""Provides a hook for authenticating and fetching files from LDC."""
+
+from __future__ import annotations
+
+import logging
+from functools import cached_property
+from urllib.parse import urljoin
+
+import requests
+from airflow.sdk.exceptions import AirflowException
+from airflow.sdk import BaseHook
+
+from mokelumne.util.ldc import get_csrf_token
+
+logger = logging.getLogger(__name__)
+
+
+class LDCHook(BaseHook):
+    """Interact with the LDC catalog using an authenticated requests session."""
+
+    conn_type = "ldc"
+    conn_name_attr = "conn_id"
+    default_conn_name = "ldc_default"
+    hook_name = "LDC"
+
+    def __init__(self, conn_id: str = "ldc_default") -> None:
+        super().__init__()
+        self.conn_id = conn_id
+
+    def get_conn(self) -> requests.Session:
+        """Return an authenticated requests session for the LDC catalog."""
+        return self._get_session()
+
+    @cached_property
+    def conn(self) -> requests.Session:
+        """Return a cached authenticated requests session."""
+        return self._get_session()
+
+    def _get_session(self) -> requests.Session:
+        connection = self.get_connection(self.conn_id)
+        if not connection.host:
+            raise AirflowException("LDC connection host is not configured")
+
+        login_url = urljoin(connection.host, "login")
+        session = requests.Session()
+
+        response = session.get(login_url)
+        if response.status_code != 200:
+            raise AirflowException(
+                f"LDC login page request failed: {response.status_code}"
+            )
+
+        form_data = get_csrf_token(response.text)
+        if not form_data:
+            raise AirflowException(
+                "Unable to extract CSRF token from LDC login page"
+            )
+
+        form_data["spree_user[login]"] = connection.login or ""
+        form_data["spree_user[password]"] = connection.password or ""
+        form_data["utf8"] = "✓"
+
+        login_request = requests.Request("POST", url=login_url, data=form_data)
+        prepped = session.prepare_request(login_request)
+        login_response = session.send(prepped)
+        if login_response.status_code not in (200, 302):
+            raise AirflowException(
+                f"LDC authentication failed: {login_response.status_code}"
+            )
+
+        return session
+
+    def refresh_session(self) -> None:
+        """Clear the cached session so it is recreated on next access."""
+        if "conn" in self.__dict__:
+            del self.__dict__["conn"]
+
+    def test_connection(self) -> tuple[bool, str]:
+        try:
+            self.get_conn()
+            return True, "Connection successful"
+        except Exception as exc:
+            return False, str(exc)
+
+    def get_corpora_response(self) -> requests.Response:
+        """Fetch the LDC corpora downloads page response."""
+        connection = self.get_connection(self.conn_id)
+        session = self.get_conn()
+        datasets_url = urljoin(connection.host, "organization/downloads")   # pyright: ignore[reportArgumentType]
+
+        response = session.get(datasets_url, stream=True)
+        if response.status_code == 401:
+            logger.warning("LDC corpora fetch received 401, refreshing session")
+            self.refresh_session()
+            session = self.get_conn()
+            response = session.get(datasets_url, stream=True)
+
+        if response.status_code != 200:
+            raise AirflowException(
+                f"Failed to fetch LDC corpora page: {response.status_code}"
+            )
+
+        return response
+
+    def get_corpus_file(self, download_link: str) -> requests.Response:
+        """Fetch a corpus download response for the given download link."""
+        if not download_link:
+            raise AirflowException("Download link is missing")
+
+        connection = self.get_connection(self.conn_id)
+        session = self.get_conn()
+        dl_uri = urljoin(connection.host, download_link)  # pyright: ignore[reportArgumentType]
+        response = session.get(dl_uri, stream=True)
+        if response.status_code == 401:
+            logger.warning("LDC download request received 401, refreshing session")
+            self.refresh_session()
+            session = self.get_conn()
+            response = session.get(dl_uri, stream=True)
+
+        try:
+            response.raise_for_status()
+        except requests.HTTPError as exc:
+            raise AirflowException(f"LDC download failed: {exc}") from exc
+
+        return response
@@ -0,0 +1,20 @@
+package-name: mokelumne
+name: LDC
+description: LDC Airflow provider.
+connection-types:
+  - connection-type: ldc
+    hook-class-name: mokelumne.providers.ldc.hooks.ldc.LDCHook
+    hook-name: Linguistic Data Consortium
+    ui-field-behaviour:
+      hidden-fields:
+        - port
+        - schema
+        - extra
+      relabeling:
+        host: LDC Catalog URL
+        login: LDC Login Username
+        password: LDC Password
+      placeholders:
+        host: "https://catalog.ldc.upenn.edu"
+        login: "your-ldc-username"
+        password: "your-ldc-password"