PubInv
diff --git a/‎.github/workflows/refresh-project-workbook.yml‎
Lines changed: 45 additions & 0 deletions b/‎.github/workflows/refresh-project-workbook.yml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎Copy of Public Invention Projects.xlsx - Projects.pdf‎
-120 KB b/‎Copy of Public Invention Projects.xlsx - Projects.pdf‎
-120 KB
diff --git a/‎pipeline/activity.py‎
Lines changed: 48 additions & 0 deletions b/‎pipeline/activity.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎pipeline/fetch.py‎
Lines changed: 67 additions & 0 deletions b/‎pipeline/fetch.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎pipeline/github_client.py‎
Lines changed: 64 additions & 0 deletions b/‎pipeline/github_client.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎pipeline/ooxml.py‎
Lines changed: 148 additions & 0 deletions b/‎pipeline/ooxml.py‎
Lines changed: 148 additions & 0 deletions
@@ -0,0 +1,45 @@
+name: Refresh project workbook
+
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - "sources/**"
+      - "pipeline/**"
+      - ".github/workflows/refresh-project-workbook.yml"
+  schedule:
+    - cron: "0 13 * * 1"
+
+permissions:
+  contents: write
+
+concurrency:
+  group: refresh-project-workbook
+  cancel-in-progress: true
+
+jobs:
+  refresh:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Refresh workbook computed columns
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python3 pipeline/fetch.py
+
+      - name: Commit workbook changes
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add sources/Public\ Invention\ Projects.xlsx
+          if git diff --staged --quiet; then
+            echo "No workbook changes."
+          else
+            git commit -m "chore: refresh project workbook"
+            git push
+          fi
@@ -0,0 +1,48 @@
+"""Activity status policy for project links."""
+from datetime import datetime, timezone
+
+# A project is considered active if its last commit is within about 6 months.
+ACTIVE_DAYS = 182
+
+# A project is considered stale from 6-12 months, then dormant after 12 months.
+STALE_DAYS = 365
+
+
+def classify_activity(timestamp: str) -> str:
+    """Convert a GitHub timestamp into Active/Stale/Dormant/Unknown."""
+    # No timestamp means the script could not determine activity.
+    if not timestamp:
+        return "Unknown"
+
+    # GitHub API timestamps use UTC ISO format like 2026-05-27T00:39:03Z.
+    try:
+        last_commit = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+    except ValueError:
+        return "Unknown"
+
+    # Compare the commit date to now in UTC so timezone differences do not matter.
+    age_days = (datetime.now(timezone.utc) - last_commit).days
+
+    # Fresh projects are actively maintained.
+    if age_days <= ACTIVE_DAYS:
+        return "Active"
+
+    # Older-but-not-abandoned projects are stale.
+    if age_days <= STALE_DAYS:
+        return "Stale"
+
+    # Anything older than one year is dormant.
+    return "Dormant"
+
+
+def format_commit_month(timestamp: str) -> str:
+    """Convert a GitHub timestamp into a workbook-friendly Month Year label."""
+    if not timestamp:
+        return ""
+
+    try:
+        last_commit = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")
+    except ValueError:
+        return ""
+
+    return last_commit.strftime("%B %Y")
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Refresh computed GitHub columns in the project workbook.
+
+This script is designed to run in GitHub Actions. It only updates the Excel
+workbook; it does not parse skills, generate JSON, run the demo page, or
+perform project matching.
+"""
+from github_client import GitHubClient
+from project_links import enrich_project_row
+from workbook_reader import DEFAULT_SHEETS, DEFAULT_XLSX, read_project_rows
+from workbook_writer import write_computed_columns
+
+COMPUTED_COLUMNS = {
+    # Excel header -> field name in the enriched row dictionary.
+    "Status": "activity_status",
+    "Last Commit": "last_commit",
+    "Description Source": "description_source",
+}
+
+
+def main() -> None:
+    # The Action always refreshes the repo workbook's standard project tabs.
+    workbook = DEFAULT_XLSX
+    sheets = DEFAULT_SHEETS
+
+    # Read project rows from the workbook before making any network calls.
+    records = read_project_rows(workbook, sheets=sheets)
+    print(f"Read {len(records)} project rows from {workbook}.")
+
+    # Build the GitHub API client with the Action-provided token.
+    github = GitHubClient()
+
+    # Reuse one cache across rows so repeated PubInv/PubInv idea-file links do
+    # not call the same GitHub endpoints more than needed.
+    enriched = []
+    cache = {}
+    for idx, record in enumerate(records, 1):
+        # Add activity_status, last_commit, and description_source to the row.
+        updated = enrich_project_row(record, github, cache)
+        enriched.append(updated)
+
+        # Print one line per row so Actions logs show what was computed.
+        print(
+            f"[{idx}/{len(records)}] {updated['project'][:58]:58} "
+            f"{updated['activity_status']:8} "
+            f"{updated['last_commit'] or '-':20} "
+            f"{updated['description_source']}"
+        )
+
+    # Safety guard: if every GitHub row became Unknown, do not overwrite useful
+    # workbook data with a failed refresh.
+    github_rows = [row for row in enriched if "github.com" in (row.get("link") or "")]
+    resolved = [row for row in github_rows if row["activity_status"] != "Unknown"]
+    if github_rows and not resolved:
+        raise SystemExit(
+            f"ERROR: {len(github_rows)} GitHub-linked rows all resolved to Unknown. "
+            "Refusing to overwrite workbook data."
+        )
+
+    # Write only the computed columns back into the workbook.
+    write_computed_columns(workbook, enriched, COMPUTED_COLUMNS)
+    print(f"Updated computed columns in {workbook}.")
+
+
+if __name__ == "__main__":
+    # Standard Python entrypoint.
+    main()
@@ -0,0 +1,64 @@
+"""Small GitHub API client used by the workbook refresh."""
+import json
+import os
+from urllib.error import HTTPError, URLError
+from urllib.parse import quote
+from urllib.request import Request, urlopen
+
+
+class GitHubClient:
+    def __init__(self) -> None:
+        # This pipeline is intended to run in GitHub Actions, where GITHUB_TOKEN
+        # is provided to the job.
+        token = os.environ.get("GITHUB_TOKEN")
+        if not token:
+            raise SystemExit("ERROR: GITHUB_TOKEN is required. Run this workflow in GitHub Actions.")
+
+        # The User-Agent is required by GitHub API etiquette.
+        self.headers = {"User-Agent": "project-chatbot-workbook-refresh/1.0"}
+
+        # API requests ask for GitHub's JSON response format.
+        self.api_headers = {**self.headers, "Accept": "application/vnd.github+json"}
+
+        # Authenticate API requests made by the workflow.
+        self.api_headers["Authorization"] = f"Bearer {token}"
+
+    def latest_commit(self, owner: str, repo: str, path: str = "", ref: str = "") -> str:
+        """Return the latest commit timestamp for a repo or one file path."""
+        # The commits endpoint returns newest commits first when per_page=1.
+        params = ["per_page=1"]
+
+        # Blob links include a branch/ref and a file path. Both matter for
+        # repositories whose default branch differs from the blob link branch.
+        if ref:
+            params.append(f"sha={quote(ref)}")
+        if path:
+            params.append(f"path={quote(path)}")
+
+        url = f"https://api.github.com/repos/{owner}/{repo}/commits?{'&'.join(params)}"
+
+        try:
+            # commits is a list; the first item is the newest matching commit.
+            commits = self.get_json(url)
+            if commits:
+                commit = commits[0].get("commit", {})
+                return (commit.get("committer") or commit.get("author") or {}).get("date", "")
+        except (HTTPError, URLError, OSError, KeyError, TypeError):
+            # If a file-specific lookup fails, do not substitute repo activity.
+            # That would make old project idea files look fresh.
+            pass
+        if path:
+            return ""
+        return self.repo_pushed_at(owner, repo)
+
+    def repo_pushed_at(self, owner: str, repo: str) -> str:
+        """Return repository pushed_at as a fallback activity timestamp."""
+        try:
+            return self.get_json(f"https://api.github.com/repos/{owner}/{repo}").get("pushed_at", "")
+        except (HTTPError, URLError, OSError):
+            return ""
+
+    def get_json(self, url: str):
+        """GET a GitHub API URL and decode the JSON body."""
+        with urlopen(Request(url, headers=self.api_headers), timeout=12) as response:
+            return json.loads(response.read().decode("utf-8", errors="ignore"))
@@ -0,0 +1,148 @@
+"""Minimal OOXML helpers for .xlsx files.
+
+The pipeline avoids third-party dependencies, so it edits the workbook as XML
+inside the .xlsx zip. Higher-level workbook code should own row/column meaning.
+"""
+import zipfile
+import xml.etree.ElementTree as ET
+
+# SpreadsheetML namespace used by worksheet XML files.
+M_NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
+
+# Package relationship namespace used to map workbook sheets to XML files.
+REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
+
+# Office relationship namespace used for sheet relationship ids.
+OFFICE_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+
+# Prefix map used by ElementTree XPath calls.
+NS = {"m": M_NS}
+
+# Preserve the default SpreadsheetML namespace when writing XML back.
+ET.register_namespace("", M_NS)
+
+
+def sheet_paths(z: zipfile.ZipFile) -> dict[str, str]:
+    """Return worksheet display name -> XML path inside the workbook zip."""
+    # workbook.xml lists sheets by name and relationship id.
+    workbook = ET.fromstring(z.read("xl/workbook.xml"))
+
+    # workbook.xml.rels maps relationship ids to actual worksheet XML paths.
+    rels = ET.fromstring(z.read("xl/_rels/workbook.xml.rels"))
+    rel_targets = {
+        rel.attrib["Id"]: rel.attrib["Target"]
+        for rel in rels.findall(f"{{{REL_NS}}}Relationship")
+    }
+
+    # Build a direct lookup so callers can read a sheet by visible tab name.
+    paths = {}
+    for sheet in workbook.findall("m:sheets/m:sheet", NS):
+        target = rel_targets[sheet.attrib[f"{{{OFFICE_REL}}}id"]]
+
+        # Relationship targets are sometimes relative to xl/.
+        paths[sheet.attrib["name"]] = target if target.startswith("xl/") else "xl/" + target
+    return paths
+
+
+def shared_strings(z: zipfile.ZipFile) -> list[str]:
+    """Read Excel's shared string table."""
+    # Workbooks with only inline strings may not have sharedStrings.xml.
+    if "xl/sharedStrings.xml" not in z.namelist():
+        return []
+
+    # Shared strings are stored as rich text runs; join all text nodes.
+    root = ET.fromstring(z.read("xl/sharedStrings.xml"))
+    return [
+        "".join((text.text or "") for text in item.findall(".//m:t", NS))
+        for item in root.findall("m:si", NS)
+    ]
+
+
+def cell_text(cell: ET.Element, strings: list[str]) -> str:
+    """Return a cell's displayed text value."""
+    # Normal cells store their value under <v>.
+    value = cell.find("m:v", NS)
+
+    # t="s" means <v> is an index into sharedStrings.xml.
+    if cell.attrib.get("t") == "s" and value is not None and value.text is not None:
+        return strings[int(value.text)]
+
+    # t="inlineStr" means the text lives directly inside the cell.
+    if cell.attrib.get("t") == "inlineStr":
+        inline = cell.find("m:is", NS)
+        return "".join((text.text or "") for text in inline.findall(".//m:t", NS)) if inline is not None else ""
+
+    # Numeric/plain cells can be returned directly from <v>.
+    return value.text if value is not None and value.text is not None else ""
+
+
+def split_ref(ref: str) -> tuple[int | None, int | None]:
+    """Convert an Excel cell reference like C12 into (3, 12)."""
+    # Separate column letters from row digits.
+    letters = "".join(ch for ch in ref if ch.isalpha())
+    digits = "".join(ch for ch in ref if ch.isdigit())
+    if not letters or not digits:
+        return None, None
+
+    # Convert base-26 letters into a 1-based column number.
+    col = 0
+    for ch in letters:
+        col = col * 26 + ord(ch.upper()) - 64
+    return col, int(digits)
+
+
+def col_name(idx: int) -> str:
+    """Convert a 1-based column number into Excel letters."""
+    name = ""
+
+    # Excel columns are base-26 but without a zero digit.
+    while idx:
+        idx, rem = divmod(idx - 1, 26)
+        name = chr(65 + rem) + name
+    return name
+
+
+def ensure_cell(row: ET.Element, col: int) -> ET.Element:
+    """Return an existing cell in a row, or create it in column order."""
+    # Build the Excel cell reference, for example column 14 in row 2 is N2.
+    ref = f"{col_name(col)}{row.attrib['r']}"
+
+    # Reuse an existing cell if one is already present.
+    cells = row.findall("m:c", NS)
+    for cell in cells:
+        existing_col, _ = split_ref(cell.attrib.get("r", ""))
+        if existing_col == col:
+            return cell
+
+    # Create a new blank cell.
+    new_cell = ET.Element(f"{{{M_NS}}}c", {"r": ref})
+
+    # Insert before the next higher column so Excel sees cells in normal order.
+    for pos, cell in enumerate(cells):
+        existing_col, _ = split_ref(cell.attrib.get("r", ""))
+        if existing_col and existing_col > col:
+            row.insert(pos, new_cell)
+            return new_cell
+
+    # Append if this is now the rightmost cell in the row.
+    row.append(new_cell)
+    return new_cell
+
+
+def set_text(cell: ET.Element, value: str) -> None:
+    """Replace a cell's contents with inline text."""
+    # Remove any old <v>, <is>, or formula children.
+    for child in list(cell):
+        cell.remove(child)
+
+    # Remove the previous cell type before setting the new representation.
+    cell.attrib.pop("t", None)
+
+    # Empty string means leave the cell blank.
+    if not value:
+        return
+
+    # Inline strings keep this writer simple and avoid editing sharedStrings.xml.
+    cell.attrib["t"] = "inlineStr"
+    inline = ET.SubElement(cell, f"{{{M_NS}}}is")
+    ET.SubElement(inline, f"{{{M_NS}}}t").text = value