Skip to content

Commit c3dccaa

Browse files
Merge pull request #8 from hassan1brahim/chore/workbook-refresh-action
Add GitHub Actions workbook status refresh
2 parents e83d9d7 + 20579cc commit c3dccaa

11 files changed

Lines changed: 643 additions & 0 deletions
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Refresh project workbook
2+
3+
on:
4+
workflow_dispatch:
5+
push:
6+
paths:
7+
- "sources/**"
8+
- "pipeline/**"
9+
- ".github/workflows/refresh-project-workbook.yml"
10+
schedule:
11+
- cron: "0 13 * * 1"
12+
13+
permissions:
14+
contents: write
15+
16+
concurrency:
17+
group: refresh-project-workbook
18+
cancel-in-progress: true
19+
20+
jobs:
21+
refresh:
22+
runs-on: ubuntu-latest
23+
steps:
24+
- uses: actions/checkout@v4
25+
26+
- uses: actions/setup-python@v5
27+
with:
28+
python-version: "3.11"
29+
30+
- name: Refresh workbook computed columns
31+
env:
32+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
33+
run: python3 pipeline/fetch.py
34+
35+
- name: Commit workbook changes
36+
run: |
37+
git config user.name "github-actions[bot]"
38+
git config user.email "github-actions[bot]@users.noreply.github.com"
39+
git add sources/Public\ Invention\ Projects.xlsx
40+
if git diff --staged --quiet; then
41+
echo "No workbook changes."
42+
else
43+
git commit -m "chore: refresh project workbook"
44+
git push
45+
fi
-120 KB
Binary file not shown.

pipeline/activity.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Activity status policy for project links."""
2+
from datetime import datetime, timezone
3+
4+
# A project is considered active if its last commit is within about 6 months.
5+
ACTIVE_DAYS = 182
6+
7+
# A project is considered stale from 6-12 months, then dormant after 12 months.
8+
STALE_DAYS = 365
9+
10+
11+
def classify_activity(timestamp: str) -> str:
12+
"""Convert a GitHub timestamp into Active/Stale/Dormant/Unknown."""
13+
# No timestamp means the script could not determine activity.
14+
if not timestamp:
15+
return "Unknown"
16+
17+
# GitHub API timestamps use UTC ISO format like 2026-05-27T00:39:03Z.
18+
try:
19+
last_commit = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
20+
except ValueError:
21+
return "Unknown"
22+
23+
# Compare the commit date to now in UTC so timezone differences do not matter.
24+
age_days = (datetime.now(timezone.utc) - last_commit).days
25+
26+
# Fresh projects are actively maintained.
27+
if age_days <= ACTIVE_DAYS:
28+
return "Active"
29+
30+
# Older-but-not-abandoned projects are stale.
31+
if age_days <= STALE_DAYS:
32+
return "Stale"
33+
34+
# Anything older than one year is dormant.
35+
return "Dormant"
36+
37+
38+
def format_commit_month(timestamp: str) -> str:
39+
"""Convert a GitHub timestamp into a workbook-friendly Month Year label."""
40+
if not timestamp:
41+
return ""
42+
43+
try:
44+
last_commit = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")
45+
except ValueError:
46+
return ""
47+
48+
return last_commit.strftime("%B %Y")

pipeline/fetch.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env python3
2+
"""Refresh computed GitHub columns in the project workbook.
3+
4+
This script is designed to run in GitHub Actions. It only updates the Excel
5+
workbook; it does not parse skills, generate JSON, run the demo page, or
6+
perform project matching.
7+
"""
8+
from github_client import GitHubClient
9+
from project_links import enrich_project_row
10+
from workbook_reader import DEFAULT_SHEETS, DEFAULT_XLSX, read_project_rows
11+
from workbook_writer import write_computed_columns
12+
13+
COMPUTED_COLUMNS = {
14+
# Excel header -> field name in the enriched row dictionary.
15+
"Status": "activity_status",
16+
"Last Commit": "last_commit",
17+
"Description Source": "description_source",
18+
}
19+
20+
21+
def main() -> None:
22+
# The Action always refreshes the repo workbook's standard project tabs.
23+
workbook = DEFAULT_XLSX
24+
sheets = DEFAULT_SHEETS
25+
26+
# Read project rows from the workbook before making any network calls.
27+
records = read_project_rows(workbook, sheets=sheets)
28+
print(f"Read {len(records)} project rows from {workbook}.")
29+
30+
# Build the GitHub API client with the Action-provided token.
31+
github = GitHubClient()
32+
33+
# Reuse one cache across rows so repeated PubInv/PubInv idea-file links do
34+
# not call the same GitHub endpoints more than needed.
35+
enriched = []
36+
cache = {}
37+
for idx, record in enumerate(records, 1):
38+
# Add activity_status, last_commit, and description_source to the row.
39+
updated = enrich_project_row(record, github, cache)
40+
enriched.append(updated)
41+
42+
# Print one line per row so Actions logs show what was computed.
43+
print(
44+
f"[{idx}/{len(records)}] {updated['project'][:58]:58} "
45+
f"{updated['activity_status']:8} "
46+
f"{updated['last_commit'] or '-':20} "
47+
f"{updated['description_source']}"
48+
)
49+
50+
# Safety guard: if every GitHub row became Unknown, do not overwrite useful
51+
# workbook data with a failed refresh.
52+
github_rows = [row for row in enriched if "github.com" in (row.get("link") or "")]
53+
resolved = [row for row in github_rows if row["activity_status"] != "Unknown"]
54+
if github_rows and not resolved:
55+
raise SystemExit(
56+
f"ERROR: {len(github_rows)} GitHub-linked rows all resolved to Unknown. "
57+
"Refusing to overwrite workbook data."
58+
)
59+
60+
# Write only the computed columns back into the workbook.
61+
write_computed_columns(workbook, enriched, COMPUTED_COLUMNS)
62+
print(f"Updated computed columns in {workbook}.")
63+
64+
65+
if __name__ == "__main__":
66+
# Standard Python entrypoint.
67+
main()

pipeline/github_client.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""Small GitHub API client used by the workbook refresh."""
2+
import json
3+
import os
4+
from urllib.error import HTTPError, URLError
5+
from urllib.parse import quote
6+
from urllib.request import Request, urlopen
7+
8+
9+
class GitHubClient:
10+
def __init__(self) -> None:
11+
# This pipeline is intended to run in GitHub Actions, where GITHUB_TOKEN
12+
# is provided to the job.
13+
token = os.environ.get("GITHUB_TOKEN")
14+
if not token:
15+
raise SystemExit("ERROR: GITHUB_TOKEN is required. Run this workflow in GitHub Actions.")
16+
17+
# The User-Agent is required by GitHub API etiquette.
18+
self.headers = {"User-Agent": "project-chatbot-workbook-refresh/1.0"}
19+
20+
# API requests ask for GitHub's JSON response format.
21+
self.api_headers = {**self.headers, "Accept": "application/vnd.github+json"}
22+
23+
# Authenticate API requests made by the workflow.
24+
self.api_headers["Authorization"] = f"Bearer {token}"
25+
26+
def latest_commit(self, owner: str, repo: str, path: str = "", ref: str = "") -> str:
27+
"""Return the latest commit timestamp for a repo or one file path."""
28+
# The commits endpoint returns newest commits first when per_page=1.
29+
params = ["per_page=1"]
30+
31+
# Blob links include a branch/ref and a file path. Both matter for
32+
# repositories whose default branch differs from the blob link branch.
33+
if ref:
34+
params.append(f"sha={quote(ref)}")
35+
if path:
36+
params.append(f"path={quote(path)}")
37+
38+
url = f"https://api.github.com/repos/{owner}/{repo}/commits?{'&'.join(params)}"
39+
40+
try:
41+
# commits is a list; the first item is the newest matching commit.
42+
commits = self.get_json(url)
43+
if commits:
44+
commit = commits[0].get("commit", {})
45+
return (commit.get("committer") or commit.get("author") or {}).get("date", "")
46+
except (HTTPError, URLError, OSError, KeyError, TypeError):
47+
# If a file-specific lookup fails, do not substitute repo activity.
48+
# That would make old project idea files look fresh.
49+
pass
50+
if path:
51+
return ""
52+
return self.repo_pushed_at(owner, repo)
53+
54+
def repo_pushed_at(self, owner: str, repo: str) -> str:
55+
"""Return repository pushed_at as a fallback activity timestamp."""
56+
try:
57+
return self.get_json(f"https://api.github.com/repos/{owner}/{repo}").get("pushed_at", "")
58+
except (HTTPError, URLError, OSError):
59+
return ""
60+
61+
def get_json(self, url: str):
62+
"""GET a GitHub API URL and decode the JSON body."""
63+
with urlopen(Request(url, headers=self.api_headers), timeout=12) as response:
64+
return json.loads(response.read().decode("utf-8", errors="ignore"))

pipeline/ooxml.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""Minimal OOXML helpers for .xlsx files.
2+
3+
The pipeline avoids third-party dependencies, so it edits the workbook as XML
4+
inside the .xlsx zip. Higher-level workbook code should own row/column meaning.
5+
"""
6+
import zipfile
7+
import xml.etree.ElementTree as ET
8+
9+
# SpreadsheetML namespace used by worksheet XML files.
10+
M_NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
11+
12+
# Package relationship namespace used to map workbook sheets to XML files.
13+
REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
14+
15+
# Office relationship namespace used for sheet relationship ids.
16+
OFFICE_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
17+
18+
# Prefix map used by ElementTree XPath calls.
19+
NS = {"m": M_NS}
20+
21+
# Preserve the default SpreadsheetML namespace when writing XML back.
22+
ET.register_namespace("", M_NS)
23+
24+
25+
def sheet_paths(z: zipfile.ZipFile) -> dict[str, str]:
26+
"""Return worksheet display name -> XML path inside the workbook zip."""
27+
# workbook.xml lists sheets by name and relationship id.
28+
workbook = ET.fromstring(z.read("xl/workbook.xml"))
29+
30+
# workbook.xml.rels maps relationship ids to actual worksheet XML paths.
31+
rels = ET.fromstring(z.read("xl/_rels/workbook.xml.rels"))
32+
rel_targets = {
33+
rel.attrib["Id"]: rel.attrib["Target"]
34+
for rel in rels.findall(f"{{{REL_NS}}}Relationship")
35+
}
36+
37+
# Build a direct lookup so callers can read a sheet by visible tab name.
38+
paths = {}
39+
for sheet in workbook.findall("m:sheets/m:sheet", NS):
40+
target = rel_targets[sheet.attrib[f"{{{OFFICE_REL}}}id"]]
41+
42+
# Relationship targets are sometimes relative to xl/.
43+
paths[sheet.attrib["name"]] = target if target.startswith("xl/") else "xl/" + target
44+
return paths
45+
46+
47+
def shared_strings(z: zipfile.ZipFile) -> list[str]:
48+
"""Read Excel's shared string table."""
49+
# Workbooks with only inline strings may not have sharedStrings.xml.
50+
if "xl/sharedStrings.xml" not in z.namelist():
51+
return []
52+
53+
# Shared strings are stored as rich text runs; join all text nodes.
54+
root = ET.fromstring(z.read("xl/sharedStrings.xml"))
55+
return [
56+
"".join((text.text or "") for text in item.findall(".//m:t", NS))
57+
for item in root.findall("m:si", NS)
58+
]
59+
60+
61+
def cell_text(cell: ET.Element, strings: list[str]) -> str:
62+
"""Return a cell's displayed text value."""
63+
# Normal cells store their value under <v>.
64+
value = cell.find("m:v", NS)
65+
66+
# t="s" means <v> is an index into sharedStrings.xml.
67+
if cell.attrib.get("t") == "s" and value is not None and value.text is not None:
68+
return strings[int(value.text)]
69+
70+
# t="inlineStr" means the text lives directly inside the cell.
71+
if cell.attrib.get("t") == "inlineStr":
72+
inline = cell.find("m:is", NS)
73+
return "".join((text.text or "") for text in inline.findall(".//m:t", NS)) if inline is not None else ""
74+
75+
# Numeric/plain cells can be returned directly from <v>.
76+
return value.text if value is not None and value.text is not None else ""
77+
78+
79+
def split_ref(ref: str) -> tuple[int | None, int | None]:
80+
"""Convert an Excel cell reference like C12 into (3, 12)."""
81+
# Separate column letters from row digits.
82+
letters = "".join(ch for ch in ref if ch.isalpha())
83+
digits = "".join(ch for ch in ref if ch.isdigit())
84+
if not letters or not digits:
85+
return None, None
86+
87+
# Convert base-26 letters into a 1-based column number.
88+
col = 0
89+
for ch in letters:
90+
col = col * 26 + ord(ch.upper()) - 64
91+
return col, int(digits)
92+
93+
94+
def col_name(idx: int) -> str:
95+
"""Convert a 1-based column number into Excel letters."""
96+
name = ""
97+
98+
# Excel columns are base-26 but without a zero digit.
99+
while idx:
100+
idx, rem = divmod(idx - 1, 26)
101+
name = chr(65 + rem) + name
102+
return name
103+
104+
105+
def ensure_cell(row: ET.Element, col: int) -> ET.Element:
106+
"""Return an existing cell in a row, or create it in column order."""
107+
# Build the Excel cell reference, for example column 14 in row 2 is N2.
108+
ref = f"{col_name(col)}{row.attrib['r']}"
109+
110+
# Reuse an existing cell if one is already present.
111+
cells = row.findall("m:c", NS)
112+
for cell in cells:
113+
existing_col, _ = split_ref(cell.attrib.get("r", ""))
114+
if existing_col == col:
115+
return cell
116+
117+
# Create a new blank cell.
118+
new_cell = ET.Element(f"{{{M_NS}}}c", {"r": ref})
119+
120+
# Insert before the next higher column so Excel sees cells in normal order.
121+
for pos, cell in enumerate(cells):
122+
existing_col, _ = split_ref(cell.attrib.get("r", ""))
123+
if existing_col and existing_col > col:
124+
row.insert(pos, new_cell)
125+
return new_cell
126+
127+
# Append if this is now the rightmost cell in the row.
128+
row.append(new_cell)
129+
return new_cell
130+
131+
132+
def set_text(cell: ET.Element, value: str) -> None:
133+
"""Replace a cell's contents with inline text."""
134+
# Remove any old <v>, <is>, or formula children.
135+
for child in list(cell):
136+
cell.remove(child)
137+
138+
# Remove the previous cell type before setting the new representation.
139+
cell.attrib.pop("t", None)
140+
141+
# Empty string means leave the cell blank.
142+
if not value:
143+
return
144+
145+
# Inline strings keep this writer simple and avoid editing sharedStrings.xml.
146+
cell.attrib["t"] = "inlineStr"
147+
inline = ET.SubElement(cell, f"{{{M_NS}}}is")
148+
ET.SubElement(inline, f"{{{M_NS}}}t").text = value

0 commit comments

Comments
 (0)