-
Notifications
You must be signed in to change notification settings - Fork 2
127 lines (109 loc) · 5.5 KB
/
sites_data_pipeline.yml
File metadata and controls
127 lines (109 loc) · 5.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
name: Sites Data Pipeline (7-step)
# ── Trigger ───────────────────────────────────────────────────────────────────
on:
workflow_dispatch:
inputs:
site_url:
description: 'CKAN site URL to collect metadata for (e.g. https://data.gov.au)'
required: true
type: string
# ── Job ───────────────────────────────────────────────────────────────────────
jobs:
pipeline:
name: Run Sites Data Pipeline
runs-on: ubuntu-latest
timeout-minutes: 360
permissions:
contents: write
defaults:
run:
working-directory: sites-data-fetch
steps:
# ── Setup ──────────────────────────────────────────────────────────────
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: pip
cache-dependency-path: sites-data-fetch/requirements.txt
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
# ── Build seed CSV from the single URL input ───────────────────────────
- name: Create seed 0.csv
run: |
echo "url" > 0.csv
echo "${{ inputs.site_url }}" >> 0.csv
echo "✅ Created 0.csv for: ${{ inputs.site_url }}"
# ── Step 1: Name Processing ────────────────────────────────────────────
- name: "Step 1 — Name Processing"
run: |
echo "▶ Step 1: Extracting instance name"
python 1-nameProcess.py
echo "✅ Step 1 complete"
# ── Step 2: CKAN API Stats ─────────────────────────────────────────────
- name: "Step 2 — CKAN API Stats"
run: |
echo "▶ Step 2: Fetching stats via CKAN Action API"
python 2-CKANActionAPI.py
echo "✅ Step 2 complete"
# ── Step 3: Site Type Classification ──────────────────────────────────
- name: "Step 3 — Site Type Classification"
run: |
echo "▶ Step 3: Classifying site type"
python 3-siteType.py
echo "✅ Step 3 complete"
# ── Step 4: About Page Descriptions ───────────────────────────────────
- name: "Step 4 — About Page Description"
run: |
echo "▶ Step 4: Scraping About page"
python 4-description.py
echo "✅ Step 4 complete"
# ── Step 5: Location Analysis (LLM) ───────────────────────────────────
- name: "Step 5 — Location Analysis (LLM)"
env:
OPEN_ROUTER_KEY: ${{ secrets.OPEN_ROUTER_KEY }}
run: |
if [ -z "$OPEN_ROUTER_KEY" ]; then
echo "❌ OPEN_ROUTER_KEY secret is not set."
exit 1
fi
echo "▶ Step 5: Inferring geographic location via LLM"
python 5-locationAnalyser.py
echo "✅ Step 5 complete"
# ── Step 6: Geocoding ──────────────────────────────────────────────────
- name: "Step 6 — Geocoding"
run: |
echo "▶ Step 6: Geocoding location to lat/lon"
python 6-geocode.py
echo "✅ Step 6 complete"
# ── Step 7: Timestamp ──────────────────────────────────────────────────
- name: "Step 7 — Add Timestamp"
run: |
echo "▶ Step 7: Stamping output with UTC date"
python 7-tstamp.py 6.csv 7.csv
echo "✅ Step 7 complete"
# ── Upload final output as artifact ───────────────────────────────────
- name: Upload final output (7.csv)
if: always()
uses: actions/upload-artifact@v4
with:
name: sites-metadata-${{ github.run_id }}
path: sites-data-fetch/7.csv
retention-days: 30
# ── Summary ────────────────────────────────────────────────────────────
- name: Workflow summary
if: always()
run: |
echo "=== Pipeline Summary ==="
echo "Status: ${{ job.status }}"
echo "Site URL: ${{ inputs.site_url }}"
echo "Timestamp: $(date -u)"
echo ""
echo "Output files:"
for f in 1.csv 2.csv 3.csv 4.csv 5.csv 6.csv 7.csv; do
[ -f "$f" ] && echo " $f — $(wc -l < $f) lines" || echo " $f — not produced"
done