riksdagsmonitor/.github/workflows/update-cia-csv-data.yml at main · Hack23/riksdagsmonitor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
name: Update CIA CSV Data

# Refreshes every CSV already tracked under data/cia/** and cia-data/** from the
# upstream Hack23/cia sample-data tree:
#   https://github.com/Hack23/cia/tree/master/service.data.impl/sample-data
#
# Upstream layout is NOT flat — root-level files coexist with sub-folders
# (distinct_values/, framework-validation/, risk-rule-tests/). The workflow
# therefore builds a basename→upstream-path index via the GitHub Tree API
# (recursive) before downloading, so every tracked local CSV is resolved to
# its correct upstream path regardless of sub-folder.
#
# The workflow is intentionally update-only: it never pulls in CSV files that
# are not already tracked in this repository.
#
# After the CSV refresh it runs the production-stats pipeline
# (scripts/load-cia-stats.ts + scripts/update-stats-from-cia.ts) so the
# homepage statistics (index*.html) stay in sync with
# cia-data/production-stats.json derived from extraction_summary_report.csv.
#
# When either stage produces changes, a single pull request is opened.

on:
  schedule:
    # Daily at 03:30 UTC (~04:30 CET / 05:30 CEST), safely after the upstream
    # CIA extraction job which completes around 02:57 UTC. The 30-minute margin
    # absorbs upstream runtime variance and ensures we refresh from the latest
    # extraction rather than the previous day's snapshot.
    - cron: '30 3 * * *'
  workflow_dispatch:
    inputs:
      ref:
        description: 'Upstream Hack23/cia git ref (branch, tag, or SHA) to download from'
        required: false
        type: string
        default: 'master'

permissions:
  contents: read

jobs:
  refresh:
    name: Refresh CIA CSV data and production stats
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    env:
      UPSTREAM_REF: ${{ github.event.inputs.ref || 'master' }}
      UPSTREAM_REPO: Hack23/cia
      UPSTREAM_PATH: service.data.impl/sample-data

    steps:
      - name: Harden Runner
        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
        with:
          egress-policy: audit
          allowed-endpoints: >
            api.github.com:443
            github.com:443
            raw.githubusercontent.com:443
            objects.githubusercontent.com:443
            codeload.github.com:443
            registry.npmjs.org:443
            nodejs.org:443

      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          fetch-depth: 1

      - name: Setup Node.js
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '26'
          cache: 'npm'
          cache-dependency-path: |
            package-lock.json
            .github/workflows/update-cia-csv-data.yml

      - name: Install dependencies
        run: npm ci

      - name: Resolve upstream ref → SHA
        id: resolve
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          set -euo pipefail

          # URL-encode the ref so branch names that legitimately contain '/'
          # (e.g. "feature/foo") survive interpolation into the GitHub API
          # path. jq is used as a dependency-free URL-encoder.
          encoded_ref="$(printf '%s' "$UPSTREAM_REF" | jq -rR @uri)"
          echo "🔎 Resolving ${UPSTREAM_REPO}@${UPSTREAM_REF} (encoded: ${encoded_ref})"

          # Ask the commits endpoint for the fully-resolved SHA. This accepts
          # branches, tags, and raw SHAs uniformly and returns the tip commit
          # SHA. All downstream URLs (tree API + raw downloads) use that SHA,
          # so there's no risk of a mid-run upstream update producing an
          # inconsistent CSV / stats snapshot, and no further URL-encoding
          # is required because a SHA is always [0-9a-f].
          sha="$(curl --silent --show-error --fail --location --retry 3 --max-time 30 \
            -H "Accept: application/vnd.github.v3+json" \
            -H "Authorization: Bearer ${GH_TOKEN}" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            "https://api.github.com/repos/${UPSTREAM_REPO}/commits/${encoded_ref}" \
            | jq -r '.sha')"

          if [ -z "$sha" ] || [ "$sha" = "null" ]; then
            echo "::error::Failed to resolve ${UPSTREAM_REPO}@${UPSTREAM_REF} to a commit SHA"
            exit 1
          fi
          echo "✅ Resolved SHA: ${sha}"
          echo "sha=${sha}" >> "$GITHUB_OUTPUT"

      - name: Build upstream basename → path index
        id: index
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }}
        run: |
          set -euo pipefail

          echo "📡 Fetching recursive tree for ${UPSTREAM_REPO}@${UPSTREAM_SHA}"
          tree_json="$(mktemp)"
          curl --silent --show-error --fail --location --retry 3 --max-time 60 \
            -H "Accept: application/vnd.github.v3+json" \
            -H "Authorization: Bearer ${GH_TOKEN}" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            "https://api.github.com/repos/${UPSTREAM_REPO}/git/trees/${UPSTREAM_SHA}?recursive=1" \
            -o "$tree_json"

          # The GitHub "get a tree" endpoint returns truncated=true when the
          # tree exceeds ~100k entries or 7MB. The upstream Hack23/cia tree is
          # currently well under those limits (<10k entries), so a single
          # recursive call is sufficient. If the repo ever grows past the
          # threshold, this step fails fast rather than silently producing an
          # incomplete index — at which point the workflow should be updated
          # to fall back to per-subdirectory tree fetches.
          truncated="$(jq -r '.truncated' "$tree_json")"
          if [ "$truncated" = "true" ]; then
            echo "::error::GitHub tree response is truncated; upstream repo has grown past the single-call limit. Update the workflow to paginate sub-trees."
            exit 1
          fi

          # index.tsv: <basename>\t<upstream-relative-path> (duplicate basenames rejected)
          index_file="${RUNNER_TEMP:-/tmp}/cia_sample_data_index.tsv"
          jq -r --arg prefix "${UPSTREAM_PATH}/" '
            .tree[]
            | select(.type == "blob")
            | select(.path | startswith($prefix))
            | select(.path | endswith(".csv"))
            | .path | ltrimstr($prefix)
          ' "$tree_json" | awk -F/ '{ print $NF "\t" $0 }' | sort > "$index_file"

          total_csvs="$(wc -l < "$index_file")"
          dup_count="$(cut -f1 "$index_file" | sort | uniq -d | wc -l)"
          if [ "$dup_count" -gt 0 ]; then
            echo "::error::Upstream contains ${dup_count} duplicate CSV basenames; basename-based lookup is ambiguous:"
            cut -f1 "$index_file" | sort | uniq -d
            exit 1
          fi

          echo "📚 Upstream index: ${total_csvs} CSVs (no duplicate basenames)"
          echo "index_file=$index_file" >> "$GITHUB_OUTPUT"
          echo "total_upstream=$total_csvs" >> "$GITHUB_OUTPUT"
          rm -f "$tree_json"

      - name: Refresh tracked CSV files from upstream
        id: refresh
        env:
          INDEX_FILE: ${{ steps.index.outputs.index_file }}
          UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }}
        run: |
          set -euo pipefail

          # Use the resolved SHA (not the raw ref) so downloads are pinned to
          # the exact same commit the index was built from, and no URL-encoding
          # is needed for the raw.githubusercontent.com path.
          BASE_URL="https://raw.githubusercontent.com/${UPSTREAM_REPO}/${UPSTREAM_SHA}/${UPSTREAM_PATH}"
          echo "📥 Source base URL: ${BASE_URL}"
          echo "📚 Using index: ${INDEX_FILE}"

          # Only touch CSVs that are ALREADY tracked in the repository under
          # data/cia/** or cia-data/**. We never introduce new files.
          mapfile -t LOCAL_CSVS < <(
            find data/cia cia-data -type f -name '*.csv' 2>/dev/null | sort -u
          )

          total=${#LOCAL_CSVS[@]}
          updated=0
          unchanged=0
          missing=0
          skipped=0
          failed=0

          # Locally-curated files with no upstream equivalent; these must not be
          # overwritten by the upstream sample-data tree.
          declare -a SKIP_PATTERNS=(
            # Local hand-curated mini-samples used by dashboard demos; their
            # schemas (name,ministry,influence / impact / riskScore / quarter)
            # do not exist in upstream service.data.impl/sample-data.
            'data/cia/ministry/sample_influence.csv'
            'data/cia/ministry/sample_decision_impact.csv'
            'data/cia/ministry/sample_risk_levels.csv'
            'data/cia/ministry/sample_productivity.csv'
            # Locally-generated forecast products (election scenarios) that
            # are not published in the upstream CIA sample-data tree.
            'cia-data/election/election_forecast.csv'
            'cia-data/election/coalition_scenarios.csv'
          )

          is_skipped() {
            local path="$1"
            for pat in "${SKIP_PATTERNS[@]}"; do
              if [ "$path" = "$pat" ]; then
                return 0
              fi
            done
            return 1
          }

          # Resolve a local basename to its upstream-relative path. Falls back
          # to the "<stem>_sample.csv" alias, which the repo uses for a few
          # locally-renamed canonical files (e.g. view_riksdagen_committee_decisions.csv).
          resolve_upstream_path() {
            local base="$1"
            local hit
            hit="$(awk -v b="$base" -F'\t' '$1==b {print $2; exit}' "$INDEX_FILE")"
            if [ -n "$hit" ]; then
              printf '%s' "$hit"
              return 0
            fi
            local alias="${base%.csv}_sample.csv"
            hit="$(awk -v b="$alias" -F'\t' '$1==b {print $2; exit}' "$INDEX_FILE")"
            if [ -n "$hit" ]; then
              printf '%s' "$hit"
              return 0
            fi
            return 1
          }

          tmpdir="$(mktemp -d)"
          trap 'rm -rf "$tmpdir"' EXIT
          : > "$tmpdir/updated.txt"
          : > "$tmpdir/missing.txt"
          : > "$tmpdir/failed.txt"

          for local_path in "${LOCAL_CSVS[@]}"; do
            base="$(basename "$local_path")"

            if is_skipped "$local_path"; then
              echo "  ⏭  skip   $local_path (local-only sample, no upstream)"
              skipped=$((skipped + 1))
              continue
            fi

            if ! upstream_rel="$(resolve_upstream_path "$base")"; then
              echo "  ❔ miss   $local_path (no upstream basename match)"
              missing=$((missing + 1))
              echo "$local_path" >> "$tmpdir/missing.txt"
              continue
            fi

            url="${BASE_URL}/${upstream_rel}"
            tmpfile="$tmpdir/$base"

            http_code="$(curl --silent --show-error --location --retry 3 \
              --max-time 60 --output "$tmpfile" --write-out '%{http_code}' \
              "$url" || echo '000')"

            case "$http_code" in
              200)
                if [ ! -s "$tmpfile" ]; then
                  echo "  ⚠️  empty  $local_path  ($url)"
                  failed=$((failed + 1))
                  echo "$local_path" >> "$tmpdir/failed.txt"
                  continue
                fi
                if cmp --silent "$tmpfile" "$local_path"; then
                  unchanged=$((unchanged + 1))
                else
                  mkdir -p "$(dirname "$local_path")"
                  mv "$tmpfile" "$local_path"
                  updated=$((updated + 1))
                  echo "$local_path  <- ${upstream_rel}" >> "$tmpdir/updated.txt"
                  echo "  ✅ update $local_path  <- ${upstream_rel}"
                fi
                ;;
              404)
                echo "  ❔ miss   $local_path (404 at ${upstream_rel})"
                missing=$((missing + 1))
                echo "$local_path" >> "$tmpdir/missing.txt"
                ;;
              *)
                echo "  ❌ http=$http_code $local_path ($url)"
                failed=$((failed + 1))
                echo "$local_path" >> "$tmpdir/failed.txt"
                ;;
            esac
          done

          {
            echo "total=$total"
            echo "updated=$updated"
            echo "unchanged=$unchanged"
            echo "missing=$missing"
            echo "skipped=$skipped"
            echo "failed=$failed"
          } >> "$GITHUB_OUTPUT"

          {
            echo "## 📊 CIA CSV Refresh Summary"
            echo ""
            echo "| Metric | Count |"
            echo "| --- | ---: |"
            echo "| Total tracked CSVs | $total |"
            echo "| ✅ Updated | $updated |"
            echo "| 🟰 Unchanged | $unchanged |"
            echo "| ❔ Missing upstream | $missing |"
            echo "| ⏭ Skipped (local-only) | $skipped |"
            echo "| ❌ Failed downloads | $failed |"
            echo ""
            echo "- Upstream ref: \`${UPSTREAM_REF}\`"
            echo "- Source base: ${BASE_URL}"
            echo "- Upstream index size: ${{ steps.index.outputs.total_upstream }} CSVs"

            if [ "$updated" -gt 0 ]; then
              echo ""
              echo "### Updated files"
              echo ""
              sed 's/^/- /' "$tmpdir/updated.txt"
            fi
            if [ "$missing" -gt 0 ]; then
              echo ""
              echo "### Missing upstream"
              echo ""
              sed 's/^/- /' "$tmpdir/missing.txt"
            fi
            if [ "$failed" -gt 0 ]; then
              echo ""
              echo "### Failed downloads"
              echo ""
              sed 's/^/- /' "$tmpdir/failed.txt"
            fi
          } >> "$GITHUB_STEP_SUMMARY"

          if [ "$failed" -gt 0 ]; then
            echo "::error::$failed CSV downloads failed; see job summary"
            exit 1
          fi

      - name: Refresh CIA production statistics
        id: stats
        env:
          UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }}
          # Pin the statistics fetch to the exact same commit SHA the CSV
          # refresh just used. `scripts/load-cia-stats.ts` respects the
          # CIA_EXTRACTION_SUMMARY_URL env var as an override of its default
          # master-branch URL, so the injected counts stay consistent with the
          # CSV snapshot when the workflow is dispatched with a non-master ref.
          CIA_EXTRACTION_SUMMARY_URL: https://raw.githubusercontent.com/${{ env.UPSTREAM_REPO }}/${{ steps.resolve.outputs.sha }}/${{ env.UPSTREAM_PATH }}/extraction_summary_report.csv
        run: |
          set -euo pipefail

          echo "📊 Fetching CIA production statistics from ${CIA_EXTRACTION_SUMMARY_URL}"
          npx --no-install tsx scripts/load-cia-stats.ts

          if [ ! -f "cia-data/production-stats.json" ]; then
            echo "::error::cia-data/production-stats.json was not produced"
            exit 1
          fi

          total_persons="$(jq -r '.counts.total_persons' cia-data/production-stats.json)"
          total_votes="$(jq -r '.counts.total_votes' cia-data/production-stats.json)"
          last_updated="$(jq -r '.metadata.last_updated' cia-data/production-stats.json)"

          {
            echo "total_persons=$total_persons"
            echo "total_votes=$total_votes"
            echo "last_updated=$last_updated"
          } >> "$GITHUB_OUTPUT"

          echo "🖊  Injecting statistics into index*.html (14 language variants)"
          npx --no-install tsx scripts/update-stats-from-cia.ts

          {
            echo ""
            echo "## 📈 Production Statistics Refresh"
            echo ""
            echo "| Metric | Value |"
            echo "| --- | ---: |"
            echo "| Total Persons | ${total_persons} |"
            echo "| Total Votes | ${total_votes} |"
            echo "| Last Extraction | ${last_updated} |"
            echo "| Pinned commit | \`${UPSTREAM_SHA}\` |"
          } >> "$GITHUB_STEP_SUMMARY"

      - name: Detect git changes
        id: diff
        run: |
          set -e
          # Paths touched by this workflow: CSV trees + stats JSON + injected HTML
          paths=(data/cia cia-data/production-stats.json cia-data/*.csv cia-data/*/*.csv index.html index_*.html)
          if [ -n "$(git status --porcelain -- "${paths[@]}")" ]; then
            echo "has_changes=true" >> "$GITHUB_OUTPUT"
            echo "📝 Changes detected"
            git status --porcelain -- "${paths[@]}" | head -30
          else
            echo "has_changes=false" >> "$GITHUB_OUTPUT"
            echo "✅ No changes detected"
          fi

      - name: Create pull request with refreshed data
        if: steps.diff.outputs.has_changes == 'true'
        uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8.1.1
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          add-paths: |
            data/cia/**
            cia-data/**
            index.html
            index_*.html
          commit-message: |
            data(cia): refresh CSV sample data + production stats from ${{ env.UPSTREAM_REPO }}@${{ steps.resolve.outputs.sha }}

            - Upstream ref: ${{ env.UPSTREAM_REF }} (resolved to ${{ steps.resolve.outputs.sha }})
            - CSV files: updated ${{ steps.refresh.outputs.updated }} / ${{ steps.refresh.outputs.total }} tracked
            - Production stats: ${{ steps.stats.outputs.total_persons }} persons, ${{ steps.stats.outputs.total_votes }} votes
            - Last extraction: ${{ steps.stats.outputs.last_updated }}
          branch: data/cia-csv-refresh
          delete-branch: true
          title: '📊 Refresh CIA data (${{ steps.refresh.outputs.updated }} CSVs + production stats)'
          body: |
            ## 📊 Automated CIA data refresh

            Pulled the latest `service.data.impl/sample-data/` CSVs from
            [`${{ env.UPSTREAM_REPO }}@${{ env.UPSTREAM_REF }}`](https://github.com/${{ env.UPSTREAM_REPO }}/tree/${{ steps.resolve.outputs.sha }}/${{ env.UPSTREAM_PATH }})
            (resolved to commit [`${{ steps.resolve.outputs.sha }}`](https://github.com/${{ env.UPSTREAM_REPO }}/commit/${{ steps.resolve.outputs.sha }}))
            and re-ran the production-statistics pipeline against the same commit.

            ### CSV refresh
            | Metric | Count |
            | --- | ---: |
            | Total tracked CSVs | ${{ steps.refresh.outputs.total }} |
            | ✅ Updated | ${{ steps.refresh.outputs.updated }} |
            | 🟰 Unchanged | ${{ steps.refresh.outputs.unchanged }} |
            | ❔ Missing upstream | ${{ steps.refresh.outputs.missing }} |
            | ⏭ Skipped (local-only) | ${{ steps.refresh.outputs.skipped }} |
            | ❌ Failed downloads | ${{ steps.refresh.outputs.failed }} |

            ### Production statistics
            - Total persons: **${{ steps.stats.outputs.total_persons }}**
            - Total votes: **${{ steps.stats.outputs.total_votes }}**
            - Last extraction: **${{ steps.stats.outputs.last_updated }}**

            ### How resolution works
            The workflow never introduces new files — it only refreshes CSVs
            already tracked under `data/cia/**` and `cia-data/**`. Upstream paths
            are resolved through a recursive `basename → upstream-path` index
            built from the GitHub Tree API, so files living in
            sub-folders (`distinct_values/`, `framework-validation/`,
            `risk-rule-tests/`) are located correctly even though local paths
            are organised differently. An `<stem>_sample.csv` alias is applied
            for the handful of locally-renamed canonical files.

            ### Review checklist
            - [ ] CSV diffs look reasonable (no schema regressions)
            - [ ] `production-stats.json` counts move monotonically or within reason
            - [ ] Dashboards still render with the refreshed data
            - [ ] No unexpected file additions or removals

            > Generated by `.github/workflows/update-cia-csv-data.yml`.
          labels: |
            automated-pipeline
            data-update
            cia-intelligence

# ISMS Compliance
# - ISO 27001:2022 A.5.33 Protection of records — Git audit trail + PR review gate
# - ISO 27001:2022 A.8.3  Information lifecycle management — nightly refresh + change detection
# - ISO 27001:2022 A.8.10 Information deletion — auto-delete PR branch after merge
# - ISO 27001:2022 A.8.19 Security in use — HTTPS-only downloads, SHA-pinned actions
# - NIST CSF 2.0 PR.DS-5 Data integrity — byte-level cmp vs upstream
# - NIST CSF 2.0 DE.CM-1 Network monitoring — step-security/harden-runner egress audit
# - CIS Controls v8.1 3.1  Data inventory — explicit local-file discovery
# - CIS Controls v8.1 3.14 Data integrity validation — diff + PR review before merge
# - GDPR Article 6(1)(e) — public interest processing (democratic transparency)
# - Swedish Offentlighetsprincipen — public access to government information