diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 43a8b6e3f2..0f26dbc924 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1856,3 +1856,54 @@ graph TB

+--- + +## đŸ›ïž Statskontoret Integration — Current Architecture + +> **Effective:** 2026-04-25 · **Classification:** Public · **Runtime:** Node.js 25 / TypeScript CLI · **MCP status:** intentionally **not** an MCP server. + +Statskontoret is now the Swedish public-administration and central-government budget-execution context layer. It complements the existing provider split: IMF remains primary for macro/fiscal projections, SCB remains Swedish official-statistics ground truth, World Bank remains governance/environment/social residue, and Statskontoret supplies agency structure plus budget outturn detail that the other providers do not expose in the same operational form. + +### Architectural placement + +```mermaid +flowchart LR + Workflow[Agentic news workflow
Node 25] --> CLI[statskontoret-fetch.ts
list-sources · discover · headcount] + CLI --> Client[StatskontoretClient
statskontoret-client.ts] + Client --> Source[www.statskontoret.se
open data pages] + Source --> XLSX[Excel workbooks] + Source --> ZIP[CSV ZIP archives] + Client --> Parser[XLSX / CSV-ZIP parsers
typed StatskontoretError] + Parser --> Derived[Derived artifacts
headcount-by-department] + Derived --> Persist[analysis/data/statskontoret/
JSON + .meta.json sidecars] + Derived --> Articles[Article and dashboard context] +``` + +### Provider responsibility matrix + +| Need | Primary provider | Riksdagsmonitor surface | +|---|---|---| +| Agency count, department grouping, leadership form and government-body headcount | **Statskontoret Myndighetsförteckning** | `scripts/statskontoret-fetch.ts headcount`, `analysis/statskontoret/` | +| Annual central-government budget outturn | **Statskontoret Årsutfall** | Download discovery and persisted raw/derived artifacts | +| Monthly central-government budget execution | **Statskontoret MĂ„nadsutfall** | Download discovery for high-frequency budget monitoring | +| Macro/fiscal projections and cross-country methodology | **IMF WEO/FM/SDMX** | `scripts/imf-*` | +| Swedish regional/monthly official statistics | **SCB PxWeb** | `scb` MCP | +| Governance/environment/social residue | **World Bank** | `world-bank` MCP | + +### Code and quality surfaces + +| Surface | Responsibility | +|---|---| +| `scripts/statskontoret-client.ts` | Typed client, source catalogue, download discovery, HTML entity decoding, XLSX parsing, CSV ZIP parsing, numeric normalisation, department headcount aggregation. | +| `scripts/statskontoret-fetch.ts` | Import-safe CLI wrapper for workflows; exported argument parsing helpers for testability; exit code `2` for CLI contract errors. | +| `analysis/statskontoret/indicators-inventory.json` | Machine-readable dataset inventory and provider decision matrix. | +| `analysis/statskontoret/data-dictionary.md` | Field families, freshness discipline, persistence layout. | +| `tests/statskontoret-*.test.ts` | Inventory consistency, download-link extraction, workbook parsing, CSV ZIP parsing, CLI parsing and parser primitive coverage. | + +### Operational characteristics + +- **Trust boundary:** one outbound HTTPS boundary to `www.statskontoret.se`; no credentials, no private data, no write-back to the source. +- **Persistence:** optional `--persist` writes raw or derived payloads to `analysis/data/statskontoret/{dataset}/{artifact}.json` with `.meta.json` provenance sidecars. +- **Failure mode:** optional enrichment semantics; article generation can fall back to cached artifacts or omit Statskontoret context rather than blocking publication. +- **Security posture:** Public classification, high-integrity provenance, dependency surface limited to existing npm SBOM (`jszip`) and in-repository TypeScript code. + diff --git a/DATA_MODEL.md b/DATA_MODEL.md index 0dc09a2fd4..c4cdb5d134 100644 --- a/DATA_MODEL.md +++ b/DATA_MODEL.md @@ -2592,3 +2592,27 @@ This DATA_MODEL.md complements ARCHITECTURE.md: **⏰ Next Review:** 2027-02-15 **🎯 Framework Compliance:** [![ISO 27001](https://img.shields.io/badge/ISO_27001-2022_Aligned-blue?style=flat-square&logo=iso&logoColor=white)](https://github.com/Hack23/ISMS-PUBLIC/blob/main/CLASSIFICATION.md) [![NIST CSF 2.0](https://img.shields.io/badge/NIST_CSF-2.0_Aligned-green?style=flat-square&logo=nist&logoColor=white)](https://github.com/Hack23/ISMS-PUBLIC/blob/main/CLASSIFICATION.md) [![CIS Controls](https://img.shields.io/badge/CIS_Controls-v8.1_Aligned-orange?style=flat-square&logo=cisecurity&logoColor=white)](https://github.com/Hack23/ISMS-PUBLIC/blob/main/CLASSIFICATION.md) +--- + +## đŸ›ïž Statskontoret Data Model Extension + +Statskontoret adds a public Swedish-administration data domain under the economic/public-administration context layer. + +### Source entities + +| Entity | Key fields | Storage / source | +|---|---|---| +| `StatskontoretSourceDefinition` | `key`, `title`, `url`, `cadence`, `coverage`, `primaryUse` | Static catalogue in `scripts/statskontoret-client.ts`; mirrored by `analysis/statskontoret/indicators-inventory.json`. | +| `StatskontoretDownloadLink` | `source`, `sourcePage`, `url`, `resourceType`, `documentType`, `fileType`, `fileName`, `year`, `month`, `status`, `updatedAt` | Derived from Statskontoret HTML pages by `extractStatskontoretDownloadLinks`. | +| `StatskontoretWorkbook` / `StatskontoretSheet` | sheet name and row arrays | Parsed locally from XLSX ZIP parts. | +| `StatskontoretHeadcountRow` | `year`, `department`, `headcount`, `authorityCount` | Derived from Myndighetsförteckning rows. | + +### Persisted artifact contract + +```text +analysis/data/statskontoret/{dataset}/{artifact}.json +analysis/data/statskontoret/{dataset}/{artifact}.meta.json +``` + +Sidecar metadata includes `fetchedAt`, `mcpTool: statskontoret-ts-client`, `dataset`, and `artifact`. The provider decision matrix in `analysis/statskontoret/indicators-inventory.json` maps government-body headcount and central-government budget outturn claims to Statskontoret, while macro/fiscal projections remain IMF-first. + diff --git a/FLOWCHART.md b/FLOWCHART.md index 5bb51e4a46..9355a321d0 100644 --- a/FLOWCHART.md +++ b/FLOWCHART.md @@ -969,3 +969,29 @@ flowchart LR - 24 indicators across 10 IMF dataflows (WEO / FM / IFS / BOP / DOTS / GFS_COFOG / PCPS / ER / MFS_IR / MFS_PR) catalogued in [`analysis/imf/indicators-inventory.json`](analysis/imf/indicators-inventory.json) - Vintage discipline (>6 mo → annotation) enforced by `tests/imf-inventory.test.ts` (13 assertions) and `tests/economic-context-multi-provider.test.ts` (asserts IMF queried before WB) - Egress allow-list: `www.imf.org`, `sdmxcentral.imf.org` pinned in every workflow `network:` block + +--- + +## đŸ›ïž Statskontoret Data Flow (Current State) + +```mermaid +flowchart TD + Start[News / analysis workflow needs agency or budget-execution context] + Decision{Context type?} + Start --> Decision + Decision -->|Agency structure / headcount| MF[Statskontoret Myndighetsförteckning] + Decision -->|Annual budget outturn| AU[Statskontoret Årsutfall] + Decision -->|Monthly budget outturn| MU[Statskontoret MĂ„nadsutfall] + Decision -->|Macro projection| IMF[IMF WEO/FM] + MF --> CLI[statskontoret-fetch.ts] + AU --> CLI + MU --> CLI + CLI --> Discover[discover: extract Excel / CSV ZIP links] + CLI --> Headcount[headcount: parse XLSX and aggregate department time series] + Discover --> Persist[analysis/data/statskontoret JSON + meta] + Headcount --> Persist + Persist --> Article[Article / dashboard context with source URL and freshness] +``` + +Key gates: HTTPS-only source, source catalogue validation, parser tests, provenance sidecars, and optional-enrichment fallback. + diff --git a/MINDMAP.md b/MINDMAP.md index 0f00f16034..78f73dae3a 100644 --- a/MINDMAP.md +++ b/MINDMAP.md @@ -554,3 +554,52 @@ mindmap Regional municipal Budget execution ``` + +--- + +## đŸ›ïž Statskontoret Integration Branch (Current State) + +```mermaid +mindmap + root((Statskontoret Integration)) + Purpose + Swedish agency structure + Government-body headcount + Central-government budget execution + Sources + Myndighetsforteckning + Annual + XLSX + Headcount by department + Arsutfall + Annual + XLSX + CSV ZIP + Manadsutfall + Monthly + XLSX + CSV ZIP + Budget time series + Long-run state budget context + Code + statskontoret-client.ts + Discovery + XLSX parser + CSV ZIP parser + Typed StatskontoretError + statskontoret-fetch.ts + list-sources + discover + headcount + Governance + Public classification + No MCP server + No credentials + www.statskontoret.se allowlist + analysis/statskontoret inventory + Tests + client tests + CLI parsing tests + inventory tests +``` + diff --git a/README.md b/README.md index 4464065a21..39c944d0b4 100644 --- a/README.md +++ b/README.md @@ -1108,3 +1108,35 @@ Riksdagsmonitor uses a **provider-tiered** data architecture, with each provider **Why this split** — IMF uses uniform SNA 2008 / GFSM 2014 / BPM6 methodology across countries (essential for cross-country comparison), publishes T+5 projections (essential for look-ahead workflows), and has fresher data than World Bank's economic indicators. World Bank remains the canonical source for the classes IMF does not publish (WGI governance, environment). Authority: [`.github/aw/ECONOMIC_DATA_CONTRACT.md`](.github/aw/ECONOMIC_DATA_CONTRACT.md) v2.1 · hub: [`analysis/imf/`](analysis/imf/) · agent guide: [`AGENTS.md`](AGENTS.md) §IMF. + +--- + +## đŸ›ïž Statskontoret Swedish Administration Integration + +Riksdagsmonitor now includes a pure-TypeScript Statskontoret integration for Swedish government-body and central-government budget-execution context. + +| Dataset | Use | +|---|---| +| Myndighetsförteckning | Authority count, department grouping, leadership form and Ă„rsarbetskrafter/headcount over time. | +| Årsutfall för statens budget | Annual central-government revenue and expenditure outturns. | +| MĂ„nadsutfall för statens budget | Monthly budget execution from 2006 onward. | +| Tidsserier, statens budget m.m. | Long-run Swedish budget context. | + +Quick commands: + +```bash +tsx scripts/statskontoret-fetch.ts list-sources +tsx scripts/statskontoret-fetch.ts discover --source arsutfall --persist +tsx scripts/statskontoret-fetch.ts headcount --url "https://www.statskontoret.se/...xlsx" --persist +``` + +Architecture and governance references: + +- `analysis/statskontoret/README.md` — integration hub. +- `analysis/statskontoret/indicators-inventory.json` — machine-readable source catalogue. +- `analysis/statskontoret/data-dictionary.md` — field and freshness rules. +- `scripts/statskontoret-client.ts` / `scripts/statskontoret-fetch.ts` — client and workflow CLI. +- `tests/statskontoret-client.test.ts`, `tests/statskontoret-fetch.test.ts`, `tests/statskontoret-inventory.test.ts` — regression coverage. + +Provider rule: IMF remains primary for macro/fiscal projections, SCB remains Swedish statistical ground truth, World Bank remains governance/environment/social residue, and Statskontoret is authoritative for Swedish agency structure and central-government budget execution. + diff --git a/SECURITY_ARCHITECTURE.md b/SECURITY_ARCHITECTURE.md index e796672eac..b0dc3fae72 100644 --- a/SECURITY_ARCHITECTURE.md +++ b/SECURITY_ARCHITECTURE.md @@ -3086,3 +3086,22 @@ flowchart LR **Egress hosts** (allow-list): `www.imf.org` (Datamapper REST · WEO/FM), `sdmxcentral.imf.org` (SDMX 3.0 REST · IFS/BOP/DOTS/GFS/PCPS/ER/MFS_IR/MFS_PR). Both HTTPS-only, anonymous, public — no credentials required. **Canonical rule.** Every economic claim in a Riksdagsmonitor article cites an IMF dataflow first; World Bank citations are reserved for governance, environment and social residue (the classes IMF does not publish). SCB is the Swedish-specific ground truth layer. See `ECONOMIC_DATA_CONTRACT.md` v2.1 for the banned-phrase list and vintage discipline (>6 mo → annotation). + +--- + +## đŸ›ïž Statskontoret Security Architecture + +Statskontoret is a read-only public-data integration using in-repository TypeScript code and the existing npm dependency graph. It is intentionally not configured as an MCP server; workflows invoke `tsx scripts/statskontoret-fetch.ts` via the bash tool. + +| Control area | Statskontoret control | +|---|---| +| Network egress | Allow only HTTPS to `www.statskontoret.se` for this provider. | +| Authentication | None required; no tokens or secrets transmitted. | +| Input validation | Resource classification, URL normalisation, HTML entity decoding, XLSX workbook structure checks, CSV ZIP file filtering. | +| Integrity | Persisted JSON plus `.meta.json` provenance sidecars with source/dataset/artifact/fetch timestamp. | +| Availability | 15s client timeout and optional-enrichment fallback to cached artifacts. | +| Supply chain | Parser code is local TypeScript; ZIP/XLSX parsing uses `jszip` under npm lock/SBOM and advisory review. | +| Privacy | Public authority and aggregate budget records only; no private-person or credential data. | + +Security classification: **PUBLIC / High Integrity / Medium-High Availability**. Mapped controls: ISO 27001 A.5.23 (cloud/service use), A.8.9 (configuration management), A.8.12 (data leakage prevention by design), A.8.20 (network security), NIST CSF 2.0 ID.IM / PR.DS / PR.PS, CIS Controls 4, 8, 12 and 16. + diff --git a/TESTING.md b/TESTING.md index d77717d77a..32df366c97 100644 --- a/TESTING.md +++ b/TESTING.md @@ -687,3 +687,24 @@ IMF_LIVE_SMOKE=1 npm test -- imf-client.live - `tests/imf-vintage-discipline.test.ts` — asserts cache filenames carry vintage tags **Canonical rule.** Every economic claim in a Riksdagsmonitor article cites an IMF dataflow first; World Bank citations are reserved for governance, environment and social residue (the classes IMF does not publish). SCB is the Swedish-specific ground truth layer. See `ECONOMIC_DATA_CONTRACT.md` v2.1 for the banned-phrase list and vintage discipline (>6 mo → annotation). + +--- + +## đŸ§Ș Statskontoret Test Coverage + +Statskontoret coverage is split across focused Vitest suites: + +| Test file | Coverage | +|---|---| +| `tests/statskontoret-client.test.ts` | Download-link extraction, XLSX workbook parsing, CSV ZIP extraction, Swedish decimal handling, injected fetch client behavior. | +| `tests/statskontoret-fetch.test.ts` | Import-safe CLI parsing, typed CLI errors, source validation, resource classification, numeric parsing primitives. | +| `tests/statskontoret-inventory.test.ts` | Inventory metadata, dataset coverage parity with `STATSKONTORET_SOURCES`, provider-decision matrix, client/CLI/persistence declarations. | + +Targeted validation command: + +```bash +npx vitest run tests/statskontoret-client.test.ts tests/statskontoret-fetch.test.ts tests/statskontoret-inventory.test.ts +``` + +Quality expectation: no live network calls in tests; fixtures model Statskontoret workbook/ZIP assumptions and prevent workflow regressions without depending on upstream availability. + diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index fdf65e4298..dc5f65a488 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -3000,3 +3000,27 @@ All mitigations are codified in: **Egress hosts** (allow-list): `www.imf.org` (Datamapper REST · WEO/FM), `sdmxcentral.imf.org` (SDMX 3.0 REST · IFS/BOP/DOTS/GFS/PCPS/ER/MFS_IR/MFS_PR). Both HTTPS-only, anonymous, public — no credentials required. **Canonical rule.** Every economic claim in a Riksdagsmonitor article cites an IMF dataflow first; World Bank citations are reserved for governance, environment and social residue (the classes IMF does not publish). SCB is the Swedish-specific ground truth layer. See `ECONOMIC_DATA_CONTRACT.md` v2.1 for the banned-phrase list and vintage discipline (>6 mo → annotation). + +--- + +## đŸ›ïž Statskontoret Integration — STRIDE Threats + +> **Effective:** 2026-04-25 · **Classification:** Public · **Entry point:** `scripts/statskontoret-fetch.ts` · **Source:** `www.statskontoret.se`. + +Statskontoret ingestion introduces a public-data trust boundary for Swedish agency structure and budget outturn files. It is unauthenticated, read-only and optional enrichment, but the integrity of parsed figures matters for political-intelligence claims. + +| ID | Asset / flow | STRIDE | Threat | Likelihood | Impact | Mitigations | +|---|---|---|---|---|---|---| +| T-STATS-01 | `www.statskontoret.se` page discovery | Spoofing | DNS/TLS interception or lookalike page returns false download links | LOW | MEDIUM | HTTPS-only egress, allow-list `www.statskontoret.se`, source URL recorded in payload and `.meta.json`, PR review of persisted diffs. | +| T-STATS-02 | Excel / CSV ZIP payload | Tampering | Workbook or archive content modified upstream or in transit | LOW | HIGH | TLS transport, local parser contract checks, typed `StatskontoretError`, persisted raw/derived artifacts with provenance sidecars, reviewer diff inspection. | +| T-STATS-03 | Headcount aggregation | Information integrity | Header drift maps wrong columns to `År`, `Departement`, `Myndighet`, or `Årsarbetskrafter` | MEDIUM | MEDIUM | Header-family matching documented in `analysis/statskontoret/data-dictionary.md`, unit tests for workbook parsing and Swedish number handling, fallback to no derived output if required fields cannot be resolved. | +| T-STATS-04 | CLI invocation | Repudiation | Article cites agency headcount or budget outturn without source page/year/status | MEDIUM | MEDIUM | `discover` captures source page, URL, year/month/status and `last-modified`; persisted sidecars include `dataset`, `artifact`, `fetchedAt`, and `mcpTool: statskontoret-ts-client`. | +| T-STATS-05 | Source availability | Denial of service | Statskontoret page unavailable or workbook fetch times out | MEDIUM | LOW | 15s timeout, optional-enrichment semantics, cache-first reuse of `analysis/data/statskontoret/`, article generation can omit context rather than fail. | +| T-STATS-06 | XLSX/ZIP parsing dependency | Elevation of privilege | Malicious archive attempts parser/resource abuse | LOW | HIGH | `jszip` pinned in npm lock/SBOM, GitHub Advisory Database reviewed, no dynamic eval, no script execution from workbooks, tests exercise parser edge cases. | + +### Residual risk and classification + +- **Residual risk:** LOW-MEDIUM integrity risk due to upstream data or workbook-schema drift; handled by provenance, test coverage and human review. +- **Privacy:** no PII or credentials; public authority and aggregate budget data only. +- **CIA:** Public / High Integrity / Medium-High Availability for derived article context. + diff --git a/analysis/statskontoret/README.md b/analysis/statskontoret/README.md new file mode 100644 index 0000000000..9928c0f314 --- /dev/null +++ b/analysis/statskontoret/README.md @@ -0,0 +1,135 @@ +# Statskontoret Data Integration + +> **Purpose**: Statskontoret open data as the authoritative Swedish public-administration and central-government budget-execution context layer for Riksdagsmonitor. +> +> **Effective**: 2026-04-25 · **Classification**: Public + +Authoritative files in this folder: + +- [`indicators-inventory.json`](indicators-inventory.json) — machine-readable dataset catalogue and provider decision matrix. +- [`data-dictionary.md`](data-dictionary.md) — field, cadence, freshness and derived-artifact reference. +- [`use-cases.md`](use-cases.md) — canonical article and dashboard use cases. + +--- + +## 1 · Why Statskontoret + +Statskontoret fills a gap that IMF, SCB and World Bank do not cover in the same operational form: current and historical structure of Sweden's central-government agencies and budget execution in the state's own reporting structure. + +| Need | Provider | Rationale | +|---|---|---| +| Government-body headcount and authority count by department | **Statskontoret Myndighetsförteckning** | Includes Ă„rsarbetskrafter, ledningsform, sĂ€rskilda organ and department grouping. | +| Annual central-government budget outturn | **Statskontoret Årsutfall** | Hermes/Riksdag/government budget execution records. | +| Monthly central-government budget outturn | **Statskontoret MĂ„nadsutfall** | Lowest-level monthly revenue/expenditure data by agency. | +| Long-run central-government fiscal time series (from 1995) | **Statskontoret Tidsserier** | Final outcomes for revenue, expenditure and balance since 1995. | +| Macro/fiscal projections | **IMF WEO/FM** | T+5 projection and cross-country methodology. | +| Swedish regional/monthly official statistics | **SCB** | PxWeb official-statistics ground truth. | + +--- + +## 2 · Code surface + +| File | Purpose | +|---|---| +| [`scripts/statskontoret-client.ts`](../../scripts/statskontoret-client.ts) | Public unauthenticated client for Statskontoret pages, Excel workbooks, CSV ZIP archives, headcount aggregation and budget-outturn parsing. | +| [`scripts/statskontoret-fetch.ts`](../../scripts/statskontoret-fetch.ts) | CLI wrapper for agentic workflows (`list-sources`, `discover`, `headcount`, `budget-outturn`). | +| [`analysis/statskontoret/indicators-inventory.json`](indicators-inventory.json) | Dataset inventory and provider decision matrix. | +| [`analysis/data/statskontoret/`](../data/statskontoret/) | Optional persisted raw/derived data written by `--persist`. | + +No MCP server is required. Workflows invoke the TypeScript CLI via the `bash` tool and need egress to `www.statskontoret.se`. + +--- + +## 3 · CLI quick reference + +```bash +# List available Statskontoret sources +tsx scripts/statskontoret-fetch.ts list-sources + +# Discover downloadable Excel / CSV ZIP links on a source page +tsx scripts/statskontoret-fetch.ts discover --source arsutfall --persist + +# Build department headcount time series from the authority-register workbook +tsx scripts/statskontoret-fetch.ts headcount --url "https://www.statskontoret.se/...xlsx" --persist + +# Parse budget-outturn rows from Ă„rsutfall / mĂ„nadsutfall / budget-time-series +tsx scripts/statskontoret-fetch.ts budget-outturn \ + --source arsutfall \ + --url "https://www.statskontoret.se/...xlsx" \ + --doc-type Inkomst \ + --persist + +# Omit --doc-type to let the parser infer from sheet names +tsx scripts/statskontoret-fetch.ts budget-outturn \ + --source budget-time-series \ + --url "https://www.statskontoret.se/...xlsx" \ + --persist +``` + +--- + +## 4 · Derived headcount artifact + +The client converts the workbook sheet matching `förteckning` / `forteckning` into records and aggregates: + +```json +{ + "year": 2025, + "department": "Finansdepartementet", + "headcount": 1234.5, + "authorityCount": 12 +} +``` + +Aggregation rules: + +1. Locate header fields equivalent to `År`, `Departement`, `Myndighet` and `Årsarbetskrafter`. +2. Parse Swedish decimal comma values as numbers. +3. Sum Ă„rsarbetskrafter by `(year, department)`. +4. Count distinct authority names in the same group. +5. Persist raw/derived payloads with `.meta.json` provenance sidecars. + +--- + +## 5 · Derived budget-outturn artifact + +The `budget-outturn` command parses Ă„rsutfall, mĂ„nadsutfall and budget-time-series workbooks into typed `StatskontoretBudgetRow` objects (amounts in MSEK): + +```json +{ + "year": 2024, + "documentType": "Inkomst", + "title": "Skatt pĂ„ inkomst", + "code": "1111", + "outturn": 500000, + "budget": 480000 +} +``` + +For monthly data the `month` field (1–12) is also present. Optional fields: `agency`, `status`, `code`. + +The `summarizeBudgetOutturn` helper aggregates rows into per-`(year, documentType)` totals: + +```json +{ + "year": 2024, + "documentType": "Inkomst", + "totalOutturn": 700000, + "totalBudget": 670000, + "variance": 30000, + "rowCount": 2 +} +``` + +`variance` is `totalOutturn − totalBudget`; it is omitted when any contributing row had no budget figure. + +--- + +## 6 · Security and data governance + +- **Classification**: Public / High Integrity / Medium-High Availability. +- **Privacy**: Public authority and budget data only; no private-person data. +- **Integrity**: Source URL, retrieval timestamp, dataset and artifact are persisted in sidecar metadata. +- **Supply chain**: XLSX/ZIP parsing uses `jszip@3.10.1`; GitHub Advisory Database check completed with no known vulnerabilities for that version. +- **Threat surface**: External public-data ingestion from `www.statskontoret.se`; the `assertStatskontoretFetchTarget` guard rejects non-HTTPS or off-allowlist URLs before any fetch is issued; schema/shape validation and PR diff review mitigate data-poisoning risk. + diff --git a/analysis/statskontoret/data-dictionary.md b/analysis/statskontoret/data-dictionary.md new file mode 100644 index 0000000000..9bde30b278 --- /dev/null +++ b/analysis/statskontoret/data-dictionary.md @@ -0,0 +1,70 @@ +# Statskontoret Data Dictionary + +## Sources + +| Source key | Dataset | Cadence | Format | Coverage | Primary use | +|---|---|---:|---|---|---| +| `myndighetsforteckning` | Myndighetsförteckning – öppna data | Annual | Excel | Summary 2025, time series 2007–2025, latest and full authority register | Headcount and authority count by department over time | +| `budget-time-series` | Tidsserier, statens budget m.m. | Annual | Publication / linked tables | Final budget outcomes generally from 1995 | Long-run fiscal context | +| `arsutfall` | Årsutfall för statens budget – öppna data | Annual | Excel, CSV ZIP | Annual revenue/expenditure outturns | Budget execution by appropriation/income title/agency | +| `manadsutfall` | MĂ„nadsutfall för statens budget – öppna data | Monthly | Excel, CSV ZIP | Monthly outcomes from January 2006 onward | High-frequency budget execution monitoring | + +## Myndighetsförteckning fields + +| Field family | Expected labels | Normalisation | Derived use | +|---|---|---|---| +| Year | `År`, `Ar`, `Year` | integer | Time-series key | +| Authority | `Myndighet`, `Myndighetsnamn`, `Namn` | string | Distinct authority count | +| Department | `Departement`, `Departementstillhörighet` | string | Grouping dimension | +| Headcount | `Årsarbetskrafter`, `ÅA` | Swedish decimal comma → number | Sum by year and department | +| Leadership form | `Ledningsform` | string | Governance/administrative context | +| Special organs | `SĂ€rskilda organ` | string/boolean-like | Institutional context | + +## Årsutfall and MĂ„nadsutfall budget-outturn fields + +These fields apply to `arsutfall`, `manadsutfall` and `budget-time-series` workbooks parsed via `parseBudgetRows` / `buildBudgetTimeSeries`. + +| Field | Expected labels (normalised) | Normalisation | Present in | +|---|---|---|---| +| Year | `År`, `Ar`, `Year`, `KalenderĂ„r`, `Kalenderar` | integer | All three sources | +| Month | `MĂ„nad`, `Manad`, `Month`, `MĂ„nadsperiod` | integer 1–12 | mĂ„nadsutfall only | +| Document type | `Dokumenttyp`, `Typ`, `Inkomst_Utgift` | string | All (or inferred from sheet name) | +| Income title name | `Inkomsttitelnamn`, `Inkomsttitelgruppsnamn` | string | Inkomst rows | +| Income title code | `Inkomsttitel`, `Inkomsttitelnummer`, `Inkomsttitelnr` | string | Inkomst rows | +| Appropriation name | `Anslagsnamn`, `Utgiftsomradesnamn`, `Utgiftsomrade` | string | Utgift rows | +| Appropriation number | `Anslagsnr`, `Anslagsnummer`, `Anslagspost`, `Utgiftsomradesnr` | string | Utgift rows | +| Outturn amount | `Utfall`, `Utfall MSEK`, `Utfallbelopp`, `Belopp` | Swedish decimal comma → MSEK | All rows | +| Budget amount | `Budget`, `BudgetvĂ€rde`, `Anvisat`, `Ramanslag` | Swedish decimal comma → MSEK | Where available | +| Agency | `Myndighet`, `Myndighetsnamn` | string | Finest granularity; optional | +| Status | `Status`, `PreliminĂ€r`, `Utfallsstatus` | string | Optional (preliminary/definitive) | + +### Sheet-name to document-type inference + +When the workbook contains multiple sheets and no explicit `--doc-type` override is given, `buildBudgetTimeSeries` infers the document type from the sheet name: + +| Sheet name contains | Inferred `documentType` | +|---|---| +| `inkomst` | `Inkomst` | +| `utgift` or `anslag` | `Utgift` | +| anything else | no override (field `Typ` etc. from each row used instead) | + +## Freshness discipline + +- Myndighetsförteckning: annual refresh; re-run discovery when source page `last-modified` changes. The client reads the HTML meta tag `` (or date-only variants) and copies the value to discovered link provenance. +- MĂ„nadsutfall: monthly refresh after Statskontoret publication. +- Årsutfall: refresh on preliminary/definitive release changes. +- Budget time series: annual official-statistics publication. + +## Persistence layout + +```text +analysis/data/statskontoret/{dataset}/{artifact}.json +analysis/data/statskontoret/{dataset}/{artifact}.meta.json +``` + +Sidecar metadata contains `fetchedAt`, `mcpTool: statskontoret-ts-client`, `dataset` and `artifact`. + +## Key normalisation rules + +All column-header matching is case-insensitive and accent-folded (`NFD` normalisation with diacritic removal), so `Årsarbetskrafter`, `arsarbetskrafter` and `ÅRSARBETSKRAFTER` all resolve to the same normalised key `arsarbetskrafter`. Swedish decimal comma notation (`1.234,5`) is parsed to `1234.5` by `parseStatskontoretSwedishNumber`. + diff --git a/analysis/statskontoret/indicators-inventory.json b/analysis/statskontoret/indicators-inventory.json new file mode 100644 index 0000000000..f059b8d4d4 --- /dev/null +++ b/analysis/statskontoret/indicators-inventory.json @@ -0,0 +1,169 @@ +{ + "version": "1.1", + "description": "Machine-readable inventory of Statskontoret open datasets used by Riksdagsmonitor for Swedish government-body and central-government budget context. Complements IMF (primary economic projections), SCB (Swedish official statistics), World Bank (non-economic global context), and Riksdag/Regering data.", + "lastUpdated": "2026-04-25", + "effectiveDate": "2026-04-25", + "source": "Statskontoret open data (www.statskontoret.se)", + "classification": "Public", + "clients": { + "cli": "tsx scripts/statskontoret-fetch.ts (commands: list-sources, discover, headcount, budget-outturn)", + "library": "scripts/statskontoret-client.ts (StatskontoretClient class)", + "persistence": "scripts/parliamentary-data/data-persistence.ts (persistStatskontoretData)" + }, + "notes": { + "firewallAllowlist": "www.statskontoret.se", + "noMcp": "Statskontoret is not an MCP server. Agentic workflows invoke the TypeScript CLI via the bash tool, mirroring IMF's no-MCP client pattern.", + "formats": "Myndighetsförteckningen is published as Excel. Årsutfall and MĂ„nadsutfall expose both Excel and CSV ZIP downloads. Budget time-series pages link to annual official-statistics publications and related open-data tables.", + "privacy": "Public authority/agency data and aggregate budget data only; no private-person data. Authority names and agency-level budget lines are public administrative records." + }, + "datasets": { + "myndighetsforteckning": { + "title": "Myndighetsförteckning – öppna data", + "url": "https://www.statskontoret.se/analys-och-statistik/oppna-data/myndighetsforteckning/", + "cadence": "Annual snapshot; source page metadata observed as last-modified 2026-02-06 for the 2025 workbook.", + "coverage": "Summerande statistik 2025; tidsserier 2007–2025; förteckning 2025; förteckning 2007–2025.", + "format": [ + "xlsx" + ], + "primaryUse": "Headcount of government bodies, grouped by department, leadership form and special organs; department headcount over time from 2007 onward.", + "keyFields": [ + "År", + "Myndighet", + "Departement / departementstillhörighet", + "Årsarbetskrafter", + "Ledningsform", + "SĂ€rskilda organ" + ], + "derivedArtifacts": [ + { + "id": "headcount-by-department", + "description": "Sum Ă„rsarbetskrafter by year and department, with authority count per group.", + "script": "tsx scripts/statskontoret-fetch.ts headcount --url --persist", + "storage": "analysis/data/statskontoret/myndighetsforteckning/headcount-by-department.json" + } + ], + "committees": [ + "KU", + "FiU", + "AU" + ], + "admiralty": "A1" + }, + "budget-time-series": { + "title": "Tidsserier, statens budget m.m.", + "url": "https://www.statskontoret.se/analys-och-statistik/officiell-statistik/tidsserier-statens-budget-m.m", + "cadence": "Annual official statistics release.", + "coverage": "Final outcomes for central-government revenue, expenditure, balance and related public-finance tables, generally from 1995.", + "format": [ + "html-publication", + "linked-open-data" + ], + "primaryUse": "Long-run Swedish central-government budget context for finance, tax and public-administration analysis.", + "committees": [ + "FiU", + "SkU", + "KU" + ], + "admiralty": "A1", + "derivedArtifacts": [ + { + "id": "budget-outturn", + "description": "Long-run central-government budget time series (revenue and expenditure) from 1995 onward parsed into StatskontoretBudgetRow objects.", + "script": "tsx scripts/statskontoret-fetch.ts budget-outturn --source budget-time-series --url --persist", + "storage": "analysis/data/statskontoret/budget-time-series/budget-outturn.json" + } + ] + }, + "arsutfall": { + "title": "Årsutfall för statens budget – öppna data", + "url": "https://www.statskontoret.se/analys-och-statistik/oppna-data/arsutfall/", + "cadence": "Annual, with preliminary and definitive releases.", + "coverage": "Annual revenue and expenditure outturns based on Hermes reporting, Riksdag budget decisions and government disposition rights.", + "format": [ + "xlsx", + "csv-zip" + ], + "primaryUse": "Annual budget execution by appropriation, income title and agency; definitive vs preliminary status tracking.", + "queryParameters": [ + "documentType", + "fileType", + "fileName", + "Year", + "month", + "status" + ], + "committees": [ + "FiU", + "SkU" + ], + "admiralty": "A1", + "derivedArtifacts": [ + { + "id": "budget-outturn-inkomst", + "description": "Annual central-government revenue outturn rows (documentType=Inkomst) parsed into StatskontoretBudgetRow objects.", + "script": "tsx scripts/statskontoret-fetch.ts budget-outturn --source arsutfall --url --doc-type Inkomst --persist", + "storage": "analysis/data/statskontoret/arsutfall/budget-outturn-inkomst.json" + }, + { + "id": "budget-outturn-utgift", + "description": "Annual central-government expenditure outturn rows (documentType=Utgift) parsed into StatskontoretBudgetRow objects.", + "script": "tsx scripts/statskontoret-fetch.ts budget-outturn --source arsutfall --url --doc-type Utgift --persist", + "storage": "analysis/data/statskontoret/arsutfall/budget-outturn-utgift.json" + } + ] + }, + "manadsutfall": { + "title": "MĂ„nadsutfall för statens budget – öppna data", + "url": "https://www.statskontoret.se/analys-och-statistik/oppna-data/manadsutfall/", + "cadence": "Monthly.", + "coverage": "Monthly revenue and expenditure outcomes from January 2006 onward, specified at income-subtitle / appropriation-item / agency granularity.", + "format": [ + "xlsx", + "csv-zip" + ], + "primaryUse": "High-frequency budget execution monitoring and agency-level spending/revenue context.", + "queryParameters": [ + "documentType", + "fileType", + "fileName", + "Year", + "month", + "status" + ], + "committees": [ + "FiU", + "SkU", + "KU" + ], + "admiralty": "A1", + "derivedArtifacts": [ + { + "id": "budget-outturn-inkomst", + "description": "Monthly central-government revenue outturn rows (documentType=Inkomst) parsed into StatskontoretBudgetRow objects.", + "script": "tsx scripts/statskontoret-fetch.ts budget-outturn --source manadsutfall --url --doc-type Inkomst --persist", + "storage": "analysis/data/statskontoret/manadsutfall/budget-outturn-inkomst.json" + }, + { + "id": "budget-outturn-utgift", + "description": "Monthly central-government expenditure outturn rows (documentType=Utgift) parsed into StatskontoretBudgetRow objects.", + "script": "tsx scripts/statskontoret-fetch.ts budget-outturn --source manadsutfall --url --doc-type Utgift --persist", + "storage": "analysis/data/statskontoret/manadsutfall/budget-outturn-utgift.json" + } + ] + } + }, + "providerDecisionMatrix": { + "governmentBodiesHeadcount": "statskontoret:myndighetsforteckning", + "agencyLeadershipForm": "statskontoret:myndighetsforteckning", + "centralGovernmentBudgetAnnualOutturn": "statskontoret:arsutfall", + "centralGovernmentBudgetMonthlyOutturn": "statskontoret:manadsutfall", + "longRunBudgetTimeSeries": "statskontoret:budget-time-series", + "macroFiscalProjection": "imf:WEO/FM", + "swedishOfficialRegionalStats": "scb:pxweb" + }, + "updateDiscipline": { + "myndighetsforteckning": "Check annually and whenever the source page last-modified value changes.", + "budgetOutturn": "Check monthly for MĂ„nadsutfall and annually/preliminary cycles for Årsutfall.", + "integrity": "Persist raw source payload plus .meta.json provenance; review derived headcount diffs in PRs." + } +} diff --git a/analysis/statskontoret/use-cases.md b/analysis/statskontoret/use-cases.md new file mode 100644 index 0000000000..aeb64a066f --- /dev/null +++ b/analysis/statskontoret/use-cases.md @@ -0,0 +1,42 @@ +# Statskontoret Use Cases + +## 1 · Department headcount dashboard + +Use `myndighetsforteckning` to calculate annual `Ă„rsarbetskrafter` grouped by department. This provides context for articles about government reorganisation, budget pressure, administrative capacity and committee oversight. + +Evidence standard: cite Statskontoret source URL, workbook year, department name and derived headcount value. + +## 2 · Agency-level budget execution context + +Use `arsutfall` for annual and `manadsutfall` for monthly budget execution. Pair with Riksdag budget documents and committee reports to show whether parliamentary appropriations translate into agency-level spending patterns. + +Evidence standard: cite Statskontoret source URL, document type (`Inkomst`/`Utgift`), year/month/status and budget line. + +## 3 · Long-run central-government fiscal context + +Use `budget-time-series` to provide long-run historical framing for Swedish state-budget revenue, expenditure and balance. IMF remains primary for macro/fiscal projection and cross-country methodology; Statskontoret is the Swedish budget-execution layer. + +Evidence standard: cite Statskontoret official-statistics publication year and table label. + +## 4 · Annual budget outturn summary (income vs. expenditure) + +Use `summarizeBudgetOutturn` to aggregate individual `StatskontoretBudgetRow` records from `arsutfall` or `manadsutfall` into per-year, per-documentType totals. This is the standard pattern for producing summary tables in articles and committee-report context. + +```ts +import { parseBudgetRows, summarizeBudgetOutturn } from '../../scripts/statskontoret-client.js'; + +const rows = parseBudgetRows(records, { documentType: 'Inkomst' }); +const summary = summarizeBudgetOutturn(rows); +// summary[0] → { year: 2024, documentType: 'Inkomst', totalOutturn: 700000, totalBudget: 670000, variance: 30000, rowCount: 2 } +``` + +`variance` = `totalOutturn − totalBudget` (positive = revenue above plan; negative = expenditure below appropriation or income undershot). Omitted when any source row had no budget figure. + +Evidence standard: cite Statskontoret source URL, year, document type, outturn and variance; note preliminary vs. definitive `status`. + +## 5 · High-frequency monitoring with mĂ„nadsutfall + +Use `manadsutfall` to monitor budget execution monthly for specific agencies or income categories. Combine with IMF SDMX monthly fiscal data (`sdmxcentral.imf.org`) for cross-validation. + +Evidence standard: cite Statskontoret mĂ„nadsutfall URL, year/month, agency name and outturn amount. + diff --git a/package-lock.json b/package-lock.json index 7cd6e8c7f0..f1795cac21 100644 --- a/package-lock.json +++ b/package-lock.json @@ -27,6 +27,7 @@ "htmlhint": "1.9.2", "js-yaml": "4.1.1", "json-schema-to-typescript": "15.0.4", + "jszip": "3.10.1", "knip": "6.6.3", "papaparse": "5.5.3", "playwright": "1.59.1", @@ -3771,8 +3772,8 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", "integrity": "sha512-3lqz5YjWTYnW6dlDa5TLaTCcShfar1e40rmcJVwCBJC6mWlFuj0eCHIElmG1g5kyuJ/GD+8Wn4FFCcz4gJPfaQ==", - "license": "MIT", - "optional": true + "devOptional": true, + "license": "MIT" }, "node_modules/cors": { "version": "2.8.6", @@ -6046,6 +6047,13 @@ "node": ">= 4" } }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "dev": true, + "license": "MIT" + }, "node_modules/imurmurhash": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", @@ -6236,6 +6244,13 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "dev": true, + "license": "MIT" + }, "node_modules/isexe": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", @@ -6463,6 +6478,19 @@ "verror": "1.10.0" } }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "dev": true, + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, "node_modules/keyv": { "version": "4.5.4", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", @@ -6560,6 +6588,16 @@ "node": ">= 0.8.0" } }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lightningcss": { "version": "1.32.0", "resolved": "https://registry.npmjs.org/lightningcss/-/lightningcss-1.32.0.tgz", @@ -8319,6 +8357,13 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", + "dev": true, + "license": "(MIT AND Zlib)" + }, "node_modules/papaparse": { "version": "5.5.3", "resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.5.3.tgz", @@ -8596,6 +8641,13 @@ "node": ">= 0.6.0" } }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", + "dev": true, + "license": "MIT" + }, "node_modules/property-information": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", @@ -8718,6 +8770,29 @@ "url": "https://opencollective.com/express" } }, + "node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "dev": true, + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/readable-stream/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "dev": true, + "license": "MIT" + }, "node_modules/rehype-autolink-headings": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/rehype-autolink-headings/-/rehype-autolink-headings-7.1.0.tgz", @@ -9127,6 +9202,13 @@ "url": "https://opencollective.com/express" } }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", + "dev": true, + "license": "MIT" + }, "node_modules/setprototypeof": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", @@ -9440,6 +9522,23 @@ "dev": true, "license": "MIT" }, + "node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "dev": true, + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/string_decoder/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "dev": true, + "license": "MIT" + }, "node_modules/string-width": { "version": "4.2.3", "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", @@ -10098,6 +10197,13 @@ "punycode": "^2.1.0" } }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "dev": true, + "license": "MIT" + }, "node_modules/uuid": { "version": "8.3.2", "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz", diff --git a/package.json b/package.json index d80b959e76..cd31682c95 100644 --- a/package.json +++ b/package.json @@ -178,6 +178,7 @@ "htmlhint": "1.9.2", "js-yaml": "4.1.1", "json-schema-to-typescript": "15.0.4", + "jszip": "3.10.1", "knip": "6.6.3", "papaparse": "5.5.3", "playwright": "1.59.1", diff --git a/scripts/parliamentary-data/data-persistence.ts b/scripts/parliamentary-data/data-persistence.ts index 8986fb520a..ff39e126fc 100644 --- a/scripts/parliamentary-data/data-persistence.ts +++ b/scripts/parliamentary-data/data-persistence.ts @@ -82,6 +82,7 @@ export type PersistenceDocumentType = | 'government' | 'worldbank' | 'imf' + | 'statskontoret' | 'scb' | string; // extensible for generic MCP servers @@ -528,6 +529,53 @@ export function persistIMFData( return path.join(dir, filename); } +/** + * Persist Statskontoret open-data responses and derived datasets. + * + * Stored under `analysis/data/statskontoret/{dataset}/{artifact}.json`. + * Statskontoret data is public and unauthenticated; provenance sidecars record + * the source dataset and the TypeScript client/CLI used to retrieve or derive + * the artifact. + * + * @param dataset - Statskontoret source key (e.g. 'myndighetsforteckning'). + * @param artifact - Logical artifact name (e.g. 'downloads', + * 'headcount-by-department'). + * @param response - Raw or derived Statskontoret payload. + * @param dataRoot - Override for the data root directory (for testing). + * @returns Absolute path to the persisted data file. + */ +export function persistStatskontoretData( + dataset: string, + artifact: string, + response: unknown, + dataRoot: string = DATA_ROOT, +): string { + const dir = path.join(dataRoot, 'statskontoret', sanitizeDokId(dataset)); + ensureDir(dir); + + const sanitizedArtifact = sanitizeDokId(artifact); + const filename = `${sanitizedArtifact}.json`; + fs.writeFileSync( + path.join(dir, filename), + JSON.stringify(response, null, 2), + 'utf8', + ); + + const metaFilename = `${sanitizedArtifact}.meta.json`; + fs.writeFileSync( + path.join(dir, metaFilename), + JSON.stringify({ + fetchedAt: new Date().toISOString(), + mcpTool: 'statskontoret-ts-client', + dataset, + artifact, + }, null, 2), + 'utf8', + ); + + return path.join(dir, filename); +} + /** * Persist SCB (Statistics Sweden) table data. * Stored under `analysis/data/scb/{tableId}.json` diff --git a/scripts/statskontoret-client.ts b/scripts/statskontoret-client.ts new file mode 100644 index 0000000000..26b54ce62d --- /dev/null +++ b/scripts/statskontoret-client.ts @@ -0,0 +1,808 @@ +/** + * @module Statskontoret/Client + * @description TypeScript client for Statskontoret public open-data pages. + * + * Covers the Statskontoret datasets that complement IMF, SCB and World Bank + * context for Riksdagsmonitor: the authority register (myndighetsförteckning), + * budget time series, annual budget outturn and monthly budget outturn. Data is + * public and unauthenticated. Excel workbooks and CSV ZIP archives are parsed + * locally so workflows can persist source data and derived headcount series. + * + * @author Hack23 AB + * @license Apache-2.0 + */ + +import JSZip from 'jszip'; + +import { decodeHtmlEntities } from './html-utils.js'; + +export type StatskontoretSourceKey = + | 'myndighetsforteckning' + | 'budget-time-series' + | 'arsutfall' + | 'manadsutfall'; + +export type StatskontoretResourceType = 'excel' | 'csv-zip' | 'zip' | 'document' | 'unknown'; + +export interface StatskontoretSourceDefinition { + readonly key: StatskontoretSourceKey; + readonly title: string; + readonly url: string; + readonly cadence: string; + readonly coverage: string; + readonly primaryUse: string; +} + +export interface StatskontoretDownloadLink { + readonly source: StatskontoretSourceKey; + readonly sourcePage: string; + readonly href: string; + readonly url: string; + readonly text: string; + readonly resourceType: StatskontoretResourceType; + readonly documentType?: string; + readonly fileType?: string; + readonly fileName?: string; + readonly year?: number; + readonly month?: number; + readonly status?: string; + readonly updatedAt?: string; +} + +export interface StatskontoretClientConfig { + readonly baseURL?: string; + readonly timeout?: number; + readonly fetchFn?: typeof fetch; +} + +export interface StatskontoretWorkbook { + readonly sheets: readonly StatskontoretSheet[]; +} + +export interface StatskontoretSheet { + readonly name: string; + readonly rows: readonly (readonly string[])[]; +} + +export interface StatskontoretHeadcountRow { + readonly year: number; + readonly department: string; + readonly headcount: number; + readonly authorityCount: number; +} + +export interface StatskontoretHeadcountOptions { + readonly sheetNamePattern?: RegExp; + readonly fallbackYear?: number; +} + +/** + * A single budget-outturn row derived from an Ă„rsutfall, mĂ„nadsutfall or + * budget-time-series workbook. Amounts are in MSEK (millions of Swedish + * kronor) as published by Statskontoret. + */ +export interface StatskontoretBudgetRow { + readonly year: number; + /** Present only for mĂ„nadsutfall (1–12). */ + readonly month?: number; + /** 'Inkomst' | 'Utgift' or the raw documentType string from the download. */ + readonly documentType: string; + /** Human-readable title: income title name or appropriation/expenditure-area name. */ + readonly title: string; + /** Numeric code of the income title or appropriation, when present. */ + readonly code?: string; + /** Outturn amount in MSEK. */ + readonly outturn: number; + /** Budget amount in MSEK; may be absent in older series. */ + readonly budget?: number; + /** Agency or authority name, when present (finest granularity). */ + readonly agency?: string; + /** Preliminary / definitive / forecast status label. */ + readonly status?: string; +} + +export interface StatskontoretBudgetOptions { + /** Override the documentType label (e.g. when fetching a single-type workbook). */ + readonly documentType?: string; + /** Hint for the year when the workbook has no year column (e.g. a single-year file). */ + readonly fallbackYear?: number; + /** Hint for the month when the workbook has no month column. */ + readonly fallbackMonth?: number; +} + +/** + * Aggregated totals derived from one or more `StatskontoretBudgetRow` rows. + * + * `totalOutturn` and `totalBudget` are the sums of the individual row amounts + * (in MSEK) within the selected grouping. `variance` is `totalOutturn - + * totalBudget`; it is `undefined` when any contributing row had no budget + * figure. `rowCount` records how many source rows were included. + */ +export interface StatskontoretBudgetSummary { + readonly year: number; + readonly documentType: string; + readonly totalOutturn: number; + readonly totalBudget?: number; + readonly variance?: number; + readonly rowCount: number; +} + +/** + * Typed error thrown by the Statskontoret client and parsers. + * + * `kind` lets callers distinguish transport, parsing and contract failures + * without brittle message matching. + */ +export class StatskontoretError extends Error { + readonly kind: 'http' | 'workbook' | 'contract' | 'cli'; + + constructor(message: string, kind: StatskontoretError['kind'] = 'contract', options?: ErrorOptions) { + super(message, options); + this.name = 'StatskontoretError'; + this.kind = kind; + } +} + +export const STATSKONTORET_BASE_URL = 'https://www.statskontoret.se'; + +export const STATSKONTORET_SOURCES: readonly StatskontoretSourceDefinition[] = Object.freeze([ + { + key: 'myndighetsforteckning', + title: 'Myndighetsförteckning – öppna data', + url: '/analys-och-statistik/oppna-data/myndighetsforteckning/', + cadence: 'Annual snapshot; Statskontoret page metadata currently indicates 2026-02-06 update for the 2025 workbook.', + coverage: 'Summary statistics, 2007–2025 time series, latest authority list and full 2007–2025 authority register.', + primaryUse: 'Government-body headcount, authority count, leadership form and department grouping over time.', + }, + { + key: 'budget-time-series', + title: 'Tidsserier, statens budget m.m.', + url: '/analys-och-statistik/officiell-statistik/tidsserier-statens-budget-m.m', + cadence: 'Annual official statistics release.', + coverage: 'Final outcomes for central-government revenue, expenditure, balance and related public-finance tables, generally from 1995.', + primaryUse: 'Long-run fiscal context for committee and budget-cycle analysis.', + }, + { + key: 'arsutfall', + title: 'Årsutfall för statens budget – öppna data', + url: '/analys-och-statistik/oppna-data/arsutfall/', + cadence: 'Annual, with preliminary and definitive releases.', + coverage: 'Annual central-government revenue and expenditure outturns based on Hermes reporting and Riksdag/government budget decisions.', + primaryUse: 'Yearly budget execution context by appropriation, income title and agency.', + }, + { + key: 'manadsutfall', + title: 'MĂ„nadsutfall för statens budget – öppna data', + url: '/analys-och-statistik/oppna-data/manadsutfall/', + cadence: 'Monthly.', + coverage: 'Monthly central-government revenue and expenditure outcomes from January 2006 onward at low-level agency/account granularity.', + primaryUse: 'High-frequency budget execution context and agency-level fiscal monitoring.', + }, +]); + +const DEFAULT_TIMEOUT = 15_000; +const FILE_EXTENSION_RE = /\.(xlsx|xls|csv|zip|docx|pdf)(?:$|[?#])/i; +const HREF_RE = /]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi; +const TAG_RE = /<[^>]+>/g; + +export class StatskontoretClient { + readonly baseURL: string; + readonly timeout: number; + private readonly fetchFn: typeof fetch; + + constructor(config: StatskontoretClientConfig = {}) { + this.baseURL = trimTrailingSlash(config.baseURL ?? STATSKONTORET_BASE_URL); + this.timeout = config.timeout ?? DEFAULT_TIMEOUT; + this.fetchFn = config.fetchFn ?? fetch; + } + + async discoverDownloads(sourceKey: StatskontoretSourceKey): Promise { + const source = getStatskontoretSource(sourceKey); + const pageUrl = resolveStatskontoretUrl(source.url, this.baseURL); + const html = await this.fetchText(pageUrl); + return extractStatskontoretDownloadLinks(html, sourceKey, pageUrl, this.baseURL); + } + + async fetchWorkbook(url: string): Promise { + const buffer = await this.fetchArrayBuffer(url); + return parseStatskontoretXlsx(buffer); + } + + async fetchCsvZip(url: string): Promise> { + const buffer = await this.fetchArrayBuffer(url); + return parseStatskontoretCsvZip(buffer); + } + + async fetchText(url: string): Promise { + const response = await this.fetchWithTimeout(url); + return response.text(); + } + + async fetchArrayBuffer(url: string): Promise { + const response = await this.fetchWithTimeout(url); + return response.arrayBuffer(); + } + + private async fetchWithTimeout(url: string): Promise { + const resolved = resolveStatskontoretUrl(url, this.baseURL); + assertStatskontoretFetchTarget(resolved, this.baseURL); + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), this.timeout); + let response: Response; + try { + response = await this.fetchFn(resolved, { + signal: controller.signal, + headers: { + Accept: 'text/html,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/zip,text/csv,*/*', + }, + }); + } catch (error) { + const detail = error instanceof Error ? error.message : String(error); + throw new StatskontoretError(`Statskontoret fetch failed for ${resolved}: ${detail}`, 'http', { cause: error }); + } finally { + clearTimeout(timeoutId); + } + if (!response.ok) { + throw new StatskontoretError(`Statskontoret API error: ${response.status} ${response.statusText} for ${response.url}`, 'http'); + } + return response; + } +} + +export function getStatskontoretSource(key: StatskontoretSourceKey): StatskontoretSourceDefinition { + const source = STATSKONTORET_SOURCES.find((candidate) => candidate.key === key); + if (!source) throw new StatskontoretError(`Unknown Statskontoret source: ${key}`); + return source; +} + +export function extractStatskontoretDownloadLinks( + html: string, + source: StatskontoretSourceKey, + sourcePage: string, + baseURL: string = STATSKONTORET_BASE_URL, +): StatskontoretDownloadLink[] { + const links: StatskontoretDownloadLink[] = []; + const pageUpdatedAt = extractPageLastModified(html); + for (const match of html.matchAll(HREF_RE)) { + const href = decodeHtml(match[1] ?? '').trim(); + const text = normalizeWhitespace(decodeHtml((match[2] ?? '').replace(TAG_RE, ' '))); + if (!href) continue; + const resourceType = classifyStatskontoretResource(href, text); + if (resourceType === 'unknown') continue; + const url = resolveStatskontoretUrl(href, baseURL); + const parsed = new URL(url); + const year = parseStatskontoretOptionalInt(parsed.searchParams.get('Year')); + const month = parseStatskontoretOptionalInt(parsed.searchParams.get('month')); + links.push({ + source, + sourcePage, + href, + url, + text, + resourceType, + ...(parsed.searchParams.get('documentType') ? { documentType: parsed.searchParams.get('documentType') ?? undefined } : {}), + ...(parsed.searchParams.get('fileType') ? { fileType: parsed.searchParams.get('fileType') ?? undefined } : {}), + ...(parsed.searchParams.get('fileName') ? { fileName: parsed.searchParams.get('fileName') ?? undefined } : {}), + ...(year !== undefined ? { year } : {}), + ...(month !== undefined ? { month } : {}), + ...(parsed.searchParams.get('status') ? { status: parsed.searchParams.get('status') ?? undefined } : {}), + ...(pageUpdatedAt ? { updatedAt: pageUpdatedAt } : {}), + }); + } + return deduplicateLinks(links); +} + +export async function parseStatskontoretXlsx(input: ArrayBuffer | Uint8Array): Promise { + const zip = await JSZip.loadAsync(input); + const workbookXml = await readZipText(zip, 'xl/workbook.xml'); + const workbookRelsXml = await readZipText(zip, 'xl/_rels/workbook.xml.rels'); + const sharedStringsXml = zip.file('xl/sharedStrings.xml') + ? await readZipText(zip, 'xl/sharedStrings.xml') + : ''; + const sharedStrings = parseSharedStrings(sharedStringsXml); + const rels = parseWorkbookRelationships(workbookRelsXml); + const sheets: StatskontoretSheet[] = []; + + for (const sheet of parseWorkbookSheets(workbookXml)) { + const target = rels.get(sheet.relationshipId); + if (!target) continue; + const sheetPath = target.startsWith('/') ? target.slice(1) : `xl/${target}`; + const sheetXml = await readZipText(zip, sheetPath.replace(/\/\.\//g, '/')); + sheets.push({ name: sheet.name, rows: parseWorksheetRows(sheetXml, sharedStrings) }); + } + + return { sheets }; +} + +export async function parseStatskontoretCsvZip(input: ArrayBuffer | Uint8Array): Promise> { + const zip = await JSZip.loadAsync(input); + const out: Record = {}; + for (const [name, entry] of Object.entries(zip.files)) { + if (entry.dir) continue; + if (!/\.csv$/i.test(name)) continue; + out[name] = await entry.async('string'); + } + return out; +} + +export function rowsToRecords(rows: readonly (readonly string[])[], headerRowIndex?: number): Record[] { + const resolvedHeaderIndex = headerRowIndex ?? findLikelyHeaderRow(rows); + if (resolvedHeaderIndex < 0) return []; + const headers = rows[resolvedHeaderIndex].map((header, index) => header.trim() || `column_${index + 1}`); + const records: Record[] = []; + for (const row of rows.slice(resolvedHeaderIndex + 1)) { + const record: Record = {}; + let hasValue = false; + for (let i = 0; i < headers.length; i++) { + const value = row[i]?.trim() ?? ''; + if (value) hasValue = true; + record[headers[i]] = value; + } + if (hasValue) records.push(record); + } + return records; +} + +export function aggregateHeadcountByDepartment( + records: readonly Record[], + fallbackYear?: number, +): StatskontoretHeadcountRow[] { + const aggregate = new Map }>(); + for (const record of records) { + const lookup = buildRecordLookup(record); + const year = parseStatskontoretOptionalInt(findField(lookup, ['Ă„r', 'ar', 'year']) ?? '') ?? fallbackYear; + const department = findField(lookup, ['departement', 'departementstillhörighet', 'departementstillhorighet'])?.trim(); + const headcountValue = parseStatskontoretSwedishNumber(findField(lookup, ['Ă„rsarbetskrafter', 'arsarbetskrafter', 'Ă„a', 'aa']) ?? ''); + if (!year || !department || headcountValue === undefined) continue; + const authority = findField(lookup, ['myndighet', 'myndighetsnamn', 'namn'])?.trim() ?? ''; + const key = `${year}::${department}`; + const current = aggregate.get(key) ?? { headcount: 0, authorities: new Set() }; + current.headcount += headcountValue; + if (authority) current.authorities.add(authority); + aggregate.set(key, current); + } + + return [...aggregate.entries()] + .map(([key, value]) => { + const [yearRaw, department] = key.split('::'); + return { + year: Number.parseInt(yearRaw, 10), + department, + headcount: roundOneDecimal(value.headcount), + authorityCount: value.authorities.size, + }; + }) + .sort((a, b) => a.year - b.year || a.department.localeCompare(b.department, 'sv')); +} + +export function buildHeadcountTimeSeries( + workbook: StatskontoretWorkbook, + options: StatskontoretHeadcountOptions = {}, +): StatskontoretHeadcountRow[] { + const sheet = options.sheetNamePattern + ? workbook.sheets.find((candidate) => options.sheetNamePattern?.test(candidate.name)) + : workbook.sheets.find((candidate) => /förteckning|forteckning/i.test(candidate.name)) ?? workbook.sheets[0]; + if (!sheet) return []; + return aggregateHeadcountByDepartment(rowsToRecords(sheet.rows), options.fallbackYear); +} + +/** + * Parse budget-outturn records into typed `StatskontoretBudgetRow` rows. + * + * Covers both `arsutfall` (annual, no month) and `manadsutfall` (monthly) as + * well as the `budget-time-series` XLSX series. Field names are normalised so + * Swedish characters and capitalisation differences are tolerated. + */ +export function parseBudgetRows( + records: readonly Record[], + options: StatskontoretBudgetOptions = {}, +): StatskontoretBudgetRow[] { + const rows: StatskontoretBudgetRow[] = []; + for (const record of records) { + const lookup = buildRecordLookup(record); + const yearRaw = findField(lookup, ['Ă„r', 'ar', 'year', 'kalenderĂ„r', 'kalenderar']); + const year = parseStatskontoretOptionalInt(yearRaw ?? '') ?? options.fallbackYear; + if (!year) continue; + + const monthRaw = findField(lookup, ['mĂ„nad', 'manad', 'month', 'mĂ„nadsperiod']); + const month = parseStatskontoretOptionalInt(monthRaw ?? '') ?? options.fallbackMonth; + + const docType = + options.documentType ?? + findField(lookup, ['dokumenttyp', 'dokumenttype', 'typ', 'inkomst_utgift', 'inkomstutgift']) ?? + ''; + + const title = + // 'Inkomsttitelnamn' is the descriptive name; 'Inkomsttitel' is the numeric code. + // Check the name-specific candidates first to avoid shadowing by the code field. + findField(lookup, [ + 'inkomsttitelnamn', 'inkomsttitelgruppsnamn', + 'anslagsnamn', 'utgiftsomradesnamn', 'utgiftsomrade', + 'titel', 'name', 'namn', 'rubrik', + ])?.trim() ?? ''; + + const code = findField(lookup, [ + // 'inkomsttitel' is the numeric income-title code (e.g. 1111, 1211) + 'inkomsttitel', 'inkomsttitelnummer', 'inkomsttitelnr', + 'anslagsnr', 'anslagsnummer', 'anslagspost', + 'utgiftsomradesnr', 'kod', 'code', 'nummer', + ])?.trim(); + + const outturnRaw = findField(lookup, [ + 'utfall', 'outturn', 'utfallmsek', 'utfallbelopp', + 'inkomstutfall', 'utgiftsutfall', 'belopp', + ]); + const outturn = parseStatskontoretSwedishNumber(outturnRaw ?? ''); + if (outturn === undefined) continue; + + const budgetRaw = findField(lookup, [ + 'budget', 'budgetvarde', 'budgetvĂ€rde', 'anvisatbelopp', + 'anvisat', 'statsbidrag', 'ramanslag', + ]); + const budget = parseStatskontoretSwedishNumber(budgetRaw ?? ''); + + const agency = findField(lookup, ['myndighet', 'myndighetsnamn', 'namn', 'authority'])?.trim(); + const status = findField(lookup, ['status', 'utfallsstatus', 'preliminar', 'preliminĂ€r'])?.trim(); + + rows.push({ + year, + ...(month !== undefined ? { month } : {}), + documentType: docType, + title, + ...(code ? { code } : {}), + outturn: roundOneDecimal(outturn), + ...(budget !== undefined ? { budget: roundOneDecimal(budget) } : {}), + ...(agency ? { agency } : {}), + ...(status ? { status } : {}), + }); + } + return rows; +} + +/** + * Parse all sheets in a budget-outturn workbook and return a flat array of + * typed rows sorted by year ascending, then month ascending (annual rows last + * for the same year), then documentType alphabetically. For single-type workbooks + * (e.g. a file explicitly downloaded as "Inkomst"), pass + * `options.documentType` to set the label uniformly. + */ +export function buildBudgetTimeSeries( + workbook: StatskontoretWorkbook, + options: StatskontoretBudgetOptions = {}, +): StatskontoretBudgetRow[] { + const rows: StatskontoretBudgetRow[] = []; + for (const sheet of workbook.sheets) { + // Derive a document-type hint from the sheet name when not forced by options + const sheetDocType = options.documentType ?? inferDocTypeFromSheetName(sheet.name); + const sheetOptions: StatskontoretBudgetOptions = { + ...options, + ...(sheetDocType ? { documentType: sheetDocType } : {}), + }; + rows.push(...parseBudgetRows(rowsToRecords(sheet.rows), sheetOptions)); + } + return rows.sort( + (a, b) => + a.year - b.year || + (a.month ?? Number.MAX_SAFE_INTEGER) - (b.month ?? Number.MAX_SAFE_INTEGER) || + a.documentType.localeCompare(b.documentType, 'sv'), + ); +} + +/** + * Aggregate `StatskontoretBudgetRow` rows into per-year/documentType totals. + * + * Rows are grouped by `(year, documentType)`. `totalBudget` and `variance` + * are included only when every row in the group has a `budget` value. + * + * Returns results sorted by year ascending, then documentType alphabetically. + */ +export function summarizeBudgetOutturn( + rows: readonly StatskontoretBudgetRow[], +): StatskontoretBudgetSummary[] { + const groups = new Map(); + + for (const row of rows) { + const key = `${row.year}::${row.documentType}`; + const existing = groups.get(key); + if (existing) { + existing.totalOutturn = roundOneDecimal(existing.totalOutturn + row.outturn); + if (row.budget !== undefined) { + existing.totalBudget = roundOneDecimal(existing.totalBudget + row.budget); + } else { + existing.allHaveBudget = false; + } + existing.rowCount++; + } else { + groups.set(key, { + year: row.year, + documentType: row.documentType, + totalOutturn: row.outturn, + totalBudget: row.budget ?? 0, + allHaveBudget: row.budget !== undefined, + rowCount: 1, + }); + } + } + + return [...groups.values()] + .map((g): StatskontoretBudgetSummary => ({ + year: g.year, + documentType: g.documentType, + totalOutturn: g.totalOutturn, + ...(g.allHaveBudget ? { + totalBudget: g.totalBudget, + variance: roundOneDecimal(g.totalOutturn - g.totalBudget), + } : {}), + rowCount: g.rowCount, + })) + .sort( + (a, b) => a.year - b.year || a.documentType.localeCompare(b.documentType, 'sv'), + ); +} + +/** Infer 'Inkomst' / 'Utgift' from common Swedish sheet-name patterns. */ +function inferDocTypeFromSheetName(name: string): string | undefined { + const n = name.toLowerCase(); + if (n.includes('inkomst')) return 'Inkomst'; + if (n.includes('utgift') || n.includes('anslag')) return 'Utgift'; + return undefined; +} + +function parseWorkbookSheets(xml: string): Array<{ name: string; relationshipId: string }> { + const sheets: Array<{ name: string; relationshipId: string }> = []; + const sheetRe = /]*)\/>/gi; + for (const match of xml.matchAll(sheetRe)) { + const attrs = parseXmlAttributes(match[1] ?? ''); + const name = attrs.get('name'); + const relationshipId = attrs.get('r:id') ?? attrs.get('id'); + if (name && relationshipId) sheets.push({ name: decodeXml(name), relationshipId }); + } + return sheets; +} + +function parseWorkbookRelationships(xml: string): Map { + const rels = new Map(); + const relRe = /]*)\/>/gi; + for (const match of xml.matchAll(relRe)) { + const attrs = parseXmlAttributes(match[1] ?? ''); + const id = attrs.get('Id'); + const target = attrs.get('Target'); + if (id && target) rels.set(id, target); + } + return rels; +} + +function parseSharedStrings(xml: string): string[] { + if (!xml) return []; + const strings: string[] = []; + const siRe = /]*>([\s\S]*?)<\/si>/gi; + for (const match of xml.matchAll(siRe)) { + strings.push(extractTextNodes(match[1] ?? '')); + } + return strings; +} + +function parseWorksheetRows(xml: string, sharedStrings: readonly string[]): string[][] { + const rows: string[][] = []; + const rowRe = /]*>([\s\S]*?)<\/row>/gi; + for (const rowMatch of xml.matchAll(rowRe)) { + const row: string[] = []; + const cellRe = /]*)>([\s\S]*?)<\/c>/gi; + for (const cellMatch of (rowMatch[1] ?? '').matchAll(cellRe)) { + const attrs = parseXmlAttributes(cellMatch[1] ?? ''); + const ref = attrs.get('r') ?? ''; + const cellIndex = cellRefToColumnIndex(ref) ?? row.length; + row[cellIndex] = parseCellValue(cellMatch[2] ?? '', attrs.get('t'), sharedStrings); + } + // Densify the sparse row: cells with explicit refs (e.g. C5) can leave + // holes when intermediate columns are absent; `Array.prototype.map` skips + // those holes, so downstream `rowsToRecords` would receive misaligned + // columns. Iterate every index up to the max set position to fill gaps. + rows.push(Array.from({ length: row.length }, (_, i) => row[i] ?? '')); + } + return rows; +} + +function parseCellValue(xml: string, type: string | undefined, sharedStrings: readonly string[]): string { + if (type === 'inlineStr') return extractTextNodes(xml); + const value = firstXmlTagValue(xml, 'v'); + if (value === undefined) return ''; + if (type === 's') return sharedStrings[Number.parseInt(value, 10)] ?? ''; + return decodeXml(value); +} + +function findLikelyHeaderRow(rows: readonly (readonly string[])[]): number { + for (let i = 0; i < rows.length; i++) { + const normalized = rows[i].map(normalizeKey); + // Headcount (myndighetsförteckning) signals + const headcountScore = [ + normalized.some((cell) => cell.includes('myndighet')), + normalized.some((cell) => cell.includes('departement')), + normalized.some((cell) => cell.includes('arsarbetskrafter') || cell === 'aa'), + normalized.some((cell) => cell === 'ar' || cell === 'year'), + ].filter(Boolean).length; + if (headcountScore >= 2) return i; + // Budget-outturn (Ă„rsutfall / mĂ„nadsutfall / budget-time-series) signals + const budgetScore = [ + normalized.some((cell) => cell.includes('utfall') || cell.includes('outturn')), + normalized.some((cell) => + cell.includes('inkomst') || cell.includes('utgift') || cell.includes('anslag'), + ), + normalized.some((cell) => cell === 'ar' || cell.includes('kalenderar') || cell === 'year'), + normalized.some((cell) => cell.includes('budget') || cell.includes('belopp')), + ].filter(Boolean).length; + if (budgetScore >= 2) return i; + } + return rows.findIndex((row) => row.filter((cell) => cell.trim()).length >= 2); +} + +function buildRecordLookup(record: Record): Map { + const lookup = new Map(); + for (const [key, value] of Object.entries(record)) { + lookup.set(normalizeKey(key), value); + } + return lookup; +} + +function findField(lookup: ReadonlyMap, candidates: readonly string[]): string | undefined { + const normalizedCandidates = candidates.map(normalizeKey); + for (const candidate of normalizedCandidates) { + const exact = lookup.get(candidate); + if (exact !== undefined) return exact; + } + for (const [key, value] of lookup.entries()) { + if (normalizedCandidates.some((candidate) => key.includes(candidate))) return value; + } + return undefined; +} + +export function parseStatskontoretSwedishNumber(value: string): number | undefined { + const compact = value.replace(/\s/g, ''); + const normalized = compact.includes(',') + ? compact.replace(/\./g, '').replace(',', '.') + : compact; + const parsed = Number.parseFloat(normalized); + return Number.isFinite(parsed) ? parsed : undefined; +} + +export function parseStatskontoretOptionalInt(value: string | null): number | undefined { + if (!value) return undefined; + const parsed = Number.parseInt(value, 10); + return Number.isFinite(parsed) ? parsed : undefined; +} + +export function classifyStatskontoretResource(href: string, text: string): StatskontoretResourceType { + const haystack = `${href} ${text}`.toLowerCase(); + if (haystack.includes('filetype=excel') || /\.xlsx(?:$|[?#])/i.test(href) || /\bexcel\b/i.test(text)) return 'excel'; + if (haystack.includes('filetype=zip') && /\bcsv\b/i.test(text)) return 'csv-zip'; + if (/\.zip(?:$|[?#])/i.test(href)) return /\bcsv\b/i.test(text) ? 'csv-zip' : 'zip'; + if (/\b(csv|zip)\b/i.test(text) && href.includes('GetFile')) return 'csv-zip'; + if (/\.(docx|pdf)(?:$|[?#])/i.test(href)) return 'document'; + if (FILE_EXTENSION_RE.test(href) || href.includes('GetFile')) return 'unknown'; + return 'unknown'; +} + +function deduplicateLinks(links: readonly StatskontoretDownloadLink[]): StatskontoretDownloadLink[] { + const seen = new Set(); + const out: StatskontoretDownloadLink[] = []; + for (const link of links) { + if (seen.has(link.url)) continue; + seen.add(link.url); + out.push(link); + } + return out; +} + +function resolveStatskontoretUrl(url: string, baseURL: string): string { + return new URL(decodeHtml(url), `${trimTrailingSlash(baseURL)}/`).toString(); +} + +/** + * Validate that an outbound URL targets the Statskontoret allowlisted host + * over HTTPS before issuing a fetch. Mirrors the firewall allowlist documented + * in `analysis/statskontoret/indicators-inventory.json` so absolute URLs from + * untrusted callers cannot redirect the client to arbitrary hosts. + */ +export function assertStatskontoretFetchTarget(url: string, baseURL: string = STATSKONTORET_BASE_URL): URL { + let parsed: URL; + try { + parsed = new URL(url); + } catch { + throw new StatskontoretError(`Invalid Statskontoret URL: ${url}`, 'http'); + } + if (parsed.protocol !== 'https:') { + throw new StatskontoretError(`Statskontoret fetch must use https: ${url}`, 'http'); + } + const allowedHost = new URL(baseURL).hostname; + if (parsed.hostname !== allowedHost) { + throw new StatskontoretError( + `Statskontoret fetch host ${parsed.hostname} not in allowlist (${allowedHost})`, + 'http', + ); + } + return parsed; +} + +function trimTrailingSlash(value: string): string { + return value.replace(/\/+$/, ''); +} + +function normalizeWhitespace(value: string): string { + return value.replace(/\s+/g, ' ').trim(); +} + +function normalizeKey(value: string): string { + return value + .toLowerCase() + .normalize('NFD') + .replace(/[\u0300-\u036f]/g, '') + .replace(/[^a-z0-9]+/g, ''); +} + +function roundOneDecimal(value: number): number { + return Math.round(value * 10) / 10; +} + +function cellRefToColumnIndex(ref: string): number | undefined { + const letters = ref.match(/^[A-Z]+/i)?.[0]; + if (!letters) return undefined; + let index = 0; + for (const char of letters.toUpperCase()) { + // Excel columns are bijective base-26 labels; keep a one-based accumulator + // (A=1, Z=26, AA=27) and convert to a zero-based array index below. + index = index * 26 + (char.charCodeAt(0) - 65 + 1); + } + return index - 1; +} + +function parseXmlAttributes(input: string): Map { + const attrs = new Map(); + const attrRe = /([\w:-]+)=["']([^"']*)["']/g; + for (const match of input.matchAll(attrRe)) { + attrs.set(match[1], decodeXml(match[2] ?? '')); + } + return attrs; +} + +function firstXmlTagValue(xml: string, tag: string): string | undefined { + const match = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`, 'i').exec(xml); + return match ? decodeXml(match[1] ?? '') : undefined; +} + +function extractTextNodes(xml: string): string { + const parts: string[] = []; + const textRe = /]*>([\s\S]*?)<\/t>/gi; + for (const match of xml.matchAll(textRe)) { + parts.push(decodeXml(match[1] ?? '')); + } + return parts.join(''); +} + +async function readZipText(zip: JSZip, path: string): Promise { + const file = zip.file(path); + if (!file) throw new StatskontoretError(`Statskontoret workbook missing ${path}`, 'workbook'); + return file.async('string'); +} + +function extractPageLastModified(html: string): string | undefined { + const match = / [--persist] + * tsx scripts/statskontoret-fetch.ts budget-outturn --url --source arsutfall [--doc-type Inkomst] [--persist] + */ + +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; + +import { + buildBudgetTimeSeries, + buildHeadcountTimeSeries, + getStatskontoretSource, + STATSKONTORET_SOURCES, + StatskontoretClient, + StatskontoretError, + type StatskontoretSourceKey, +} from './statskontoret-client.js'; +import { persistStatskontoretData } from './parliamentary-data/data-persistence.js'; + +interface ParsedArgs { + readonly command: 'list-sources' | 'discover' | 'headcount' | 'budget-outturn' | 'help'; + readonly flags: ReadonlyMap; + readonly booleans: ReadonlySet; +} + +const HELP = `tsx scripts/statskontoret-fetch.ts [flags] + +Commands: + list-sources Print the built-in Statskontoret source catalogue + discover Extract downloadable Excel/CSV-ZIP links from a source page + headcount Fetch an authority-register workbook and aggregate headcount by department/year + budget-outturn Fetch a budget-outturn workbook (Ă„rsutfall / mĂ„nadsutfall / tidsserier) and parse rows + help Show this message + +Flags: + --source Source key: myndighetsforteckning | budget-time-series | arsutfall | manadsutfall + --url Direct Excel workbook URL for headcount / budget-outturn commands + --doc-type Override documentType label for budget-outturn (e.g. Inkomst | Utgift) + --persist Write raw/derived output under analysis/data/statskontoret/ +`; + +export function parseStatskontoretArgs(argv: readonly string[]): ParsedArgs { + const command = (argv[0] ?? 'help') as ParsedArgs['command']; + const validCommands: readonly ParsedArgs['command'][] = [ + 'list-sources', 'discover', 'headcount', 'budget-outturn', 'help', + ]; + if (!validCommands.includes(command)) { + throw new StatskontoretError(`unknown command ${command}`, 'cli'); + } + const flags = new Map(); + const booleans = new Set(); + for (let i = 1; i < argv.length; i++) { + const token = argv[i]; + if (!token.startsWith('--')) { + throw new StatskontoretError(`unexpected positional argument ${token}`, 'cli'); + } + const key = token.slice(2); + const next = argv[i + 1]; + if (next !== undefined && !next.startsWith('--')) { + flags.set(key, next); + i++; + } else { + booleans.add(key); + } + } + return { command, flags, booleans }; +} + +export function requireStatskontoretFlag(flags: ReadonlyMap, key: string): string { + const value = flags.get(key); + if (!value) { + throw new StatskontoretError(`missing required flag --${key}`, 'cli'); + } + return value; +} + +export function parseStatskontoretSource(value: string): StatskontoretSourceKey { + if (STATSKONTORET_SOURCES.some((source) => source.key === value)) return value as StatskontoretSourceKey; + throw new StatskontoretError(`unknown source ${value}`, 'cli'); +} + +async function runDiscover(flags: ReadonlyMap, booleans: ReadonlySet): Promise { + const source = parseStatskontoretSource(requireStatskontoretFlag(flags, 'source')); + const client = new StatskontoretClient(); + const links = await client.discoverDownloads(source); + const payload = { source: getStatskontoretSource(source), links }; + process.stdout.write(`${JSON.stringify(payload, null, 2)}\n`); + if (booleans.has('persist')) { + persistStatskontoretData(source, 'downloads', payload); + } +} + +async function runHeadcount(flags: ReadonlyMap, booleans: ReadonlySet): Promise { + const url = requireStatskontoretFlag(flags, 'url'); + const client = new StatskontoretClient(); + const workbook = await client.fetchWorkbook(url); + const headcount = buildHeadcountTimeSeries(workbook, { sheetNamePattern: /förteckning|forteckning/i }); + const payload = { source: 'myndighetsforteckning', url, headcount }; + process.stdout.write(`${JSON.stringify(payload, null, 2)}\n`); + if (booleans.has('persist')) { + persistStatskontoretData('myndighetsforteckning', 'headcount-by-department', payload); + } +} + +async function runBudgetOutturn(flags: ReadonlyMap, booleans: ReadonlySet): Promise { + const url = requireStatskontoretFlag(flags, 'url'); + const source = parseStatskontoretSource(requireStatskontoretFlag(flags, 'source')); + if (source === 'myndighetsforteckning') { + throw new StatskontoretError( + 'budget-outturn command is for arsutfall | manadsutfall | budget-time-series, not myndighetsforteckning', + 'cli', + ); + } + const docType = flags.get('doc-type'); + const client = new StatskontoretClient(); + const workbook = await client.fetchWorkbook(url); + const rows = buildBudgetTimeSeries(workbook, { ...(docType ? { documentType: docType } : {}) }); + const payload = { source, url, ...(docType ? { documentType: docType } : {}), rows }; + process.stdout.write(`${JSON.stringify(payload, null, 2)}\n`); + if (booleans.has('persist')) { + const artifact = docType + ? `budget-outturn-${docType.toLowerCase()}` + : 'budget-outturn'; + persistStatskontoretData(source, artifact, payload); + } +} + +async function main(): Promise { + const { command, flags, booleans } = parseStatskontoretArgs(process.argv.slice(2)); + switch (command) { + case 'list-sources': + process.stdout.write(`${JSON.stringify({ sources: STATSKONTORET_SOURCES }, null, 2)}\n`); + return; + case 'discover': + await runDiscover(flags, booleans); + return; + case 'headcount': + await runHeadcount(flags, booleans); + return; + case 'budget-outturn': + await runBudgetOutturn(flags, booleans); + return; + case 'help': + default: + process.stdout.write(HELP); + } +} + +function isDirectExecution(): boolean { + const entry = process.argv[1]; + if (!entry) return false; + try { + return import.meta.url === pathToFileURL(path.resolve(entry)).href; + } catch { + // `pathToFileURL` throws on malformed paths; `path.resolve` is used to + // normalise the entry first so most runners reach the comparison, and the + // catch keeps the module import-safe across exotic launchers. + return false; + } +} + +if (isDirectExecution()) { + main().catch((error: unknown) => { + const message = error instanceof Error ? error.message : String(error); + process.stderr.write(`statskontoret-fetch: ${message}\n`); + process.exit(error instanceof StatskontoretError && error.kind === 'cli' ? 2 : 1); + }); +} diff --git a/tests/statskontoret-client.test.ts b/tests/statskontoret-client.test.ts new file mode 100644 index 0000000000..4d04084dfc --- /dev/null +++ b/tests/statskontoret-client.test.ts @@ -0,0 +1,501 @@ +/** + * Tests for Statskontoret client and parsers. + * + * No live network calls — link discovery and XLSX/CSV ZIP parsing use local + * fixtures. + */ + +import { describe, it, expect } from 'vitest'; +import JSZip from 'jszip'; +import { + aggregateHeadcountByDepartment, + buildBudgetTimeSeries, + buildHeadcountTimeSeries, + extractStatskontoretDownloadLinks, + getStatskontoretSource, + parseStatskontoretCsvZip, + parseStatskontoretXlsx, + parseBudgetRows, + rowsToRecords, + StatskontoretClient, + StatskontoretError, + summarizeBudgetOutturn, +} from '../scripts/statskontoret-client.js'; + +describe('Statskontoret link discovery', () => { + it('extracts Excel and CSV ZIP GetFile links with provenance parameters', () => { + const html = ` + + Excel (366,1 kB) + Csv (152,3 kB) + `; + + const links = extractStatskontoretDownloadLinks( + html, + 'arsutfall', + 'https://www.statskontoret.se/analys-och-statistik/oppna-data/arsutfall/', + ); + + expect(links).toHaveLength(2); + expect(links[0]).toMatchObject({ + source: 'arsutfall', + resourceType: 'excel', + documentType: 'Inkomst', + fileType: 'Excel', + year: 2025, + month: 0, + status: 'PreliminĂ€r 1', + updatedAt: '2026-03-24', + }); + expect(links[1].resourceType).toBe('csv-zip'); + expect(links[1].url).toContain('fileType=Zip'); + }); +}); + +describe('Statskontoret workbook parsing', () => { + it('parses XLSX rows and builds department headcount time series', async () => { + const workbook = await parseStatskontoretXlsx(await createWorkbookFixture()); + expect(workbook.sheets.map((sheet) => sheet.name)).toEqual(['Förteckning 2007–2025']); + + const records = rowsToRecords(workbook.sheets[0].rows); + expect(records).toHaveLength(4); + + const headcount = buildHeadcountTimeSeries(workbook); + expect(headcount).toEqual([ + { year: 2024, department: 'Finansdepartementet', headcount: 42.5, authorityCount: 1 }, + { year: 2025, department: 'Finansdepartementet', headcount: 45.5, authorityCount: 2 }, + { year: 2025, department: 'Justitiedepartementet', headcount: 20, authorityCount: 1 }, + ]); + }); + + it('aggregates records with Swedish decimal comma values', () => { + const rows = aggregateHeadcountByDepartment([ + { + År: '2025', + Myndighet: 'Myndighet A', + Departementstillhörighet: 'Klimat- och nĂ€ringslivsdepartementet', + Årsarbetskrafter: '10,5', + }, + { + År: '2025', + Myndighet: 'Myndighet B', + Departementstillhörighet: 'Klimat- och nĂ€ringslivsdepartementet', + Årsarbetskrafter: '1.234,5', + }, + ]); + + expect(rows).toEqual([ + { + year: 2025, + department: 'Klimat- och nĂ€ringslivsdepartementet', + headcount: 1245, + authorityCount: 2, + }, + ]); + }); +}); + +describe('Statskontoret CSV ZIP parsing', () => { + it('extracts CSV files from ZIP archives', async () => { + const zip = new JSZip(); + zip.file('utfall.csv', 'År;Myndighet;Utfall\n2025;A;100\n'); + zip.file('readme.txt', 'ignored'); + const content = await zip.generateAsync({ type: 'uint8array' }); + + const csv = await parseStatskontoretCsvZip(content); + expect(csv).toEqual({ 'utfall.csv': 'År;Myndighet;Utfall\n2025;A;100\n' }); + }); +}); + +describe('StatskontoretClient', () => { + it('uses injected fetch for source discovery', async () => { + const fetchFn = async () => new Response('Excel', { status: 200 }); + const client = new StatskontoretClient({ fetchFn: fetchFn as typeof fetch }); + const links = await client.discoverDownloads('myndighetsforteckning'); + expect(links[0].url).toBe('https://www.statskontoret.se/file.xlsx'); + }); + + it('allows custom HTTPS baseURL hosts through the fetch guard', async () => { + let requestedUrl = ''; + const fetchFn = async (input: RequestInfo | URL) => { + requestedUrl = String(input); + return new Response('ok', { status: 200 }); + }; + const client = new StatskontoretClient({ + baseURL: 'https://staging.statskontoret.test', + fetchFn: fetchFn as typeof fetch, + }); + + await expect(client.fetchText('/page')).resolves.toBe('ok'); + expect(requestedUrl).toBe('https://staging.statskontoret.test/page'); + }); + + it('wraps network failures in typed http errors with the original cause', async () => { + const cause = new Error('socket closed'); + const fetchFn = async () => { + throw cause; + }; + const client = new StatskontoretClient({ fetchFn: fetchFn as typeof fetch }); + + let caught: StatskontoretError | undefined; + try { + await client.fetchText('https://www.statskontoret.se/down'); + } catch (error) { + caught = error as StatskontoretError; + } + + expect(caught).toBeInstanceOf(StatskontoretError); + expect(caught?.kind).toBe('http'); + expect(caught?.message).toContain('socket closed'); + expect(caught?.cause).toBe(cause); + }); + + it('densifies sparse worksheet rows so column alignment is preserved', async () => { + // Worksheet with explicit cell refs that skip column B, leaving a hole at + // index 1; densification must fill the gap with '' so headers stay aligned. + const zip = new JSZip(); + zip.file('[Content_Types].xml', ''); + zip.file('xl/workbook.xml', ` + + + `); + zip.file('xl/_rels/workbook.xml.rels', ` + + + `); + zip.file('xl/worksheets/sheet1.xml', ` + + + h1h3 + + `); + const workbook = await parseStatskontoretXlsx(await zip.generateAsync({ type: 'uint8array' })); + expect(workbook.sheets[0].rows[0]).toEqual(['h1', '', 'h3']); + }); +}); + +describe('parseBudgetRows', () => { + it('parses annual income outturn records (Ă„rsutfall Inkomst)', () => { + const records = [ + { År: '2024', Inkomsttitel: '1111', Inkomsttitelnamn: 'Skatt pĂ„ inkomst', Utfall: '500000', Budget: '480000' }, + { År: '2024', Inkomsttitel: '1211', Inkomsttitelnamn: 'MervĂ€rdesskatt', Utfall: '750000', Budget: '700000' }, + ]; + const rows = parseBudgetRows(records, { documentType: 'Inkomst' }); + expect(rows).toHaveLength(2); + expect(rows[0]).toMatchObject({ + year: 2024, + documentType: 'Inkomst', + title: 'Skatt pĂ„ inkomst', + code: '1111', + outturn: 500000, + budget: 480000, + }); + expect(rows[0].month).toBeUndefined(); + }); + + it('parses annual expenditure outturn records (Ă„rsutfall Utgift)', () => { + const records = [ + { År: '2024', Anslagsnamn: 'Riksdagen', Anslagsnr: '1:1', Utfall: '1200', Budget: '1100', Myndighet: 'Riksdagen' }, + ]; + const rows = parseBudgetRows(records, { documentType: 'Utgift' }); + expect(rows[0]).toMatchObject({ + year: 2024, + documentType: 'Utgift', + title: 'Riksdagen', + code: '1:1', + outturn: 1200, + budget: 1100, + agency: 'Riksdagen', + }); + }); + + it('parses monthly outturn records (mĂ„nadsutfall) with month column', () => { + const records = [ + { År: '2025', MĂ„nad: '3', Inkomsttitelnamn: 'Skatter', Utfall: '42000', Typ: 'Inkomst' }, + ]; + const rows = parseBudgetRows(records); + expect(rows[0]).toMatchObject({ year: 2025, month: 3, documentType: 'Inkomst', outturn: 42000 }); + }); + + it('uses fallback year when the record has no year column', () => { + const records = [{ Inkomsttitelnamn: 'Skatt', Utfall: '100' }]; + const rows = parseBudgetRows(records, { fallbackYear: 2023, documentType: 'Inkomst' }); + expect(rows[0].year).toBe(2023); + }); + + it('skips records missing an outturn value', () => { + const records = [ + { År: '2024', Inkomsttitelnamn: 'Titel', Utfall: '' }, + { År: '2024', Inkomsttitelnamn: 'Titel2', Utfall: '100' }, + ]; + expect(parseBudgetRows(records)).toHaveLength(1); + }); + + it('normalises Swedish decimal commas', () => { + const records = [{ År: '2024', Inkomsttitelnamn: 'X', Utfall: '1.234,5' }]; + expect(parseBudgetRows(records)[0].outturn).toBe(1234.5); + }); +}); + +describe('buildBudgetTimeSeries', () => { + it('derives documentType from sheet name and parses all sheets', async () => { + const zip = new JSZip(); + zip.file('[Content_Types].xml', ''); + zip.file('xl/workbook.xml', ` + + + + + + `); + zip.file('xl/_rels/workbook.xml.rels', ` + + + + `); + zip.file('xl/sharedStrings.xml', ` + + ${['Inkomsttitelnamn', 'Utfall', 'Skatt', 'Anslagsnamn', 'Utfall', 'Riksdagen'].map((v) => `${v}`).join('')} + `); + // Inkomst sheet + zip.file('xl/worksheets/sheet1.xml', ` + + + 01 + 2500 + + `); + // Utgift sheet + zip.file('xl/worksheets/sheet2.xml', ` + + + 34 + 51200 + + `); + const workbook = await parseStatskontoretXlsx(await zip.generateAsync({ type: 'uint8array' })); + const rows = buildBudgetTimeSeries(workbook, { fallbackYear: 2024 }); + expect(rows.find((r) => r.documentType === 'Inkomst')).toMatchObject({ title: 'Skatt', outturn: 500 }); + expect(rows.find((r) => r.documentType === 'Utgift')).toMatchObject({ title: 'Riksdagen', outturn: 1200 }); + }); + + it('sorts output by year then month then documentType', () => { + const rows = parseBudgetRows( + [ + { År: '2025', MĂ„nad: '2', Inkomsttitelnamn: 'B', Utfall: '10', Typ: 'Utgift' }, + { År: '2024', Inkomsttitelnamn: 'A', Utfall: '20', Typ: 'Inkomst' }, + { År: '2025', MĂ„nad: '1', Inkomsttitelnamn: 'C', Utfall: '30', Typ: 'Inkomst' }, + ], + ); + // Verify the sort contract that buildBudgetTimeSeries applies + const sorted = [...rows].sort( + (a, b) => + a.year - b.year || + (a.month ?? Number.MAX_SAFE_INTEGER) - (b.month ?? Number.MAX_SAFE_INTEGER) || + a.documentType.localeCompare(b.documentType, 'sv'), + ); + // Ensure the sort is stable: 2024 first, then 2025/month-1, then 2025/month-2 + expect(sorted[0].year).toBe(2024); + expect(sorted[1]).toMatchObject({ year: 2025, month: 1 }); + expect(sorted[2]).toMatchObject({ year: 2025, month: 2 }); + }); + + it('forces documentType when options.documentType overrides sheet-name inference', () => { + const rows = parseBudgetRows( + [{ År: '2025', Anslagsnamn: 'Polismyndigheten', Utfall: '55000' }], + { documentType: 'Utgift' }, + ); + expect(rows[0].documentType).toBe('Utgift'); + }); +}); + +describe('summarizeBudgetOutturn', () => { + it('aggregates rows into per-year/documentType totals with variance', () => { + const rows = parseBudgetRows([ + { År: '2024', Inkomsttitelnamn: 'Skatt', Utfall: '500000', Budget: '480000', Typ: 'Inkomst' }, + { År: '2024', Inkomsttitelnamn: 'Moms', Utfall: '200000', Budget: '190000', Typ: 'Inkomst' }, + { År: '2024', Anslagsnamn: 'Polis', Utfall: '80000', Budget: '75000', Typ: 'Utgift' }, + ]); + const summary = summarizeBudgetOutturn(rows); + const income = summary.find((s) => s.documentType === 'Inkomst'); + expect(income).toMatchObject({ + year: 2024, + totalOutturn: 700000, + totalBudget: 670000, + variance: 30000, + rowCount: 2, + }); + const expenditure = summary.find((s) => s.documentType === 'Utgift'); + expect(expenditure).toMatchObject({ year: 2024, totalOutturn: 80000, rowCount: 1 }); + }); + + it('omits totalBudget and variance when any row lacks a budget value', () => { + const rows = parseBudgetRows([ + { År: '2024', Inkomsttitelnamn: 'Skatt', Utfall: '500', Budget: '480', Typ: 'Inkomst' }, + { År: '2024', Inkomsttitelnamn: 'Tull', Utfall: '100', Typ: 'Inkomst' }, + ]); + const [summary] = summarizeBudgetOutturn(rows); + expect(summary.totalBudget).toBeUndefined(); + expect(summary.variance).toBeUndefined(); + expect(summary.totalOutturn).toBe(600); + }); + + it('returns results sorted by year then documentType', () => { + const rows = parseBudgetRows([ + { År: '2024', Anslagsnamn: 'A', Utfall: '1', Typ: 'Utgift' }, + { År: '2023', Inkomsttitelnamn: 'B', Utfall: '2', Typ: 'Inkomst' }, + { År: '2024', Inkomsttitelnamn: 'C', Utfall: '3', Typ: 'Inkomst' }, + ]); + const summary = summarizeBudgetOutturn(rows); + expect(summary.map((s) => `${s.year}/${s.documentType}`)).toEqual([ + '2023/Inkomst', '2024/Inkomst', '2024/Utgift', + ]); + }); + + it('returns empty array for empty input', () => { + expect(summarizeBudgetOutturn([])).toEqual([]); + }); +}); + +describe('getStatskontoretSource', () => { + it('returns the source definition for a valid key', () => { + const src = getStatskontoretSource('arsutfall'); + expect(src.key).toBe('arsutfall'); + expect(src.title).toContain('Årsutfall'); + }); + + it('throws a typed StatskontoretError for an unknown key', () => { + expect(() => getStatskontoretSource('does-not-exist' as 'arsutfall')).toThrow(StatskontoretError); + }); + + it('exposes StatskontoretError.kind on thrown errors', () => { + let caught: StatskontoretError | undefined; + try { + getStatskontoretSource('does-not-exist' as 'arsutfall'); + } catch (err) { + caught = err as StatskontoretError; + } + expect(caught?.kind).toBe('contract'); + expect(caught?.name).toBe('StatskontoretError'); + }); +}); + +describe('buildHeadcountTimeSeries advanced options', () => { + it('uses sheetNamePattern to pick the correct sheet', async () => { + const workbook = await parseStatskontoretXlsx(await createWorkbookFixture()); + const result = buildHeadcountTimeSeries(workbook, { sheetNamePattern: /2007.+2025/ }); + expect(result.length).toBeGreaterThan(0); + }); + + it('returns empty array when sheetNamePattern matches no sheet', async () => { + const workbook = await parseStatskontoretXlsx(await createWorkbookFixture()); + const result = buildHeadcountTimeSeries(workbook, { sheetNamePattern: /nonexistent/ }); + expect(result).toEqual([]); + }); + + it('returns empty array when workbook has no sheets', () => { + const result = buildHeadcountTimeSeries({ sheets: [] }); + expect(result).toEqual([]); + }); +}); + +describe('rowsToRecords advanced options', () => { + it('uses explicit headerRowIndex to skip auto-detection', () => { + const rows = [ + ['title-row'], + ['Col A', 'Col B'], + ['val1', 'val2'], + ] as const; + const records = rowsToRecords(rows, 1); + expect(records).toEqual([{ 'Col A': 'val1', 'Col B': 'val2' }]); + }); + + it('returns empty array when rows are empty', () => { + expect(rowsToRecords([])).toEqual([]); + }); + + it('uses fallback column names for blank headers', () => { + const rows = [['', 'B'], ['x', 'y']] as const; + const [record] = rowsToRecords(rows, 0); + expect(record['column_1']).toBe('x'); + expect(record['B']).toBe('y'); + }); +}); + +describe('parseBudgetRows additional paths', () => { + it('uses fallbackMonth when the record has no month column', () => { + const records = [{ År: '2025', Inkomsttitelnamn: 'Skatt', Utfall: '1000' }]; + const [row] = parseBudgetRows(records, { fallbackMonth: 6 }); + expect(row.month).toBe(6); + }); + + it('skips records with no year and no fallbackYear', () => { + const records = [{ Inkomsttitelnamn: 'Skatt', Utfall: '100' }]; + expect(parseBudgetRows(records)).toHaveLength(0); + }); +}); + +describe('extractStatskontoretDownloadLinks deduplication', () => { + it('deduplicates links with identical resolved URLs', () => { + const html = ` + Excel + Excel`; + const links = extractStatskontoretDownloadLinks( + html, 'arsutfall', 'https://www.statskontoret.se/arsutfall/', + ); + expect(links).toHaveLength(1); + }); + + it('keeps links with different query parameters', () => { + const html = ` + Excel 2024 + Excel 2025`; + const links = extractStatskontoretDownloadLinks( + html, 'arsutfall', 'https://www.statskontoret.se/arsutfall/', + ); + expect(links).toHaveLength(2); + }); +}); + +describe('StatskontoretClient HTTP error path', () => { + it('throws a typed http error when the server returns a non-OK response', async () => { + const fetchFn = async () => new Response('Not Found', { status: 404, statusText: 'Not Found' }); + const client = new StatskontoretClient({ fetchFn: fetchFn as typeof fetch }); + await expect(client.fetchText('https://www.statskontoret.se/missing')).rejects.toMatchObject({ + kind: 'http', + }); + }); +}); + + +async function createWorkbookFixture(): Promise { + // Minimal XLSX fixture mirroring the Statskontoret assumptions documented in + // analysis/statskontoret/data-dictionary.md: a workbook sheet whose header row + // contains År, Myndighet, Departement and Årsarbetskrafter. + const zip = new JSZip(); + zip.file('[Content_Types].xml', ''); + zip.file('xl/workbook.xml', ` + + + `); + zip.file('xl/_rels/workbook.xml.rels', ` + + + `); + zip.file('xl/sharedStrings.xml', ` + + ${['År', 'Myndighet', 'Departement', 'Årsarbetskrafter', 'Myndighet A', 'Finansdepartementet', 'Myndighet B', 'Justitiedepartementet', 'Myndighet C'] + .map((value) => `${value}`).join('')} + `); + zip.file('xl/worksheets/sheet1.xml', ` + + + 0123 + 20254510.5 + 20256720 + 20248542.5 + 20258535 + + `); + return zip.generateAsync({ type: 'uint8array' }); +} diff --git a/tests/statskontoret-fetch.test.ts b/tests/statskontoret-fetch.test.ts new file mode 100644 index 0000000000..7bcd50cd91 --- /dev/null +++ b/tests/statskontoret-fetch.test.ts @@ -0,0 +1,122 @@ +/** Tests for Statskontoret CLI argument parsing helpers. */ + +import { describe, it, expect } from 'vitest'; +import { + parseStatskontoretArgs, + parseStatskontoretSource, + requireStatskontoretFlag, +} from '../scripts/statskontoret-fetch.js'; +import { + classifyStatskontoretResource, + parseStatskontoretOptionalInt, + parseStatskontoretSwedishNumber, + StatskontoretClient, + StatskontoretError, + assertStatskontoretFetchTarget, +} from '../scripts/statskontoret-client.js'; + +describe('Statskontoret CLI parsing', () => { + it('parses flags and boolean options without executing the CLI', () => { + const parsed = parseStatskontoretArgs(['discover', '--source', 'arsutfall', '--persist']); + + expect(parsed.command).toBe('discover'); + expect(requireStatskontoretFlag(parsed.flags, 'source')).toBe('arsutfall'); + expect(parsed.booleans.has('persist')).toBe(true); + }); + + it('throws typed CLI errors for invalid input', () => { + expect(() => parseStatskontoretArgs(['unknown'])).toThrow(StatskontoretError); + expect(() => requireStatskontoretFlag(new Map(), 'source')).toThrow(/missing required flag/); + expect(() => parseStatskontoretSource('bad-source')).toThrow(/unknown source/); + }); + + it('accepts built-in source keys', () => { + expect(parseStatskontoretSource('myndighetsforteckning')).toBe('myndighetsforteckning'); + }); +}); + +describe('Statskontoret parsing primitives', () => { + it('classifies common downloadable resources', () => { + expect(classifyStatskontoretResource('/OpenData/GetFile?fileType=Excel', 'Excel')).toBe('excel'); + expect(classifyStatskontoretResource('/OpenData/GetFile?fileType=Zip', 'Csv (10 kB)')).toBe('csv-zip'); + expect(classifyStatskontoretResource('/files/report.pdf', 'Rapport')).toBe('document'); + expect(classifyStatskontoretResource('/page', 'Webbsida')).toBe('unknown'); + }); + + it('normalises Swedish numeric and integer values defensively', () => { + expect(parseStatskontoretSwedishNumber('1 234,5')).toBe(1234.5); + expect(parseStatskontoretSwedishNumber('not-a-number')).toBeUndefined(); + expect(parseStatskontoretOptionalInt('2026')).toBe(2026); + expect(parseStatskontoretOptionalInt(null)).toBeUndefined(); + }); +}); + +describe('Statskontoret fetch target guard', () => { + it('accepts the allowlisted Statskontoret HTTPS host', () => { + expect(() => + assertStatskontoretFetchTarget('https://www.statskontoret.se/page'), + ).not.toThrow(); + }); + + it('rejects non-HTTPS schemes', () => { + expect(() => + assertStatskontoretFetchTarget('http://www.statskontoret.se/page'), + ).toThrow(StatskontoretError); + }); + + it('rejects hosts outside the allowlist', () => { + expect(() => + assertStatskontoretFetchTarget('https://example.com/path'), + ).toThrow(/not in allowlist/); + }); + + it('rejects malformed URLs with a typed error', () => { + expect(() => assertStatskontoretFetchTarget('not a url')).toThrow(StatskontoretError); + }); + + it('blocks fetchText calls that target other hosts', async () => { + const client = new StatskontoretClient(); + await expect(client.fetchText('https://evil.example.com/x')).rejects.toThrow(/allowlist/); + }); +}); + +describe('Statskontoret CLI budget-outturn command parsing', () => { + it('parses budget-outturn command with required flags', () => { + const parsed = parseStatskontoretArgs([ + 'budget-outturn', '--source', 'arsutfall', '--url', 'https://www.statskontoret.se/file.xlsx', + ]); + expect(parsed.command).toBe('budget-outturn'); + expect(requireStatskontoretFlag(parsed.flags, 'source')).toBe('arsutfall'); + expect(requireStatskontoretFlag(parsed.flags, 'url')).toBe('https://www.statskontoret.se/file.xlsx'); + }); + + it('parses optional --doc-type flag', () => { + const parsed = parseStatskontoretArgs([ + 'budget-outturn', '--source', 'manadsutfall', '--url', 'https://www.statskontoret.se/f.xlsx', '--doc-type', 'Inkomst', + ]); + expect(parsed.flags.get('doc-type')).toBe('Inkomst'); + }); + + it('parses --persist boolean alongside budget-outturn', () => { + const parsed = parseStatskontoretArgs([ + 'budget-outturn', '--source', 'budget-time-series', '--url', 'https://www.statskontoret.se/f.xlsx', '--persist', + ]); + expect(parsed.booleans.has('persist')).toBe(true); + }); +}); + +describe('Statskontoret CLI budget-outturn source guard', () => { + it('rejects myndighetsforteckning as a source for budget-outturn', () => { + // The parseStatskontoretSource guard only validates known keys, so this + // test exercises the runtime guard inside runBudgetOutturn that was added + // to prevent myndighetsforteckning being used with the budget-outturn command. + // We test that CLI argument parsing succeeds but the source validation passes + // at parse time; the actual rejection occurs inside runBudgetOutturn. + const parsed = parseStatskontoretArgs([ + 'budget-outturn', '--source', 'myndighetsforteckning', '--url', 'https://www.statskontoret.se/x.xlsx', + ]); + // Parsing succeeds; the rejection happens at runtime inside runBudgetOutturn. + expect(parsed.command).toBe('budget-outturn'); + expect(parsed.flags.get('source')).toBe('myndighetsforteckning'); + }); +}); diff --git a/tests/statskontoret-inventory.test.ts b/tests/statskontoret-inventory.test.ts new file mode 100644 index 0000000000..95e15c805c --- /dev/null +++ b/tests/statskontoret-inventory.test.ts @@ -0,0 +1,53 @@ +/** Validation tests for the Statskontoret inventory artifacts. */ + +import { describe, it, expect } from 'vitest'; +import { readFileSync } from 'node:fs'; +import { resolve, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { STATSKONTORET_SOURCES } from '../scripts/statskontoret-client.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); +const repoRoot = resolve(__dirname, '..'); + +interface StatskontoretInventory { + version: string; + source: string; + classification: string; + clients: Record; + datasets: Record; + providerDecisionMatrix: Record; +} + +function readInventory(): StatskontoretInventory { + return JSON.parse(readFileSync(resolve(repoRoot, 'analysis/statskontoret/indicators-inventory.json'), 'utf-8')) as StatskontoretInventory; +} + +describe('analysis/statskontoret/indicators-inventory.json', () => { + const inv = readInventory(); + + it('identifies Statskontoret as the public source', () => { + expect(inv.source).toMatch(/Statskontoret/i); + expect(inv.classification).toBe('Public'); + expect(inv.version).toBeTruthy(); + }); + + it('covers every built-in TypeScript source definition', () => { + for (const source of STATSKONTORET_SOURCES) { + expect(inv.datasets[source.key], `inventory missing ${source.key}`).toBeDefined(); + expect(inv.datasets[source.key].url).toBe(`https://www.statskontoret.se${source.url}`); + } + }); + + it('declares key provider-decision routes', () => { + expect(inv.providerDecisionMatrix.governmentBodiesHeadcount).toBe('statskontoret:myndighetsforteckning'); + expect(inv.providerDecisionMatrix.macroFiscalProjection).toBe('imf:WEO/FM'); + expect(inv.providerDecisionMatrix.centralGovernmentBudgetMonthlyOutturn).toBe('statskontoret:manadsutfall'); + }); + + it('documents the client, CLI and persistence surfaces', () => { + expect(inv.clients.cli).toContain('scripts/statskontoret-fetch.ts'); + expect(inv.clients.library).toContain('scripts/statskontoret-client.ts'); + expect(inv.clients.persistence).toContain('persistStatskontoretData'); + }); +});