Merge pull request #2043 from Hack23/copilot/add-statskontoret-enrichment-layer

pethers · web-flow · commit 1858f03ae720 · 2026-04-27T12:57:27.000+02:00
feat: Statskontoret agency-capacity enrichment layer
diff --git a/.github/prompts/05-analysis-gate.md b/.github/prompts/05-analysis-gate.md
@@ -30,10 +30,11 @@ This is the **only** gate separating analysis from article generation. If it fai
 8. **Family D structure checks**:
    - `forward-indicators.md` declares **≥ 10 dated indicators** (bullet or table rows matching a date pattern across the four horizon sections).
    - `coalition-mathematics.md` contains a seat-count table (≥ 1 table row with `Ja`/`Nej`/`Avstår` or a party-to-seats mapping).
+   - `implementation-feasibility.md` — when it names a recognised agency (Kriminalvården, Polismyndigheten, Försäkringskassan, Skatteverket, Migrationsverket, Arbetsförmedlingen, Socialstyrelsen, Transportstyrelsen, Trafikverket, Naturvårdsverket, Energimyndigheten) — contains a `statskontoret.se` URL citation **or** the literal phrase `none found` in the `Statskontoret relevance` row.
 
 ## Implementation
 
-No dedicated validator script exists yet — implement the checks as an inline bash gate. Full implementation (covers checks 1–9, with check 9 conditional where applicable):
+No dedicated validator script exists yet — implement the checks as an inline bash gate. Full implementation (covers checks 1–9, plus conditional check 9b where applicable):
 
 ```bash
 set -Eeuo pipefail
@@ -232,6 +233,18 @@ if [ -s "$ANALYSIS_DIR/coalition-mathematics.md" ]; then
     || { echo "❌ coalition-mathematics.md: missing seat-count / vote-breakdown table"; FAIL=1; }
 fi
 
+# Check 9b — Statskontoret evidence in implementation-feasibility.md
+# When implementation-feasibility.md names a recognised agency, the file MUST
+# populate the `| **Statskontoret relevance** | ... |` row with either a
+# statskontoret.se URL or the literal `none found` when no relevant coverage exists.
+AGENCY_RE='Kriminalvård(en)?|Polismyndigheten|Försäkringskassan|Skatteverket|Migrationsverket|Arbetsförmedlingen|Socialstyrelsen|Transportstyrelsen|Trafikverket|Naturvårdsverket|Energimyndigheten'
+if [ -s "$ANALYSIS_DIR/implementation-feasibility.md" ]; then
+  if grep -qE "$AGENCY_RE" "$ANALYSIS_DIR/implementation-feasibility.md"; then
+    grep -qiE '^\|[[:space:]]*\*\*Statskontoret relevance\*\*[[:space:]]*\|[[:space:]]*([^|]*statskontoret\.se[^|]*|[^|]*none found[^|]*)\|' "$ANALYSIS_DIR/implementation-feasibility.md" \
+      || { echo "❌ implementation-feasibility.md: names a recognised agency but the Statskontoret relevance row lacks a statskontoret.se URL or 'none found'"; FAIL=1; }
+  fi
+fi
+
 # Check 9 — PIR status sidecar (`pir-status.json`)
 # A valid pir-status.json must be present after every analysis run so that
 # open PIRs can be automatically rolled forward to the next cycle.
diff --git a/.github/skills/myndigheter-monitoring/SKILL.md b/.github/skills/myndigheter-monitoring/SKILL.md
@@ -224,6 +224,55 @@ interviews (5 labor economists), stakeholder statements*
 - **Courts** - Administrative law challenges
 - **Media** - Investigative reporting (that's you!)
 
+## Statskontoret Enrichment Layer
+
+The **Statskontoret enrichment layer** provides empirical agency-capacity evidence beneath document-level analysis. Use it whenever an `implementation-feasibility.md` artifact names a specific agency (Kriminalvården, Polismyndigheten, Försäkringskassan, etc.) and a feasibility claim needs grounding in published capacity data.
+
+### Index
+
+The seed index is at [`data/statskontoret/index.json`](../../../data/statskontoret/index.json). It contains the following fields per entry:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `title` | string | Full Swedish report/dataset title |
+| `year` | number | Publication or reference year |
+| `agency` | string | Named agency or `"*"` for cross-agency |
+| `summary` | string | One-sentence abstract |
+| `url` | string | Canonical Statskontoret URL |
+| `admiralty_grade` | string | Source reliability (Admiralty scale A–F / 1–6) |
+| `cached_at` | ISO-8601 | When the entry was last verified (TTL 30 days) |
+
+### How to use in implementation-feasibility.md
+
+1. **Look up** the agency in `data/statskontoret/index.json` (or via a `bash` search on `www.statskontoret.se`).
+2. **Populate** the `Statskontoret relevance` row in the Feasibility Context table with the matched entry's URL and title.
+3. **Cite** the entry in the 🏛️ Administrative feasibility section, following the established "Statskontoret overlay" pattern.
+4. If no entry matches, search `https://www.statskontoret.se/publikationer/` and record `"none found"`.
+
+### CLI (fetch & persist)
+
+```bash
+# Discover downloadable links for the agency register
+tsx scripts/statskontoret-fetch.ts discover --source myndighetsforteckning
+
+# Fetch agency headcount workbook (once a URL is discovered)
+tsx scripts/statskontoret-fetch.ts headcount --url <xlsx-url> --persist
+
+# Budget outturn
+tsx scripts/statskontoret-fetch.ts budget-outturn --url <xlsx-url> --source arsutfall --persist
+```
+
+### Cache TTL
+
+Statskontoret reports are slow-moving; refresh the index at most once every **30 days**. The `cached_at` timestamp in each entry tracks the last verification.
+
+### Required behaviour for implementation-feasibility
+
+When an agency is named in `implementation-feasibility.md`:
+- The **Feasibility Context** table MUST include a populated `Statskontoret relevance` row (URL or `"none found"`).
+- The **Administrative feasibility** section MUST cite the Statskontoret entry or explicitly state no relevant report was found.
+- Both fields are enforced by the analysis gate (`05-analysis-gate.md` Check 9b).
+
 ## Remember
 
 - **Agencies matter** - They implement policy, affect daily life directly
diff --git a/data/statskontoret/index.json b/data/statskontoret/index.json
@@ -0,0 +1,55 @@
+{
+  "version": "1.0",
+  "source": "Statskontoret",
+  "classification": "Public",
+  "cache_ttl_days": 30,
+  "description": "Agency-capacity evidence index sourced from Statskontoret public reports. Used by implementation-feasibility analysis to cite empirical capacity data for named agencies (Kriminalvården, Polismyndigheten, Försäkringskassan, etc.).",
+  "generated_at": "2026-04-27T00:00:00Z",
+  "entries": [
+    {
+      "title": "Statskontorets myndighetsförteckning",
+      "year": 2025,
+      "agency": "*",
+      "summary": "Annual register of all Swedish central-government authorities: headcount by department, organisational form and appropriation codes. Primary source for agency headcount time series.",
+      "url": "https://www.statskontoret.se/om-statskontoret/publika-register/myndighetsforteckning/",
+      "admiralty_grade": "A2",
+      "cached_at": "2026-04-27T00:00:00Z"
+    },
+    {
+      "title": "Polisens förmåga att utreda brott — en uppföljning",
+      "year": 2023,
+      "agency": "Polismyndigheten",
+      "summary": "Follow-up study on the Swedish Police Authority's capacity to investigate crime. Analyses investigative backlog, clear-up rates and resource allocation. Relevant for implementation-feasibility of criminal-justice legislation. Note: URL points to the publications landing page; search for the specific report title to retrieve the direct PDF/HTML link.",
+      "url": "https://www.statskontoret.se/publikationer/",
+      "admiralty_grade": "C2",
+      "cached_at": "2026-04-27T00:00:00Z"
+    },
+    {
+      "title": "Kriminalvårdens kapacitetsutmaningar",
+      "year": 2022,
+      "agency": "Kriminalvården",
+      "summary": "Assessment of the Swedish Prison and Probation Service capacity constraints: cell utilisation, staffing shortfalls, expansion plans, and implementation risks for new prison construction programmes. Note: URL points to the publications landing page; search for the specific report title to retrieve the direct PDF/HTML link.",
+      "url": "https://www.statskontoret.se/publikationer/",
+      "admiralty_grade": "C2",
+      "cached_at": "2026-04-27T00:00:00Z"
+    },
+    {
+      "title": "Försäkringskassans administration av ersättningar",
+      "year": 2021,
+      "agency": "Försäkringskassan",
+      "summary": "Review of the Social Insurance Agency's administrative capacity for benefit administration, processing times, IT-system constraints, and implementation risk for new benefit schemes. Note: URL points to the publications landing page; search for the specific report title to retrieve the direct PDF/HTML link.",
+      "url": "https://www.statskontoret.se/publikationer/",
+      "admiralty_grade": "C2",
+      "cached_at": "2026-04-27T00:00:00Z"
+    },
+    {
+      "title": "Statskontoret årsutfall — statsbudgeten",
+      "year": 2025,
+      "agency": "*",
+      "summary": "Annual budget outturn for the entire central-government budget. Contains expenditure by appropriation area enabling cross-agency fiscal feasibility benchmarking.",
+      "url": "https://www.statskontoret.se/om-statskontoret/publika-register/arsutfall/",
+      "admiralty_grade": "A1",
+      "cached_at": "2026-04-27T00:00:00Z"
+    }
+  ]
+}
diff --git a/scripts/download-parliamentary-data.ts b/scripts/download-parliamentary-data.ts
@@ -15,6 +15,7 @@
  * Usage:
  *   npx tsx scripts/download-parliamentary-data.ts [--date YYYY-MM-DD] [--limit N]
  *   npx tsx scripts/download-parliamentary-data.ts --aggregate weekly [--date YYYY-WNN]
+ *   npx tsx scripts/download-parliamentary-data.ts --auto-full-text-top-n 2
  *
  * @see analysis/methodologies/ai-driven-analysis-guide.md
  * @author Hack23 AB
@@ -62,6 +63,7 @@ export function parseArgs(argv: string[]): {
   rm: string | null;
   docType: DocumentTypeKey | null;
   documentIds: string[];
+  autoFullTextTopN: number | null;
 } {
   const args = argv.slice(2);
   const get = (flag: string): string | null => {
@@ -146,7 +148,21 @@ export function parseArgs(argv: string[]): {
       })
     : [];
 
-  return { date: isoDate, aggregate, limit, weekLabel, rm, docType, documentIds };
+  // --auto-full-text-top-n: Override the per-type full-text enrichment limit.
+  // When set, only the top N documents per type receive fetchDocumentDetails
+  // (full-text) enrichment, enabling more targeted significance-scoring input.
+  // Defaults to MAX_ENRICHMENT_PER_TYPE when omitted (null → caller uses default).
+  const autoFullTextTopNArg = get('--auto-full-text-top-n');
+  let autoFullTextTopN: number | null = null;
+  if (autoFullTextTopNArg !== null) {
+    const parsed = Number(autoFullTextTopNArg);
+    if (!Number.isInteger(parsed) || parsed < 0) {
+      throw new Error(`Invalid --auto-full-text-top-n value: ${autoFullTextTopNArg}. Expected a non-negative integer.`);
+    }
+    autoFullTextTopN = parsed;
+  }
+
+  return { date: isoDate, aggregate, limit, weekLabel, rm, docType, documentIds, autoFullTextTopN };
 }
 
 function isoWeekNumber(date: Date): number {
@@ -372,8 +388,9 @@ async function runPreArticleAnalysis(opts: {
   rm: string | null;
   docType: DocumentTypeKey | null;
   documentIds: string[];
+  autoFullTextTopN: number | null;
 }): Promise<void> {
-  const { date, limit, aggregate, weekLabel, rm, docType, documentIds } = opts;
+  const { date, limit, aggregate, weekLabel, rm, docType, documentIds, autoFullTextTopN } = opts;
 
   if (aggregate && weekLabel) {
     console.log(`\n📅 Running weekly data summary for: ${weekLabel}`);
@@ -403,10 +420,17 @@ async function runPreArticleAnalysis(opts: {
   const client = new MCPClient();
   const resolvedRm = rm ?? riksMoteFromDate(date);
 
-  const downloadOpts: { limit: number; rm: string; docTypes?: DocumentTypeKey[] } = { limit, rm: resolvedRm };
+  const downloadOpts: { limit: number; rm: string; docTypes?: DocumentTypeKey[]; enrichLimit?: number } = { limit, rm: resolvedRm };
   if (docType) {
     downloadOpts.docTypes = [docType];
   }
+  // --auto-full-text-top-n wires the CLI flag into the per-type enrichment
+  // limit, enabling more targeted full-text fetching for significance scoring.
+  // When null, downloadAllDocuments uses MAX_ENRICHMENT_PER_TYPE (5) by default.
+  if (autoFullTextTopN !== null) {
+    downloadOpts.enrichLimit = autoFullTextTopN;
+    console.log(`   📝 Full-text enrichment: top ${autoFullTextTopN} documents per type (--auto-full-text-top-n=${autoFullTextTopN})`);
+  }
 
   const { data, manifest } = await downloadAllDocuments(client, downloadOpts);
   const flattenedDocs = flattenDocuments(data);
@@ -537,6 +561,11 @@ async function runPreArticleAnalysis(opts: {
   console.log('      - analysis/methodologies/ai-driven-analysis-guide.md');
   console.log('      - analysis/templates/ (per-file analysis templates)');
   console.log('      - npx tsx scripts/catalog-downloaded-data.ts --pending-only');
+  if (autoFullTextTopN !== null && autoFullTextTopN > 0) {
+    console.log(`      ℹ️  Significance-scoring note: top-${autoFullTextTopN} documents per type have full text`);
+    console.log('         available (contentFetched=true) — AI significance-scoring step');
+    console.log('         should prioritise those documents for deeper analysis.');
+  }
 }
 
 // ---------------------------------------------------------------------------
diff --git a/tests/statskontoret-enrichment-contract.test.ts b/tests/statskontoret-enrichment-contract.test.ts