|
| 1 | +/** |
| 2 | + * @module scripts/fetch-statskontoret |
| 3 | + * @description Cached fetch module for Statskontoret open data, providing a |
| 4 | + * 30-day TTL cache layer over {@link StatskontoretClient}. |
| 5 | + * |
| 6 | + * This module is intended for use by agentic workflows that need Statskontoret |
| 7 | + * context (authority register, budget outturn) without re-downloading large |
| 8 | + * Excel/ZIP files on every run. It follows the same no-MCP client pattern as |
| 9 | + * `imf-context.ts` and `scb-context.ts`. |
| 10 | + * |
| 11 | + * ### Cache behaviour |
| 12 | + * - Cache root: `analysis/data/statskontoret/<sourceKey>/cache/` |
| 13 | + * - TTL: 30 days (configurable via the `cacheTtlMs` option) |
| 14 | + * - On hit: returns the cached payload with provenance metadata |
| 15 | + * - On miss or stale: invokes `StatskontoretClient.discoverDownloads()` and |
| 16 | + * persists the result before returning |
| 17 | + * - On fetch error: falls back to the most recent stale cache entry (resilience) |
| 18 | + * |
| 19 | + * ### Security |
| 20 | + * Fetch calls go only to `https://www.statskontoret.se` (enforced by |
| 21 | + * `assertStatskontoretFetchTarget` inside `StatskontoretClient`). No |
| 22 | + * credentials are required; all data is PUBLIC classification. |
| 23 | + * |
| 24 | + * @see analysis/statskontoret/indicators-inventory.json |
| 25 | + * @see scripts/statskontoret-client.ts (low-level HTTP + parse) |
| 26 | + * @see scripts/statskontoret-fetch.ts (CLI entry-point) |
| 27 | + * @author Hack23 AB |
| 28 | + * @license Apache-2.0 |
| 29 | + */ |
| 30 | + |
| 31 | +import fs from 'node:fs'; |
| 32 | +import path from 'node:path'; |
| 33 | +import { fileURLToPath } from 'node:url'; |
| 34 | + |
| 35 | +import { |
| 36 | + getStatskontoretSource, |
| 37 | + STATSKONTORET_SOURCES, |
| 38 | + StatskontoretClient, |
| 39 | + StatskontoretError, |
| 40 | + type StatskontoretClientConfig, |
| 41 | + type StatskontoretDownloadLink, |
| 42 | + type StatskontoretSourceKey, |
| 43 | +} from './statskontoret-client.js'; |
| 44 | + |
| 45 | +// --------------------------------------------------------------------------- |
| 46 | +// Constants |
| 47 | +// --------------------------------------------------------------------------- |
| 48 | + |
| 49 | +const __filename = fileURLToPath(import.meta.url); |
| 50 | +const REPO_ROOT = path.resolve(path.dirname(__filename), '..'); |
| 51 | + |
| 52 | +/** Default 30-day cache TTL in milliseconds (30 days × 24 h × 60 min × 60 s × 1000 ms). */ |
| 53 | +export const CACHE_TTL_MS = 30 * 24 * 60 * 60 * 1000; |
| 54 | + |
| 55 | +/** Root directory for cached Statskontoret payloads. */ |
| 56 | +export const STATSKONTORET_CACHE_ROOT = path.join( |
| 57 | + REPO_ROOT, |
| 58 | + 'analysis', |
| 59 | + 'data', |
| 60 | + 'statskontoret', |
| 61 | +); |
| 62 | + |
| 63 | +// --------------------------------------------------------------------------- |
| 64 | +// Types |
| 65 | +// --------------------------------------------------------------------------- |
| 66 | + |
| 67 | +/** A cached Statskontoret downloads payload with provenance metadata. */ |
| 68 | +export interface StatskontoretCachedPayload { |
| 69 | + readonly sourceKey: StatskontoretSourceKey; |
| 70 | + readonly sourceTitle: string; |
| 71 | + readonly sourceUrl: string; |
| 72 | + readonly links: readonly StatskontoretDownloadLink[]; |
| 73 | + readonly cachedAt: string; |
| 74 | + readonly fetchedAt: string; |
| 75 | + readonly fromCache: boolean; |
| 76 | + readonly cacheAgeMs: number; |
| 77 | +} |
| 78 | + |
| 79 | +/** Options for {@link fetchStatskontoretCached}. */ |
| 80 | +export interface FetchStatskontoretCachedOptions { |
| 81 | + /** Override the 30-day TTL (milliseconds). Mainly for testing. */ |
| 82 | + readonly cacheTtlMs?: number; |
| 83 | + /** Override the cache root directory. Mainly for testing. */ |
| 84 | + readonly cacheRoot?: string; |
| 85 | + /** Override the `StatskontoretClient` configuration (e.g. inject a mock fetch). */ |
| 86 | + readonly clientConfig?: StatskontoretClientConfig; |
| 87 | +} |
| 88 | + |
| 89 | +/** Internal cache file format. */ |
| 90 | +interface CacheEntry { |
| 91 | + readonly fetchedAt: string; |
| 92 | + readonly sourceKey: StatskontoretSourceKey; |
| 93 | + readonly links: StatskontoretDownloadLink[]; |
| 94 | +} |
| 95 | + |
| 96 | +// --------------------------------------------------------------------------- |
| 97 | +// Private helpers |
| 98 | +// --------------------------------------------------------------------------- |
| 99 | + |
| 100 | +function cacheDir(sourceKey: StatskontoretSourceKey, cacheRoot: string): string { |
| 101 | + return path.join(cacheRoot, sourceKey, 'cache'); |
| 102 | +} |
| 103 | + |
| 104 | +function cacheFilePath(sourceKey: StatskontoretSourceKey, cacheRoot: string): string { |
| 105 | + return path.join(cacheDir(sourceKey, cacheRoot), 'downloads.json'); |
| 106 | +} |
| 107 | + |
| 108 | +function readCacheEntry(filePath: string): CacheEntry | undefined { |
| 109 | + try { |
| 110 | + const raw = fs.readFileSync(filePath, 'utf-8'); |
| 111 | + return JSON.parse(raw) as CacheEntry; |
| 112 | + } catch { |
| 113 | + return undefined; |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +function writeCacheEntry(filePath: string, entry: CacheEntry): void { |
| 118 | + const dir = path.dirname(filePath); |
| 119 | + fs.mkdirSync(dir, { recursive: true }); |
| 120 | + fs.writeFileSync(filePath, JSON.stringify(entry, null, 2), 'utf-8'); |
| 121 | +} |
| 122 | + |
| 123 | +function isCacheFresh(fetchedAt: string, ttlMs: number): boolean { |
| 124 | + const age = Date.now() - new Date(fetchedAt).getTime(); |
| 125 | + return age < ttlMs; |
| 126 | +} |
| 127 | + |
| 128 | +// --------------------------------------------------------------------------- |
| 129 | +// Public API |
| 130 | +// --------------------------------------------------------------------------- |
| 131 | + |
| 132 | +/** |
| 133 | + * Fetch Statskontoret download links for a given source key, using a 30-day |
| 134 | + * file-system cache. |
| 135 | + * |
| 136 | + * @param sourceKey - The Statskontoret source to fetch |
| 137 | + * (`myndighetsforteckning`, `arsutfall`, `manadsutfall`, `budget-time-series`). |
| 138 | + * @param options - Optional TTL, cache-root and client overrides. |
| 139 | + * @returns A {@link StatskontoretCachedPayload} with links and provenance info. |
| 140 | + * |
| 141 | + * @example |
| 142 | + * ```ts |
| 143 | + * const payload = await fetchStatskontoretCached('myndighetsforteckning'); |
| 144 | + * console.log(`Found ${payload.links.length} download links (fromCache=${payload.fromCache})`); |
| 145 | + * ``` |
| 146 | + */ |
| 147 | +export async function fetchStatskontoretCached( |
| 148 | + sourceKey: StatskontoretSourceKey, |
| 149 | + options: FetchStatskontoretCachedOptions = {}, |
| 150 | +): Promise<StatskontoretCachedPayload> { |
| 151 | + const { |
| 152 | + cacheTtlMs = CACHE_TTL_MS, |
| 153 | + cacheRoot = STATSKONTORET_CACHE_ROOT, |
| 154 | + clientConfig = {}, |
| 155 | + } = options; |
| 156 | + |
| 157 | + const source = getStatskontoretSource(sourceKey); |
| 158 | + const filePath = cacheFilePath(sourceKey, cacheRoot); |
| 159 | + |
| 160 | + // --- Cache hit --- |
| 161 | + const cached = readCacheEntry(filePath); |
| 162 | + if (cached !== undefined && isCacheFresh(cached.fetchedAt, cacheTtlMs)) { |
| 163 | + const cacheAgeMs = Date.now() - new Date(cached.fetchedAt).getTime(); |
| 164 | + return { |
| 165 | + sourceKey, |
| 166 | + sourceTitle: source.title, |
| 167 | + sourceUrl: source.url, |
| 168 | + links: cached.links, |
| 169 | + cachedAt: cached.fetchedAt, |
| 170 | + fetchedAt: cached.fetchedAt, |
| 171 | + fromCache: true, |
| 172 | + cacheAgeMs, |
| 173 | + }; |
| 174 | + } |
| 175 | + |
| 176 | + // --- Cache miss or stale: fetch from origin --- |
| 177 | + const client = new StatskontoretClient(clientConfig); |
| 178 | + let links: StatskontoretDownloadLink[]; |
| 179 | + let fetchedAt: string; |
| 180 | + |
| 181 | + try { |
| 182 | + links = await client.discoverDownloads(sourceKey); |
| 183 | + // Stamp provenance after the fetch completes so `fetchedAt` reflects when |
| 184 | + // the data was actually retrieved, not when the request was issued. |
| 185 | + fetchedAt = new Date().toISOString(); |
| 186 | + writeCacheEntry(filePath, { fetchedAt, sourceKey, links }); |
| 187 | + } catch (error) { |
| 188 | + // --- Resilience: return stale cache on fetch failure --- |
| 189 | + if (cached !== undefined) { |
| 190 | + const cacheAgeMs = Date.now() - new Date(cached.fetchedAt).getTime(); |
| 191 | + return { |
| 192 | + sourceKey, |
| 193 | + sourceTitle: source.title, |
| 194 | + sourceUrl: source.url, |
| 195 | + links: cached.links, |
| 196 | + cachedAt: cached.fetchedAt, |
| 197 | + fetchedAt: cached.fetchedAt, |
| 198 | + fromCache: true, |
| 199 | + cacheAgeMs, |
| 200 | + }; |
| 201 | + } |
| 202 | + const detail = error instanceof Error ? error.message : String(error); |
| 203 | + throw new StatskontoretError( |
| 204 | + `fetch-statskontoret: failed to fetch ${sourceKey} and no cache available: ${detail}`, |
| 205 | + 'http', |
| 206 | + { cause: error }, |
| 207 | + ); |
| 208 | + } |
| 209 | + |
| 210 | + return { |
| 211 | + sourceKey, |
| 212 | + sourceTitle: source.title, |
| 213 | + sourceUrl: source.url, |
| 214 | + links, |
| 215 | + cachedAt: fetchedAt, |
| 216 | + fetchedAt, |
| 217 | + fromCache: false, |
| 218 | + cacheAgeMs: 0, |
| 219 | + }; |
| 220 | +} |
| 221 | + |
| 222 | +/** |
| 223 | + * Check whether a fresh cache entry exists for the given source key without |
| 224 | + * triggering a network fetch. |
| 225 | + * |
| 226 | + * @param sourceKey - The Statskontoret source to check. |
| 227 | + * @param options - Optional TTL and cache-root overrides. |
| 228 | + * @returns `true` if a fresh cache entry exists, `false` otherwise. |
| 229 | + */ |
| 230 | +export function isStatskontoretCacheFresh( |
| 231 | + sourceKey: StatskontoretSourceKey, |
| 232 | + options: Pick<FetchStatskontoretCachedOptions, 'cacheTtlMs' | 'cacheRoot'> = {}, |
| 233 | +): boolean { |
| 234 | + const { cacheTtlMs = CACHE_TTL_MS, cacheRoot = STATSKONTORET_CACHE_ROOT } = options; |
| 235 | + const filePath = cacheFilePath(sourceKey, cacheRoot); |
| 236 | + const cached = readCacheEntry(filePath); |
| 237 | + return cached !== undefined && isCacheFresh(cached.fetchedAt, cacheTtlMs); |
| 238 | +} |
| 239 | + |
| 240 | +/** |
| 241 | + * Return the list of all built-in Statskontoret source keys. |
| 242 | + * Useful for iterating over all sources in agentic workflows. |
| 243 | + */ |
| 244 | +export function statskontoretSourceKeys(): readonly StatskontoretSourceKey[] { |
| 245 | + return STATSKONTORET_SOURCES.map((s) => s.key); |
| 246 | +} |
0 commit comments