Skip to content

Commit d383ccd

Browse files
committed
Add source period metadata to Arch facts
1 parent 5e851c5 commit d383ccd

7 files changed

Lines changed: 463 additions & 131 deletions

File tree

arch/core.py

Lines changed: 125 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,37 @@
1212
import re
1313
from collections import Counter
1414
from dataclasses import asdict, dataclass, field
15+
from datetime import date
1516
from decimal import Decimal
1617
from typing import Any
1718

1819
Scalar = str | int | float | bool | None
1920

2021
ALLOWED_PERIOD_TYPES = {"calendar_year", "tax_year", "fiscal_year", "month"}
22+
ALLOWED_PERIOD_BASES = {
23+
"calendar_year",
24+
"tax_year",
25+
"fiscal_year",
26+
"us_federal_fiscal_year",
27+
"uk_fiscal_year",
28+
"state_fiscal_year",
29+
"reference_month",
30+
"benefit_month",
31+
"payment_month",
32+
"payment_date_fiscal_year",
33+
"statistical_annual",
34+
"projection_year",
35+
}
36+
ALLOWED_ACCOUNTING_BASES = {
37+
"accrual",
38+
"cash",
39+
"cash_outlay",
40+
"cash_payment",
41+
"benefit_month",
42+
"payment_date",
43+
"statistical_total",
44+
"projection",
45+
}
2146
ALLOWED_GEOGRAPHY_LEVELS = {
2247
"country",
2348
"region",
@@ -70,6 +95,12 @@ class PeriodDimension:
7095

7196
type: str
7297
value: int | str
98+
start_date: str | None = None
99+
end_date: str | None = None
100+
basis: str | None = None
101+
authority: str | None = None
102+
source_label: str | None = None
103+
accounting_basis: str | None = None
73104

74105

75106
@dataclass(frozen=True)
@@ -247,7 +278,9 @@ def build_label(fact: AggregateFact) -> str:
247278
concept = _humanize(fact.measure.concept)
248279
aggregation = _humanize(fact.aggregation.method)
249280
entity = _humanize(fact.entity.name)
250-
period = f"{fact.period.value} {_humanize(fact.period.type)}"
281+
period = fact.period.source_label or (
282+
f"{fact.period.value} {_humanize(fact.period.type)}"
283+
)
251284
geography = fact.geography.name or fact.geography.id
252285
source = _source_label(fact.source)
253286

@@ -373,6 +406,7 @@ def validate_fact(fact: AggregateFact) -> tuple[ValidationIssue, ...]:
373406
errors.append(
374407
_issue("missing_period", "Period value is required", "period.value")
375408
)
409+
_validate_period_semantics(errors, fact.period)
376410

377411
if fact.geography.level not in ALLOWED_GEOGRAPHY_LEVELS:
378412
errors.append(
@@ -500,7 +534,7 @@ def fact_counts(facts: list[AggregateFact]) -> dict[str, dict[str, int]]:
500534

501535
def _canonical_key_payload(fact: AggregateFact) -> dict[str, Any]:
502536
payload = {
503-
"period": asdict(fact.period),
537+
"period": _period_key_payload(fact.period),
504538
"geography": {
505539
"level": fact.geography.level,
506540
"id": fact.geography.id,
@@ -527,6 +561,95 @@ def _canonical_key_payload(fact: AggregateFact) -> dict[str, Any]:
527561
return payload
528562

529563

564+
def _period_key_payload(period: PeriodDimension) -> dict[str, Any]:
565+
return {key: value for key, value in asdict(period).items() if value is not None}
566+
567+
568+
def _validate_period_semantics(
569+
errors: list[ValidationIssue],
570+
period: PeriodDimension,
571+
) -> None:
572+
if period.basis is not None and period.basis not in ALLOWED_PERIOD_BASES:
573+
errors.append(
574+
_issue(
575+
"malformed_period",
576+
f"Unsupported period basis: {period.basis!r}",
577+
"period.basis",
578+
)
579+
)
580+
if (
581+
period.accounting_basis is not None
582+
and period.accounting_basis not in ALLOWED_ACCOUNTING_BASES
583+
):
584+
errors.append(
585+
_issue(
586+
"malformed_period",
587+
f"Unsupported accounting basis: {period.accounting_basis!r}",
588+
"period.accounting_basis",
589+
)
590+
)
591+
if period.authority is not None and not period.authority.strip():
592+
errors.append(
593+
_issue(
594+
"missing_period",
595+
"Period authority must be nonempty when provided",
596+
"period.authority",
597+
)
598+
)
599+
if period.source_label is not None and not period.source_label.strip():
600+
errors.append(
601+
_issue(
602+
"missing_period",
603+
"Period source label must be nonempty when provided",
604+
"period.source_label",
605+
)
606+
)
607+
608+
parsed_start = _parse_iso_date(errors, period.start_date, "period.start_date")
609+
parsed_end = _parse_iso_date(errors, period.end_date, "period.end_date")
610+
if (
611+
parsed_start is not None
612+
and parsed_end is not None
613+
and parsed_start > parsed_end
614+
):
615+
errors.append(
616+
_issue(
617+
"malformed_period",
618+
"Period start_date must be on or before end_date",
619+
"period.start_date",
620+
)
621+
)
622+
623+
624+
def _parse_iso_date(
625+
errors: list[ValidationIssue],
626+
value: str | None,
627+
field_name: str,
628+
) -> date | None:
629+
if value is None:
630+
return None
631+
if not re.fullmatch(r"\d{4}-\d{2}-\d{2}", value):
632+
errors.append(
633+
_issue(
634+
"malformed_period",
635+
f"Period date must use ISO YYYY-MM-DD format: {value!r}",
636+
field_name,
637+
)
638+
)
639+
return None
640+
try:
641+
return date.fromisoformat(value)
642+
except ValueError:
643+
errors.append(
644+
_issue(
645+
"malformed_period",
646+
f"Period date must use ISO YYYY-MM-DD format: {value!r}",
647+
field_name,
648+
)
649+
)
650+
return None
651+
652+
530653
def _validate_value(errors: list[ValidationIssue], value: Any) -> None:
531654
if isinstance(value, Decimal):
532655
return

arch/database.py

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
source_row_to_mapping,
3434
)
3535

36-
ARCH_DB_SCHEMA_VERSION = "arch.relational.v1"
36+
ARCH_DB_SCHEMA_VERSION = "arch.relational.v2"
3737

3838

3939
@dataclass(frozen=True)
@@ -77,9 +77,7 @@ def build_arch_db(
7777
columns = source_columns_from_source_rows(rows)
7878
source_row_values_count = sum(len(row.values) for row in rows)
7979
resolved_build_id = build_id or _build_id(facts, cells, rows)
80-
fact_constraints = [
81-
(fact, build_aggregate_constraints(fact)) for fact in facts
82-
]
80+
fact_constraints = [(fact, build_aggregate_constraints(fact)) for fact in facts]
8381
source_record_ids = {
8482
fact.source_record_id for fact in facts if fact.source_record_id is not None
8583
}
@@ -117,9 +115,7 @@ def build_arch_db(
117115
return ArchDbBuildReport(
118116
build_id=resolved_build_id,
119117
facts_count=len(facts),
120-
constraints_count=sum(
121-
len(constraints) for _, constraints in fact_constraints
122-
),
118+
constraints_count=sum(len(constraints) for _, constraints in fact_constraints),
123119
source_records_count=len(source_record_ids),
124120
source_rows_count=len(rows),
125121
source_columns_count=len(columns),
@@ -244,13 +240,25 @@ def _create_schema(connection: sqlite3.Connection) -> None:
244240
legal_vintage TEXT,
245241
period_type TEXT,
246242
period_value TEXT,
243+
period_start_date TEXT,
244+
period_end_date TEXT,
245+
period_basis TEXT,
246+
period_authority TEXT,
247+
period_source_label TEXT,
248+
period_accounting_basis TEXT,
247249
PRIMARY KEY (
248250
source_concept,
249251
canonical_concept,
250252
relation,
251253
legal_vintage,
252254
period_type,
253-
period_value
255+
period_value,
256+
period_start_date,
257+
period_end_date,
258+
period_basis,
259+
period_authority,
260+
period_source_label,
261+
period_accounting_basis
254262
)
255263
);
256264
@@ -278,6 +286,12 @@ def _create_schema(connection: sqlite3.Connection) -> None:
278286
value_numeric REAL,
279287
period_type TEXT NOT NULL,
280288
period_value TEXT NOT NULL,
289+
period_start_date TEXT,
290+
period_end_date TEXT,
291+
period_basis TEXT,
292+
period_authority TEXT,
293+
period_source_label TEXT,
294+
period_accounting_basis TEXT,
281295
geography_level TEXT NOT NULL,
282296
geography_id TEXT NOT NULL,
283297
geography_vintage TEXT,
@@ -740,7 +754,7 @@ def _insert_concept_alignments(
740754
facts: list[AggregateFact],
741755
build_id: str,
742756
) -> None:
743-
seen: set[tuple[str, str, str, str | None, str, str]] = set()
757+
seen: set[tuple[Any, ...]] = set()
744758
for fact in facts:
745759
measure = fact.measure
746760
if not measure.source_concept or not measure.concept_relation:
@@ -752,6 +766,12 @@ def _insert_concept_alignments(
752766
measure.legal_vintage,
753767
fact.period.type,
754768
str(fact.period.value),
769+
fact.period.start_date,
770+
fact.period.end_date,
771+
fact.period.basis,
772+
fact.period.authority,
773+
fact.period.source_label,
774+
fact.period.accounting_basis,
755775
)
756776
if key in seen:
757777
continue
@@ -768,9 +788,15 @@ def _insert_concept_alignments(
768788
evidence_notes,
769789
legal_vintage,
770790
period_type,
771-
period_value
791+
period_value,
792+
period_start_date,
793+
period_end_date,
794+
period_basis,
795+
period_authority,
796+
period_source_label,
797+
period_accounting_basis
772798
)
773-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
799+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
774800
""",
775801
(
776802
measure.source_concept,
@@ -783,6 +809,12 @@ def _insert_concept_alignments(
783809
measure.legal_vintage,
784810
fact.period.type,
785811
str(fact.period.value),
812+
fact.period.start_date,
813+
fact.period.end_date,
814+
fact.period.basis,
815+
fact.period.authority,
816+
fact.period.source_label,
817+
fact.period.accounting_basis,
786818
),
787819
)
788820

@@ -820,6 +852,12 @@ def _insert_aggregate_fact(
820852
value_numeric,
821853
period_type,
822854
period_value,
855+
period_start_date,
856+
period_end_date,
857+
period_basis,
858+
period_authority,
859+
period_source_label,
860+
period_accounting_basis,
823861
geography_level,
824862
geography_id,
825863
geography_vintage,
@@ -849,7 +887,7 @@ def _insert_aggregate_fact(
849887
source_extraction_method,
850888
source_method_notes
851889
)
852-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
890+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
853891
""",
854892
(
855893
fact_key,
@@ -875,6 +913,12 @@ def _insert_aggregate_fact(
875913
_numeric_value(fact.value),
876914
fact.period.type,
877915
str(fact.period.value),
916+
fact.period.start_date,
917+
fact.period.end_date,
918+
fact.period.basis,
919+
fact.period.authority,
920+
fact.period.source_label,
921+
fact.period.accounting_basis,
878922
fact.geography.level,
879923
fact.geography.id,
880924
fact.geography.vintage,

0 commit comments

Comments
 (0)