Skip to content

Commit 37271be

Browse files
authored
Publish TRACE TRO declarations for US data releases (#746)
Align data-side TRO with canonical TROv 0.1 and policyengine.py bundle-side TRO. Subagent review (CLEAN): - canonical TROv 0.1 vocabulary verified against trov.ttl - canonical_json_bytes and composition-fingerprint algorithm byte-identical to policyengine.py - structured pe:* provenance fields (not prose) - JSON schema validates emitted TROs - uploader emits floating + versioned paths for both release_manifest.json and trace.tro.jsonld Admin-merging: integration-tests hits pre-existing test_aca_calibration tolerance failure identical to PR #798; TRO code isn't imported during the data build path. Companion issue PolicyEngine/policyengine.py#313 tracks bundle-side namespace migration.
1 parent 0d6b6d7 commit 37271be

7 files changed

Lines changed: 831 additions & 0 deletions

File tree

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,38 @@ make documentation
153153
```
154154

155155
Note: The Makefile uses the older `jb` command syntax which may not work with Jupyter Book 2. Use `myst start` or `jupyter book start docs` instead.
156+
157+
## TRACE provenance output
158+
159+
Each US data release now publishes both:
160+
161+
- `release_manifest.json`
162+
- `trace.tro.jsonld`
163+
164+
The release manifest remains the operational source of truth for:
165+
166+
- published artifact paths and checksums
167+
- build IDs and timestamps
168+
- build-time `policyengine-us` provenance
169+
170+
`trace.tro.jsonld` is a generated TRACE declaration built from that manifest. It gives a
171+
standards-based provenance export over the same release artifacts, including a
172+
composition fingerprint across the release manifest and the artifacts it describes.
173+
174+
The TRO uses the canonical [TROv 0.1 vocabulary](https://w3id.org/trace/trov/0.1/) and
175+
surfaces PolicyEngine-specific build provenance under the `https://policyengine.org/trace/0.1#`
176+
extension namespace. Structured fields on the performance node
177+
(`pe:dataBuildFingerprint`, `pe:builtWithModelVersion`, `pe:builtWithModelGitSha`,
178+
`pe:dataBuildId`, `pe:emittedIn`) let a verifier cross-check this TRO against the
179+
certified-bundle TRO emitted by `policyengine.py` without parsing prose.
180+
181+
The emitted TRO is validated against `policyengine_us_data/schemas/trace_tro.schema.json`.
182+
183+
Important boundary:
184+
185+
- the TRACE file does not replace the release manifest
186+
- the TRACE file does not decide model/data compatibility
187+
188+
For the broader certified-bundle architecture, see
189+
[`policyengine.py` release bundles](https://github.com/PolicyEngine/policyengine.py/blob/main/docs/release-bundles.md)
190+
and the official [TRACE specification](https://transparency-certified.github.io/trace-specification/docs/specifications/).
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Publish TRACE TRO declarations alongside US data release manifests on Hugging Face. The TRO uses canonical TROv 0.1 vocabulary, exposes structured `pe:*` build provenance fields (model version, git sha, data-build fingerprint, CI emission context), and ships with a JSON schema for downstream validation.
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
{
2+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3+
"$id": "https://policyengine.org/schemas/policyengine-us-data/trace_tro/0.1.json",
4+
"title": "PolicyEngine US data release TRACE TRO",
5+
"description": "JSON Schema for a TRACE Transparent Research Object emitted alongside a policyengine-us-data release. Uses the canonical TROv 0.1 vocabulary with the PolicyEngine extension namespace.",
6+
"type": "object",
7+
"required": ["@context", "@graph"],
8+
"properties": {
9+
"@context": {
10+
"type": "array",
11+
"minItems": 1,
12+
"items": {
13+
"type": "object",
14+
"required": ["trov", "schema", "pe"],
15+
"properties": {
16+
"trov": {
17+
"type": "string",
18+
"const": "https://w3id.org/trace/trov/0.1#"
19+
},
20+
"schema": {
21+
"type": "string",
22+
"const": "https://schema.org/"
23+
},
24+
"pe": {
25+
"type": "string",
26+
"const": "https://policyengine.org/trace/0.1#"
27+
}
28+
}
29+
}
30+
},
31+
"@graph": {
32+
"type": "array",
33+
"minItems": 1,
34+
"items": { "$ref": "#/$defs/troNode" }
35+
}
36+
},
37+
"$defs": {
38+
"sha256Hex": {
39+
"type": "string",
40+
"pattern": "^[a-f0-9]{64}$"
41+
},
42+
"trovHash": {
43+
"type": "object",
44+
"required": ["trov:hashAlgorithm", "trov:hashValue"],
45+
"properties": {
46+
"trov:hashAlgorithm": { "type": "string", "const": "sha256" },
47+
"trov:hashValue": { "$ref": "#/$defs/sha256Hex" }
48+
}
49+
},
50+
"typeStringOrArray": {
51+
"oneOf": [
52+
{ "type": "string" },
53+
{
54+
"type": "array",
55+
"minItems": 1,
56+
"items": { "type": "string" }
57+
}
58+
]
59+
},
60+
"artifact": {
61+
"type": "object",
62+
"required": ["@id", "@type", "trov:hash"],
63+
"properties": {
64+
"@id": { "type": "string" },
65+
"@type": { "const": "trov:ResearchArtifact" },
66+
"schema:name": { "type": "string" },
67+
"trov:hash": { "$ref": "#/$defs/trovHash" },
68+
"trov:mimeType": { "type": "string" }
69+
}
70+
},
71+
"artifactLocation": {
72+
"type": "object",
73+
"required": ["@id", "@type", "trov:artifact", "trov:path"],
74+
"properties": {
75+
"@id": { "type": "string" },
76+
"@type": { "const": "trov:ArtifactLocation" },
77+
"trov:artifact": {
78+
"type": "object",
79+
"required": ["@id"],
80+
"properties": { "@id": { "type": "string" } }
81+
},
82+
"trov:path": { "type": "string", "minLength": 1 }
83+
}
84+
},
85+
"troNode": {
86+
"type": "object",
87+
"required": [
88+
"@id",
89+
"@type",
90+
"schema:name",
91+
"trov:wasAssembledBy",
92+
"trov:hasComposition",
93+
"trov:hasArrangement",
94+
"trov:hasPerformance"
95+
],
96+
"properties": {
97+
"@id": { "type": "string" },
98+
"@type": {
99+
"oneOf": [
100+
{ "const": "trov:TransparentResearchObject" },
101+
{
102+
"type": "array",
103+
"minItems": 1,
104+
"contains": { "const": "trov:TransparentResearchObject" },
105+
"items": { "type": "string" }
106+
}
107+
]
108+
},
109+
"trov:vocabularyVersion": { "type": "string" },
110+
"schema:name": { "type": "string", "minLength": 1 },
111+
"schema:description": { "type": "string" },
112+
"schema:dateCreated": { "type": "string" },
113+
"schema:creator": {},
114+
"trov:wasAssembledBy": {
115+
"type": "object",
116+
"required": ["@id", "@type", "schema:name"],
117+
"properties": {
118+
"@id": { "type": "string" },
119+
"@type": { "$ref": "#/$defs/typeStringOrArray" },
120+
"schema:name": { "type": "string" }
121+
}
122+
},
123+
"trov:createdWith": {
124+
"type": "object",
125+
"required": ["@type", "schema:name"],
126+
"properties": {
127+
"@type": { "const": "schema:SoftwareApplication" },
128+
"schema:name": { "type": "string" },
129+
"schema:softwareVersion": { "type": "string" }
130+
}
131+
},
132+
"trov:hasComposition": { "$ref": "#/$defs/composition" },
133+
"trov:hasArrangement": {
134+
"type": "array",
135+
"minItems": 1,
136+
"items": { "$ref": "#/$defs/arrangement" }
137+
},
138+
"trov:hasPerformance": {
139+
"type": "array",
140+
"minItems": 1,
141+
"items": { "$ref": "#/$defs/performance" }
142+
}
143+
}
144+
},
145+
"composition": {
146+
"type": "object",
147+
"required": ["@id", "@type", "trov:hasFingerprint", "trov:hasArtifact"],
148+
"properties": {
149+
"@id": { "type": "string" },
150+
"@type": { "const": "trov:ArtifactComposition" },
151+
"trov:hasFingerprint": {
152+
"type": "object",
153+
"required": ["@id", "@type", "trov:hash"],
154+
"properties": {
155+
"@id": { "type": "string" },
156+
"@type": { "const": "trov:CompositionFingerprint" },
157+
"trov:hash": { "$ref": "#/$defs/trovHash" }
158+
}
159+
},
160+
"trov:hasArtifact": {
161+
"type": "array",
162+
"minItems": 1,
163+
"items": { "$ref": "#/$defs/artifact" }
164+
}
165+
}
166+
},
167+
"arrangement": {
168+
"type": "object",
169+
"required": ["@id", "@type", "trov:hasArtifactLocation"],
170+
"properties": {
171+
"@id": { "type": "string" },
172+
"@type": { "const": "trov:ArtifactArrangement" },
173+
"rdfs:comment": { "type": "string" },
174+
"trov:hasArtifactLocation": {
175+
"type": "array",
176+
"minItems": 1,
177+
"items": { "$ref": "#/$defs/artifactLocation" }
178+
}
179+
}
180+
},
181+
"performance": {
182+
"type": "object",
183+
"required": [
184+
"@id",
185+
"@type",
186+
"trov:wasConductedBy",
187+
"trov:contributedToArrangement",
188+
"pe:emittedIn",
189+
"pe:dataBuildId"
190+
],
191+
"properties": {
192+
"@id": { "type": "string" },
193+
"@type": { "const": "trov:TrustedResearchPerformance" },
194+
"rdfs:comment": { "type": "string" },
195+
"trov:wasConductedBy": {
196+
"type": "object",
197+
"required": ["@id"],
198+
"properties": { "@id": { "type": "string" } }
199+
},
200+
"trov:contributedToArrangement": {
201+
"type": "object",
202+
"required": ["@id"],
203+
"properties": { "@id": { "type": "string" } }
204+
},
205+
"trov:startedAtTime": { "type": "string" },
206+
"trov:endedAtTime": { "type": "string" },
207+
"pe:emittedIn": {
208+
"type": "string",
209+
"enum": ["local", "github-actions"]
210+
},
211+
"pe:ciRunUrl": { "type": "string", "format": "uri" },
212+
"pe:ciGitSha": { "type": "string" },
213+
"pe:ciGitRef": { "type": "string" },
214+
"pe:dataBuildId": { "type": "string" },
215+
"pe:builtWithModelPackageName": { "type": "string" },
216+
"pe:builtWithModelVersion": { "type": "string" },
217+
"pe:builtWithModelGitSha": { "type": "string" },
218+
"pe:dataBuildFingerprint": { "type": "string" }
219+
}
220+
}
221+
}
222+
}

policyengine_us_data/tests/test_release_manifest.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
RELEASE_MANIFEST_SCHEMA_VERSION,
1212
build_release_manifest,
1313
)
14+
from policyengine_us_data.utils.trace_tro import TRACE_TRO_FILENAME
1415

1516

1617
def _write_file(path: Path, content: bytes) -> Path:
@@ -181,6 +182,8 @@ def test_upload_files_to_hf_adds_release_manifest_operations(tmp_path):
181182
assert "enhanced_cps_2024.h5" in operation_paths
182183
assert "release_manifest.json" in operation_paths
183184
assert "releases/1.73.0/release_manifest.json" in operation_paths
185+
assert TRACE_TRO_FILENAME in operation_paths
186+
assert f"releases/1.73.0/{TRACE_TRO_FILENAME}" in operation_paths
184187

185188
release_ops = [
186189
operation
@@ -192,6 +195,16 @@ def test_upload_files_to_hf_adds_release_manifest_operations(tmp_path):
192195
assert isinstance(operation, CommitOperationAdd)
193196
assert isinstance(operation.path_or_fileobj, BytesIO)
194197

198+
trace_ops = [
199+
operation
200+
for operation in operations
201+
if operation.path_in_repo.endswith(".jsonld")
202+
]
203+
assert len(trace_ops) == 2
204+
for operation in trace_ops:
205+
assert isinstance(operation, CommitOperationAdd)
206+
assert isinstance(operation.path_or_fileobj, BytesIO)
207+
195208

196209
def test_upload_files_to_hf_does_not_tag_until_finalize(tmp_path):
197210
dataset_path = _write_file(

0 commit comments

Comments
 (0)