probing-concepts-release/croissant.json at main · pfizer-opensource/probing-concepts-release · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
{
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "rai": "http://mlcommons.org/croissant/RAI/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "dct": "http://purl.org/dc/terms/",
    "equivalentProperty": "cr:equivalentProperty",
    "examples": {
      "@id": "cr:examples",
      "@type": "@json"
    },
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:regex",
    "repeated": "cr:repeated",
    "replace": "cr:replace",
    "samplingRate": "cr:samplingRate",
    "sc": "https://schema.org/",
    "separator": "cr:separator",
    "source": "cr:source",
    "subField": "cr:subField",
    "transform": "cr:transform",
    "prov": "http://www.w3.org/ns/prov#"
  },
  "@type": "sc:Dataset",
  "name": "probing-concepts",
  "version": "1.0.0",
  "datePublished": "2024-05-01",
  "license": "https://www.apache.org/licenses/LICENSE-2.0",
  "citeAs": "Probing Concepts Dataset",
  "conformsTo": "http://mlcommons.org/croissant/1.1",
  "rai:dataBiases": [
    "Selection bias toward Western scientific traditions: domain selection and ontology choices reflect Eurocentric biomedical and natural-science taxonomies. Traditional medicine, indigenous knowledge systems, and non-canonical scientific frameworks are not represented.",
    "Source-ontology bias: concepts inherit any biases of their source ontologies (e.g. ICD-11 reflects WHO classification decisions; HGNC/Ensembl reflects predominantly human-genome research priorities, which historically over-sample populations of European ancestry).",
    "Domain-size imbalance: the seven domains do not contain equal numbers of concepts (~56 in physics to ~89 in chemistry), so aggregate metrics computed across domains weight some areas more heavily than others if not stratified.",
    "Curator bias: concepts and referents were selected by a single curator per concept following a written proof-reading rubric; no inter-annotator agreement was measured. Topic emphasis within each domain reflects the curator's choice of which branches of each ontology to sample.",
    "Evaluation bias: response scoring uses an LLM-as-judge approach (Anthropic Claude Sonnet 4.6 by default). The judge is given the authoritative referent list and asked to perform overlap detection rather than open-ended evaluation, which reduces but does not eliminate scorer-self-preference bias."
  ],
  "rai:dataLimitations": [
    "Domain coverage is restricted to seven scientific domains (biology, botany, chemistry, geology, medicine, musicology, physics) chosen because they have well-maintained, publicly accessible authoritative ontologies; non-scientific domains (law, history, social sciences, humanities) and non-Western knowledge systems are not represented.",
    "All concepts, definitions, selection criteria, and referents are in English. Latin scientific names appear in biology, botany and geology, but no multilingual coverage is provided and the benchmark has not been validated in other languages.",
    "Concept and referent coverage is bounded by the granularity and completeness of the source ontologies (e.g. GO, CHEBI, WFO/POWO, IMA/RRUFF, Hornbostel-Sachs/MIMO, WHO ICD-11, PDG Review of Particle Physics). Rare, recently described, or non-classical entities may be under-represented; ICD-11-derived medicine concepts are bounded to a maximum tree depth of 6.",
    "Referents are drawn from canonical naming conventions in each source (e.g. HGNC gene symbols, IMA-approved mineral names, ICD-11 codes); colloquial names, regional synonyms, and alternative naming conventions are not systematically captured.",
    "The benchmark is NOT recommended for: clinical decision support, medical diagnosis, treatment selection, or any safety-critical application; comprehensive assessment of domain expertise (each domain samples ~60-90 concepts and is not exhaustive); fine-tuning models for production deployment without domain-specific revalidation; or evaluating model knowledge in languages other than English."
  ],
  "rai:dataSocialImpact": "Positive impact: the benchmark provides a reproducible, ontology-grounded measurement of conceptual understanding that can be used to compare LLMs across scientific domains, support public reporting of model capabilities and limitations, and aid researchers in identifying knowledge gaps in widely deployed models. Risks: model scores on this benchmark could be misinterpreted as evidence of broader scientific competence (e.g. clinical or safety competence) for which the benchmark has not been validated; results could be over-generalised to languages, domains, or knowledge systems not covered. Mitigations: the data card explicitly enumerates uses for which the benchmark is NOT recommended, the scoring pipeline is open-source and deterministic given a scoring model, and per-concept and per-domain scores are released alongside aggregate numbers to discourage single-number summaries.",
  "rai:dataUseCases": [
    "Construct: the benchmark is intended to measure conceptual understanding in large language models, operationalised as the ability to map between (a) a canonical concept name, (b) a natural-language definition, (c) an ordered set of necessary-and-sufficient selection criteria, and (d) a curated set of referents (positive instances) drawn from authoritative scientific ontologies.",
    "Established use cases: (1) probing conceptual knowledge of LLMs across model families, sizes, and training cutoffs; (2) comparing models via decide-concept, decide-concept-from-selection-criteria, list-referents, decide-referents, and semantic-field-size tasks defined in tests.json; (3) computing per-domain and aggregate scores using the LLM-as-judge scoring pipeline in src/probing_concepts/score_responses.py; (4) qualitative failure-mode analysis via src/probing_concepts/failure_modes.py.",
    "Use cases for which validity has NOT been established: clinical decision support or medical diagnosis (the medicine concepts are scientific classifications, not diagnostic tools); training-data quality auditing; safety evaluation; fairness auditing for protected demographic groups (the dataset does not encode demographic attributes); evaluation of multilingual or non-English models; and direct fine-tuning for downstream production tasks without additional domain-specific validation."
  ],
  "rai:personalSensitiveInformation": "The dataset contains no personal or sensitive information. Concepts and referents are scientific terms, gene symbols, chemical entities, mineral names, musical instrument classifications, particle physics terms, and ICD-11 disease categories. No patient data, no individual identifiers, no personal health information, and no demographic attributes (gender, age, race, ethnicity, geography, socio-economic status, religion, or political affiliation) are associated with any concept or referent.",
  "url": "https://github.com/pfizer-opensource/probing-concepts-release",
  "distribution": [
    {
      "@type": "cr:FileObject",
      "@id": "repository-archive",
      "name": "repository-archive",
      "contentUrl": "https://raw.githubusercontent.com/pfizer-opensource/probing-concepts-release/main/concepts.zip",
      "encodingFormat": "application/zip",
      "md5": "2ac2c72cd316e76a0274a7fa1cdb06d3"
    },
    {
      "@type": "cr:FileObject",
      "@id": "concept-records",
      "name": "concept-records",
      "contentUrl": "https://raw.githubusercontent.com/pfizer-opensource/probing-concepts-release/main/concepts_records.jsonl",
      "encodingFormat": "application/jsonlines",
      "md5": "6428b782a26b2577590e5db642d62d99"
    },
    {
      "@type": "cr:FileSet",
      "@id": "concept-files",
      "name": "concept-files",
      "containedIn": {
        "@id": "repository-archive"
      },
      "encodingFormat": "application/json",
      "includes": "concepts/*/*.json"
    }
  ],
  "recordSet": [
    {
      "@type": "cr:RecordSet",
      "@id": "concepts",
      "name": "concepts",
      "field": [
        {
          "@type": "cr:Field",
          "@id": "concepts/domain",
          "name": "domain",
          "dataType": "sc:Text",
          "source": {
            "fileObject": {
              "@id": "concept-records"
            },
            "extract": {
              "column": "domain"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "concepts/concept",
          "name": "concept",
          "dataType": "sc:Text",
          "source": {
            "fileObject": {
              "@id": "concept-records"
            },
            "extract": {
              "column": "concept"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "concepts/definition",
          "name": "definition",
          "dataType": "sc:Text",
          "source": {
            "fileObject": {
              "@id": "concept-records"
            },
            "extract": {
              "column": "definition"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "concepts/selection_criteria_json",
          "name": "selection_criteria_json",
          "dataType": "sc:Text",
          "source": {
            "fileObject": {
              "@id": "concept-records"
            },
            "extract": {
              "column": "selection_criteria_json"
            }
          }
        }
      ]
    }
  ],
  "rai:hasSyntheticData": false,
  "prov:wasDerivedFrom": [
    {
      "@id": "https://icd.who.int/browse11",
      "prov:label": "WHO ICD-11 Browser"
    },
    {
      "@id": "https://id.who.int/icd/release/11/2024-01/mms",
      "prov:label": "WHO ICD-11 MMS API"
    },
    {
      "@id": "https://www.ebi.ac.uk/ols4/",
      "prov:label": "EBI Ontology Lookup Service (OLS4)"
    },
    {
      "@id": "http://geneontology.org/",
      "prov:label": "Gene Ontology"
    },
    {
      "@id": "https://obofoundry.org/",
      "prov:label": "OBO Foundry"
    },
    {
      "@id": "https://reactome.org/",
      "prov:label": "Reactome"
    },
    {
      "@id": "https://www.ebi.ac.uk/chebi/",
      "prov:label": "ChEBI"
    },
    {
      "@id": "https://www.worldfloraonline.org/",
      "prov:label": "World Flora Online"
    },
    {
      "@id": "https://powo.science.kew.org/",
      "prov:label": "Plants of the World Online"
    },
    {
      "@id": "https://planteome.org/",
      "prov:label": "Planteome"
    },
    {
      "@id": "https://www.ima-mineralogy.org/",
      "prov:label": "International Mineralogical Association"
    },
    {
      "@id": "https://rruff.info/ima/",
      "prov:label": "RRUFF IMA Database"
    },
    {
      "@id": "https://www.mindat.org/",
      "prov:label": "Mindat"
    },
    {
      "@id": "https://www.usgs.gov/",
      "prov:label": "USGS"
    },
    {
      "@id": "https://mimo-international.com/MIMO/",
      "prov:label": "MIMO"
    },
    {
      "@id": "https://pdg.lbl.gov/2024/",
      "prov:label": "Particle Data Group Review of Particle Physics"
    }
  ],
  "prov:wasGeneratedBy": [
    {
      "@type": "prov:Activity",
      "prov:type": {
        "@id": "https://www.wikidata.org/wiki/Q4929239"
      },
      "prov:label": "Concept annotation and Croissant export",
      "sc:description": "Human curators sampled concepts from the authoritative public ontologies and registries listed in prov:wasDerivedFrom and recorded each concept as a JSON file with a canonical name, definition, ordered selection criteria, optional ontology/ontology_id reference, and curated referent set, following the proof-reading rubric in concepts/readme/rules_to_proof_read_concept_definitions.md and the per-domain annotation guidance in concepts/readme/. The 27 ICD-11 medicine concepts added in February 2025 were extracted programmatically from the WHO ICD-11 MMS API; all other concepts were manually curated. Every concept is validated against the schema enforced by src/probing_concepts/validate_concepts.py (probing-validate) prior to release. For Croissant publication, the raw concept archive is distributed as concepts.zip and a derived flat JSONL export, concepts_records.jsonl, is generated from the concept files to provide scalar fields for Croissant record generation; nested referent data remains available in the raw archive rather than the Croissant record set. No LLM-generated content is included in the concept inventory; LLMs are used only downstream to produce model responses (probing-test) and as judges during scoring (probing-score)."
    }
  ]
}