Skip to content

Commit d0cf22c

Browse files
committed
feat(gks): add support for vrs alleles
close #20 * In addition to Categorical Variants, VRS Alleles will be recognized
1 parent 3ac0bba commit d0cf22c

5 files changed

Lines changed: 123 additions & 42 deletions

File tree

clinvar_this/io/gks_json/base.py

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
VariantOncogenicityProposition,
2727
)
2828
from ga4gh.va_spec.ccv_2022 import VariantOncogenicityStatement
29-
from ga4gh.vrs.models import Expression, Syntax
29+
from ga4gh.vrs.models import Allele, Expression, MolecularVariation, Syntax
3030
from pydantic import BaseModel, ConfigDict
3131

3232

@@ -185,19 +185,21 @@ def _read_file(
185185

186186
@staticmethod
187187
def _get_variant_hgvs(
188-
variant: CategoricalVariant,
188+
variant: CategoricalVariant | Allele,
189189
) -> str | None:
190190
"""Retrieve a HGVS expression for a variant
191191
192-
Checks the first constraint for an expression. Only support
193-
DefiningAlleleConstraints at the moment.
192+
For Categorical Variants, checks the first constraint for an expression. Only
193+
support extracting from DefiningAlleleConstraints at the moment.
194+
If no constraints found, then checks the expressions extension. These are cases
195+
where an HGVS expression is unable to be representing using VRS.
194196
195-
If no constraints found, then checks the expressions extension.
197+
For VRS Alleles, checks the `expressions` field.
196198
197199
Order matters: the first matching expression is returned. cDNA RefSeq HGVS
198200
expressions are prioritized over genomic RefSeq HGVS expressions.
199201
200-
:param variant: Categorical Variant
202+
:param variant: Variant associated to statement
201203
:return: cDNA RefSeq HGVS expression or genomic RefSeq HGVS expression for a
202204
variant, if provided.
203205
"""
@@ -225,20 +227,24 @@ def get_hgvs(
225227
return None
226228

227229
expressions = None
228-
if getattr(variant, "constraints", None) and variant.constraints:
229-
constraint = variant.constraints[0]
230-
if isinstance(constraint.root, DefiningAlleleConstraint):
231-
expressions = constraint.root.allele.expressions
230+
231+
if isinstance(variant, CategoricalVariant):
232+
if getattr(variant, "constraints", None) and variant.constraints:
233+
constraint = variant.constraints[0]
234+
if isinstance(constraint.root, DefiningAlleleConstraint):
235+
expressions = constraint.root.allele.expressions
236+
else:
237+
# Case where a VRS Allele is unable to be represented, can store
238+
# expressions as an extension named 'expressions'
239+
try:
240+
expressions_ext = next(
241+
ext for ext in variant.extensions if ext.name == "expressions"
242+
).value
243+
expressions = [Expression(**ext) for ext in expressions_ext]
244+
except (StopIteration, TypeError):
245+
return None
232246
else:
233-
# Case where a VRS Allele is unable to be represented, can store
234-
# expressions as an extension named 'expressions'
235-
try:
236-
expressions_ext = next(
237-
ext for ext in variant.extensions if ext.name == "expressions"
238-
).value
239-
expressions = [Expression(**ext) for ext in expressions_ext]
240-
except (StopIteration, TypeError):
241-
return None
247+
expressions = variant.expressions
242248

243249
return get_hgvs(expressions, Syntax.HGVS_C) or get_hgvs(
244250
expressions, Syntax.HGVS_G
@@ -518,29 +524,25 @@ def _get_clinvar_accession_and_record_status(statement: GksStatementT) -> dict:
518524

519525
def _get_variant_set(
520526
self,
521-
proposition: VariantTherapeuticResponseProposition
522-
| VariantDiagnosticProposition
523-
| VariantPrognosticProposition,
527+
gene_context: MappableConcept,
528+
variant: CategoricalVariant | Allele,
524529
variant_hgvs: str | None = None,
525530
) -> SubmissionVariantSet:
526531
"""Get variant set
527532
528533
This assumes only a single submission variant.
529534
530-
:param proposition: Proposition for a given statement.
535+
:param gene_context: Gene associated to statement
536+
:param variant: Variant associated to statement
531537
:param variant_hgvs: The HGVS expression for a variant, if found.
532538
:return: Variant set for a proposition
533539
"""
534540
return SubmissionVariantSet(
535541
variant=[
536542
SubmissionVariant(
537543
hgvs=variant_hgvs,
538-
gene=[
539-
SubmissionVariantGene(
540-
symbol=proposition.geneContextQualifier.name
541-
)
542-
],
543-
alternate_designations=proposition.subjectVariant.aliases,
544+
gene=[SubmissionVariantGene(symbol=gene_context.name)],
545+
alternate_designations=variant.aliases,
544546
)
545547
]
546548
)
@@ -573,6 +575,7 @@ def _get_method_type(
573575
def _get_submission(
574576
self,
575577
statement: GksStatementT,
578+
variant: CategoricalVariant | Allele,
576579
observed_in: list[SubmissionObservedInSomatic],
577580
variant_hgvs: str | None = None,
578581
submitted_assembly: Assembly | None = None,
@@ -587,6 +590,7 @@ def _get_submission(
587590
have record status as `update` rather than `novel`.
588591
589592
:param statement: GKS statement instance to transform
593+
:param variant: Variant associated to statement
590594
:param observed_in: List of distinct ClinVar somatic observations associated
591595
with the statement
592596
:param variant_hgvs: The HGVS expression for a variant, if found
@@ -616,17 +620,31 @@ def records_to_submission_container(
616620
submissions = []
617621

618622
for statement in statements:
623+
statement_id = statement.id
619624
variant = statement.proposition.subjectVariant
620625

621-
if not isinstance(variant, CategoricalVariant):
626+
if isinstance(variant, MolecularVariation):
627+
if not isinstance(variant.root, Allele):
628+
logger.warning(
629+
"Skipping statement. Molecular Variation is not an Allele for statement ID: %s",
630+
statement_id,
631+
)
632+
continue
633+
634+
variant = variant.root
635+
elif not isinstance(variant, CategoricalVariant):
636+
logger.warning(
637+
"Skipping statement. Variant is not a Categorical Variant or MolecularVariation for statement ID: %s",
638+
statement_id,
639+
)
622640
continue
623641

624642
variant_hgvs = self._get_variant_hgvs(variant)
625643

626644
if not variant_hgvs:
627645
logger.warning(
628646
"Skipping statement. No HGVS found for statement ID: %s",
629-
statement.id,
647+
statement_id,
630648
)
631649
continue
632650

@@ -641,19 +659,19 @@ def records_to_submission_container(
641659
if not observed_in:
642660
logger.warning(
643661
"Skipping statement. No observed_in found for statement ID: %s",
644-
statement.id,
662+
statement_id,
645663
)
646664
continue
647665

648666
submissions.append(
649667
self._get_submission(
650668
statement,
669+
variant,
651670
observed_in,
652671
variant_hgvs=variant_hgvs,
653672
submitted_assembly=batch_metadata.submitted_assembly,
654673
)
655674
)
656-
657675
return SubmissionContainer(
658676
assertion_criteria=self.assertion_criteria,
659677
**{self.submission_container_attribute: submissions},

clinvar_this/io/gks_json/clinical_impact_transformer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
PrognosticPredicate,
1717
TherapeuticResponsePredicate,
1818
)
19+
from ga4gh.vrs.models import Allele
20+
from ga4gh.cat_vrs.models import CategoricalVariant
1921

2022
from clinvar_api.models import (
2123
Assembly,
2224
CitationDb,
23-
RecordStatus,
2425
SubmissionAssertionCriteria,
2526
SubmissionClinicalImpactSubmission,
2627
)
@@ -84,6 +85,7 @@ class ClinicalImpactTransformer(
8485
def _get_submission(
8586
self,
8687
statement: VariantClinicalSignificanceStatement,
88+
variant: CategoricalVariant | Allele,
8789
observed_in: list[SubmissionObservedInSomatic],
8890
variant_hgvs: str | None = None,
8991
submitted_assembly: Assembly | None = None,
@@ -103,6 +105,7 @@ def _get_submission(
103105
have record status as `update` rather than `novel`.
104106
105107
:param statement: GKS statement (therapeutic, diagnostic, or prognostic) to transform
108+
:param variant: Variant associated to statement
106109
:param observed_in: List of distinct observations
107110
:param variant_hgvs: The HGVS expression for a variant, if found
108111
:param submitted_assembly: The genome assembly used to call the variant.
@@ -119,12 +122,14 @@ def _get_submission(
119122
drug_for_therapeutic_assertion = None
120123

121124
return SubmissionClinicalImpactSubmission(
122-
local_id=proposition.subjectVariant.id or proposition.subjectVariant.name,
125+
local_id=variant.id or variant.name,
123126
submitted_assembly=submitted_assembly,
124127
local_key=statement.id,
125128
observed_in=observed_in,
126129
condition_set=self._get_condition_set(proposition),
127-
variant_set=self._get_variant_set(proposition, variant_hgvs=variant_hgvs),
130+
variant_set=self._get_variant_set(
131+
proposition.geneContextQualifier, variant, variant_hgvs=variant_hgvs
132+
),
128133
clinical_impact_classification=SomaticClinicalImpactClassification(
129134
clinical_impact_classification_description=_IMPACT_CLASS_MAPPING[
130135
statement.classification.primaryCoding.code.root

clinvar_this/io/gks_json/oncogenicity_transformer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
from clinvar_api.models import (
1010
Assembly,
1111
CitationDb,
12-
RecordStatus,
1312
SubmissionAssertionCriteria,
1413
SubmissionOncogenicitySubmission,
1514
SomaticOncogenicityClassification,
1615
)
16+
from ga4gh.vrs.models import Allele
17+
from ga4gh.cat_vrs.models import CategoricalVariant
1718
from ga4gh.va_spec.ccv_2022 import VariantOncogenicityStatement
1819
from clinvar_api.models.sub_payload import (
1920
SubmissionObservedInSomatic,
@@ -33,6 +34,7 @@ class OncogenicityTransformer(GksJsonTransformer[VariantOncogenicityStatement]):
3334
def _get_submission(
3435
self,
3536
statement: VariantOncogenicityStatement,
37+
variant: CategoricalVariant | Allele,
3638
observed_in: list[SubmissionObservedInSomatic],
3739
variant_hgvs: str | None = None,
3840
submitted_assembly: Assembly | None = None,
@@ -49,6 +51,7 @@ def _get_submission(
4951
have record status as `update` rather than `novel`.
5052
5153
:param statement: GKS statement (oncogenicity) to transform
54+
:param variant: Variant associated to statement
5255
:param observed_in: List of distinct observations
5356
:param variant_hgvs: The HGVS expression for a variant, if found
5457
:param submitted_assembly: The genome assembly used to call the variant.
@@ -59,12 +62,14 @@ def _get_submission(
5962
proposition = statement.proposition
6063

6164
return SubmissionOncogenicitySubmission(
62-
local_id=proposition.subjectVariant.id or proposition.subjectVariant.name,
65+
local_id=variant.id or variant.name,
6366
submitted_assembly=submitted_assembly,
6467
local_key=statement.id,
6568
observed_in=observed_in,
6669
condition_set=self._get_condition_set(proposition),
67-
variant_set=self._get_variant_set(proposition, variant_hgvs=variant_hgvs),
70+
variant_set=self._get_variant_set(
71+
proposition.geneContextQualifier, variant, variant_hgvs=variant_hgvs
72+
),
6873
oncogenicity_classification=SomaticOncogenicityClassification(
6974
oncogenicity_classification_description=statement.classification.primaryCoding.code.root.capitalize(),
7075
comment=self._get_comment(statement),

docs/file_formats.rst

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,17 @@ The following information is required or must be derivable from each statement:
164164

165165
- ``proposition.subjectVariant``
166166

167-
- Expects a GA4GH VA-Spec Categorical Variant.
168-
- HGVS expressions are extracted from the first ``DefiningAlleleConstraint`` in ``proposition.subjectVariant.constraints``.
169-
- If no supported constraint is present, HGVS expressions may be provided in an ``expressions`` extension on ``proposition.subjectVariant``.
167+
- Expects a GA4GH Cat-VRS Categorical Variant or VRS Allele.
168+
169+
- For Categorical Variants:
170+
171+
- HGVS expressions are extracted from the ``expressions`` field in the first ``DefiningAlleleConstraint`` in ``proposition.subjectVariant.constraints``.
172+
- If no supported constraint is present, HGVS expressions may be provided in an ``expressions`` extension on ``proposition.subjectVariant``.
173+
174+
- For Alleles:
175+
176+
- HGVS expressions are extracted from ``proposition.subjectVariant.expressions``.
177+
170178
- RefSeq transcript HGVS expressions are preferred over RefSeq genomic HGVS expressions.
171179
- The selected HGVS expression is used for the ClinVar variant description.
172180
- ``proposition.subjectVariant.id`` or ``proposition.subjectVariant.name`` is used as the ClinVar local ID.

tests/clinvar_this/io/gks_json/test_clinical_impact_transformer.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Module for testing GKS Clinical Impact Transformer"""
22

3+
from copy import deepcopy
34
import re
45

56
from deepdiff import DeepDiff
@@ -75,6 +76,38 @@ def civic_aid7(clinical_impact_gks_json_data):
7576
return VariantClinicalSignificanceStatement(**clinical_impact_gks_json_data[1])
7677

7778

79+
@pytest.fixture(scope="module")
80+
def civic_aid7_allele(clinical_impact_gks_json_data):
81+
"""Create test fixture for CIViC AID7 where a VRS Allele is used instead"""
82+
params = deepcopy(clinical_impact_gks_json_data[1])
83+
subject_variant = {
84+
"id": "ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R",
85+
"type": "Allele",
86+
"name": "NM_004333.6:c.1799T>A",
87+
"digest": "W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R",
88+
"expressions": [{"syntax": "hgvs.c", "value": "NM_004333.6:c.1799T>A"}],
89+
"location": {
90+
"id": "ga4gh:SL.8HBKs9fzlT3tKWlM03REjkg_0Om6Y33U",
91+
"type": "SequenceLocation",
92+
"digest": "8HBKs9fzlT3tKWlM03REjkg_0Om6Y33U",
93+
"sequenceReference": {
94+
"type": "SequenceReference",
95+
"refgetAccession": "SQ.aKMPEJgmlZXt_F6gRY5cUG3THH2n-GUa",
96+
"moleculeType": "RNA",
97+
},
98+
"start": 2024,
99+
"end": 2025,
100+
},
101+
"state": {"type": "LiteralSequenceExpression", "sequence": "A"},
102+
}
103+
104+
for el in params["hasEvidenceLines"]:
105+
el["targetProposition"]["subjectVariant"] = subject_variant
106+
107+
params["proposition"]["subjectVariant"] = subject_variant
108+
return VariantClinicalSignificanceStatement(**params)
109+
110+
78111
@pytest.fixture(scope="module")
79112
def civic_aid20(clinical_impact_gks_json_data):
80113
"""Create test fixture for CIViC AID20"""
@@ -527,3 +560,15 @@ def test_contributions(clinical_impact_transformer, civic_aid200, civic_metadata
527560
].clinical_impact_classification.date_last_evaluated
528561
== "2026-04-16"
529562
)
563+
564+
565+
def test_vrs_allele(clinical_impact_transformer, civic_aid7_allele, civic_metadata):
566+
"""Test that VRS Alleles work correctly"""
567+
actual = clinical_impact_transformer.records_to_submission_container(
568+
[civic_aid7_allele], civic_metadata
569+
)
570+
assert len(actual.clinical_impact_submission) == 1
571+
assert len(actual.clinical_impact_submission[0].variant_set.variant) == 1
572+
assert actual.clinical_impact_submission[0].variant_set.variant[0].model_dump(
573+
exclude_none=True
574+
) == {"hgvs": "NM_004333.6:c.1799T>A", "gene": [{"symbol": "BRAF"}]}

0 commit comments

Comments
 (0)