Skip to content

Commit 987fe18

Browse files
maverbiestanna-parkeractions-user
authored
feat!(ingest): Add host field for INSDC sequences during ingest (#6534)
resolves #6533 #6295 This PR modifies ingest to ensure host organism information is processed uniformly for INSDC-ingested sequences and direct submissions. Because of this, we can also remove INSDC specific behaviour from preprocessing. This will be done in a separate PR once we know this ingest change is correct and stable. Concretely, ingest now collapses submissions with `hostTaxonId` and/or `hostNameScientific` into a single `host` field (removing `hostTaxonId` and`hostNameScientific`). The `hostNameCommon` field is also removed. ## Breaking changes Not strictly breaking, but when rolling out this PR, this DB surgery needs to be run to make sure existing INSDC submissions fit the new model: <details> <summary>An old command with a WHERE clause that was too strict </summary> The command to run: ```sql UPDATE sequence_entries SET unprocessed_data = jsonb_set( unprocessed_data, '{metadata}', ((unprocessed_data -> 'metadata') - 'hostTaxonId' - 'hostNameScientific' - 'hostNameCommon') || CASE WHEN COALESCE( NULLIF(unprocessed_data -> 'metadata' ->> 'hostTaxonId', ''), NULLIF(unprocessed_data -> 'metadata' ->> 'hostNameScientific', '') ) IS NOT NULL THEN jsonb_build_object('host', COALESCE( NULLIF(unprocessed_data -> 'metadata' ->> 'hostTaxonId', ''), NULLIF(unprocessed_data -> 'metadata' ->> 'hostNameScientific', ''))) ELSE '{}'::jsonb END ) WHERE unprocessed_data IS NOT NULL AND (unprocessed_data -> 'metadata' ? 'hostTaxonId' OR unprocessed_data -> 'metadata' ? 'hostNameScientific' OR unprocessed_data -> 'metadata' ? 'hostNameCommon') AND NOT (unprocessed_data -> 'metadata' ? 'host'); ``` </details> ```sql UPDATE sequence_entries SET unprocessed_data = jsonb_set( unprocessed_data, '{metadata}', ((unprocessed_data -> 'metadata') - 'hostTaxonId' - 'hostNameScientific' - 'hostNameCommon') || CASE WHEN COALESCE( NULLIF(unprocessed_data -> 'metadata' ->> 'hostTaxonId', ''), NULLIF(unprocessed_data -> 'metadata' ->> 'hostNameScientific', '') ) IS NOT NULL THEN jsonb_build_object('host', COALESCE( NULLIF(unprocessed_data -> 'metadata' ->> 'hostTaxonId', ''), NULLIF(unprocessed_data -> 'metadata' ->> 'hostNameScientific', ''))) ELSE '{}'::jsonb END ) WHERE unprocessed_data IS NOT NULL AND (unprocessed_data -> 'metadata' ? 'hostTaxonId' OR unprocessed_data -> 'metadata' ? 'hostNameScientific' OR unprocessed_data -> 'metadata' ? 'hostNameCommon') AND NULLIF(unprocessed_data -> 'metadata' ->> 'host', '') IS NULL; ``` ### Screenshot ### PR Checklist - [x] All necessary documentation has been adapted. - [x] The implemented feature is covered by appropriate, automated tests. - [x] Any manual testing that has been done is documented (i.e. what exactly was tested?) 🚀 Preview: Add `preview` label to enable --------- Co-authored-by: anna-parker <50943381+anna-parker@users.noreply.github.com> Co-authored-by: GitHub Action <action@github.com>
1 parent cfe4f07 commit 987fe18

5 files changed

Lines changed: 39 additions & 9 deletions

File tree

ingest/scripts/heuristic_group_segments.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from typing import Final
2828

2929
import click
30+
from prepare_metadata import resolve_host_information
3031
import orjsonl
3132
import yaml
3233

@@ -267,6 +268,8 @@ def main(
267268
json.dumps(filtered_record, sort_keys=True).encode(), usedforsecurity=False
268269
).hexdigest()
269270

271+
row = resolve_host_information(row)
272+
270273
orjsonl.append(output_metadata, {"id": joint_key, "metadata": row})
271274
count += 1
272275

ingest/scripts/override_group_segments.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from typing import Any, Final
3636

3737
import click
38+
from prepare_metadata import resolve_host_information
3839
import orjsonl # type: ignore
3940
import requests
4041
import yaml
@@ -170,7 +171,7 @@ def get_metadata_of_group(
170171
json.dumps(filtered_record, sort_keys=True).encode(), usedforsecurity=False
171172
).hexdigest()
172173

173-
return grouped_metadata
174+
return resolve_host_information(grouped_metadata)
174175

175176

176177
def group_records(
@@ -236,7 +237,9 @@ def write_grouped_metadata(
236237
prev_acc = f"{acc}.{prev_ver}"
237238
if prev_acc in groups.accession_to_group:
238239
group = groups.accession_to_group[prev_acc]
239-
logger.warning(f"Matched {full_accession} to group via previous version {prev_acc}")
240+
logger.warning(
241+
f"Matched {full_accession} to group via previous version {prev_acc}"
242+
)
240243
break
241244

242245
if group is None:
@@ -334,13 +337,13 @@ def get_groups_object(groups_json_path: str) -> Groups:
334337
"--groups",
335338
required=True,
336339
type=click.Path(exists=True),
337-
help="Path to the JSON file containing the map from group names to lists of accessions."
340+
help="Path to the JSON file containing the map from group names to lists of accessions.",
338341
)
339342
@click.option(
340343
"--input-seq",
341344
required=True,
342345
type=click.Path(exists=True),
343-
help="Path to the JSONL file of input data."
346+
help="Path to the JSONL file of input data.",
344347
)
345348
@click.option("--input-metadata", required=True, type=click.Path(exists=True))
346349
@click.option("--output-seq", required=True, type=click.Path())
@@ -355,7 +358,7 @@ def get_groups_object(groups_json_path: str) -> Groups:
355358
@click.option(
356359
"--match-previous-accession-versions/--no-match-previous-accession-versions",
357360
default=False,
358-
help="Whether to match against previous versions of accessions (e.g., XX123.1 when XX123.2 is provided)"
361+
help="Whether to match against previous versions of accessions (e.g., XX123.1 when XX123.2 is provided)",
359362
)
360363
def main( # noqa: PLR0913, PLR0917
361364
config_file: str,

ingest/scripts/prepare_metadata.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,25 @@ class Config:
3636
segmented: bool
3737

3838

39+
def resolve_host_information(record: dict[str, str]) -> dict[str, str]:
40+
"""Create a new host field and populate it from hostTaxonId or
41+
hostNameScientific (falling back to the empty string) to be consistent
42+
with how direct submissions specify the host organism. Any existing
43+
hostTaxonId, hostNameScientific, and hostNameCommon fields on the
44+
record will be removed.
45+
46+
This should be done after computing the hash for a record to not trigger
47+
revisions for all INSDC data
48+
"""
49+
host = record.get("hostTaxonId") or record.get("hostNameScientific")
50+
record.pop("hostTaxonId", None)
51+
record.pop("hostNameScientific", None)
52+
record.pop("hostNameCommon", None)
53+
record["host"] = host
54+
55+
return record
56+
57+
3958
@click.command()
4059
@click.option("--config-file", required=True, type=click.Path(exists=True))
4160
@click.option("--input", required=True, type=click.Path(exists=True))
@@ -151,6 +170,11 @@ def main(
151170

152171
record["hash"] = hashlib.md5(prehash.encode(), usedforsecurity=False).hexdigest()
153172

173+
# for segmented organisms, this has to happen in `heuristic_group_segments.py`
174+
# and `override_group_segments.py`
175+
if not config.segmented:
176+
record = resolve_host_information(record)
177+
154178
orjsonl.append(output, {"id": record[fasta_id_field], "metadata": record})
155179

156180
logger.info(f"Saved metadata for {len(metadata)} sequences")
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S id hash accession fastaIds
2-
Public Health England, Research Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B. Sairam district Kazakhstan Hyalomma anatolicum 176092 2016-04-30T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 2015 tick pool #134 2016-04-30T00:00:00Z 1 4f8a46f4b233b3d9f05d5336b8b711aa KX096703 KX096703.1 KX096703.1.S c7d098a014a36e8b8f87c73621b7d6fc LOC_0000VXA KX096703.1.S_S
1+
authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry host isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S id hash accession fastaIds
2+
"Public Health England, Research" "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B." Sairam district Kazakhstan 176092 2016-04-30T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 2015 tick pool #134 2016-04-30T00:00:00Z 1 4f8a46f4b233b3d9f05d5336b8b711aa KX096703 KX096703.1 KX096703.1.S c7d098a014a36e8b8f87c73621b7d6fc LOC_0000VXA KX096703.1.S_S
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S id hash fastaIds
2-
Chumakov Institute of Poliomyelitis and Viral Encephalitides Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P. Uganda Homo sapiens 9606 2016-12-07T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 1958 Nakiwogo 2016-12-07T00:00:00Z 2016-12-07T00:00:00Z 1 1 7b10a4e21daa8a2e693958761be17d53 70954bc35782b5592858ac3f1a6bbf89 KX013483 KX013485 KX013483.1 KX013485.1 KX013483.1.L/KX013485.1.S bb3a6d8df47cb2891e7b60030a40c335 KX013483.1.L/KX013485.1.S_L KX013483.1.L/KX013485.1.S_S
1+
authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry host isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S id hash fastaIds
2+
Chumakov Institute of Poliomyelitis and Viral Encephalitides "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P." Uganda 9606 2016-12-07T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 1958 Nakiwogo 2016-12-07T00:00:00Z 2016-12-07T00:00:00Z 1 1 7b10a4e21daa8a2e693958761be17d53 70954bc35782b5592858ac3f1a6bbf89 KX013483 KX013485 KX013483.1 KX013485.1 KX013483.1.L/KX013485.1.S bb3a6d8df47cb2891e7b60030a40c335 KX013483.1.L/KX013485.1.S_L KX013483.1.L/KX013485.1.S_S

0 commit comments

Comments
 (0)