Skip to content

Commit 763998f

Browse files
feat(prepro): speed up Nextclade metadata lookup (#6453)
## Summary This draft PR replaces `dpath.get()` in the Nextclade preprocessing metadata path with a simple direct lookup for the dot-separated paths used by Loculus metadata mappings. It also removes the now-unused `dpath` Conda dependency and adds a focused unit test for the lookup behavior. ## Why While investigating west-nile preprocessing, the slow phase after `Nextclade results available` was dominated by per-entry metadata processing rather than upload or taxonomy calls. The hot path was `process_single -> get_output_metadata -> add_input_metadata -> add_nextclade_metadata -> dpath.get`. The Nextclade result objects can be large, and Loculus only needs simple nested dictionary paths here after the existing wildcard truncation step. Direct traversal avoids the heavy general-purpose `dpath` lookup overhead. Full notes: https://gist.github.com/theosanderson-agent/657269613739be0d318f64a08d37bfa9 ## Local timing For a synthetic 100-entry west-nile batch using saved unprocessed data: - Before: metadata/process phase around `26.348s` - After this branch: `process_single_total=1.183s`, mean `0.012s`, median `0.002s`, max `0.145s` - Same run had `nextclade_enrich=1.884s` ## Validation - `ruff format --diff .` - `ruff check --diff .` - `PYTHONPATH=src /usr/bin/python3.12 -m pytest tests/test_nextclade_preprocessing.py::test_get_nested_metadata_uses_simple_dot_paths` 🚀 Preview: https://codex-fast-nextclade-meta.loculus.org
1 parent 01ec896 commit 763998f

3 files changed

Lines changed: 36 additions & 8 deletions

File tree

preprocessing/nextclade/environment.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ channels:
66
dependencies:
77
- python=3.14.4
88
- biopython=1.87
9-
- dpath=2.2.0
109
- nextclade=3.21.2
1110
- pip=25.2
1211
- uv=0.11.8

preprocessing/nextclade/src/loculus_preprocessing/prepro.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
from tempfile import TemporaryDirectory
66
from typing import Any
77

8-
import dpath
9-
108
from .backend import (
119
download_diamond_db,
1210
download_minimizer,
@@ -134,6 +132,17 @@ def truncate_after_wildcard(path: str, separator: str = ".") -> str:
134132
return path
135133

136134

135+
def get_nested_metadata(metadata: dict[str, Any], path: str, separator: str = ".") -> Any | None:
136+
value: Any = metadata
137+
for part in path.split(separator):
138+
if not isinstance(value, dict):
139+
return None
140+
value = value.get(part)
141+
if value is None:
142+
return None
143+
return value
144+
145+
137146
def add_nextclade_metadata(
138147
spec: ProcessingSpec,
139148
unprocessed: UnprocessedAfterNextclade,
@@ -169,12 +178,10 @@ def add_nextclade_metadata(
169178
):
170179
return InputData(datum=None)
171180

172-
raw: str | None = dpath.get(
181+
raw: Any | None = get_nested_metadata(
173182
unprocessed.nextcladeMetadata[sequence_name],
174183
truncate_after_wildcard(nextclade_path),
175-
separator=".",
176-
default=None,
177-
) # type: ignore[assignment]
184+
)
178185

179186
match nextclade_path:
180187
case "frameShifts":

preprocessing/nextclade/tests/test_nextclade_preprocessing.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
UnprocessedEntry,
3333
)
3434
from loculus_preprocessing.embl import create_flatfile, reformat_authors_from_loculus_to_embl_style
35-
from loculus_preprocessing.prepro import process_all
35+
from loculus_preprocessing.prepro import get_nested_metadata, process_all
3636
from loculus_preprocessing.processing_functions import (
3737
format_frameshift,
3838
format_stop_codon,
@@ -59,6 +59,28 @@
5959
LABELED_PRIVATE_MUTATIONS = "tests/labeledPrivateMutations.json"
6060

6161

62+
def test_get_nested_metadata_uses_simple_dot_paths():
63+
metadata = {
64+
"coverage": 0.98,
65+
"qc": {"stopCodons": {"totalStopCodons": 0, "stopCodons": []}},
66+
"cladeFounderInfo": {
67+
"aaMutations": [{"privateSubstitutions": ["NS1:Y35H"]}],
68+
},
69+
}
70+
71+
assert get_nested_metadata(metadata, "coverage") == 0.98
72+
assert get_nested_metadata(metadata, "qc.stopCodons.totalStopCodons") == 0
73+
assert get_nested_metadata(metadata, "qc.stopCodons.stopCodons") == []
74+
assert get_nested_metadata(metadata, "cladeFounderInfo.aaMutations") == [
75+
{"privateSubstitutions": ["NS1:Y35H"]},
76+
]
77+
assert get_nested_metadata(metadata, "qc.missing.total") is None
78+
assert get_nested_metadata(metadata, "coverage.value") is None
79+
80+
metadata_with_zero = {"qc": {"score": 0}}
81+
assert get_nested_metadata(metadata_with_zero, "qc.score") == 0
82+
83+
6284
def consensus_sequence(
6385
type: Literal["single"]
6486
| Literal["ebola-sudan"]

0 commit comments

Comments
 (0)