Skip to content

Commit fea7cbb

Browse files
authored
feat: add refget integration for unknown contig lookup (#9)
Query EBI's refget server to retrieve aliases and metadata for contigs that don't match any known reference genome. This helps users identify what unknown contigs are (e.g., discovering that an unmatched contig corresponds to GL000220.1 via its INSDC alias). Changes: - Add src/refget/ module with async enrichment logic using refget-client crate, semaphore-limited concurrent lookups, and graceful error handling per contig - Replace hand-rolled sha512t24u computation with refget-digest crate, removing sha2 and base64 dependencies - Add --refget-server flag to CLI identify subcommand - Add --refget-server and --no-refget flags to serve subcommand (refget enabled by default) - Enrich unmatched contigs in the web server's detailed comparison response, scoped to the current page for efficiency - Display refget aliases, sha512t24u, and circularity in the web UI contig detail modal with a "refget" badge on list items
1 parent a172acb commit fea7cbb

11 files changed

Lines changed: 1279 additions & 37 deletions

File tree

Cargo.lock

Lines changed: 680 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,15 @@ tokio = { version = "1.43", features = ["full"] }
3030
tower = { version = "0.5", features = ["limit"] }
3131
tower-http = { version = "0.6", features = ["fs", "limit", "timeout", "set-header"] }
3232

33+
# Refget
34+
refget-client = { version = "0.1.0", default-features = false }
35+
refget-digest = "0.1.0"
36+
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
37+
3338
# Utilities
3439
anyhow = "1.0"
3540
thiserror = "2.0"
3641
md5 = "0.7"
37-
sha2 = "0.10"
38-
base64 = "0.22"
3942
chrono = { version = "0.4", features = ["serde"] }
4043
open = "5.0"
4144
flate2 = "1.0"

src/cli/identify.rs

Lines changed: 142 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use crate::matching::engine::{MatchResult, MatchingConfig, MatchingEngine, Scori
1111
use crate::matching::hierarchical_engine::{HierarchicalMatchResult, HierarchicalMatchingEngine};
1212
use crate::matching::Suggestion;
1313
use crate::parsing;
14+
use crate::refget::{EnrichedContig, RefgetConfig, RefgetLookupResult};
1415

1516
/// How to handle references that have contigs missing from their FASTA
1617
/// (e.g., CHM13 where MT is in assembly report but uses standard rCRS mitochondria)
@@ -72,6 +73,12 @@ pub struct IdentifyArgs {
7273
/// Whether contigs appear in the same order
7374
#[arg(long, default_value = "10", value_parser = clap::value_parser!(u32).range(0..=100))]
7475
pub weight_order: u32,
76+
77+
/// Refget server URL for looking up unknown contigs.
78+
/// When set, unmatched contigs with MD5 digests are queried against this
79+
/// server to retrieve aliases and other metadata.
80+
#[arg(long)]
81+
pub refget_server: Option<String>,
7582
}
7683

7784
#[derive(Clone, Copy, Debug, clap::ValueEnum)]
@@ -166,6 +173,35 @@ fn run_flat(
166173
return Ok(());
167174
}
168175

176+
// Optionally enrich unmatched contigs via refget
177+
let enriched = if let Some(ref server_url) = args.refget_server {
178+
let config = RefgetConfig::new(server_url);
179+
// Collect all query_only contigs from the top match
180+
let unmatched_contigs: Vec<_> = matches
181+
.first()
182+
.map(|m| m.diagnosis.query_only.clone())
183+
.unwrap_or_default();
184+
185+
if unmatched_contigs.is_empty() {
186+
None
187+
} else {
188+
if verbose {
189+
eprintln!(
190+
"Querying refget server for {} unmatched contigs...",
191+
unmatched_contigs.len()
192+
);
193+
}
194+
let rt = tokio::runtime::Runtime::new()?;
195+
let results = rt.block_on(crate::refget::enrichment::enrich_contigs(
196+
&unmatched_contigs,
197+
&config,
198+
));
199+
Some(results)
200+
}
201+
} else {
202+
None
203+
};
204+
169205
// Output results
170206
match format {
171207
OutputFormat::Text => {
@@ -176,11 +212,24 @@ fn run_flat(
176212
args.missing_contig_handling,
177213
&scoring_weights,
178214
);
215+
if let Some(ref enriched) = enriched {
216+
print_refget_text_results(enriched);
217+
}
179218
}
180219
OutputFormat::Json => {
181-
print_json_results(&matches, args.missing_contig_handling, &scoring_weights)?;
220+
print_json_results(
221+
&matches,
222+
args.missing_contig_handling,
223+
&scoring_weights,
224+
enriched.as_deref(),
225+
)?;
226+
}
227+
OutputFormat::Tsv => {
228+
print_tsv_results(&matches, &scoring_weights);
229+
if let Some(ref enriched) = enriched {
230+
print_refget_tsv_results(enriched);
231+
}
182232
}
183-
OutputFormat::Tsv => print_tsv_results(&matches, &scoring_weights),
184233
}
185234

186235
Ok(())
@@ -481,10 +530,11 @@ fn print_json_results(
481530
matches: &[MatchResult],
482531
missing_handling: MissingContigHandling,
483532
weights: &ScoringWeights,
533+
enriched: Option<&[EnrichedContig]>,
484534
) -> anyhow::Result<()> {
485535
let norm = weights.normalized();
486536
// Create serializable output
487-
let output: Vec<serde_json::Value> = matches
537+
let results: Vec<serde_json::Value> = matches
488538
.iter()
489539
.map(|m| {
490540
// Calculate reference coverage
@@ -542,6 +592,12 @@ fn print_json_results(
542592
})
543593
.collect();
544594

595+
let mut output = serde_json::json!({ "matches": results });
596+
597+
if let Some(enriched) = enriched {
598+
output["refget_enrichment"] = serde_json::json!(enriched);
599+
}
600+
545601
println!("{}", serde_json::to_string_pretty(&output)?);
546602
Ok(())
547603
}
@@ -584,6 +640,89 @@ fn print_tsv_results(matches: &[MatchResult], weights: &ScoringWeights) {
584640
}
585641
}
586642

643+
// ============================================================================
644+
// Refget enrichment output functions
645+
// ============================================================================
646+
647+
fn print_refget_text_results(enriched: &[EnrichedContig]) {
648+
let found: Vec<_> = enriched
649+
.iter()
650+
.filter(|e| matches!(e.refget_metadata, RefgetLookupResult::Found { .. }))
651+
.collect();
652+
653+
if found.is_empty() {
654+
println!("Refget: no unmatched contigs found in refget server.");
655+
return;
656+
}
657+
658+
println!("\nRefget Aliases for Unmatched Contigs:");
659+
println!("{}", "─".repeat(60));
660+
for entry in &found {
661+
if let RefgetLookupResult::Found {
662+
aliases,
663+
sha512t24u,
664+
circular,
665+
} = &entry.refget_metadata
666+
{
667+
print!(" {} ", entry.name);
668+
if *circular {
669+
print!("(circular) ");
670+
}
671+
println!("[sha512t24u: {sha512t24u}]");
672+
if aliases.is_empty() {
673+
println!(" (no aliases)");
674+
} else {
675+
for alias in aliases {
676+
println!(" {}: {}", alias.naming_authority, alias.value);
677+
}
678+
}
679+
}
680+
}
681+
println!();
682+
}
683+
684+
fn print_refget_tsv_results(enriched: &[EnrichedContig]) {
685+
println!("\n# Refget enrichment for unmatched contigs");
686+
println!("contig\tmd5\tstatus\tsha512t24u\tcircular\taliases");
687+
for entry in enriched {
688+
match &entry.refget_metadata {
689+
RefgetLookupResult::Found {
690+
aliases,
691+
sha512t24u,
692+
circular,
693+
} => {
694+
let alias_str: Vec<String> = aliases
695+
.iter()
696+
.map(|a| format!("{}={}", a.naming_authority, a.value))
697+
.collect();
698+
println!(
699+
"{}\t{}\tfound\t{}\t{}\t{}",
700+
entry.name,
701+
entry.md5.as_deref().unwrap_or(""),
702+
sha512t24u,
703+
circular,
704+
alias_str.join(";"),
705+
);
706+
}
707+
RefgetLookupResult::NotFound => {
708+
println!(
709+
"{}\t{}\tnot_found\t\t\t",
710+
entry.name,
711+
entry.md5.as_deref().unwrap_or(""),
712+
);
713+
}
714+
RefgetLookupResult::Error { message } => {
715+
println!(
716+
"{}\t{}\terror\t\t\t{}",
717+
entry.name,
718+
entry.md5.as_deref().unwrap_or(""),
719+
message,
720+
);
721+
}
722+
}
723+
}
724+
}
725+
587726
// ============================================================================
588727
// Hierarchical catalog output functions
589728
// ============================================================================

src/cli/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,15 @@ pub struct ServeArgs {
8888
/// Open browser automatically
8989
#[arg(long)]
9090
pub open: bool,
91+
92+
/// Refget server URL for looking up unknown contigs.
93+
/// Defaults to EBI's ENA CRAM endpoint. Use --no-refget to disable.
94+
#[arg(long, default_value = crate::refget::DEFAULT_REFGET_SERVER)]
95+
pub refget_server: String,
96+
97+
/// Disable refget lookups for unmatched contigs
98+
#[arg(long)]
99+
pub no_refget: bool,
91100
}
92101

93102
#[derive(Clone, Copy, Debug, clap::ValueEnum)]

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
//! - [`core`]: Core data types for contigs, references, and headers
4848
//! - [`matching`]: Matching engine and scoring algorithms
4949
//! - [`parsing`]: Parsers for SAM/BAM/CRAM, dict, and TSV files
50+
//! - [`refget`]: Refget server integration for unknown contig lookup
5051
//! - [`cli`]: Command-line interface implementation
5152
//! - [`web`]: Web server for browser-based identification
5253
@@ -55,6 +56,7 @@ pub mod cli;
5556
pub mod core;
5657
pub mod matching;
5758
pub mod parsing;
59+
pub mod refget;
5860
pub mod utils;
5961
pub mod web;
6062

src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod cli;
66
mod core;
77
mod matching;
88
mod parsing;
9+
mod refget;
910
mod utils;
1011
mod web;
1112

0 commit comments

Comments
 (0)