Skip to content

Commit 9124f70

Browse files
committed
gene module namer
1 parent 0649e1e commit 9124f70

2 files changed

Lines changed: 258 additions & 3 deletions

File tree

src/pySingleCellNet/tools/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from .gene import (
3535
build_gene_knn,
3636
find_gene_modules,
37+
name_gene_modules_by_annotation,
3738
whoare_genes_neighbors,
3839
what_module_has_gene,
3940
score_gene_sets,
@@ -56,9 +57,10 @@
5657
"gsea_on_deg",
5758
"collect_gsea_results_from_dict",
5859
"convert_diffExp_to_dict",
59-
"deg",
60-
"build_gene_knn",
60+
"deg",
61+
"build_gene_knn",
6162
"find_gene_modules",
63+
"name_gene_modules_by_annotation",
6264
"whoare_genes_neighbors",
6365
"what_module_has_gene",
6466
"score_gene_sets",

src/pySingleCellNet/tools/gene.py

Lines changed: 254 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from __future__ import annotations
22
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, Callable
3+
from collections import defaultdict
4+
from functools import lru_cache
5+
from pathlib import Path
36
import numpy as np
47
import pandas as pd
58
import scanpy as sc
@@ -278,6 +281,257 @@ def find_gene_modules(
278281
return modules
279282

280283

284+
def _species_to_taxon_id(species: str) -> str:
285+
"""Normalize species name to NCBI taxon ID string used by STRING files."""
286+
species_norm = str(species).strip().lower().replace(" ", "_")
287+
lut = {
288+
"mouse": "10090",
289+
"mus_musculus": "10090",
290+
"m_musculus": "10090",
291+
"human": "9606",
292+
"homo_sapiens": "9606",
293+
}
294+
if species_norm in lut:
295+
return lut[species_norm]
296+
if species_norm.isdigit():
297+
return species_norm
298+
raise ValueError(f"Unsupported species '{species}'. Provide an NCBI taxon ID or one of {sorted(lut)}.")
299+
300+
301+
def _default_string_data_dir() -> Path:
302+
"""Resolve the default STRING data directory packaged with the repository."""
303+
return Path(__file__).resolve().parents[3] / "data" / "STRING"
304+
305+
306+
def _clean_term_label(term: str, max_len: int = 70) -> str:
307+
"""Tidy annotation text and keep it concise."""
308+
label = " ".join(str(term).split()).rstrip(".;")
309+
if len(label) > max_len:
310+
label = label[: max_len - 3].rstrip() + "..."
311+
return label
312+
313+
314+
def _score_terms_for_module(
315+
module_genes: Sequence[str],
316+
gene_to_terms: Mapping[str, Sequence[Tuple[str, Optional[int]]]],
317+
min_genes_for_term: int,
318+
max_cluster_size: Optional[int],
319+
banned_terms: Mapping[str, None],
320+
) -> List[Tuple[str, float, int, Optional[int]]]:
321+
"""
322+
Score annotation terms by coverage and specificity for a given module.
323+
324+
Returns a list of tuples (term, score, support, cluster_size) sorted by descending score.
325+
"""
326+
module_genes_unique = list(dict.fromkeys(str(g) for g in module_genes))
327+
module_size = len(module_genes_unique)
328+
if module_size == 0:
329+
return []
330+
331+
term_to_genes = defaultdict(set)
332+
term_to_cluster_size: Dict[str, Optional[int]] = {}
333+
334+
for g in module_genes_unique:
335+
for term, csize in gene_to_terms.get(g, []):
336+
term_to_genes[term].add(g)
337+
if csize is not None:
338+
prev = term_to_cluster_size.get(term)
339+
if prev is None or csize < prev:
340+
term_to_cluster_size[term] = int(csize)
341+
342+
scored: List[Tuple[str, float, int, Optional[int]]] = []
343+
for term, genes in term_to_genes.items():
344+
term_l = term.lower()
345+
if term_l in banned_terms:
346+
continue
347+
support = len(genes)
348+
if support < min_genes_for_term:
349+
continue
350+
cluster_size = term_to_cluster_size.get(term)
351+
if max_cluster_size is not None and cluster_size is not None and cluster_size > max_cluster_size:
352+
continue
353+
specificity = 1.0
354+
if cluster_size is not None and cluster_size > 0:
355+
specificity = 1.0 / float(np.log1p(cluster_size))
356+
coverage = support / float(module_size)
357+
score = coverage * specificity
358+
scored.append((term, score, support, cluster_size))
359+
360+
scored.sort(key=lambda x: (-x[1], -x[2], x[3] if x[3] is not None else float("inf"), x[0]))
361+
return scored
362+
363+
364+
@lru_cache(maxsize=8)
365+
def _load_string_gene_annotations(species: str, data_dir: Optional[str]) -> Dict[str, List[Tuple[str, Optional[int]]]]:
366+
"""
367+
Load STRING cluster descriptions and build a gene -> [(term, cluster_size)] mapping.
368+
"""
369+
taxon_id = _species_to_taxon_id(species)
370+
base_dir = Path(data_dir) if data_dir is not None else _default_string_data_dir()
371+
cluster_info_path = base_dir / f"{taxon_id}.clusters.info.v12.0.txt"
372+
cluster_proteins_path = base_dir / f"{taxon_id}.clusters.proteins.v12.0.txt"
373+
protein_info_path = base_dir / f"{taxon_id}.protein.info.v12.0.txt"
374+
375+
for pth in (cluster_info_path, cluster_proteins_path, protein_info_path):
376+
if not pth.exists():
377+
raise FileNotFoundError(
378+
f"Expected STRING file not found: {pth}. Provide a valid `string_data_dir` pointing to STRING exports."
379+
)
380+
381+
cluster_info = pd.read_csv(cluster_info_path, sep="\t")
382+
cluster_info = cluster_info[["cluster_id", "cluster_size", "best_described_by"]]
383+
384+
cluster_members = pd.read_csv(cluster_proteins_path, sep="\t")
385+
protein_info = pd.read_csv(protein_info_path, sep="\t")[["#string_protein_id", "preferred_name"]]
386+
387+
annotated = (
388+
cluster_members.merge(protein_info, left_on="protein_id", right_on="#string_protein_id", how="left")
389+
.merge(cluster_info, on="cluster_id", how="left")
390+
)
391+
392+
gene_to_terms: Dict[str, List[Tuple[str, Optional[int]]]] = defaultdict(list)
393+
for row in annotated.itertuples():
394+
gene = getattr(row, "preferred_name", None)
395+
term = getattr(row, "best_described_by", None)
396+
cluster_size = getattr(row, "cluster_size", None)
397+
if pd.isna(gene) or pd.isna(term):
398+
continue
399+
term_clean = str(term).strip()
400+
if not term_clean:
401+
continue
402+
cluster_size_int = None
403+
if pd.notna(cluster_size):
404+
try:
405+
cluster_size_int = int(cluster_size)
406+
except (TypeError, ValueError):
407+
cluster_size_int = None
408+
gene_to_terms[str(gene)].append((term_clean, cluster_size_int))
409+
410+
deduped: Dict[str, List[Tuple[str, Optional[int]]]] = {}
411+
for gene, entries in gene_to_terms.items():
412+
seen = set()
413+
unique_entries: List[Tuple[str, Optional[int]]] = []
414+
for term, size in entries:
415+
key = (term, size)
416+
if key in seen:
417+
continue
418+
seen.add(key)
419+
unique_entries.append((term, size))
420+
deduped[gene] = unique_entries
421+
422+
return deduped
423+
424+
425+
def name_gene_modules_by_annotation(
426+
modules: Mapping[str, Sequence[str]],
427+
annotation_source: str,
428+
*,
429+
species: str = "mouse",
430+
string_data_dir: Optional[Union[str, Path]] = None,
431+
min_genes_for_term: int = 2,
432+
top_n_terms: int = 1,
433+
include_source_in_name: bool = True,
434+
fallback_to_original: bool = True,
435+
max_cluster_size: Optional[int] = 5000,
436+
banned_terms: Optional[Sequence[str]] = None,
437+
) -> Dict[str, List[str]]:
438+
"""
439+
Rename gene modules using shared functional annotations.
440+
441+
Parameters
442+
----------
443+
modules
444+
Mapping from module name to a sequence of gene symbols.
445+
annotation_source
446+
One of {"string", "go", "reactome", "interpro"}.
447+
Currently, only "string" is supported with local STRING export files.
448+
species
449+
Species name or NCBI taxon ID. Defaults to "mouse" (10090).
450+
string_data_dir
451+
Directory containing STRING export files (e.g., data/STRING). If None, uses the
452+
repository's default data/STRING location.
453+
min_genes_for_term
454+
Minimum number of module genes that must share a term to use it for naming.
455+
top_n_terms
456+
How many top terms to combine into the module label (joined with " | ").
457+
include_source_in_name
458+
If True, append the source tag (e.g., "(STRING)") to the label.
459+
fallback_to_original
460+
If True and no annotation is found, keep the original module name; otherwise use
461+
a generic "module_{i}" placeholder.
462+
max_cluster_size
463+
Optional upper bound on STRING cluster size to consider (filters very broad/generic
464+
terms such as "Cellular_component"). Set to None to disable.
465+
banned_terms
466+
Optional list of term strings (case-insensitive exact match) to exclude from naming.
467+
468+
Returns
469+
-------
470+
Dict[str, List[str]]
471+
New mapping where keys are descriptive module names and values are the original gene lists.
472+
473+
Notes
474+
-----
475+
• The scoring favors terms that cover more module genes and come from smaller (more specific)
476+
STRING clusters via a coverage × specificity heuristic.
477+
• Extendability: hooks for GO/Reactome/InterPro can reuse the scoring helper by supplying
478+
a gene -> [(term, specificity_size)] mapping.
479+
480+
Example
481+
-------
482+
>>> modules = {"gmod_0": ["Med14", "Med15", "Med16"]}
483+
>>> name_gene_modules_by_annotation(modules, "string", species="mouse", string_data_dir="data/STRING")
484+
{'Core mediator complex (STRING)': ['Med14', 'Med15', 'Med16']}
485+
"""
486+
if not modules:
487+
raise ValueError("`modules` is empty.")
488+
if min_genes_for_term < 1:
489+
raise ValueError("`min_genes_for_term` must be >= 1.")
490+
if top_n_terms < 1:
491+
raise ValueError("`top_n_terms` must be >= 1.")
492+
493+
source_norm = annotation_source.strip().lower()
494+
if source_norm == "string":
495+
gene_to_terms = _load_string_gene_annotations(species, None if string_data_dir is None else str(string_data_dir))
496+
source_tag = "STRING"
497+
elif source_norm in {"go", "reactome", "interpro"}:
498+
raise NotImplementedError(f"Annotation source '{annotation_source}' is not yet implemented in this build.")
499+
else:
500+
raise ValueError(f"Unknown annotation_source '{annotation_source}'.")
501+
502+
banned = {t.lower() for t in (banned_terms or ["cellular_component", "biological_process", "molecular_function"])}
503+
504+
renamed: Dict[str, List[str]] = {}
505+
used_names = set()
506+
for idx, (orig_name, gene_list) in enumerate(modules.items()):
507+
genes_unique = list(dict.fromkeys(str(g) for g in gene_list))
508+
scored_terms = _score_terms_for_module(
509+
genes_unique,
510+
gene_to_terms,
511+
min_genes_for_term,
512+
max_cluster_size,
513+
banned,
514+
)
515+
516+
if not scored_terms:
517+
base_name = orig_name if fallback_to_original else f"module_{idx+1}"
518+
else:
519+
selected = [_clean_term_label(term) for term, _, _, _ in scored_terms[:top_n_terms]]
520+
base_name = " | ".join(selected)
521+
if include_source_in_name:
522+
base_name = f"{base_name} ({source_tag})"
523+
524+
candidate = base_name
525+
suffix = 2
526+
while candidate in used_names:
527+
candidate = f"{base_name} #{suffix}"
528+
suffix += 1
529+
used_names.add(candidate)
530+
renamed[candidate] = genes_unique
531+
532+
return renamed
533+
534+
281535
def whoare_genes_neighbors(
282536
adata,
283537
gene: str,
@@ -1036,4 +1290,3 @@ def correlate_module_scores_with_pcs(
10361290
result = result.sort_values("abs_correlation", ascending=False).reset_index(drop=True)
10371291

10381292
return result
1039-

0 commit comments

Comments
 (0)