Skip to content

Commit 0dbe759

Browse files
committed
Find gene modules with PPI infor from STRING
1 parent 9124f70 commit 0dbe759

2 files changed

Lines changed: 197 additions & 0 deletions

File tree

src/pySingleCellNet/tools/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
name_gene_modules_by_annotation,
3838
whoare_genes_neighbors,
3939
what_module_has_gene,
40+
find_string_physical_modules,
4041
score_gene_sets,
4142
correlate_module_scores_with_pcs
4243
)
@@ -63,6 +64,7 @@
6364
"name_gene_modules_by_annotation",
6465
"whoare_genes_neighbors",
6566
"what_module_has_gene",
67+
"find_string_physical_modules",
6668
"score_gene_sets",
6769
"correlate_module_scores_with_pcs"
6870
]

src/pySingleCellNet/tools/gene.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,201 @@ def name_gene_modules_by_annotation(
532532
return renamed
533533

534534

535+
# ---------------------------------------------------------------------------
536+
# STRING physical-interaction modules
537+
# ---------------------------------------------------------------------------
538+
539+
def _string_link_candidates(base_dir: Path, taxon_id: str):
540+
"""Yield candidate STRING link files in preference order."""
541+
names = [
542+
"protein.physical.links.full",
543+
"protein.links.detailed",
544+
"protein.links.full",
545+
"protein.links",
546+
]
547+
for stem in names:
548+
p = base_dir / f"{taxon_id}.{stem}.v12.0.txt"
549+
if p.exists():
550+
yield p
551+
p_gz = p.with_suffix(p.suffix + ".gz")
552+
if p_gz.exists():
553+
yield p_gz
554+
555+
556+
def _load_string_gene_id_map(species: str, data_dir: Optional[str]) -> pd.DataFrame:
557+
"""Load STRING protein info and return dataframe with gene->protein mapping."""
558+
taxon_id = _species_to_taxon_id(species)
559+
base_dir = Path(data_dir) if data_dir is not None else _default_string_data_dir()
560+
protein_info_path = base_dir / f"{taxon_id}.protein.info.v12.0.txt"
561+
if not protein_info_path.exists():
562+
raise FileNotFoundError(
563+
f"Expected STRING protein info file not found: {protein_info_path}. "
564+
"Provide `string_data_dir` pointing to STRING exports."
565+
)
566+
df = pd.read_csv(protein_info_path, sep="\t", usecols=["#string_protein_id", "preferred_name"])
567+
df = df.dropna(subset=["#string_protein_id", "preferred_name"])
568+
return df
569+
570+
571+
def find_string_physical_modules(
572+
genes: Sequence[str],
573+
*,
574+
species: str = "mouse",
575+
string_data_dir: Optional[Union[str, Path]] = None,
576+
min_confidence: int = 700,
577+
evidence_channels: Optional[Sequence[str]] = None,
578+
clustering: str = "leiden",
579+
resolution: float = 1.0,
580+
return_graph: bool = False,
581+
) -> Union[Dict[str, List[str]], Tuple[Dict[str, List[str]], ig.Graph]]:
582+
"""Cluster a user-supplied gene list into STRING physical-interaction modules.
583+
584+
Parameters
585+
----------
586+
genes
587+
Iterable of gene symbols (STRING preferred names).
588+
species
589+
Species name or NCBI taxon ID (default "mouse" -> 10090).
590+
string_data_dir
591+
Directory containing STRING export files. If None, uses packaged ``data/STRING``.
592+
min_confidence
593+
Minimum STRING combined score (0–1000) to keep an interaction. Default 700 (high confidence).
594+
evidence_channels
595+
Iterable of STRING evidence column names that must be >0 for an edge to be kept
596+
(e.g., ["experimental", "database"]). If None/empty, no channel requirement.
597+
clustering
598+
"leiden" (via igraph) or "components" (connected components only).
599+
resolution
600+
Leiden resolution parameter (ignored if clustering="components").
601+
return_graph
602+
If True, also return the igraph graph used for clustering.
603+
604+
Returns
605+
-------
606+
modules : Dict[str, List[str]]
607+
Mapping module name -> ordered list of gene symbols.
608+
graph (optional)
609+
igraph.Graph used for clustering.
610+
611+
Notes
612+
-----
613+
• Uses only local STRING flat files; no API calls required.
614+
• Falls back to connected components if Leiden is unavailable or the graph is tiny.
615+
"""
616+
617+
gene_list = [str(g) for g in genes]
618+
if len(gene_list) == 0:
619+
raise ValueError("No genes provided.")
620+
621+
taxon_id = _species_to_taxon_id(species)
622+
base_dir = Path(string_data_dir) if string_data_dir is not None else _default_string_data_dir()
623+
channels_req = [c.strip().lstrip("#") for c in (evidence_channels or []) if str(c).strip()]
624+
625+
# 1) Map genes -> STRING protein IDs
626+
prot_df = _load_string_gene_id_map(species, string_data_dir)
627+
lookup = prot_df.set_index("preferred_name")["#string_protein_id"]
628+
proteins = {g: lookup.get(g) for g in gene_list}
629+
proteins = {g: p for g, p in proteins.items() if p is not None}
630+
if len(proteins) == 0:
631+
raise ValueError("None of the supplied genes were found in STRING protein.info.")
632+
633+
# 2) Load the best available STRING links file (normalize headers, allow fallbacks)
634+
links = None
635+
links_path = None
636+
read_errors = []
637+
required_cols = {"protein1", "protein2", "combined_score"}
638+
639+
for cand in _string_link_candidates(base_dir, taxon_id):
640+
try:
641+
links = pd.read_csv(cand, sep="\s+", engine="python")
642+
# strip leading '#' from headers (STRING sometimes uses #protein1)
643+
links = links.rename(columns={c: c.lstrip("#") for c in links.columns})
644+
645+
# If combined_score is absent but an experimental/score column exists, reuse it
646+
cols = set(links.columns)
647+
if "combined_score" not in cols:
648+
for alt in ["experimental", "experimental_score", "score"]:
649+
if alt in cols:
650+
links = links.rename(columns={alt: "combined_score"})
651+
cols.add("combined_score")
652+
break
653+
654+
# Skip this candidate if channel requirements cannot be satisfied
655+
if channels_req and not set(channels_req).issubset(cols):
656+
read_errors.append((cand, f"missing channels {set(channels_req) - cols}"))
657+
links = None
658+
continue
659+
660+
if required_cols.issubset(set(links.columns)):
661+
links_path = cand
662+
break
663+
else:
664+
read_errors.append((cand, f"missing {required_cols - set(links.columns)}"))
665+
links = None
666+
except Exception as e:
667+
read_errors.append((cand, str(e)))
668+
links = None
669+
670+
if links is None:
671+
msg = "; ".join(f"{p}: {err}" for p, err in read_errors) or "no candidate files found"
672+
raise FileNotFoundError(
673+
f"Could not load a STRING links file for taxon {taxon_id}. Attempts: {msg}"
674+
)
675+
676+
# 3) Filter to interactions within the user genes and above confidence (and channels if set)
677+
protein_ids = set(proteins.values())
678+
mask = (
679+
links["protein1"].isin(protein_ids)
680+
& links["protein2"].isin(protein_ids)
681+
& (links["combined_score"] >= int(min_confidence))
682+
)
683+
if channels_req:
684+
for ch in channels_req:
685+
mask &= links[ch] > 0
686+
sub = links[mask]
687+
688+
if sub.empty:
689+
return ({}, None) if return_graph else {}
690+
691+
# 4) Build igraph
692+
# Map protein IDs to gene symbols for readable vertices
693+
prot_to_gene = {v: k for k, v in proteins.items()}
694+
edges = [
695+
(prot_to_gene[p1], prot_to_gene[p2], float(s))
696+
for p1, p2, s in sub[["protein1", "protein2", "combined_score"]].itertuples(index=False)
697+
]
698+
699+
g = ig.Graph()
700+
g.add_vertices(sorted(set(prot_to_gene.values())))
701+
g.add_edges([(a, b) for a, b, _ in edges])
702+
g.es["weight"] = [w for _, _, w in edges]
703+
704+
modules: Dict[str, List[str]] = {}
705+
706+
if clustering.lower() == "leiden" and g.ecount() > 0 and g.vcount() > 1:
707+
try:
708+
leiden = g.community_leiden(weights=g.es["weight"], resolution_parameter=float(resolution))
709+
groups = leiden.as_cover().partition
710+
except Exception:
711+
groups = g.components().membership
712+
else:
713+
groups = g.components().membership
714+
715+
# Collect modules
716+
group_to_genes: Dict[int, List[str]] = defaultdict(list)
717+
for gene, grp in zip(g.vs["name"], groups):
718+
group_to_genes[int(grp)].append(gene)
719+
720+
for idx, genes_in_group in sorted(group_to_genes.items(), key=lambda kv: -len(kv[1])):
721+
if len(genes_in_group) == 1:
722+
continue # skip singletons; they have no supported interaction
723+
modules[f"strmod_{idx}"] = sorted(genes_in_group)
724+
725+
if return_graph:
726+
return modules, g
727+
return modules
728+
729+
535730
def whoare_genes_neighbors(
536731
adata,
537732
gene: str,

0 commit comments

Comments
 (0)