@@ -532,6 +532,201 @@ def name_gene_modules_by_annotation(
532532 return renamed
533533
534534
535+ # ---------------------------------------------------------------------------
536+ # STRING physical-interaction modules
537+ # ---------------------------------------------------------------------------
538+
539+ def _string_link_candidates (base_dir : Path , taxon_id : str ):
540+ """Yield candidate STRING link files in preference order."""
541+ names = [
542+ "protein.physical.links.full" ,
543+ "protein.links.detailed" ,
544+ "protein.links.full" ,
545+ "protein.links" ,
546+ ]
547+ for stem in names :
548+ p = base_dir / f"{ taxon_id } .{ stem } .v12.0.txt"
549+ if p .exists ():
550+ yield p
551+ p_gz = p .with_suffix (p .suffix + ".gz" )
552+ if p_gz .exists ():
553+ yield p_gz
554+
555+
556+ def _load_string_gene_id_map (species : str , data_dir : Optional [str ]) -> pd .DataFrame :
557+ """Load STRING protein info and return dataframe with gene->protein mapping."""
558+ taxon_id = _species_to_taxon_id (species )
559+ base_dir = Path (data_dir ) if data_dir is not None else _default_string_data_dir ()
560+ protein_info_path = base_dir / f"{ taxon_id } .protein.info.v12.0.txt"
561+ if not protein_info_path .exists ():
562+ raise FileNotFoundError (
563+ f"Expected STRING protein info file not found: { protein_info_path } . "
564+ "Provide `string_data_dir` pointing to STRING exports."
565+ )
566+ df = pd .read_csv (protein_info_path , sep = "\t " , usecols = ["#string_protein_id" , "preferred_name" ])
567+ df = df .dropna (subset = ["#string_protein_id" , "preferred_name" ])
568+ return df
569+
570+
571+ def find_string_physical_modules (
572+ genes : Sequence [str ],
573+ * ,
574+ species : str = "mouse" ,
575+ string_data_dir : Optional [Union [str , Path ]] = None ,
576+ min_confidence : int = 700 ,
577+ evidence_channels : Optional [Sequence [str ]] = None ,
578+ clustering : str = "leiden" ,
579+ resolution : float = 1.0 ,
580+ return_graph : bool = False ,
581+ ) -> Union [Dict [str , List [str ]], Tuple [Dict [str , List [str ]], ig .Graph ]]:
582+ """Cluster a user-supplied gene list into STRING physical-interaction modules.
583+
584+ Parameters
585+ ----------
586+ genes
587+ Iterable of gene symbols (STRING preferred names).
588+ species
589+ Species name or NCBI taxon ID (default "mouse" -> 10090).
590+ string_data_dir
591+ Directory containing STRING export files. If None, uses packaged ``data/STRING``.
592+ min_confidence
593+ Minimum STRING combined score (0–1000) to keep an interaction. Default 700 (high confidence).
594+ evidence_channels
595+ Iterable of STRING evidence column names that must be >0 for an edge to be kept
596+ (e.g., ["experimental", "database"]). If None/empty, no channel requirement.
597+ clustering
598+ "leiden" (via igraph) or "components" (connected components only).
599+ resolution
600+ Leiden resolution parameter (ignored if clustering="components").
601+ return_graph
602+ If True, also return the igraph graph used for clustering.
603+
604+ Returns
605+ -------
606+ modules : Dict[str, List[str]]
607+ Mapping module name -> ordered list of gene symbols.
608+ graph (optional)
609+ igraph.Graph used for clustering.
610+
611+ Notes
612+ -----
613+ • Uses only local STRING flat files; no API calls required.
614+ • Falls back to connected components if Leiden is unavailable or the graph is tiny.
615+ """
616+
617+ gene_list = [str (g ) for g in genes ]
618+ if len (gene_list ) == 0 :
619+ raise ValueError ("No genes provided." )
620+
621+ taxon_id = _species_to_taxon_id (species )
622+ base_dir = Path (string_data_dir ) if string_data_dir is not None else _default_string_data_dir ()
623+ channels_req = [c .strip ().lstrip ("#" ) for c in (evidence_channels or []) if str (c ).strip ()]
624+
625+ # 1) Map genes -> STRING protein IDs
626+ prot_df = _load_string_gene_id_map (species , string_data_dir )
627+ lookup = prot_df .set_index ("preferred_name" )["#string_protein_id" ]
628+ proteins = {g : lookup .get (g ) for g in gene_list }
629+ proteins = {g : p for g , p in proteins .items () if p is not None }
630+ if len (proteins ) == 0 :
631+ raise ValueError ("None of the supplied genes were found in STRING protein.info." )
632+
633+ # 2) Load the best available STRING links file (normalize headers, allow fallbacks)
634+ links = None
635+ links_path = None
636+ read_errors = []
637+ required_cols = {"protein1" , "protein2" , "combined_score" }
638+
639+ for cand in _string_link_candidates (base_dir , taxon_id ):
640+ try :
641+ links = pd .read_csv (cand , sep = "\s+" , engine = "python" )
642+ # strip leading '#' from headers (STRING sometimes uses #protein1)
643+ links = links .rename (columns = {c : c .lstrip ("#" ) for c in links .columns })
644+
645+ # If combined_score is absent but an experimental/score column exists, reuse it
646+ cols = set (links .columns )
647+ if "combined_score" not in cols :
648+ for alt in ["experimental" , "experimental_score" , "score" ]:
649+ if alt in cols :
650+ links = links .rename (columns = {alt : "combined_score" })
651+ cols .add ("combined_score" )
652+ break
653+
654+ # Skip this candidate if channel requirements cannot be satisfied
655+ if channels_req and not set (channels_req ).issubset (cols ):
656+ read_errors .append ((cand , f"missing channels { set (channels_req ) - cols } " ))
657+ links = None
658+ continue
659+
660+ if required_cols .issubset (set (links .columns )):
661+ links_path = cand
662+ break
663+ else :
664+ read_errors .append ((cand , f"missing { required_cols - set (links .columns )} " ))
665+ links = None
666+ except Exception as e :
667+ read_errors .append ((cand , str (e )))
668+ links = None
669+
670+ if links is None :
671+ msg = "; " .join (f"{ p } : { err } " for p , err in read_errors ) or "no candidate files found"
672+ raise FileNotFoundError (
673+ f"Could not load a STRING links file for taxon { taxon_id } . Attempts: { msg } "
674+ )
675+
676+ # 3) Filter to interactions within the user genes and above confidence (and channels if set)
677+ protein_ids = set (proteins .values ())
678+ mask = (
679+ links ["protein1" ].isin (protein_ids )
680+ & links ["protein2" ].isin (protein_ids )
681+ & (links ["combined_score" ] >= int (min_confidence ))
682+ )
683+ if channels_req :
684+ for ch in channels_req :
685+ mask &= links [ch ] > 0
686+ sub = links [mask ]
687+
688+ if sub .empty :
689+ return ({}, None ) if return_graph else {}
690+
691+ # 4) Build igraph
692+ # Map protein IDs to gene symbols for readable vertices
693+ prot_to_gene = {v : k for k , v in proteins .items ()}
694+ edges = [
695+ (prot_to_gene [p1 ], prot_to_gene [p2 ], float (s ))
696+ for p1 , p2 , s in sub [["protein1" , "protein2" , "combined_score" ]].itertuples (index = False )
697+ ]
698+
699+ g = ig .Graph ()
700+ g .add_vertices (sorted (set (prot_to_gene .values ())))
701+ g .add_edges ([(a , b ) for a , b , _ in edges ])
702+ g .es ["weight" ] = [w for _ , _ , w in edges ]
703+
704+ modules : Dict [str , List [str ]] = {}
705+
706+ if clustering .lower () == "leiden" and g .ecount () > 0 and g .vcount () > 1 :
707+ try :
708+ leiden = g .community_leiden (weights = g .es ["weight" ], resolution_parameter = float (resolution ))
709+ groups = leiden .as_cover ().partition
710+ except Exception :
711+ groups = g .components ().membership
712+ else :
713+ groups = g .components ().membership
714+
715+ # Collect modules
716+ group_to_genes : Dict [int , List [str ]] = defaultdict (list )
717+ for gene , grp in zip (g .vs ["name" ], groups ):
718+ group_to_genes [int (grp )].append (gene )
719+
720+ for idx , genes_in_group in sorted (group_to_genes .items (), key = lambda kv : - len (kv [1 ])):
721+ if len (genes_in_group ) == 1 :
722+ continue # skip singletons; they have no supported interaction
723+ modules [f"strmod_{ idx } " ] = sorted (genes_in_group )
724+
725+ if return_graph :
726+ return modules , g
727+ return modules
728+
729+
535730def whoare_genes_neighbors (
536731 adata ,
537732 gene : str ,
0 commit comments