|
1 | 1 | from __future__ import annotations |
2 | 2 | from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, Callable |
| 3 | +from collections import defaultdict |
| 4 | +from functools import lru_cache |
| 5 | +from pathlib import Path |
3 | 6 | import numpy as np |
4 | 7 | import pandas as pd |
5 | 8 | import scanpy as sc |
@@ -278,6 +281,257 @@ def find_gene_modules( |
278 | 281 | return modules |
279 | 282 |
|
280 | 283 |
|
| 284 | +def _species_to_taxon_id(species: str) -> str: |
| 285 | + """Normalize species name to NCBI taxon ID string used by STRING files.""" |
| 286 | + species_norm = str(species).strip().lower().replace(" ", "_") |
| 287 | + lut = { |
| 288 | + "mouse": "10090", |
| 289 | + "mus_musculus": "10090", |
| 290 | + "m_musculus": "10090", |
| 291 | + "human": "9606", |
| 292 | + "homo_sapiens": "9606", |
| 293 | + } |
| 294 | + if species_norm in lut: |
| 295 | + return lut[species_norm] |
| 296 | + if species_norm.isdigit(): |
| 297 | + return species_norm |
| 298 | + raise ValueError(f"Unsupported species '{species}'. Provide an NCBI taxon ID or one of {sorted(lut)}.") |
| 299 | + |
| 300 | + |
| 301 | +def _default_string_data_dir() -> Path: |
| 302 | + """Resolve the default STRING data directory packaged with the repository.""" |
| 303 | + return Path(__file__).resolve().parents[3] / "data" / "STRING" |
| 304 | + |
| 305 | + |
| 306 | +def _clean_term_label(term: str, max_len: int = 70) -> str: |
| 307 | + """Tidy annotation text and keep it concise.""" |
| 308 | + label = " ".join(str(term).split()).rstrip(".;") |
| 309 | + if len(label) > max_len: |
| 310 | + label = label[: max_len - 3].rstrip() + "..." |
| 311 | + return label |
| 312 | + |
| 313 | + |
| 314 | +def _score_terms_for_module( |
| 315 | + module_genes: Sequence[str], |
| 316 | + gene_to_terms: Mapping[str, Sequence[Tuple[str, Optional[int]]]], |
| 317 | + min_genes_for_term: int, |
| 318 | + max_cluster_size: Optional[int], |
| 319 | + banned_terms: Mapping[str, None], |
| 320 | +) -> List[Tuple[str, float, int, Optional[int]]]: |
| 321 | + """ |
| 322 | + Score annotation terms by coverage and specificity for a given module. |
| 323 | +
|
| 324 | + Returns a list of tuples (term, score, support, cluster_size) sorted by descending score. |
| 325 | + """ |
| 326 | + module_genes_unique = list(dict.fromkeys(str(g) for g in module_genes)) |
| 327 | + module_size = len(module_genes_unique) |
| 328 | + if module_size == 0: |
| 329 | + return [] |
| 330 | + |
| 331 | + term_to_genes = defaultdict(set) |
| 332 | + term_to_cluster_size: Dict[str, Optional[int]] = {} |
| 333 | + |
| 334 | + for g in module_genes_unique: |
| 335 | + for term, csize in gene_to_terms.get(g, []): |
| 336 | + term_to_genes[term].add(g) |
| 337 | + if csize is not None: |
| 338 | + prev = term_to_cluster_size.get(term) |
| 339 | + if prev is None or csize < prev: |
| 340 | + term_to_cluster_size[term] = int(csize) |
| 341 | + |
| 342 | + scored: List[Tuple[str, float, int, Optional[int]]] = [] |
| 343 | + for term, genes in term_to_genes.items(): |
| 344 | + term_l = term.lower() |
| 345 | + if term_l in banned_terms: |
| 346 | + continue |
| 347 | + support = len(genes) |
| 348 | + if support < min_genes_for_term: |
| 349 | + continue |
| 350 | + cluster_size = term_to_cluster_size.get(term) |
| 351 | + if max_cluster_size is not None and cluster_size is not None and cluster_size > max_cluster_size: |
| 352 | + continue |
| 353 | + specificity = 1.0 |
| 354 | + if cluster_size is not None and cluster_size > 0: |
| 355 | + specificity = 1.0 / float(np.log1p(cluster_size)) |
| 356 | + coverage = support / float(module_size) |
| 357 | + score = coverage * specificity |
| 358 | + scored.append((term, score, support, cluster_size)) |
| 359 | + |
| 360 | + scored.sort(key=lambda x: (-x[1], -x[2], x[3] if x[3] is not None else float("inf"), x[0])) |
| 361 | + return scored |
| 362 | + |
| 363 | + |
| 364 | +@lru_cache(maxsize=8) |
| 365 | +def _load_string_gene_annotations(species: str, data_dir: Optional[str]) -> Dict[str, List[Tuple[str, Optional[int]]]]: |
| 366 | + """ |
| 367 | + Load STRING cluster descriptions and build a gene -> [(term, cluster_size)] mapping. |
| 368 | + """ |
| 369 | + taxon_id = _species_to_taxon_id(species) |
| 370 | + base_dir = Path(data_dir) if data_dir is not None else _default_string_data_dir() |
| 371 | + cluster_info_path = base_dir / f"{taxon_id}.clusters.info.v12.0.txt" |
| 372 | + cluster_proteins_path = base_dir / f"{taxon_id}.clusters.proteins.v12.0.txt" |
| 373 | + protein_info_path = base_dir / f"{taxon_id}.protein.info.v12.0.txt" |
| 374 | + |
| 375 | + for pth in (cluster_info_path, cluster_proteins_path, protein_info_path): |
| 376 | + if not pth.exists(): |
| 377 | + raise FileNotFoundError( |
| 378 | + f"Expected STRING file not found: {pth}. Provide a valid `string_data_dir` pointing to STRING exports." |
| 379 | + ) |
| 380 | + |
| 381 | + cluster_info = pd.read_csv(cluster_info_path, sep="\t") |
| 382 | + cluster_info = cluster_info[["cluster_id", "cluster_size", "best_described_by"]] |
| 383 | + |
| 384 | + cluster_members = pd.read_csv(cluster_proteins_path, sep="\t") |
| 385 | + protein_info = pd.read_csv(protein_info_path, sep="\t")[["#string_protein_id", "preferred_name"]] |
| 386 | + |
| 387 | + annotated = ( |
| 388 | + cluster_members.merge(protein_info, left_on="protein_id", right_on="#string_protein_id", how="left") |
| 389 | + .merge(cluster_info, on="cluster_id", how="left") |
| 390 | + ) |
| 391 | + |
| 392 | + gene_to_terms: Dict[str, List[Tuple[str, Optional[int]]]] = defaultdict(list) |
| 393 | + for row in annotated.itertuples(): |
| 394 | + gene = getattr(row, "preferred_name", None) |
| 395 | + term = getattr(row, "best_described_by", None) |
| 396 | + cluster_size = getattr(row, "cluster_size", None) |
| 397 | + if pd.isna(gene) or pd.isna(term): |
| 398 | + continue |
| 399 | + term_clean = str(term).strip() |
| 400 | + if not term_clean: |
| 401 | + continue |
| 402 | + cluster_size_int = None |
| 403 | + if pd.notna(cluster_size): |
| 404 | + try: |
| 405 | + cluster_size_int = int(cluster_size) |
| 406 | + except (TypeError, ValueError): |
| 407 | + cluster_size_int = None |
| 408 | + gene_to_terms[str(gene)].append((term_clean, cluster_size_int)) |
| 409 | + |
| 410 | + deduped: Dict[str, List[Tuple[str, Optional[int]]]] = {} |
| 411 | + for gene, entries in gene_to_terms.items(): |
| 412 | + seen = set() |
| 413 | + unique_entries: List[Tuple[str, Optional[int]]] = [] |
| 414 | + for term, size in entries: |
| 415 | + key = (term, size) |
| 416 | + if key in seen: |
| 417 | + continue |
| 418 | + seen.add(key) |
| 419 | + unique_entries.append((term, size)) |
| 420 | + deduped[gene] = unique_entries |
| 421 | + |
| 422 | + return deduped |
| 423 | + |
| 424 | + |
| 425 | +def name_gene_modules_by_annotation( |
| 426 | + modules: Mapping[str, Sequence[str]], |
| 427 | + annotation_source: str, |
| 428 | + *, |
| 429 | + species: str = "mouse", |
| 430 | + string_data_dir: Optional[Union[str, Path]] = None, |
| 431 | + min_genes_for_term: int = 2, |
| 432 | + top_n_terms: int = 1, |
| 433 | + include_source_in_name: bool = True, |
| 434 | + fallback_to_original: bool = True, |
| 435 | + max_cluster_size: Optional[int] = 5000, |
| 436 | + banned_terms: Optional[Sequence[str]] = None, |
| 437 | +) -> Dict[str, List[str]]: |
| 438 | + """ |
| 439 | + Rename gene modules using shared functional annotations. |
| 440 | +
|
| 441 | + Parameters |
| 442 | + ---------- |
| 443 | + modules |
| 444 | + Mapping from module name to a sequence of gene symbols. |
| 445 | + annotation_source |
| 446 | + One of {"string", "go", "reactome", "interpro"}. |
| 447 | + Currently, only "string" is supported with local STRING export files. |
| 448 | + species |
| 449 | + Species name or NCBI taxon ID. Defaults to "mouse" (10090). |
| 450 | + string_data_dir |
| 451 | + Directory containing STRING export files (e.g., data/STRING). If None, uses the |
| 452 | + repository's default data/STRING location. |
| 453 | + min_genes_for_term |
| 454 | + Minimum number of module genes that must share a term to use it for naming. |
| 455 | + top_n_terms |
| 456 | + How many top terms to combine into the module label (joined with " | "). |
| 457 | + include_source_in_name |
| 458 | + If True, append the source tag (e.g., "(STRING)") to the label. |
| 459 | + fallback_to_original |
| 460 | + If True and no annotation is found, keep the original module name; otherwise use |
| 461 | + a generic "module_{i}" placeholder. |
| 462 | + max_cluster_size |
| 463 | + Optional upper bound on STRING cluster size to consider (filters very broad/generic |
| 464 | + terms such as "Cellular_component"). Set to None to disable. |
| 465 | + banned_terms |
| 466 | + Optional list of term strings (case-insensitive exact match) to exclude from naming. |
| 467 | +
|
| 468 | + Returns |
| 469 | + ------- |
| 470 | + Dict[str, List[str]] |
| 471 | + New mapping where keys are descriptive module names and values are the original gene lists. |
| 472 | +
|
| 473 | + Notes |
| 474 | + ----- |
| 475 | + • The scoring favors terms that cover more module genes and come from smaller (more specific) |
| 476 | + STRING clusters via a coverage × specificity heuristic. |
| 477 | + • Extendability: hooks for GO/Reactome/InterPro can reuse the scoring helper by supplying |
| 478 | + a gene -> [(term, specificity_size)] mapping. |
| 479 | +
|
| 480 | + Example |
| 481 | + ------- |
| 482 | + >>> modules = {"gmod_0": ["Med14", "Med15", "Med16"]} |
| 483 | + >>> name_gene_modules_by_annotation(modules, "string", species="mouse", string_data_dir="data/STRING") |
| 484 | + {'Core mediator complex (STRING)': ['Med14', 'Med15', 'Med16']} |
| 485 | + """ |
| 486 | + if not modules: |
| 487 | + raise ValueError("`modules` is empty.") |
| 488 | + if min_genes_for_term < 1: |
| 489 | + raise ValueError("`min_genes_for_term` must be >= 1.") |
| 490 | + if top_n_terms < 1: |
| 491 | + raise ValueError("`top_n_terms` must be >= 1.") |
| 492 | + |
| 493 | + source_norm = annotation_source.strip().lower() |
| 494 | + if source_norm == "string": |
| 495 | + gene_to_terms = _load_string_gene_annotations(species, None if string_data_dir is None else str(string_data_dir)) |
| 496 | + source_tag = "STRING" |
| 497 | + elif source_norm in {"go", "reactome", "interpro"}: |
| 498 | + raise NotImplementedError(f"Annotation source '{annotation_source}' is not yet implemented in this build.") |
| 499 | + else: |
| 500 | + raise ValueError(f"Unknown annotation_source '{annotation_source}'.") |
| 501 | + |
| 502 | + banned = {t.lower() for t in (banned_terms or ["cellular_component", "biological_process", "molecular_function"])} |
| 503 | + |
| 504 | + renamed: Dict[str, List[str]] = {} |
| 505 | + used_names = set() |
| 506 | + for idx, (orig_name, gene_list) in enumerate(modules.items()): |
| 507 | + genes_unique = list(dict.fromkeys(str(g) for g in gene_list)) |
| 508 | + scored_terms = _score_terms_for_module( |
| 509 | + genes_unique, |
| 510 | + gene_to_terms, |
| 511 | + min_genes_for_term, |
| 512 | + max_cluster_size, |
| 513 | + banned, |
| 514 | + ) |
| 515 | + |
| 516 | + if not scored_terms: |
| 517 | + base_name = orig_name if fallback_to_original else f"module_{idx+1}" |
| 518 | + else: |
| 519 | + selected = [_clean_term_label(term) for term, _, _, _ in scored_terms[:top_n_terms]] |
| 520 | + base_name = " | ".join(selected) |
| 521 | + if include_source_in_name: |
| 522 | + base_name = f"{base_name} ({source_tag})" |
| 523 | + |
| 524 | + candidate = base_name |
| 525 | + suffix = 2 |
| 526 | + while candidate in used_names: |
| 527 | + candidate = f"{base_name} #{suffix}" |
| 528 | + suffix += 1 |
| 529 | + used_names.add(candidate) |
| 530 | + renamed[candidate] = genes_unique |
| 531 | + |
| 532 | + return renamed |
| 533 | + |
| 534 | + |
281 | 535 | def whoare_genes_neighbors( |
282 | 536 | adata, |
283 | 537 | gene: str, |
@@ -1036,4 +1290,3 @@ def correlate_module_scores_with_pcs( |
1036 | 1290 | result = result.sort_values("abs_correlation", ascending=False).reset_index(drop=True) |
1037 | 1291 |
|
1038 | 1292 | return result |
1039 | | - |
|
0 commit comments