rename clusters fix+

pcahan1 · pcahan1 · commit 0649e1e8551b · 2025-11-25T16:21:08.000-05:00
diff --git a/src/pySingleCellNet/tools/__init__.py b/src/pySingleCellNet/tools/__init__.py
@@ -31,13 +31,14 @@
     deg
 )
 
-from .gene import (
-    build_gene_knn,
-    find_gene_modules,
-    whoare_genes_neighbors,
-    what_module_has_gene,
-    score_gene_sets
-)
+from .gene import (
+    build_gene_knn,
+    find_gene_modules,
+    whoare_genes_neighbors,
+    what_module_has_gene,
+    score_gene_sets,
+    correlate_module_scores_with_pcs
+)
 
 # API
 __all__ = [
@@ -57,9 +58,10 @@
     "convert_diffExp_to_dict",
     "deg",
     "build_gene_knn",
-    "find_gene_modules",
-    "whoare_genes_neighbors",
-    "what_module_has_gene",
-    "score_gene_sets"
-]
+    "find_gene_modules",
+    "whoare_genes_neighbors",
+    "what_module_has_gene",
+    "score_gene_sets",
+    "correlate_module_scores_with_pcs"
+]
 
diff --git a/src/pySingleCellNet/tools/cluster.py b/src/pySingleCellNet/tools/cluster.py
@@ -28,12 +28,14 @@ def cluster_alot(
 
     Assumptions:
         * ``adata.X`` is **already log-transformed**.
-        * PCA has been computed and ``adata.obsm['X_pca']`` is present; this is
-          used as the base embedding for PC selection/subsampling.
+        * A base embedding is stored in ``adata.obsm``. By default this is
+          ``adata.obsm['X_pca']``, but you can override it via
+          ``knn_params['use_rep']`` to leverage an alternative representation.
 
     Args:
         adata: AnnData object containing the log-transformed expression matrix.
-            Must include ``obsm['X_pca']`` (shape ``(n_cells, n_pcs_total)``).
+            Must include the embedding referenced by ``knn_params['use_rep']``
+            (defaults to ``obsm['X_pca']``).
         leiden_resolutions: Leiden resolution values to evaluate (passed to
             ``sc.tl.leiden``). Each resolution is combined with every KNN/PC
             configuration in the sweep.
@@ -56,6 +58,9 @@ def cluster_alot(
         knn_params: KNN graph parameters. Supported keys:
             * ``"n_neighbors"`` (List[int], default ``[10]``): Candidate values
               for ``K`` used in ``sc.pp.neighbors``.
+            * ``"use_rep"`` (str, default ``"X_pca"``): Name of the
+              ``adata.obsm`` representation to use as the base embedding (e.g.,
+              ``"X_pca_noPC1"``). PC subsampling operates on this matrix.
         random_state: Random seed for PC subset sampling (when
             ``percent_of_pcs`` is used). Pass ``None`` for non-deterministic
             sampling. Defaults to ``None``.
@@ -70,6 +75,7 @@ def cluster_alot(
         * **runs** (``pd.DataFrame``): One row per clustering run with metadata columns such as:
           - ``obs_key``: Name of the column in ``adata.obs`` that stores cluster labels.
           - ``neighbors_key``: Name of the neighbors graph key used/created.
+          - ``use_rep``: Embedding key that served as the base representation.
           - ``resolution``: Leiden resolution value used for the run.
           - ``top_n_pcs``: Number of leading PCs considered.
           - ``pct_pcs``: Fraction of PCs used when subsampling (``percent_of_pcs``), or ``1.0`` if all were used.
@@ -80,10 +86,10 @@ def cluster_alot(
             (``round(pct_pcs * top_n_pcs)`` or ``top_n_pcs`` if no subsampling).
 
     Raises:
-        KeyError: If ``'X_pca'`` is missing from ``adata.obsm``.
-        ValueError: If any provided parameter is out of range (e.g.,
-            ``percent_of_pcs`` not in ``(0, 1]``; empty lists; non-positive
-            ``n_neighbors``).
+        ValueError: If the requested ``knn_params['use_rep']`` embedding is
+            missing from ``adata.obsm`` or if any provided parameter is out of
+            range (e.g., ``percent_of_pcs`` not in ``(0, 1]``; empty lists;
+            non-positive ``n_neighbors``).
         RuntimeError: If neighbor graph construction or Leiden clustering fails.
 
     Notes:
@@ -108,17 +114,27 @@ def cluster_alot(
         >>> runs[["obs_key", "n_clusters"]].head()
     """
 
-    # ---- Validate prerequisites ----
-    if "X_pca" not in adata.obsm:
-        raise ValueError("`adata.obsm['X_pca']` not found. Please run PCA first.")
-    Xpca = adata.obsm["X_pca"]
-    n_pcs_available = Xpca.shape[1]
-    if n_pcs_available < 2:
-        raise ValueError(f"Not enough PCs ({n_pcs_available}) in `X_pca`.")
-
     # ---- Normalize params ----
     pca_params = dict(pca_params or {})
     knn_params = dict(knn_params or {})
+
+    use_rep_key = knn_params.get("use_rep", "X_pca")
+    if use_rep_key is None:
+        use_rep_key = "X_pca"
+    if not isinstance(use_rep_key, str):
+        raise ValueError("`knn_params['use_rep']` must be a string key in `adata.obsm`.")
+
+    # ---- Validate prerequisites ----
+    if use_rep_key not in adata.obsm:
+        raise ValueError(
+            f"`adata.obsm['{use_rep_key}']` not found. Please compute that representation first."
+        )
+    X_rep = adata.obsm[use_rep_key]
+    n_pcs_available = X_rep.shape[1]
+    if n_pcs_available < 2:
+        raise ValueError(
+            f"Not enough components ({n_pcs_available}) in `adata.obsm['{use_rep_key}']`."
+        )
     top_n_pcs: List[int] = pca_params.get("top_n_pcs", [40])
     percent_of_pcs: Optional[float] = pca_params.get("percent_of_pcs", None)
     n_random_samples: Optional[int] = pca_params.get("n_random_samples", None)
@@ -143,9 +159,9 @@ def cluster_alot(
     # ---- Helper: build neighbors from a given PC subspace ----
     def _neighbors_from_pc_indices(pc_idx: np.ndarray, n_neighbors: int, neighbors_key: str):
         """Create a neighbors graph using the given PC column indices."""
-        # Create a temporary representation name
-        temp_rep_key = f"X_pca_sub_{neighbors_key}"
-        adata.obsm[temp_rep_key] = Xpca[:, pc_idx]
+        # Create a temporary representation name derived from the requested embedding
+        temp_rep_key = f"{use_rep_key}_sub_{neighbors_key}"
+        adata.obsm[temp_rep_key] = X_rep[:, pc_idx]
 
         # Build neighbors; store under unique keys (in uns & obsp)
         sc.pp.neighbors(
@@ -158,6 +174,7 @@ def _neighbors_from_pc_indices(pc_idx: np.ndarray, n_neighbors: int, neighbors_k
         # Record which PCs were used (for provenance)
         if neighbors_key in adata.uns:
             adata.uns[neighbors_key]["pcs_indices"] = pc_idx.astype(int)
+            adata.uns[neighbors_key]["base_representation"] = use_rep_key
 
         # Clean up the temporary representation to save memory
         del adata.obsm[temp_rep_key]
@@ -203,6 +220,7 @@ def _neighbors_from_pc_indices(pc_idx: np.ndarray, n_neighbors: int, neighbors_k
                 rows.append({
                     "obs_key": obs_key,
                     "neighbors_key": neighbors_key,
+                    "use_rep": use_rep_key,
                     "resolution": res,
                     "top_n_pcs": N,
                     "pct_pcs": float(pct_str),
@@ -236,6 +254,7 @@ def _neighbors_from_pc_indices(pc_idx: np.ndarray, n_neighbors: int, neighbors_k
             rows.append({
                 "obs_key": obs_key,
                 "neighbors_key": neighbors_key,
+                "use_rep": use_rep_key,
                 "resolution": float(res),
                 "top_n_pcs": int(N),
                 "pct_pcs": float(pct_str),
@@ -248,8 +267,19 @@ def _neighbors_from_pc_indices(pc_idx: np.ndarray, n_neighbors: int, neighbors_k
 
     summary_df = pd.DataFrame(rows)
     # nice ordering
-    cols = ["obs_key","neighbors_key","resolution","top_n_pcs","pct_pcs","sample_idx",
-            "n_neighbors","pcs_used_count","n_clusters","status"]
+    cols = [
+        "obs_key",
+        "neighbors_key",
+        "use_rep",
+        "resolution",
+        "top_n_pcs",
+        "pct_pcs",
+        "sample_idx",
+        "n_neighbors",
+        "pcs_used_count",
+        "n_clusters",
+        "status",
+    ]
     summary_df = summary_df[cols]
 
     return summary_df
@@ -352,6 +382,3 @@ def cluster_subclusters(
         # Prefix subcluster labels and write back
         new_labels = orig + "_" + sub.obs['leiden_sub'].astype(str)
         adata.obs.loc[mask, subcluster_col_name] = new_labels.values
-
-
-
diff --git a/src/pySingleCellNet/tools/gene.py b/src/pySingleCellNet/tools/gene.py
@@ -886,4 +886,154 @@ def what_module_has_gene(
     return [key for key, genes in genemodules.items() if target_gene in genes]
 
 
+def correlate_module_scores_with_pcs(
+    adata: AnnData,
+    score_key: Union[str, Sequence[float], np.ndarray, pd.Series],
+    *,
+    pca_key: str = "X_pca",
+    variance_key: Optional[str] = "pca",
+    method: str = "pearson",
+    min_abs_corr: Optional[float] = 0.3,
+    drop_na: bool = True,
+    sort: bool = True,
+) -> pd.DataFrame:
+    """Quantify the association between a module score and individual PCs.
+
+    Parameters
+    ----------
+    adata
+        AnnData object containing PCs in ``adata.obsm`` and per-cell module scores.
+    score_key
+        Either the name of an ``adata.obs`` column holding module scores (e.g., the
+        output of :func:`score_gene_sets`) or an explicit array-like of shape
+        ``(n_cells,)``.
+    pca_key
+        Key of the embedding in ``adata.obsm`` to correlate against (defaults to
+        ``"X_pca"``).
+    variance_key
+        Optional ``adata.uns`` key that stores ``"variance_ratio"`` for the chosen
+        PCA run (defaults to ``"pca"`` when using ``sc.tl.pca``).
+    method
+        Correlation metric: ``"pearson"`` (default) or ``"spearman"``.
+    min_abs_corr
+        Absolute-correlation threshold used to flag PCs that strongly follow the
+        module score. Set to ``None`` to skip flagging.
+    drop_na
+        If ``True`` (default), silently drop cells with missing scores/PC values.
+        Otherwise raise when NaNs are detected.
+    sort
+        If ``True`` (default), sort the output by descending absolute correlation.
+
+    Returns
+    -------
+    pandas.DataFrame
+        Table with one row per PC containing the correlation, absolute correlation,
+        two-sided p-value, variance ratio (when available), and a boolean flag
+        indicating whether the PC exceeds ``min_abs_corr``.
+    """
+
+    if pca_key not in adata.obsm:
+        raise ValueError(f"'{pca_key}' not found in adata.obsm. Run PCA first.")
+    pcs = np.asarray(adata.obsm[pca_key], dtype=np.float64)
+    if pcs.ndim != 2:
+        raise ValueError(f"adata.obsm['{pca_key}'] must be 2-D (cells × PCs).")
+    if pcs.shape[0] != adata.n_obs:
+        raise ValueError("Number of rows in the PCA embedding does not match n_obs.")
+
+    # Resolve the module scores vector
+    score_label = None
+    if isinstance(score_key, str):
+        if score_key not in adata.obs:
+            raise ValueError(f"score_key='{score_key}' not present in adata.obs.")
+        scores = adata.obs[score_key].to_numpy(dtype=np.float64)
+        score_label = score_key
+    else:
+        scores = np.asarray(score_key, dtype=np.float64).reshape(-1)
+        if scores.shape[0] != adata.n_obs:
+            raise ValueError("score_key array must have length equal to adata.n_obs.")
+
+    # Handle missing data
+    finite_scores = np.isfinite(scores)
+    finite_pcs = np.all(np.isfinite(pcs), axis=1)
+    if drop_na:
+        mask = finite_scores & finite_pcs
+    else:
+        if not (finite_scores.all() and finite_pcs.all()):
+            raise ValueError("NaN/inf detected in scores or PCs; set drop_na=True to filter them.")
+        mask = np.ones_like(finite_scores, dtype=bool)
+
+    n_valid = int(mask.sum())
+    if n_valid < 3:
+        raise ValueError("Need at least 3 valid cells to compute correlations.")
+
+    y = scores[mask]
+    X = pcs[mask]
+
+    method_lc = method.lower()
+    if method_lc not in {"pearson", "spearman"}:
+        raise ValueError("method must be either 'pearson' or 'spearman'.")
+
+    if method_lc == "spearman":
+        from scipy.stats import rankdata  # local import to avoid global dependency
+        y = rankdata(y)
+        # Rank each PC separately
+        X = np.apply_along_axis(rankdata, 0, X)
+
+    # Center data
+    y = y.astype(np.float64)
+    y_centered = y - y.mean()
+    y_norm = np.sqrt(np.sum(y_centered ** 2))
+    if y_norm == 0:
+        raise ValueError("Module score has zero variance; correlation undefined.")
+
+    X_centered = X - X.mean(axis=0)
+    X_norm = np.sqrt(np.sum(X_centered ** 2, axis=0))
+
+    with np.errstate(divide="ignore", invalid="ignore"):
+        corr = (y_centered @ X_centered) / (y_norm * X_norm)
+    corr = corr.astype(np.float64)
+
+    n_pcs = corr.size
+    dof = n_valid - 2
+    if dof < 1:
+        raise ValueError("Not enough cells to compute correlation p-values (need >= 3).")
+
+    # Compute two-sided Pearson p-values (valid for Spearman ranks as an approximation)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        denom = np.clip(1.0 - corr**2, 1e-12, None)
+        t_stat = corr * np.sqrt(dof / denom)
+    from scipy import stats as _stats  # local import
+    p_values = 2.0 * _stats.t.sf(np.abs(t_stat), df=dof)
+
+    # Variance ratios (if available)
+    var_ratio = np.full(n_pcs, np.nan)
+    if variance_key is not None and variance_key in adata.uns:
+        uns_entry = adata.uns[variance_key]
+        if isinstance(uns_entry, Mapping) and "variance_ratio" in uns_entry:
+            vr = np.asarray(uns_entry["variance_ratio"], dtype=np.float64).ravel()
+            if vr.size:
+                var_ratio[: min(n_pcs, vr.size)] = vr[:n_pcs]
+
+    result = pd.DataFrame({
+        "pc": [f"PC{i}" for i in range(1, n_pcs + 1)],
+        "pc_index": np.arange(1, n_pcs + 1, dtype=int),
+        "correlation": corr,
+        "abs_correlation": np.abs(corr),
+        "p_value": p_values,
+        "variance_ratio": var_ratio,
+        "n_cells": n_valid,
+        "score_key": score_label or "array",
+    })
+
+    if min_abs_corr is not None:
+        threshold = float(min_abs_corr)
+        result["flag_high_corr"] = result["abs_correlation"] >= threshold
+        result.attrs["min_abs_corr"] = threshold
+    else:
+        result["flag_high_corr"] = False
+
+    if sort:
+        result = result.sort_values("abs_correlation", ascending=False).reset_index(drop=True)
+
+    return result
 
diff --git a/src/pySingleCellNet/utils/__init__.py b/src/pySingleCellNet/utils/__init__.py
@@ -6,13 +6,14 @@
 )
 
 from .adataTools import (
-    split_adata_indices,
-    rename_cluster_labels,
-    limit_anndata_to_common_genes,
-    remove_genes,
-    filter_anndata_slots,
-    filter_adata_by_group_size
-)
+    split_adata_indices,
+    rename_cluster_labels,
+    limit_anndata_to_common_genes,
+    remove_genes,
+    filter_anndata_slots,
+    filter_adata_by_group_size,
+    drop_pcs_from_embedding
+)
 
 #from .gene import (
 #    extract_top_bottom_genes,
@@ -52,10 +53,11 @@
     "score_sex",
     "split_adata_indices",
     "rename_cluster_labels",
-    "limit_anndata_to_common_genes",
-    "remove_genes",
-    "filter_anndata_slots",
-    "filter_adata_by_group_size",
+    "limit_anndata_to_common_genes",
+    "remove_genes",
+    "filter_anndata_slots",
+    "filter_adata_by_group_size",
+    "drop_pcs_from_embedding",
 #    "extract_top_bottom_genes",
 #    "pull_out_genes",
 #    "pull_out_genes_v2",
diff --git a/src/pySingleCellNet/utils/adataTools.py b/src/pySingleCellNet/utils/adataTools.py