|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +from scipy.sparse import csr_matrix |
| 4 | + |
| 5 | + |
| 6 | +def compute_weighted_average_around_target( |
| 7 | + adata, |
| 8 | + target_cell_type_quantile: float = 0.995, |
| 9 | + source_cell_type_quantile: float = 0.95, |
| 10 | + normalisation_quantile: float = 0.95, |
| 11 | + distance_bin: list = None, |
| 12 | + sample_key: str = "sample", |
| 13 | + genes_to_use_as_source: list = None, |
| 14 | + gene_symbols: str = None, |
| 15 | + obsm_spatial_key: str = "X_spatial", |
| 16 | + normalisation_key: str = None, |
| 17 | + layer: str = None, |
| 18 | + cell_abundance_key: str = "cell_abundance_w_sf", |
| 19 | + cell_abundance_quantile_key: str = "q05", |
| 20 | +): |
| 21 | + """ |
| 22 | + Compute average abundance of source cell types or genes around each target cell type. |
| 23 | +
|
| 24 | + Parameters |
| 25 | + ---------- |
| 26 | + adata |
| 27 | + AnnData object of spatial dataset with cell2location results |
| 28 | + target_cell_type_quantile |
| 29 | + Quantile of target cell type abundance to use for defining |
| 30 | + a set locations with highest abundance of target cell types. |
| 31 | + Cell abundance below this thereshold is set to 0. |
| 32 | + source_cell_type_quantile |
| 33 | + Quantile of source cell type abundance to use for defining |
| 34 | + a set locations with highest abundance of source cell types. |
| 35 | + Cell abundance or RNA abundance for genes below this thereshold is set to 0. |
| 36 | + normalisation_quantile |
| 37 | + Quantile of source cell type or source RNA abundance for genes to use as normalising constant. |
| 38 | + This step can be seen as scaling that puts all source cell types or genes into the same scale. |
| 39 | + distance_bin |
| 40 | + If using concentric bins list with two elements specifying inner and outer edge of the bin. |
| 41 | + Distances specified in coordinates of `obsm_spatial_key`. |
| 42 | + sample_key |
| 43 | + `adata.obs` column key specifying distinct sections across |
| 44 | + which distance bin computation is invalid. |
| 45 | + genes_to_use_as_source |
| 46 | + To request RNA abundance of genes around target cells provide a list of |
| 47 | + var_names or gene SYMBOLs. |
| 48 | + gene_symbols |
| 49 | + `adata.var` column key containing gene symbols |
| 50 | + obsm_spatial_key |
| 51 | + `adata.obsm` key containing spatial coordinates (can be 2D or 3D or N-D). |
| 52 | + normalisation_key |
| 53 | + RNA abundance must be normalised using y_s technical effect term |
| 54 | + estimated by cell2location. Provide `adata.obsm` key containing this normalisation term. |
| 55 | + layer |
| 56 | + adata.layers to use for getting RNA abundance. Default: `adata.X` |
| 57 | + cell_abundance_key |
| 58 | + which cell2location variable to use as cell abundance |
| 59 | + cell_abundance_quantile_key |
| 60 | + which quantile of cell abundance to use |
| 61 | +
|
| 62 | + Returns |
| 63 | + ------- |
| 64 | + pd.DataFrame of average abundance of source cell types or RNA abundance of requested genes |
| 65 | + around target cell types. |
| 66 | +
|
| 67 | + """ |
| 68 | + # save initial names |
| 69 | + if genes_to_use_as_source is None: |
| 70 | + source_names = adata.uns["mod"]["factor_names"] |
| 71 | + else: |
| 72 | + source_names = genes_to_use_as_source |
| 73 | + |
| 74 | + cell_abundance_key_ = cell_abundance_quantile_key + cell_abundance_key |
| 75 | + cell_abundance_key = cell_abundance_quantile_key + "_" + cell_abundance_key |
| 76 | + |
| 77 | + # create result data frame to be completed |
| 78 | + weighted_avg = pd.DataFrame( |
| 79 | + index=[f"target {ct}" for ct in adata.uns["mod"]["factor_names"]], |
| 80 | + columns=source_names, |
| 81 | + ) |
| 82 | + if genes_to_use_as_source is None: |
| 83 | + # pick locations where source cell type abundance is above source_cell_type_quantile |
| 84 | + source_cell_type_filter = adata.obsm[cell_abundance_key] > adata.obsm[cell_abundance_key].quantile( |
| 85 | + source_cell_type_quantile |
| 86 | + ) |
| 87 | + # zero-out source cell abundance below selected quantile |
| 88 | + source_cell_type_data = adata.obsm[cell_abundance_key] * source_cell_type_filter |
| 89 | + # get normalising quantile values |
| 90 | + source_normalisation_quantile = adata.obsm[cell_abundance_key].quantile(normalisation_quantile, axis=0) |
| 91 | + # compute average abundance above this quantile |
| 92 | + source_normalisation_quantile = np.average( |
| 93 | + adata.obsm[cell_abundance_key], |
| 94 | + weights=adata.obsm[cell_abundance_key] > source_normalisation_quantile, |
| 95 | + axis=0, |
| 96 | + ) |
| 97 | + else: |
| 98 | + # if using gene symbols get var names: |
| 99 | + if gene_symbols is not None: |
| 100 | + genes_to_use_as_source = adata.var_names[adata.var[gene_symbols].isin(genes_to_use_as_source)] |
| 101 | + # get RNA abundance data |
| 102 | + if layer is None: |
| 103 | + source_cell_type_data = adata[:, genes_to_use_as_source].X.toarray() |
| 104 | + else: |
| 105 | + source_cell_type_data = adata[:, genes_to_use_as_source].layers[layer].toarray() |
| 106 | + # apply technical across-location normalisation |
| 107 | + if normalisation_key: |
| 108 | + source_cell_type_data = source_cell_type_data / adata.obsm[normalisation_key] |
| 109 | + # pick locations where source cell type abundance is above source_cell_type_quantile |
| 110 | + source_cell_type_filter = source_cell_type_data > np.quantile( |
| 111 | + source_cell_type_data, q=source_cell_type_quantile, axis=0 |
| 112 | + ) |
| 113 | + # zero-out source cell abundance below selected quantile |
| 114 | + source_cell_type_data = source_cell_type_data * source_cell_type_filter |
| 115 | + # create a dataframe with initial source RNA abundance |
| 116 | + source_cell_type_data = pd.DataFrame( |
| 117 | + source_cell_type_data, |
| 118 | + index=adata.obs_names, |
| 119 | + columns=source_names, |
| 120 | + ) |
| 121 | + # get normalising quantile values |
| 122 | + source_normalisation_quantile = source_cell_type_data.quantile(normalisation_quantile, axis=0) |
| 123 | + # compute average abundance above this quantile |
| 124 | + source_normalisation_quantile = np.average( |
| 125 | + source_cell_type_data, |
| 126 | + weights=source_cell_type_data > source_normalisation_quantile, |
| 127 | + axis=0, |
| 128 | + ) |
| 129 | + |
| 130 | + # [optional] compute average source_cell_type_data across closes locations (concentric circles) |
| 131 | + if distance_bin is not None: |
| 132 | + # iterate over samples of connected location from the same sections |
| 133 | + # or independent chunks registered 3D data |
| 134 | + for s in adata.obs[sample_key].unique(): |
| 135 | + # get sample observations |
| 136 | + sample_ind = adata.obs[sample_key].isin([s]) |
| 137 | + |
| 138 | + # compute distances bewteen locations |
| 139 | + from scipy.spatial.distance import cdist |
| 140 | + |
| 141 | + distances = cdist(adata[sample_ind, :].obsm[obsm_spatial_key], adata[sample_ind, :].obsm[obsm_spatial_key]) |
| 142 | + # select locations in distance bin |
| 143 | + binary_distance = csr_matrix((distances > distance_bin[0]) & (distances <= distance_bin[1])) |
| 144 | + # compute average abundance across locations within a bin |
| 145 | + data_ = ( |
| 146 | + (binary_distance @ csr_matrix(source_cell_type_data.loc[sample_ind, :].values)) |
| 147 | + .multiply(1 / binary_distance.sum(1)) |
| 148 | + .toarray() |
| 149 | + ) |
| 150 | + # to account for locations with no neighbours within a bin (sum == 0) |
| 151 | + data_[np.isnan(data_)] = 0 |
| 152 | + # complete the average for a given sample |
| 153 | + source_cell_type_data.loc[sample_ind, :] = data_ |
| 154 | + # normalise data by normalising quantile (global value across distance bins) |
| 155 | + source_cell_type_data = source_cell_type_data / source_normalisation_quantile |
| 156 | + # account for cases of undetected signal |
| 157 | + source_cell_type_data[source_cell_type_data.isna()] = 0 |
| 158 | + |
| 159 | + # compute average for each target cell type |
| 160 | + for ct in adata.uns["mod"]["factor_names"]: |
| 161 | + # find locations containing high abundance of target cell type |
| 162 | + target_cell_type_filter = adata.obsm[cell_abundance_key][f"{cell_abundance_key_}_{ct}"] > adata.obsm[ |
| 163 | + cell_abundance_key |
| 164 | + ][f"{cell_abundance_key_}_{ct}"].quantile(target_cell_type_quantile) |
| 165 | + # use thresholded abundance of target cell type as a weight |
| 166 | + weights = adata.obsm[cell_abundance_key][f"{cell_abundance_key_}_{ct}"] * target_cell_type_filter |
| 167 | + # normalise for target cell type abundance |
| 168 | + target_quantile = adata.obsm[cell_abundance_key][f"{cell_abundance_key_}_{ct}"].quantile(normalisation_quantile) |
| 169 | + target_quantile = np.average( |
| 170 | + adata.obsm[cell_abundance_key][f"{cell_abundance_key_}_{ct}"].values, |
| 171 | + weights=adata.obsm[cell_abundance_key][f"{cell_abundance_key_}_{ct}"].values > target_quantile, |
| 172 | + ).flatten() |
| 173 | + assert target_quantile.shape == (1,), target_quantile.shape |
| 174 | + weights = weights / target_quantile |
| 175 | + # compute the final weighted average |
| 176 | + weighted_avg_ = np.average( |
| 177 | + source_cell_type_data, |
| 178 | + weights=weights, |
| 179 | + axis=0, |
| 180 | + ) |
| 181 | + # weighted_avg_[weighted_avg_.isna()] = 0 |
| 182 | + |
| 183 | + weighted_avg_ = pd.Series(weighted_avg_, name=ct, index=source_names) |
| 184 | + |
| 185 | + # hack to make self interactions less apparent |
| 186 | + weighted_avg_[ct] = weighted_avg_[~weighted_avg_.index.isin([ct])].max() + 0.02 |
| 187 | + # complete the results dataframe |
| 188 | + weighted_avg.loc[f"target {ct}", :] = weighted_avg_ |
| 189 | + |
| 190 | + return weighted_avg.astype("float32") |
| 191 | + |
| 192 | + |
| 193 | +def melt_data_frame_per_signal(weighted_avg_dict, source_var, distance_bins): |
| 194 | + source_var_1 = pd.DataFrame( |
| 195 | + np.array([weighted_avg_dict[str(distance_bin)][source_var].values for distance_bin in distance_bins]), |
| 196 | + columns=weighted_avg_dict[str(distance_bins[0])].index, |
| 197 | + index=[np.mean(distance_bin) for distance_bin in distance_bins], |
| 198 | + ).T |
| 199 | + |
| 200 | + source_var_1 = source_var_1.melt( |
| 201 | + value_name="Abundance", |
| 202 | + var_name="Distance bin", |
| 203 | + ignore_index=False, |
| 204 | + ) |
| 205 | + source_var_1["Target"] = source_var_1.index |
| 206 | + source_var_1["Signal"] = source_var |
| 207 | + return source_var_1 |
| 208 | + |
| 209 | + |
| 210 | +def melt_signal_target_data_frame(weighted_avg_dict, distance_bins): |
| 211 | + source_vars = weighted_avg_dict[str(distance_bins[0])].columns |
| 212 | + |
| 213 | + return pd.concat( |
| 214 | + [melt_data_frame_per_signal(weighted_avg_dict, source_var, distance_bins) for source_var in source_vars] |
| 215 | + ) |
0 commit comments