Skip to content

Commit 7a9b4a1

Browse files
Update version to 0.19.2 and enhance metadata handling in CyteType (#71)
- Bump package version to 0.19.2. - Introduce max_metadata_categories parameter to limit unique values in categorical obs columns during cluster metadata aggregation, improving memory efficiency. - Increase maximum upload size for obs_duckdb from 100MB to 2GB, accommodating larger datasets. - Refactor save_obs_duckdb function to ensure proper cleanup of temporary columns after processing.
1 parent 98cbecd commit 7a9b4a1

5 files changed

Lines changed: 36 additions & 10 deletions

File tree

cytetype/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.19.1"
1+
__version__ = "0.19.2"
22

33
import requests
44

cytetype/api/client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414

1515
MAX_UPLOAD_BYTES: dict[UploadFileKind, int] = {
16-
"obs_duckdb": 100 * 1024 * 1024, # 100MB
16+
"obs_duckdb": 2 * 1024 * 1024 * 1024, # 2GB
1717
"vars_h5": 50 * 1024 * 1024 * 1024, # 10GB
1818
}
1919

cytetype/core/artifacts.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -491,16 +491,23 @@ def save_obs_duckdb(
491491
"Invalid table_name. Use letters, numbers, and underscores only."
492492
)
493493

494+
added_cols: list[str] = []
494495
if obsm_coordinates is not None and coordinates_key is not None:
495-
obs_df = obs_df.copy()
496-
obs_df[f"__vis_coordinates_{coordinates_key}_1"] = obsm_coordinates[:, 0]
497-
obs_df[f"__vis_coordinates_{coordinates_key}_2"] = obsm_coordinates[:, 1]
496+
col1 = f"__vis_coordinates_{coordinates_key}_1"
497+
col2 = f"__vis_coordinates_{coordinates_key}_2"
498+
obs_df[col1] = obsm_coordinates[:, 0]
499+
obs_df[col2] = obsm_coordinates[:, 1]
500+
added_cols = [col1, col2]
498501

499502
dd_config: dict[str, Any] = {
500503
"threads": threads,
501504
"memory_limit": memory_limit,
502505
"temp_directory": temp_directory,
503506
}
504-
with duckdb.connect(out_file, config=dd_config) as con:
505-
con.register("obs_df", obs_df)
506-
con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM obs_df")
507+
try:
508+
with duckdb.connect(out_file, config=dd_config) as con:
509+
con.register("obs_df", obs_df)
510+
con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM obs_df")
511+
finally:
512+
for col in added_cols:
513+
obs_df.drop(columns=col, inplace=True, errors="ignore")

cytetype/main.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def __init__(
8484
max_cells_per_group: int = 1000,
8585
vars_h5_path: str = "vars.h5",
8686
obs_duckdb_path: str = "obs.duckdb",
87+
max_metadata_categories: int = 500,
8788
api_url: str = "https://prod.cytetype.nygen.io",
8889
auth_token: str | None = None,
8990
) -> None:
@@ -116,6 +117,10 @@ def __init__(
116117
max_cells_per_group (int, optional): Maximum number of cells to sample per group
117118
for visualization. If a group has more cells than this limit, a random sample
118119
will be taken. Defaults to 1000.
120+
max_metadata_categories (int, optional): Maximum number of unique values a categorical
121+
obs column may have to be included in cluster metadata aggregation. Columns with
122+
more unique values (e.g. cell barcodes, per-cell IDs) are skipped to avoid
123+
excessive memory usage. Defaults to 500.
119124
api_url (str, optional): URL for the CyteType API endpoint. Only change if using a custom
120125
deployment. Defaults to "https://prod.cytetype.nygen.io".
121126
auth_token (str | None, optional): Bearer token for API authentication. If provided,
@@ -186,6 +191,7 @@ def __init__(
186191
adata=self.adata,
187192
group_key=self.group_key,
188193
min_percentage=min_percentage,
194+
max_categories=max_metadata_categories,
189195
)
190196
# Replace keys in group_metadata using cluster_map
191197
self.group_metadata = {

cytetype/preprocessing/aggregation.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import anndata
22
import numpy as np
33

4+
from ..config import logger
45
from .marker_detection import _accumulate_group_stats
56

67

@@ -55,6 +56,7 @@ def aggregate_cluster_metadata(
5556
adata: anndata.AnnData,
5657
group_key: str,
5758
min_percentage: int = 10,
59+
max_categories: int = 500,
5860
) -> dict[str, dict[str, dict[str, int]]]:
5961
"""Aggregate categorical metadata per cluster.
6062
@@ -66,6 +68,9 @@ def aggregate_cluster_metadata(
6668
adata: AnnData object containing single-cell data
6769
group_key: Column name in adata.obs to group cells by
6870
min_percentage: Minimum percentage of cells in a group to include
71+
max_categories: Maximum number of unique values a column may have to be
72+
included. Columns exceeding this threshold are skipped to avoid
73+
memory-expensive intermediate DataFrames.
6974
7075
Returns:
7176
Nested dictionary structure:
@@ -76,14 +81,22 @@ def aggregate_cluster_metadata(
7681
grouped_data = adata.obs.groupby(group_key, observed=False)
7782
column_distributions: dict[str, dict[str, dict[str, int]]] = {}
7883

79-
# Process each column in adata.obs
8084
for column_name in adata.obs.columns:
8185
if column_name == group_key:
8286
continue
8387

8488
column_dtype = adata.obs[column_name].dtype
8589
if column_dtype in ["object", "category", "string"]:
86-
# Calculate value counts for each group
90+
n_unique = adata.obs[column_name].nunique()
91+
if n_unique > max_categories:
92+
logger.debug(
93+
"Skipping column '{}' ({} unique values > max_categories={}).",
94+
column_name,
95+
n_unique,
96+
max_categories,
97+
)
98+
continue
99+
87100
value_counts_df = grouped_data[column_name].value_counts().unstack().T
88101

89102
# Convert to percentages and filter for values >min_percentage

0 commit comments

Comments
 (0)