diff --git a/bbconf/config_parser/models.py b/bbconf/config_parser/models.py index 8f1d37ec..bf2f39a4 100644 --- a/bbconf/config_parser/models.py +++ b/bbconf/config_parser/models.py @@ -1,5 +1,6 @@ import logging from pathlib import Path +from typing import Literal from pydantic import BaseModel, ConfigDict, computed_field, field_validator from yacman import load_yaml @@ -126,6 +127,17 @@ class ConfigPepHubClient(BaseModel): tag: str | None = DEFAULT_PEPHUB_TAG +class ConfigAnalysis(BaseModel): + """Analysis backend configuration. + + Controls which statistics engine is used for BED file analysis. + """ + + backend: Literal["r", "gtars"] = "r" + + model_config = ConfigDict(extra="forbid") + + class ConfigFile(BaseModel): database: ConfigDB qdrant: ConfigQdrant = None @@ -134,6 +146,7 @@ class ConfigFile(BaseModel): access_methods: AccessMethods = None s3: ConfigS3 = None phc: ConfigPepHubClient = None + analysis: ConfigAnalysis = ConfigAnalysis() model_config = ConfigDict(extra="allow") diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index 050c7e61..8261154b 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -255,6 +255,12 @@ class BedStats(Base): promotercore_percentage: Mapped[Optional[float]] tssdist: Mapped[Optional[float]] + distributions: Mapped[Optional[dict]] = mapped_column( + JSON, + nullable=True, + comment="Full distribution arrays from gtars genomicdist (JSONB)", + ) + bed: Mapped["Bed"] = relationship("Bed", back_populates="stats") @@ -337,6 +343,11 @@ class BedSets(Base): bedset_standard_deviation: Mapped[Optional[dict]] = mapped_column( JSON, comment="Median values of the bedset" ) + bedset_stats: Mapped[Optional[dict]] = mapped_column( + JSON, + nullable=True, + comment="Pre-aggregated distribution statistics from gtars (JSONB)", + ) bedfiles: Mapped[list["BedFileBedSetRelation"]] = relationship( "BedFileBedSetRelation", back_populates="bedset", cascade="all, delete-orphan" diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index d16d0c51..fec4a11b 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -74,6 +74,8 @@ class BedStatsModel(BaseModel): promoterprox_frequency: float | None = None promoterprox_percentage: float | None = None + distributions: dict | None = None + model_config = ConfigDict(extra="ignore", populate_by_name=True) diff --git a/bbconf/models/bedset_models.py b/bbconf/models/bedset_models.py index ca074a99..45dae379 100644 --- a/bbconf/models/bedset_models.py +++ b/bbconf/models/bedset_models.py @@ -1,4 +1,5 @@ import datetime +from typing import Optional from pydantic import BaseModel, ConfigDict, model_validator @@ -7,10 +8,34 @@ class BedSetStats(BaseModel): + """Bedset statistics: mean/sd of scalar columns. + + Populated from bedset_means and bedset_standard_deviation database columns. + """ + mean: BedStatsModel = None sd: BedStatsModel = None +class BedSetDistributions(BaseModel): + """Collection-level aggregated distribution statistics for a bedset. + + Stored in the bedset_stats JSONB database column. Populated when + member bed files have been processed with the gtars analysis backend. + """ + + n_files: int = 0 + composition: Optional[dict] = None + scalar_summaries: Optional[dict] = None + tss_histogram: Optional[dict] = None + widths_histogram: Optional[dict] = None + neighbor_distances: Optional[dict] = None + gc_content: Optional[dict] = None + region_distribution: Optional[dict] = None + partitions: Optional[dict] = None + chromosome_summaries: Optional[dict] = None + + class BedSetPlots(BaseModel): region_commonality: FileModel = None @@ -24,6 +49,7 @@ class BedSetMetadata(BaseModel): submission_date: datetime.datetime = None last_update_date: datetime.datetime = None statistics: BedSetStats | None = None + distributions: BedSetDistributions | None = None plots: BedSetPlots | None = None description: str = None summary: str = None diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 83566ac3..cad38dce 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -427,7 +427,11 @@ def _calculate_statistics(self, bed_ids: list[str]) -> BedSetStats: """ _LOGGER.info("Calculating bedset statistics") - numeric_columns = BedStatsModel.model_fields + numeric_columns = [ + name + for name, field in BedStatsModel.model_fields.items() + if field.annotation in (float, float | None) + ] bedset_sd = {} bedset_mean = {}