From 6e0ef984f7a6546337504669001bd79d1debf789 Mon Sep 17 00:00:00 2001 From: Sam Park Date: Fri, 27 Mar 2026 12:53:13 -0400 Subject: [PATCH 1/2] Add modular analysis backend config and distribution schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ConfigAnalysis model with backend: "r" | "gtars" (defaults to "r") to support switching analysis engines via config. Add nullable JSONB columns (distributions on BedStats, bedset_stats on BedSets) and BedSetDistributions model for gtars output. Existing code and behavior unchanged — pure additive schema and config additions. Co-Authored-By: Claude Opus 4.6 (1M context) --- bbconf/config_parser/models.py | 13 +++++++++++++ bbconf/db_utils.py | 11 +++++++++++ bbconf/models/bed_models.py | 2 ++ bbconf/models/bedset_models.py | 26 ++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/bbconf/config_parser/models.py b/bbconf/config_parser/models.py index 8f1d37ec..bf2f39a4 100644 --- a/bbconf/config_parser/models.py +++ b/bbconf/config_parser/models.py @@ -1,5 +1,6 @@ import logging from pathlib import Path +from typing import Literal from pydantic import BaseModel, ConfigDict, computed_field, field_validator from yacman import load_yaml @@ -126,6 +127,17 @@ class ConfigPepHubClient(BaseModel): tag: str | None = DEFAULT_PEPHUB_TAG +class ConfigAnalysis(BaseModel): + """Analysis backend configuration. + + Controls which statistics engine is used for BED file analysis. + """ + + backend: Literal["r", "gtars"] = "r" + + model_config = ConfigDict(extra="forbid") + + class ConfigFile(BaseModel): database: ConfigDB qdrant: ConfigQdrant = None @@ -134,6 +146,7 @@ class ConfigFile(BaseModel): access_methods: AccessMethods = None s3: ConfigS3 = None phc: ConfigPepHubClient = None + analysis: ConfigAnalysis = ConfigAnalysis() model_config = ConfigDict(extra="allow") diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index 050c7e61..8261154b 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -255,6 +255,12 @@ class BedStats(Base): promotercore_percentage: Mapped[Optional[float]] tssdist: Mapped[Optional[float]] + distributions: Mapped[Optional[dict]] = mapped_column( + JSON, + nullable=True, + comment="Full distribution arrays from gtars genomicdist (JSONB)", + ) + bed: Mapped["Bed"] = relationship("Bed", back_populates="stats") @@ -337,6 +343,11 @@ class BedSets(Base): bedset_standard_deviation: Mapped[Optional[dict]] = mapped_column( JSON, comment="Median values of the bedset" ) + bedset_stats: Mapped[Optional[dict]] = mapped_column( + JSON, + nullable=True, + comment="Pre-aggregated distribution statistics from gtars (JSONB)", + ) bedfiles: Mapped[list["BedFileBedSetRelation"]] = relationship( "BedFileBedSetRelation", back_populates="bedset", cascade="all, delete-orphan" diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index d16d0c51..fec4a11b 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -74,6 +74,8 @@ class BedStatsModel(BaseModel): promoterprox_frequency: float | None = None promoterprox_percentage: float | None = None + distributions: dict | None = None + model_config = ConfigDict(extra="ignore", populate_by_name=True) diff --git a/bbconf/models/bedset_models.py b/bbconf/models/bedset_models.py index ca074a99..45dae379 100644 --- a/bbconf/models/bedset_models.py +++ b/bbconf/models/bedset_models.py @@ -1,4 +1,5 @@ import datetime +from typing import Optional from pydantic import BaseModel, ConfigDict, model_validator @@ -7,10 +8,34 @@ class BedSetStats(BaseModel): + """Bedset statistics: mean/sd of scalar columns. + + Populated from bedset_means and bedset_standard_deviation database columns. + """ + mean: BedStatsModel = None sd: BedStatsModel = None +class BedSetDistributions(BaseModel): + """Collection-level aggregated distribution statistics for a bedset. + + Stored in the bedset_stats JSONB database column. Populated when + member bed files have been processed with the gtars analysis backend. + """ + + n_files: int = 0 + composition: Optional[dict] = None + scalar_summaries: Optional[dict] = None + tss_histogram: Optional[dict] = None + widths_histogram: Optional[dict] = None + neighbor_distances: Optional[dict] = None + gc_content: Optional[dict] = None + region_distribution: Optional[dict] = None + partitions: Optional[dict] = None + chromosome_summaries: Optional[dict] = None + + class BedSetPlots(BaseModel): region_commonality: FileModel = None @@ -24,6 +49,7 @@ class BedSetMetadata(BaseModel): submission_date: datetime.datetime = None last_update_date: datetime.datetime = None statistics: BedSetStats | None = None + distributions: BedSetDistributions | None = None plots: BedSetPlots | None = None description: str = None summary: str = None From f0ecd82cb7d10a72b851107f459d061b6e134a5c Mon Sep 17 00:00:00 2001 From: Sam Park Date: Fri, 3 Apr 2026 14:44:24 -0400 Subject: [PATCH 2/2] Fix bedset stats aggregation: skip non-numeric columns The stddev/avg loop over BedStatsModel.model_fields was hitting the new JSON `distributions` column, causing PostgreSQL to fail with "function stddev(json) does not exist". Filter to float fields only. Co-Authored-By: Claude Opus 4.6 (1M context) --- bbconf/modules/bedsets.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 83566ac3..cad38dce 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -427,7 +427,11 @@ def _calculate_statistics(self, bed_ids: list[str]) -> BedSetStats: """ _LOGGER.info("Calculating bedset statistics") - numeric_columns = BedStatsModel.model_fields + numeric_columns = [ + name + for name, field in BedStatsModel.model_fields.items() + if field.annotation in (float, float | None) + ] bedset_sd = {} bedset_mean = {}