Skip to content
Open
8 changes: 4 additions & 4 deletions bbconf/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
event,
select,
)
from sqlalchemy.dialects.postgresql import ARRAY, JSON
from sqlalchemy.dialects.postgresql import ARRAY, JSON, JSONB
from sqlalchemy.engine import URL, Engine, create_engine
from sqlalchemy.event import listens_for
from sqlalchemy.exc import IntegrityError, ProgrammingError
Expand Down Expand Up @@ -253,10 +253,10 @@ class BedStats(Base):
intron_percentage: Mapped[Optional[float]]
intergenic_percentage: Mapped[Optional[float]]
promotercore_percentage: Mapped[Optional[float]]
tssdist: Mapped[Optional[float]]
median_neighbor_distance: Mapped[Optional[float]]

distributions: Mapped[Optional[dict]] = mapped_column(
JSON,
JSONB,
nullable=True,
comment="Full distribution arrays from gtars genomicdist (JSONB)",
)
Expand Down Expand Up @@ -344,7 +344,7 @@ class BedSets(Base):
JSON, comment="Median values of the bedset"
)
bedset_stats: Mapped[Optional[dict]] = mapped_column(
JSON,
JSONB,
nullable=True,
comment="Pre-aggregated distribution statistics from gtars (JSONB)",
)
Expand Down
8 changes: 8 additions & 0 deletions bbconf/models/bed_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class BedStatsModel(BaseModel):
number_of_regions: float | None = None
gc_content: float | None = None
median_tss_dist: float | None = None
median_neighbor_distance: float | None = None
mean_region_width: float | None = None

exon_frequency: float | None = None
Expand Down Expand Up @@ -209,6 +210,13 @@ class BedListResult(BaseModel):
results: list[BedMetadataBasic]


class BedBatchResult(BaseModel):
count: int
limit: int
offset: int
results: list[BedMetadataAll]


class QdrantSearchResult(BaseModel):
id: str
payload: dict = None
Expand Down
21 changes: 17 additions & 4 deletions bbconf/models/bedset_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,31 @@ class BedSetDistributions(BaseModel):

Stored in the bedset_stats JSONB database column. Populated when
member bed files have been processed with the gtars analysis backend.

Only distributions that are meaningful at collection-level are kept:
- scalar_summaries: mean ± sd + 25-bin histograms of per-file scalar values
- tss_histogram: summed per-bin counts across files (fixed ±100 kb axis)
- region_distribution: per-chrom bin-wise mean ± sd across files
(requires gtars ≥ PR #248 for reference-aligned bin widths)
- partitions: mean ± sd of per-file partition percentages

Dropped (retained in per-file distributions blob, not aggregated):
- widths_histogram: per-file variable-range bins aren't summable; use
scalar_summaries.mean_region_width histogram instead
- neighbor_distances KDE: per-file within-bedset variance is low; use
scalar_summaries.median_neighbor_distance instead
- gc_content KDE: per-file distribution is unimodal; use
scalar_summaries.gc_content mean instead
- chromosome_summaries: redundant with region_distribution
- expected_partitions: per-file null hypothesis, not collection property
"""

n_files: int = 0
composition: Optional[dict] = None
scalar_summaries: Optional[dict] = None
tss_histogram: Optional[dict] = None
widths_histogram: Optional[dict] = None
neighbor_distances: Optional[dict] = None
gc_content: Optional[dict] = None
region_distribution: Optional[dict] = None
partitions: Optional[dict] = None
chromosome_summaries: Optional[dict] = None


class BedSetPlots(BaseModel):
Expand Down
Loading