|
| 1 | +"""Public API surface for CRISPR-Correct v0.1.0. |
| 2 | +
|
| 3 | +Exposes the three-stage workflow (map / count / alleles) cleanly, with |
| 4 | +dataclass configuration to replace the 50-kwarg entry point. The legacy |
| 5 | +`mapping.get_whitelist_reporter_counts_from_fastq(...)` entry point continues |
| 6 | +to work and is what `map_fastq` delegates to — this module is currently a |
| 7 | +thin re-packaging with IDE-friendly signatures; a future release can split |
| 8 | +the stages more aggressively once drivers are migrated. |
| 9 | +
|
| 10 | +Typical usage: |
| 11 | +
|
| 12 | +```python |
| 13 | +import crispr_ambiguous_mapping as cam |
| 14 | +from crispr_ambiguous_mapping.api import map_fastq, count, alleles, ParsingConfig |
| 15 | +from crispr_ambiguous_mapping.models import MatchTier |
| 16 | +
|
| 17 | +cfg = ParsingConfig( |
| 18 | + protospacer_start_position=0, protospacer_length=20, |
| 19 | + is_protospacer_r1=True, is_protospacer_header=False, revcomp_protospacer=False, |
| 20 | + protospacer_hamming_threshold_strict=7, |
| 21 | + surrogate_start_position=0, surrogate_length=32, |
| 22 | + is_surrogate_r1=False, is_surrogate_header=False, revcomp_surrogate=True, |
| 23 | + surrogate_hamming_threshold_strict=10, |
| 24 | + retain_inference_results=True, |
| 25 | + cores=4, |
| 26 | +) |
| 27 | +
|
| 28 | +result = map_fastq(library, fastq_r1_fns=["R1.fq.gz"], fastq_r2_fns=["R2.fq.gz"], config=cfg) |
| 29 | +counts_per_tier = count(result) |
| 30 | +allele_df = alleles(result, tier=MatchTier.PM_SM_BM, contains_guide_surrogate=True, contains_guide_barcode=False, contains_guide_umi=False) |
| 31 | +``` |
| 32 | +""" |
| 33 | +from __future__ import annotations |
| 34 | +from dataclasses import dataclass, asdict, fields |
| 35 | +from typing import Any, Dict, List, Optional |
| 36 | +import pandas as pd |
| 37 | + |
| 38 | +from .models.mapping_models import ( |
| 39 | + WhitelistReporterCountsResult, |
| 40 | + AllMatchSetWhitelistReporterCounterSeriesResults, |
| 41 | + MatchTier, |
| 42 | +) |
| 43 | + |
| 44 | + |
| 45 | +@dataclass |
| 46 | +class ParsingConfig: |
| 47 | + """IDE-friendly bundle of the ~50 parsing/threshold kwargs. |
| 48 | +
|
| 49 | + Every field mirrors a kwarg on |
| 50 | + `mapping.get_whitelist_reporter_counts_from_fastq`. Pass a ParsingConfig |
| 51 | + to `map_fastq` and it unpacks into the legacy signature. |
| 52 | +
|
| 53 | + §4.1 / §7.3: this is the minimum-viable dataclass replacement — a single |
| 54 | + flat struct so IDE autocomplete works. A future release can decompose |
| 55 | + into ComponentConfig / ThresholdConfig subtrees once users have migrated. |
| 56 | + """ |
| 57 | + # Regex / flank / position kwargs — protospacer |
| 58 | + protospacer_pattern_regex: Optional[str] = None |
| 59 | + protospacer_left_flank: Optional[str] = None |
| 60 | + protospacer_right_flank: Optional[str] = None |
| 61 | + protospacer_start_position: Optional[int] = None |
| 62 | + protospacer_end_position: Optional[int] = None |
| 63 | + protospacer_length: Optional[int] = None |
| 64 | + is_protospacer_r1: Optional[bool] = None |
| 65 | + is_protospacer_header: Optional[bool] = None |
| 66 | + revcomp_protospacer: Optional[bool] = None |
| 67 | + protospacer_hamming_threshold_strict: Optional[int] = None |
| 68 | + |
| 69 | + # surrogate |
| 70 | + surrogate_pattern_regex: Optional[str] = None |
| 71 | + surrogate_left_flank: Optional[str] = None |
| 72 | + surrogate_right_flank: Optional[str] = None |
| 73 | + surrogate_start_position: Optional[int] = None |
| 74 | + surrogate_end_position: Optional[int] = None |
| 75 | + surrogate_length: Optional[int] = None |
| 76 | + is_surrogate_r1: Optional[bool] = None |
| 77 | + is_surrogate_header: Optional[bool] = None |
| 78 | + revcomp_surrogate: Optional[bool] = None |
| 79 | + surrogate_hamming_threshold_strict: Optional[int] = None |
| 80 | + |
| 81 | + # guide barcode |
| 82 | + guide_barcode_pattern_regex: Optional[str] = None |
| 83 | + guide_barcode_left_flank: Optional[str] = None |
| 84 | + guide_barcode_right_flank: Optional[str] = None |
| 85 | + guide_barcode_start_position: Optional[int] = None |
| 86 | + guide_barcode_end_position: Optional[int] = None |
| 87 | + guide_barcode_length: Optional[int] = None |
| 88 | + is_guide_barcode_r1: Optional[bool] = None |
| 89 | + is_guide_barcode_header: Optional[bool] = None |
| 90 | + revcomp_guide_barcode: Optional[bool] = None |
| 91 | + guide_barcode_hamming_threshold_strict: Optional[int] = None |
| 92 | + |
| 93 | + # guide UMI |
| 94 | + guide_umi_pattern_regex: Optional[str] = None |
| 95 | + guide_umi_left_flank: Optional[str] = None |
| 96 | + guide_umi_right_flank: Optional[str] = None |
| 97 | + guide_umi_start_position: Optional[int] = None |
| 98 | + guide_umi_end_position: Optional[int] = None |
| 99 | + guide_umi_length: Optional[int] = None |
| 100 | + is_guide_umi_r1: Optional[bool] = None |
| 101 | + is_guide_umi_header: Optional[bool] = None |
| 102 | + revcomp_guide_umi: Optional[bool] = None |
| 103 | + |
| 104 | + # sample barcode |
| 105 | + sample_barcode_pattern_regex: Optional[str] = None |
| 106 | + sample_barcode_left_flank: Optional[str] = None |
| 107 | + sample_barcode_right_flank: Optional[str] = None |
| 108 | + sample_barcode_start_position: Optional[int] = None |
| 109 | + sample_barcode_end_position: Optional[int] = None |
| 110 | + sample_barcode_length: Optional[int] = None |
| 111 | + is_sample_barcode_r1: Optional[bool] = None |
| 112 | + is_sample_barcode_header: Optional[bool] = None |
| 113 | + revcomp_sample_barcode: Optional[bool] = None |
| 114 | + |
| 115 | + # misc |
| 116 | + retain_inference_results: bool = False |
| 117 | + cores: int = 1 |
| 118 | + |
| 119 | + def to_kwargs(self) -> Dict[str, Any]: |
| 120 | + """Dict of non-None fields suitable for splatting into the legacy entry point.""" |
| 121 | + return {f.name: getattr(self, f.name) for f in fields(self) if getattr(self, f.name) is not None} |
| 122 | + |
| 123 | + |
| 124 | +def map_fastq( |
| 125 | + whitelist_guide_reporter_df: pd.DataFrame, |
| 126 | + fastq_r1_fns: List[str], |
| 127 | + fastq_r2_fns: Optional[List[str]] = None, |
| 128 | + *, |
| 129 | + config: Optional[ParsingConfig] = None, |
| 130 | + **kwargs: Any, |
| 131 | +) -> WhitelistReporterCountsResult: |
| 132 | + """Map FASTQs to a whitelist library and return a `WhitelistReporterCountsResult`. |
| 133 | +
|
| 134 | + §4.5 / §7.1: public-API wrapper around |
| 135 | + `mapping.get_whitelist_reporter_counts_from_fastq`. Accepts a |
| 136 | + `ParsingConfig` and/or flat kwargs (flat kwargs override config fields). |
| 137 | +
|
| 138 | + Parameters |
| 139 | + ---------- |
| 140 | + whitelist_guide_reporter_df, fastq_r1_fns, fastq_r2_fns |
| 141 | + Same semantics as the legacy entry point. |
| 142 | + config |
| 143 | + Optional `ParsingConfig`. Fields with value `None` are dropped; provide |
| 144 | + any kwargs not in the config separately via `**kwargs`. |
| 145 | + **kwargs |
| 146 | + Flat kwargs; override any corresponding `config` field. |
| 147 | +
|
| 148 | + Returns |
| 149 | + ------- |
| 150 | + WhitelistReporterCountsResult |
| 151 | + """ |
| 152 | + # Import here to avoid a circular import at module load time. |
| 153 | + from .mapping.main_mapping import get_whitelist_reporter_counts_from_fastq |
| 154 | + merged: Dict[str, Any] = {} |
| 155 | + if config is not None: |
| 156 | + merged.update(config.to_kwargs()) |
| 157 | + merged.update(kwargs) |
| 158 | + return get_whitelist_reporter_counts_from_fastq( |
| 159 | + whitelist_guide_reporter_df=whitelist_guide_reporter_df, |
| 160 | + fastq_r1_fns=fastq_r1_fns, |
| 161 | + fastq_r2_fns=fastq_r2_fns, |
| 162 | + **merged, |
| 163 | + ) |
| 164 | + |
| 165 | + |
| 166 | +def count(result: WhitelistReporterCountsResult) -> AllMatchSetWhitelistReporterCounterSeriesResults: |
| 167 | + """Return the per-tier count Series container from a mapping result. |
| 168 | +
|
| 169 | + §4.5 / §7.1: thin accessor that surfaces the already-built counts. In the |
| 170 | + current implementation the mapping call builds counts eagerly; a future |
| 171 | + release can split mapping + counting into separate stages so counts are |
| 172 | + lazily evaluated only when asked for. |
| 173 | + """ |
| 174 | + return result.all_match_set_whitelist_reporter_counter_series_results |
| 175 | + |
| 176 | + |
| 177 | +def alleles( |
| 178 | + result: WhitelistReporterCountsResult, |
| 179 | + tier: str, |
| 180 | + *, |
| 181 | + contains_guide_surrogate: bool, |
| 182 | + contains_guide_barcode: bool, |
| 183 | + contains_guide_umi: bool, |
| 184 | +): |
| 185 | + """Build allele count Series for a given mapping tier from a retained mapping result. |
| 186 | +
|
| 187 | + §4.5 / §7.1: wraps `processing.get_matchset_alleleseries`. Requires the |
| 188 | + mapping call to have been run with `retain_inference_results=True` (the |
| 189 | + default is slim); raises `ValueError` with a clear remediation message |
| 190 | + otherwise. |
| 191 | +
|
| 192 | + Parameters |
| 193 | + ---------- |
| 194 | + result |
| 195 | + A mapping result from `map_fastq` built with |
| 196 | + `retain_inference_results=True`. |
| 197 | + tier |
| 198 | + A `MatchTier` enum member or its string value. |
| 199 | + contains_guide_surrogate, contains_guide_barcode, contains_guide_umi |
| 200 | + Must match the mapping configuration. |
| 201 | +
|
| 202 | + Returns |
| 203 | + ------- |
| 204 | + MatchSetWhitelistReporterObservedSequenceCounterSeriesResults |
| 205 | + """ |
| 206 | + from .processing.crispr_editing_processing import get_matchset_alleleseries |
| 207 | + return get_matchset_alleleseries( |
| 208 | + observed_guide_reporter_umi_counts_inferred=result.observed_guide_reporter_umi_counts_inferred, |
| 209 | + attribute_name=str(tier), |
| 210 | + contains_guide_surrogate=contains_guide_surrogate, |
| 211 | + contains_guide_barcode=contains_guide_barcode, |
| 212 | + contains_guide_umi=contains_guide_umi, |
| 213 | + ) |
0 commit comments