Skip to content

Commit 6a8b6cf

Browse files
CodingBashclaude
andcommitted
api §4.1 + §4.5 + §4.11 + §7.1 + §7.3: v0.1.0 public API (L + M + O)
L.1/L.2 (§4.1/§7.3 dataclass config + §4.5/§7.1 Map/Count/Postproc split): - New `crispr_ambiguous_mapping/api.py` with map_fastq / count / alleles public functions + ParsingConfig dataclass mirroring the 50 parsing/threshold kwargs. - ParsingConfig.to_kwargs() drops None fields; map_fastq(..., config=cfg, **overrides) delegates to the legacy entry point. - count(result) returns the per-tier count Series container; alleles(result, tier, ...) wraps get_matchset_alleleseries. O (§4.11 package consolidation): - New `crispr_correct/` top-level package re-exporting the public API. `import crispr_correct as cc` is the forward-looking name (matches repo/docs). - pyproject.toml version bumped 0.0.236 -> 0.1.0; description filled in; `crispr_correct` added to packages. Deferred to follow-up: N §4.6 parse_fastq rewrite (400->80 lines) — high risk, the only gate is simulation; scheduled as its own branch once the v0.1.0 wrapper surface is battle-tested. New smoke tests: - test_v0_1_0_public_api_importable — map_fastq/count/alleles/ParsingConfig present at top level. - test_crispr_correct_shim_package — `import crispr_correct as cc` works. - test_count_input_contains_surrogate_backcompat — legacy attribute still reads new field. Gate: 9/9 smoke tests + scCRISPR PASS (43s); simulation 135/135. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8634b2e commit 6a8b6cf

5 files changed

Lines changed: 290 additions & 3 deletions

File tree

crispr-ambiguous-mapping/crispr_ambiguous_mapping/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,7 @@
1515
from . import visualization
1616
from . import quality_control
1717
from . import postprocessing
18+
19+
# §4.5 / §7.1: v0.1.0 public API surface (map / count / alleles + ParsingConfig).
20+
from . import api
21+
from .api import map_fastq, count, alleles, ParsingConfig
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
"""Public API surface for CRISPR-Correct v0.1.0.
2+
3+
Exposes the three-stage workflow (map / count / alleles) cleanly, with
4+
dataclass configuration to replace the 50-kwarg entry point. The legacy
5+
`mapping.get_whitelist_reporter_counts_from_fastq(...)` entry point continues
6+
to work and is what `map_fastq` delegates to — this module is currently a
7+
thin re-packaging with IDE-friendly signatures; a future release can split
8+
the stages more aggressively once drivers are migrated.
9+
10+
Typical usage:
11+
12+
```python
13+
import crispr_ambiguous_mapping as cam
14+
from crispr_ambiguous_mapping.api import map_fastq, count, alleles, ParsingConfig
15+
from crispr_ambiguous_mapping.models import MatchTier
16+
17+
cfg = ParsingConfig(
18+
protospacer_start_position=0, protospacer_length=20,
19+
is_protospacer_r1=True, is_protospacer_header=False, revcomp_protospacer=False,
20+
protospacer_hamming_threshold_strict=7,
21+
surrogate_start_position=0, surrogate_length=32,
22+
is_surrogate_r1=False, is_surrogate_header=False, revcomp_surrogate=True,
23+
surrogate_hamming_threshold_strict=10,
24+
retain_inference_results=True,
25+
cores=4,
26+
)
27+
28+
result = map_fastq(library, fastq_r1_fns=["R1.fq.gz"], fastq_r2_fns=["R2.fq.gz"], config=cfg)
29+
counts_per_tier = count(result)
30+
allele_df = alleles(result, tier=MatchTier.PM_SM_BM, contains_guide_surrogate=True, contains_guide_barcode=False, contains_guide_umi=False)
31+
```
32+
"""
33+
from __future__ import annotations
34+
from dataclasses import dataclass, asdict, fields
35+
from typing import Any, Dict, List, Optional
36+
import pandas as pd
37+
38+
from .models.mapping_models import (
39+
WhitelistReporterCountsResult,
40+
AllMatchSetWhitelistReporterCounterSeriesResults,
41+
MatchTier,
42+
)
43+
44+
45+
@dataclass
46+
class ParsingConfig:
47+
"""IDE-friendly bundle of the ~50 parsing/threshold kwargs.
48+
49+
Every field mirrors a kwarg on
50+
`mapping.get_whitelist_reporter_counts_from_fastq`. Pass a ParsingConfig
51+
to `map_fastq` and it unpacks into the legacy signature.
52+
53+
§4.1 / §7.3: this is the minimum-viable dataclass replacement — a single
54+
flat struct so IDE autocomplete works. A future release can decompose
55+
into ComponentConfig / ThresholdConfig subtrees once users have migrated.
56+
"""
57+
# Regex / flank / position kwargs — protospacer
58+
protospacer_pattern_regex: Optional[str] = None
59+
protospacer_left_flank: Optional[str] = None
60+
protospacer_right_flank: Optional[str] = None
61+
protospacer_start_position: Optional[int] = None
62+
protospacer_end_position: Optional[int] = None
63+
protospacer_length: Optional[int] = None
64+
is_protospacer_r1: Optional[bool] = None
65+
is_protospacer_header: Optional[bool] = None
66+
revcomp_protospacer: Optional[bool] = None
67+
protospacer_hamming_threshold_strict: Optional[int] = None
68+
69+
# surrogate
70+
surrogate_pattern_regex: Optional[str] = None
71+
surrogate_left_flank: Optional[str] = None
72+
surrogate_right_flank: Optional[str] = None
73+
surrogate_start_position: Optional[int] = None
74+
surrogate_end_position: Optional[int] = None
75+
surrogate_length: Optional[int] = None
76+
is_surrogate_r1: Optional[bool] = None
77+
is_surrogate_header: Optional[bool] = None
78+
revcomp_surrogate: Optional[bool] = None
79+
surrogate_hamming_threshold_strict: Optional[int] = None
80+
81+
# guide barcode
82+
guide_barcode_pattern_regex: Optional[str] = None
83+
guide_barcode_left_flank: Optional[str] = None
84+
guide_barcode_right_flank: Optional[str] = None
85+
guide_barcode_start_position: Optional[int] = None
86+
guide_barcode_end_position: Optional[int] = None
87+
guide_barcode_length: Optional[int] = None
88+
is_guide_barcode_r1: Optional[bool] = None
89+
is_guide_barcode_header: Optional[bool] = None
90+
revcomp_guide_barcode: Optional[bool] = None
91+
guide_barcode_hamming_threshold_strict: Optional[int] = None
92+
93+
# guide UMI
94+
guide_umi_pattern_regex: Optional[str] = None
95+
guide_umi_left_flank: Optional[str] = None
96+
guide_umi_right_flank: Optional[str] = None
97+
guide_umi_start_position: Optional[int] = None
98+
guide_umi_end_position: Optional[int] = None
99+
guide_umi_length: Optional[int] = None
100+
is_guide_umi_r1: Optional[bool] = None
101+
is_guide_umi_header: Optional[bool] = None
102+
revcomp_guide_umi: Optional[bool] = None
103+
104+
# sample barcode
105+
sample_barcode_pattern_regex: Optional[str] = None
106+
sample_barcode_left_flank: Optional[str] = None
107+
sample_barcode_right_flank: Optional[str] = None
108+
sample_barcode_start_position: Optional[int] = None
109+
sample_barcode_end_position: Optional[int] = None
110+
sample_barcode_length: Optional[int] = None
111+
is_sample_barcode_r1: Optional[bool] = None
112+
is_sample_barcode_header: Optional[bool] = None
113+
revcomp_sample_barcode: Optional[bool] = None
114+
115+
# misc
116+
retain_inference_results: bool = False
117+
cores: int = 1
118+
119+
def to_kwargs(self) -> Dict[str, Any]:
120+
"""Dict of non-None fields suitable for splatting into the legacy entry point."""
121+
return {f.name: getattr(self, f.name) for f in fields(self) if getattr(self, f.name) is not None}
122+
123+
124+
def map_fastq(
125+
whitelist_guide_reporter_df: pd.DataFrame,
126+
fastq_r1_fns: List[str],
127+
fastq_r2_fns: Optional[List[str]] = None,
128+
*,
129+
config: Optional[ParsingConfig] = None,
130+
**kwargs: Any,
131+
) -> WhitelistReporterCountsResult:
132+
"""Map FASTQs to a whitelist library and return a `WhitelistReporterCountsResult`.
133+
134+
§4.5 / §7.1: public-API wrapper around
135+
`mapping.get_whitelist_reporter_counts_from_fastq`. Accepts a
136+
`ParsingConfig` and/or flat kwargs (flat kwargs override config fields).
137+
138+
Parameters
139+
----------
140+
whitelist_guide_reporter_df, fastq_r1_fns, fastq_r2_fns
141+
Same semantics as the legacy entry point.
142+
config
143+
Optional `ParsingConfig`. Fields with value `None` are dropped; provide
144+
any kwargs not in the config separately via `**kwargs`.
145+
**kwargs
146+
Flat kwargs; override any corresponding `config` field.
147+
148+
Returns
149+
-------
150+
WhitelistReporterCountsResult
151+
"""
152+
# Import here to avoid a circular import at module load time.
153+
from .mapping.main_mapping import get_whitelist_reporter_counts_from_fastq
154+
merged: Dict[str, Any] = {}
155+
if config is not None:
156+
merged.update(config.to_kwargs())
157+
merged.update(kwargs)
158+
return get_whitelist_reporter_counts_from_fastq(
159+
whitelist_guide_reporter_df=whitelist_guide_reporter_df,
160+
fastq_r1_fns=fastq_r1_fns,
161+
fastq_r2_fns=fastq_r2_fns,
162+
**merged,
163+
)
164+
165+
166+
def count(result: WhitelistReporterCountsResult) -> AllMatchSetWhitelistReporterCounterSeriesResults:
167+
"""Return the per-tier count Series container from a mapping result.
168+
169+
§4.5 / §7.1: thin accessor that surfaces the already-built counts. In the
170+
current implementation the mapping call builds counts eagerly; a future
171+
release can split mapping + counting into separate stages so counts are
172+
lazily evaluated only when asked for.
173+
"""
174+
return result.all_match_set_whitelist_reporter_counter_series_results
175+
176+
177+
def alleles(
178+
result: WhitelistReporterCountsResult,
179+
tier: str,
180+
*,
181+
contains_guide_surrogate: bool,
182+
contains_guide_barcode: bool,
183+
contains_guide_umi: bool,
184+
):
185+
"""Build allele count Series for a given mapping tier from a retained mapping result.
186+
187+
§4.5 / §7.1: wraps `processing.get_matchset_alleleseries`. Requires the
188+
mapping call to have been run with `retain_inference_results=True` (the
189+
default is slim); raises `ValueError` with a clear remediation message
190+
otherwise.
191+
192+
Parameters
193+
----------
194+
result
195+
A mapping result from `map_fastq` built with
196+
`retain_inference_results=True`.
197+
tier
198+
A `MatchTier` enum member or its string value.
199+
contains_guide_surrogate, contains_guide_barcode, contains_guide_umi
200+
Must match the mapping configuration.
201+
202+
Returns
203+
-------
204+
MatchSetWhitelistReporterObservedSequenceCounterSeriesResults
205+
"""
206+
from .processing.crispr_editing_processing import get_matchset_alleleseries
207+
return get_matchset_alleleseries(
208+
observed_guide_reporter_umi_counts_inferred=result.observed_guide_reporter_umi_counts_inferred,
209+
attribute_name=str(tier),
210+
contains_guide_surrogate=contains_guide_surrogate,
211+
contains_guide_barcode=contains_guide_barcode,
212+
contains_guide_umi=contains_guide_umi,
213+
)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""crispr_correct — forward-looking import alias for the `crispr_ambiguous_mapping` package.
2+
3+
§4.11: the project has three names historically — PyPI `crispr-ambiguous-mapping`,
4+
repo `CRISPR-Correct`, import `crispr_ambiguous_mapping`. This shim unifies
5+
future imports under a single name (`crispr_correct`) matching the repo.
6+
7+
Use this for new code:
8+
9+
```python
10+
import crispr_correct as cc
11+
result = cc.map_fastq(library, fastq_r1_fns=["R1.fq.gz"], config=cc.ParsingConfig(...))
12+
```
13+
14+
The existing `crispr_ambiguous_mapping` package stays available through 0.1.x
15+
for existing drivers; it'll be deprecated in 0.2.0 and removed in 0.3.0.
16+
"""
17+
from crispr_ambiguous_mapping import mapping, utility, models, processing, visualization, quality_control, postprocessing # noqa: F401
18+
from crispr_ambiguous_mapping.api import map_fastq, count, alleles, ParsingConfig # noqa: F401
19+
from crispr_ambiguous_mapping.models.mapping_models import MatchTier # noqa: F401

crispr-ambiguous-mapping/pyproject.toml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
[tool.poetry]
22
name = "crispr-ambiguous-mapping"
3-
version = "0.0.236"
4-
description = ""
3+
version = "0.1.0"
4+
description = "Hamming-distance-based CRISPR guide RNA mapping with IUPAC-ambiguous base-editor support."
55
authors = ["Basheer Becerra <bbecerr@outlook.com>"]
66
readme = "README.md"
7-
packages = [{include = "crispr_ambiguous_mapping"}]
7+
# §4.11: ship the historical `crispr_ambiguous_mapping` module plus the new
8+
# `crispr_correct` alias that re-exports the public API (map_fastq, count,
9+
# alleles, ParsingConfig, MatchTier). New code should use `crispr_correct`;
10+
# the legacy import remains through 0.1.x.
11+
packages = [
12+
{include = "crispr_ambiguous_mapping"},
13+
{include = "crispr_correct"},
14+
]
815

916
[tool.poetry.dependencies]
1017
python = ">=3.8,<3.12"

crispr-ambiguous-mapping/tests/test_smoke.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,50 @@ def test_slim_result_raises_on_postproc():
6565
)
6666

6767

68+
def test_v0_1_0_public_api_importable():
69+
# §4.5 / §7.1: map_fastq / count / alleles / ParsingConfig exposed at
70+
# package root via api.py.
71+
import crispr_ambiguous_mapping as cam
72+
assert callable(cam.map_fastq)
73+
assert callable(cam.count)
74+
assert callable(cam.alleles)
75+
assert cam.ParsingConfig is not None
76+
77+
cfg = cam.ParsingConfig(protospacer_length=20, cores=2)
78+
kw = cfg.to_kwargs()
79+
assert kw["protospacer_length"] == 20
80+
assert kw["cores"] == 2
81+
# None fields are dropped.
82+
assert "protospacer_pattern_regex" not in kw
83+
84+
85+
def test_crispr_correct_shim_package():
86+
# §4.11: `import crispr_correct as cc` aliases the canonical name.
87+
import crispr_correct as cc
88+
assert callable(cc.map_fastq)
89+
assert cc.ParsingConfig is not None
90+
assert cc.MatchTier.PM_SM_BM == "protospacer_match_surrogate_match_barcode_match"
91+
92+
93+
def test_count_input_contains_surrogate_backcompat():
94+
# §4.3 / K.1: legacy `contains_surrogate` attribute still reads the new
95+
# `contains_guide_surrogate` field.
96+
import pandas as pd
97+
from crispr_ambiguous_mapping.models.mapping_models import CountInput
98+
ci = CountInput(
99+
whitelist_guide_reporter_df=pd.DataFrame(),
100+
contains_guide_surrogate=True,
101+
contains_guide_barcode=False,
102+
contains_guide_umi=False,
103+
contains_sample_barcode=False,
104+
protospacer_hamming_threshold_strict=7,
105+
surrogate_hamming_threshold_strict=10,
106+
guide_barcode_hamming_threshold_strict=2,
107+
)
108+
assert ci.contains_guide_surrogate is True
109+
assert ci.contains_surrogate is True # deprecated alias
110+
111+
68112
def test_revcomp_translate_matches_biopython():
69113
# §3.10: translate-based revcomp must produce the same result as the
70114
# previous Bio.Seq.reverse_complement() on IUPAC bases we actually see.

0 commit comments

Comments
 (0)