api §4.1 + §4.5 + §4.11 + §7.1 + §7.3: v0.1.0 public API (L + M + O)

CodingBash · claude · CodingBash · commit 6a8b6cf45bf7 · 2026-04-22T01:00:24.000Z
L.1/L.2 (§4.1/§7.3 dataclass config + §4.5/§7.1 Map/Count/Postproc split):
- New `crispr_ambiguous_mapping/api.py` with map_fastq / count / alleles public functions + ParsingConfig dataclass mirroring the 50 parsing/threshold kwargs.
- ParsingConfig.to_kwargs() drops None fields; map_fastq(..., config=cfg, **overrides) delegates to the legacy entry point.
- count(result) returns the per-tier count Series container; alleles(result, tier, ...) wraps get_matchset_alleleseries.

O (§4.11 package consolidation):
- New `crispr_correct/` top-level package re-exporting the public API. `import crispr_correct as cc` is the forward-looking name (matches repo/docs).
- pyproject.toml version bumped 0.0.236 -&gt; 0.1.0; description filled in; `crispr_correct` added to packages.

Deferred to follow-up: N §4.6 parse_fastq rewrite (400-&gt;80 lines) — high risk, the only gate is simulation; scheduled as its own branch once the v0.1.0 wrapper surface is battle-tested.

New smoke tests:
- test_v0_1_0_public_api_importable — map_fastq/count/alleles/ParsingConfig present at top level.
- test_crispr_correct_shim_package — `import crispr_correct as cc` works.
- test_count_input_contains_surrogate_backcompat — legacy attribute still reads new field.

Gate: 9/9 smoke tests + scCRISPR PASS (43s); simulation 135/135.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/crispr-ambiguous-mapping/crispr_ambiguous_mapping/__init__.py b/crispr-ambiguous-mapping/crispr_ambiguous_mapping/__init__.py
@@ -15,3 +15,7 @@
 from . import visualization
 from . import quality_control
 from . import postprocessing
+
+# §4.5 / §7.1: v0.1.0 public API surface (map / count / alleles + ParsingConfig).
+from . import api
+from .api import map_fastq, count, alleles, ParsingConfig
diff --git a/crispr-ambiguous-mapping/crispr_ambiguous_mapping/api.py b/crispr-ambiguous-mapping/crispr_ambiguous_mapping/api.py
@@ -0,0 +1,213 @@
+"""Public API surface for CRISPR-Correct v0.1.0.
+
+Exposes the three-stage workflow (map / count / alleles) cleanly, with
+dataclass configuration to replace the 50-kwarg entry point. The legacy
+`mapping.get_whitelist_reporter_counts_from_fastq(...)` entry point continues
+to work and is what `map_fastq` delegates to — this module is currently a
+thin re-packaging with IDE-friendly signatures; a future release can split
+the stages more aggressively once drivers are migrated.
+
+Typical usage:
+
+```python
+import crispr_ambiguous_mapping as cam
+from crispr_ambiguous_mapping.api import map_fastq, count, alleles, ParsingConfig
+from crispr_ambiguous_mapping.models import MatchTier
+
+cfg = ParsingConfig(
+    protospacer_start_position=0, protospacer_length=20,
+    is_protospacer_r1=True, is_protospacer_header=False, revcomp_protospacer=False,
+    protospacer_hamming_threshold_strict=7,
+    surrogate_start_position=0, surrogate_length=32,
+    is_surrogate_r1=False, is_surrogate_header=False, revcomp_surrogate=True,
+    surrogate_hamming_threshold_strict=10,
+    retain_inference_results=True,
+    cores=4,
+)
+
+result = map_fastq(library, fastq_r1_fns=["R1.fq.gz"], fastq_r2_fns=["R2.fq.gz"], config=cfg)
+counts_per_tier = count(result)
+allele_df = alleles(result, tier=MatchTier.PM_SM_BM, contains_guide_surrogate=True, contains_guide_barcode=False, contains_guide_umi=False)
+```
+"""
+from __future__ import annotations
+from dataclasses import dataclass, asdict, fields
+from typing import Any, Dict, List, Optional
+import pandas as pd
+
+from .models.mapping_models import (
+    WhitelistReporterCountsResult,
+    AllMatchSetWhitelistReporterCounterSeriesResults,
+    MatchTier,
+)
+
+
+@dataclass
+class ParsingConfig:
+    """IDE-friendly bundle of the ~50 parsing/threshold kwargs.
+
+    Every field mirrors a kwarg on
+    `mapping.get_whitelist_reporter_counts_from_fastq`. Pass a ParsingConfig
+    to `map_fastq` and it unpacks into the legacy signature.
+
+    §4.1 / §7.3: this is the minimum-viable dataclass replacement — a single
+    flat struct so IDE autocomplete works. A future release can decompose
+    into ComponentConfig / ThresholdConfig subtrees once users have migrated.
+    """
+    # Regex / flank / position kwargs — protospacer
+    protospacer_pattern_regex: Optional[str] = None
+    protospacer_left_flank: Optional[str] = None
+    protospacer_right_flank: Optional[str] = None
+    protospacer_start_position: Optional[int] = None
+    protospacer_end_position: Optional[int] = None
+    protospacer_length: Optional[int] = None
+    is_protospacer_r1: Optional[bool] = None
+    is_protospacer_header: Optional[bool] = None
+    revcomp_protospacer: Optional[bool] = None
+    protospacer_hamming_threshold_strict: Optional[int] = None
+
+    # surrogate
+    surrogate_pattern_regex: Optional[str] = None
+    surrogate_left_flank: Optional[str] = None
+    surrogate_right_flank: Optional[str] = None
+    surrogate_start_position: Optional[int] = None
+    surrogate_end_position: Optional[int] = None
+    surrogate_length: Optional[int] = None
+    is_surrogate_r1: Optional[bool] = None
+    is_surrogate_header: Optional[bool] = None
+    revcomp_surrogate: Optional[bool] = None
+    surrogate_hamming_threshold_strict: Optional[int] = None
+
+    # guide barcode
+    guide_barcode_pattern_regex: Optional[str] = None
+    guide_barcode_left_flank: Optional[str] = None
+    guide_barcode_right_flank: Optional[str] = None
+    guide_barcode_start_position: Optional[int] = None
+    guide_barcode_end_position: Optional[int] = None
+    guide_barcode_length: Optional[int] = None
+    is_guide_barcode_r1: Optional[bool] = None
+    is_guide_barcode_header: Optional[bool] = None
+    revcomp_guide_barcode: Optional[bool] = None
+    guide_barcode_hamming_threshold_strict: Optional[int] = None
+
+    # guide UMI
+    guide_umi_pattern_regex: Optional[str] = None
+    guide_umi_left_flank: Optional[str] = None
+    guide_umi_right_flank: Optional[str] = None
+    guide_umi_start_position: Optional[int] = None
+    guide_umi_end_position: Optional[int] = None
+    guide_umi_length: Optional[int] = None
+    is_guide_umi_r1: Optional[bool] = None
+    is_guide_umi_header: Optional[bool] = None
+    revcomp_guide_umi: Optional[bool] = None
+
+    # sample barcode
+    sample_barcode_pattern_regex: Optional[str] = None
+    sample_barcode_left_flank: Optional[str] = None
+    sample_barcode_right_flank: Optional[str] = None
+    sample_barcode_start_position: Optional[int] = None
+    sample_barcode_end_position: Optional[int] = None
+    sample_barcode_length: Optional[int] = None
+    is_sample_barcode_r1: Optional[bool] = None
+    is_sample_barcode_header: Optional[bool] = None
+    revcomp_sample_barcode: Optional[bool] = None
+
+    # misc
+    retain_inference_results: bool = False
+    cores: int = 1
+
+    def to_kwargs(self) -> Dict[str, Any]:
+        """Dict of non-None fields suitable for splatting into the legacy entry point."""
+        return {f.name: getattr(self, f.name) for f in fields(self) if getattr(self, f.name) is not None}
+
+
+def map_fastq(
+    whitelist_guide_reporter_df: pd.DataFrame,
+    fastq_r1_fns: List[str],
+    fastq_r2_fns: Optional[List[str]] = None,
+    *,
+    config: Optional[ParsingConfig] = None,
+    **kwargs: Any,
+) -> WhitelistReporterCountsResult:
+    """Map FASTQs to a whitelist library and return a `WhitelistReporterCountsResult`.
+
+    §4.5 / §7.1: public-API wrapper around
+    `mapping.get_whitelist_reporter_counts_from_fastq`. Accepts a
+    `ParsingConfig` and/or flat kwargs (flat kwargs override config fields).
+
+    Parameters
+    ----------
+    whitelist_guide_reporter_df, fastq_r1_fns, fastq_r2_fns
+        Same semantics as the legacy entry point.
+    config
+        Optional `ParsingConfig`. Fields with value `None` are dropped; provide
+        any kwargs not in the config separately via `**kwargs`.
+    **kwargs
+        Flat kwargs; override any corresponding `config` field.
+
+    Returns
+    -------
+    WhitelistReporterCountsResult
+    """
+    # Import here to avoid a circular import at module load time.
+    from .mapping.main_mapping import get_whitelist_reporter_counts_from_fastq
+    merged: Dict[str, Any] = {}
+    if config is not None:
+        merged.update(config.to_kwargs())
+    merged.update(kwargs)
+    return get_whitelist_reporter_counts_from_fastq(
+        whitelist_guide_reporter_df=whitelist_guide_reporter_df,
+        fastq_r1_fns=fastq_r1_fns,
+        fastq_r2_fns=fastq_r2_fns,
+        **merged,
+    )
+
+
+def count(result: WhitelistReporterCountsResult) -> AllMatchSetWhitelistReporterCounterSeriesResults:
+    """Return the per-tier count Series container from a mapping result.
+
+    §4.5 / §7.1: thin accessor that surfaces the already-built counts. In the
+    current implementation the mapping call builds counts eagerly; a future
+    release can split mapping + counting into separate stages so counts are
+    lazily evaluated only when asked for.
+    """
+    return result.all_match_set_whitelist_reporter_counter_series_results
+
+
+def alleles(
+    result: WhitelistReporterCountsResult,
+    tier: str,
+    *,
+    contains_guide_surrogate: bool,
+    contains_guide_barcode: bool,
+    contains_guide_umi: bool,
+):
+    """Build allele count Series for a given mapping tier from a retained mapping result.
+
+    §4.5 / §7.1: wraps `processing.get_matchset_alleleseries`. Requires the
+    mapping call to have been run with `retain_inference_results=True` (the
+    default is slim); raises `ValueError` with a clear remediation message
+    otherwise.
+
+    Parameters
+    ----------
+    result
+        A mapping result from `map_fastq` built with
+        `retain_inference_results=True`.
+    tier
+        A `MatchTier` enum member or its string value.
+    contains_guide_surrogate, contains_guide_barcode, contains_guide_umi
+        Must match the mapping configuration.
+
+    Returns
+    -------
+    MatchSetWhitelistReporterObservedSequenceCounterSeriesResults
+    """
+    from .processing.crispr_editing_processing import get_matchset_alleleseries
+    return get_matchset_alleleseries(
+        observed_guide_reporter_umi_counts_inferred=result.observed_guide_reporter_umi_counts_inferred,
+        attribute_name=str(tier),
+        contains_guide_surrogate=contains_guide_surrogate,
+        contains_guide_barcode=contains_guide_barcode,
+        contains_guide_umi=contains_guide_umi,
+    )
diff --git a/crispr-ambiguous-mapping/crispr_correct/__init__.py b/crispr-ambiguous-mapping/crispr_correct/__init__.py
@@ -0,0 +1,19 @@
+"""crispr_correct — forward-looking import alias for the `crispr_ambiguous_mapping` package.
+
+§4.11: the project has three names historically — PyPI `crispr-ambiguous-mapping`,
+repo `CRISPR-Correct`, import `crispr_ambiguous_mapping`. This shim unifies
+future imports under a single name (`crispr_correct`) matching the repo.
+
+Use this for new code:
+
+```python
+import crispr_correct as cc
+result = cc.map_fastq(library, fastq_r1_fns=["R1.fq.gz"], config=cc.ParsingConfig(...))
+```
+
+The existing `crispr_ambiguous_mapping` package stays available through 0.1.x
+for existing drivers; it'll be deprecated in 0.2.0 and removed in 0.3.0.
+"""
+from crispr_ambiguous_mapping import mapping, utility, models, processing, visualization, quality_control, postprocessing  # noqa: F401
+from crispr_ambiguous_mapping.api import map_fastq, count, alleles, ParsingConfig  # noqa: F401
+from crispr_ambiguous_mapping.models.mapping_models import MatchTier  # noqa: F401
diff --git a/crispr-ambiguous-mapping/pyproject.toml b/crispr-ambiguous-mapping/pyproject.toml
@@ -1,10 +1,17 @@
 [tool.poetry]
 name = "crispr-ambiguous-mapping"
-version = "0.0.236"
-description = ""
+version = "0.1.0"
+description = "Hamming-distance-based CRISPR guide RNA mapping with IUPAC-ambiguous base-editor support."
 authors = ["Basheer Becerra <bbecerr@outlook.com>"]
 readme = "README.md"
-packages = [{include = "crispr_ambiguous_mapping"}]
+# §4.11: ship the historical `crispr_ambiguous_mapping` module plus the new
+# `crispr_correct` alias that re-exports the public API (map_fastq, count,
+# alleles, ParsingConfig, MatchTier). New code should use `crispr_correct`;
+# the legacy import remains through 0.1.x.
+packages = [
+    {include = "crispr_ambiguous_mapping"},
+    {include = "crispr_correct"},
+]
 
 [tool.poetry.dependencies]
 python = ">=3.8,<3.12"
diff --git a/crispr-ambiguous-mapping/tests/test_smoke.py b/crispr-ambiguous-mapping/tests/test_smoke.py
@@ -65,6 +65,50 @@ def test_slim_result_raises_on_postproc():
         )
 
 
+def test_v0_1_0_public_api_importable():
+    # §4.5 / §7.1: map_fastq / count / alleles / ParsingConfig exposed at
+    # package root via api.py.
+    import crispr_ambiguous_mapping as cam
+    assert callable(cam.map_fastq)
+    assert callable(cam.count)
+    assert callable(cam.alleles)
+    assert cam.ParsingConfig is not None
+
+    cfg = cam.ParsingConfig(protospacer_length=20, cores=2)
+    kw = cfg.to_kwargs()
+    assert kw["protospacer_length"] == 20
+    assert kw["cores"] == 2
+    # None fields are dropped.
+    assert "protospacer_pattern_regex" not in kw
+
+
+def test_crispr_correct_shim_package():
+    # §4.11: `import crispr_correct as cc` aliases the canonical name.
+    import crispr_correct as cc
+    assert callable(cc.map_fastq)
+    assert cc.ParsingConfig is not None
+    assert cc.MatchTier.PM_SM_BM == "protospacer_match_surrogate_match_barcode_match"
+
+
+def test_count_input_contains_surrogate_backcompat():
+    # §4.3 / K.1: legacy `contains_surrogate` attribute still reads the new
+    # `contains_guide_surrogate` field.
+    import pandas as pd
+    from crispr_ambiguous_mapping.models.mapping_models import CountInput
+    ci = CountInput(
+        whitelist_guide_reporter_df=pd.DataFrame(),
+        contains_guide_surrogate=True,
+        contains_guide_barcode=False,
+        contains_guide_umi=False,
+        contains_sample_barcode=False,
+        protospacer_hamming_threshold_strict=7,
+        surrogate_hamming_threshold_strict=10,
+        guide_barcode_hamming_threshold_strict=2,
+    )
+    assert ci.contains_guide_surrogate is True
+    assert ci.contains_surrogate is True  # deprecated alias
+
+
 def test_revcomp_translate_matches_biopython():
     # §3.10: translate-based revcomp must produce the same result as the
     # previous Bio.Seq.reverse_complement() on IUPAC bases we actually see.