docs §5.7 + §5.6 + §3.14: public docstrings + Dockerfile version bump

CodingBash · claude · CodingBash · commit ade0b936bf20 · 2026-04-21T23:50:14.000Z
§5.7: NumPy-style docstrings on the 3 main public entry points:
- mapping.get_whitelist_reporter_counts_from_fastq (params, return fields, raises, see-also)
- processing.get_matchset_alleleseries
- processing.get_mutation_profile

§3.14: documents that *_hamming_threshold_strict is strict-less-than (value 7 =&gt; dist &lt;= 6) in the main entry docstring, resolving the long-standing ambiguity.

§5.6: Dockerfile pin bumped 0.0.156 -&gt; 0.0.236 (current multi-sample-support).

Gate: scCRISPR + smoke (7 tests) pass (44s); simulation 135/135.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -15,4 +15,4 @@ ENV PATH="${VENV}/bin:$PATH"
 
 # Install from PyPI
 RUN pip install --upgrade pip
-RUN pip install crispr-ambiguous-mapping==0.0.156
+RUN pip install crispr-ambiguous-mapping==0.0.236
diff --git a/crispr-ambiguous-mapping/crispr_ambiguous_mapping/mapping/main_mapping.py b/crispr-ambiguous-mapping/crispr_ambiguous_mapping/mapping/main_mapping.py
@@ -107,6 +107,74 @@ def get_whitelist_reporter_counts_from_fastq(whitelist_guide_reporter_df: Option
 
                                                        retain_inference_results: bool = False,
                                                        cores: int=1) -> WhitelistReporterCountsResult:
+    """Map observed CRISPR reads from FASTQs to a whitelist guide library via per-base Hamming distance.
+
+    This is the canonical entry point for the multi-sample-support branch. It
+    parses the configured components (protospacer, optional surrogate, guide
+    barcode, guide UMI, sample/cell barcode) from R1/R2/header, runs Hamming
+    inference in parallel, builds the per-tier count series, and returns a
+    `WhitelistReporterCountsResult` dataclass.
+
+    Parameters
+    ----------
+    whitelist_guide_reporter_df
+        DataFrame with one row per guide. Required column: ``protospacer``. If
+        ``contains_guide_surrogate`` is inferred from the parsing kwargs, add a
+        ``surrogate`` column; if ``contains_guide_barcode`` is inferred, add a
+        ``barcode`` column.
+    fastq_r1_fns
+        List of R1 FASTQ paths (gzipped accepted). Single-end calls still pass
+        a single-element list.
+    fastq_r2_fns
+        List of R2 FASTQ paths or ``None`` for single-end.
+    protospacer_* / surrogate_* / guide_barcode_* / guide_umi_* / sample_barcode_*
+        Per-component extraction knobs. Provide one of:
+        ``*_pattern_regex`` (capture-group-1 parsed from sequence or header),
+        ``*_left_flank`` / ``*_right_flank`` (flank-based extraction), or
+        ``*_start_position`` + ``*_length`` / ``*_end_position`` (fixed offset).
+        ``is_*_r1`` / ``is_*_header`` selects source; ``revcomp_*`` reverse-
+        complements the extracted fragment.
+    protospacer_hamming_threshold_strict, surrogate_hamming_threshold_strict, guide_barcode_hamming_threshold_strict
+        Strict-less-than thresholds. A value of 7 means distances ``<= 6`` are
+        matches — the ``_strict`` suffix is deliberate. Typical: 7 for 20bp
+        protospacers, 10 for 32bp surrogates, 2 for 4bp barcodes. Pass ``None``
+        to auto-determine from the library (5th percentile of pairwise Hamming
+        distances, sample=100).
+    retain_inference_results
+        Default ``False`` — the slim result drops the per-observation
+        inference dict (15x smaller pickle, ~45% smaller peak RSS). Set to
+        ``True`` if you plan to call ``get_matchset_alleleseries`` /
+        ``get_mutation_profile`` / ``tally_linked_mutation_count_per_sequence``
+        downstream (they raise ``ValueError`` on a slim result with a clear
+        remediation message).
+    cores
+        Number of worker processes for inference. FASTQ parsing is single-
+        threaded (§3.12 streaming is a future upgrade).
+
+    Returns
+    -------
+    WhitelistReporterCountsResult
+        Fields of note:
+        - ``all_match_set_whitelist_reporter_counter_series_results`` — six tiers
+          (protospacer_match, PM+SM, PM+BM, PM+SM+BM, PM_mismatch_SM, PM_mismatch_SM_BM),
+          each with 9 Series (3 ambiguity strategies x 3 UMI strategies).
+        - ``quality_control_result`` — per-tier error counts (``num_total_*``, ``num_non_error_*``).
+        - ``count_input`` — echo of parsing flags (``contains_guide_surrogate``, etc.).
+        - ``observed_guide_reporter_umi_counts_inferred`` — raw per-observation
+          inference dict, present only when ``retain_inference_results=True``.
+
+    Raises
+    ------
+    ValueError
+        If ``whitelist_guide_reporter_df`` is missing required columns for
+        the configured components.
+
+    See Also
+    --------
+    crispr_ambiguous_mapping.processing.get_matchset_alleleseries
+    crispr_ambiguous_mapping.processing.get_mutation_profile
+    crispr_ambiguous_mapping.models.MatchTier
+    """
     # Input parameter validation checks
 
     protospacer_pattern_regex = None if ((protospacer_pattern_regex is not None) and  (protospacer_pattern_regex.strip() == "")) else protospacer_pattern_regex
diff --git a/crispr-ambiguous-mapping/crispr_ambiguous_mapping/processing/crispr_editing_processing.py b/crispr-ambiguous-mapping/crispr_ambiguous_mapping/processing/crispr_editing_processing.py
@@ -54,6 +54,36 @@ def _require_inference_dict(observed_guide_reporter_umi_counts_inferred, caller:
 
 
 def get_matchset_alleleseries(observed_guide_reporter_umi_counts_inferred: GeneralMappingInferenceDict, attribute_name: str, contains_surrogate: bool, contains_guide_barcode: bool, contains_guide_umi: bool):
+    """Build per-tier observed-allele count Series from a full (retained) mapping result.
+
+    For each whitelist guide, aggregates the observed protospacer/surrogate/barcode
+    alleles that mapped to it under the chosen tier, across nine (ambiguity x UMI)
+    counting strategies. Required input for downstream mutation profiling.
+
+    Parameters
+    ----------
+    observed_guide_reporter_umi_counts_inferred
+        The per-observation inference dict — i.e. ``result.observed_guide_reporter_umi_counts_inferred``
+        from a mapping call made with ``retain_inference_results=True``. A slim
+        result (default) passes ``None`` here and raises ``ValueError``.
+    attribute_name
+        Match tier to extract. Pass a ``MatchTier`` enum member (or its string
+        value). Typical: ``MatchTier.PM_SM_BM`` for full-triplet screens.
+    contains_surrogate, contains_guide_barcode, contains_guide_umi
+        Must match what was configured during mapping (these drive the output
+        DataFrame column shape).
+
+    Returns
+    -------
+    MatchSetWhitelistReporterObservedSequenceCounterSeriesResults
+        Dataclass with 9 alleledict + 9 alleleseries_dict + 9 allele_df fields,
+        one per (ambiguity, UMI) strategy.
+
+    Raises
+    ------
+    ValueError
+        If called on a slim mapping result (re-run with ``retain_inference_results=True``).
+    """
     _require_inference_dict(observed_guide_reporter_umi_counts_inferred, "get_matchset_alleleseries")
     #
     #   DEFINE THE DEFAULTDICTS FOR COUNTING
@@ -264,8 +294,36 @@ def determine_mutations_in_sequence(true_sequence, observed_sequence):
     return observed_sequence_mutation_df
 
 
-def get_mutation_profile(match_set_whitelist_reporter_observed_sequence_counter_series_results: MatchSetWhitelistReporterObservedSequenceCounterSeriesResults, whitelist_reporter_df: pd.DataFrame, contains_surrogate: bool, contains_guide_barcode: bool) -> MatchSetWhitelistReporterObservedSequenceMutationProfiles: 
-    
+def get_mutation_profile(match_set_whitelist_reporter_observed_sequence_counter_series_results: MatchSetWhitelistReporterObservedSequenceCounterSeriesResults, whitelist_reporter_df: pd.DataFrame, contains_surrogate: bool, contains_guide_barcode: bool) -> MatchSetWhitelistReporterObservedSequenceMutationProfiles:
+    """Compute per-position mutation profiles from allele count series.
+
+    Given the allele Series built by ``get_matchset_alleleseries``, this walks
+    each (whitelist, observed_allele) pair and records per-base mutations
+    against the whitelist reference, producing both linked (allele-level) and
+    unlinked (position-level) mutation tables for all nine counting strategies.
+
+    Parameters
+    ----------
+    match_set_whitelist_reporter_observed_sequence_counter_series_results
+        Return value of ``get_matchset_alleleseries``.
+    whitelist_reporter_df
+        The same DataFrame that was passed into the mapping call. Used as the
+        reference sequence for computing mutations.
+    contains_surrogate, contains_guide_barcode
+        Must match the mapping configuration.
+
+    Returns
+    -------
+    MatchSetWhitelistReporterObservedSequenceMutationProfiles
+        Mutation tables per strategy. Consume via
+        ``tally_linked_mutation_count_per_sequence`` for aggregate counters, or
+        drive ``visualization.plot_mutation_count_histogram`` /
+        ``plot_trinucleotide_mutational_signature`` directly.
+
+    See Also
+    --------
+    tally_linked_mutation_count_per_sequence
+    """
     # Function to generate unlinked mutations for particular count type
     def generate_mutations_results(alleleseries: Optional[GeneralAlleleCountSeriesDict], whitelist_reporter_df: pd.DataFrame, contains_surrogate: bool, contains_guide_barcode: bool) -> Optional[ObservedSequenceMutationProfile]:
         if alleleseries is not None: