Skip to content

Commit df3a3df

Browse files
committed
Push commit
1 parent c2e95de commit df3a3df

35 files changed

Lines changed: 489 additions & 280 deletions

File tree

crispr-ambiguous-mapping/crispr_ambiguous_mapping/mapping/main_mapping.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from ..processing import crispr_guide_counting
2626
from ..processing import crispr_sequence_encoding
2727
from ..parsing import reporter_umitools_fastq_parsing, reporter_standard_fastq_parsing
28-
from ..models.mapping_models import WhitelistReporterCountsResult, SampleWhitelistReporterCountsResult, GeneralGuideCountType
28+
from ..models.mapping_models import WhitelistReporterCountsResult, GeneralGuideCountType
2929

3030
#
3131
# Deprecated
@@ -159,7 +159,7 @@ def get_whitelist_reporter_counts_from_fastq(whitelist_guide_reporter_df: Option
159159
protospacer_hamming_threshold_strict: Optional[int] = None,
160160

161161
store_intermediates: bool = False,
162-
cores: int=1) -> Union[WhitelistReporterCountsResult, SampleWhitelistReporterCountsResult]:
162+
cores: int=1) -> WhitelistReporterCountsResult:
163163
# Input parameter validation checks
164164

165165
protospacer_pattern_regex = None if ((protospacer_pattern_regex is not None) and (protospacer_pattern_regex.strip() == "")) else protospacer_pattern_regex

crispr-ambiguous-mapping/crispr_ambiguous_mapping/models/mapping_models.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -255,19 +255,7 @@ class CountInput:
255255
@dataclass
256256
class WhitelistReporterCountsResult:
257257
all_match_set_whitelist_reporter_counter_series_results: AllMatchSetWhitelistReporterCounterSeriesResults
258-
observed_guide_reporter_umi_counts_inferred: GeneralMappingInferenceDict
258+
observed_guide_reporter_umi_counts_inferred: Union[GeneralMappingInferenceDict, DefaultDict[str, GeneralMappingInferenceDict]]
259259
quality_control_result: QualityControlResult
260260
count_input: CountInput
261261

262-
@dataclass
263-
class SampleWhitelistReporterCountsResult:
264-
all_match_set_whitelist_reporter_counter_series_results_all_samples: DefaultDict[str, AllMatchSetWhitelistReporterCounterSeriesResults]
265-
observed_guide_reporter_umi_counts_inferred_all_samples: DefaultDict[str, GeneralMappingInferenceDict]
266-
quality_control_result_all_samples: DefaultDict[str, GeneralMappingInferenceDict]
267-
count_input: CountInput
268-
269-
#
270-
# Types
271-
#
272-
273-

crispr-ambiguous-mapping/crispr_ambiguous_mapping/processing/crispr_count_processing.py

Lines changed: 315 additions & 188 deletions
Large diffs are not rendered by default.

crispr-ambiguous-mapping/crispr_ambiguous_mapping/processing/crispr_editing_processing.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,25 @@ def check_match_result_non_error(match_result):
1515
return False if match_result is None else match_result.error is None # If match_result is None, treat as error. If match_result is not None, but error is None, then non_error
1616

1717
# Filter dict with observed sequence inference results for only those that do not contain any mapping errors
18-
def get_non_error_dict(observed_guide_reporter_umi_counts_inferred: GeneralMappingInferenceDict, attribute_name: str) -> MatchSetWhitelistReporterObservedSequenceCounterSeriesResults:
19-
return {observed_guide_reporter_key: observed_guide_reporter_umi_counts_inferred_value for observed_guide_reporter_key, observed_guide_reporter_umi_counts_inferred_value in observed_guide_reporter_umi_counts_inferred.items() if check_match_result_non_error(getattr(observed_guide_reporter_umi_counts_inferred_value.inferred_value, attribute_name))}
18+
def get_non_error_dict(observed_guide_reporter_umi_counts_inferred: Union[GeneralMappingInferenceDict, DefaultDict[str, GeneralMappingInferenceDict]], attribute_name: str) -> MatchSetWhitelistReporterObservedSequenceCounterSeriesResults:
19+
"""
20+
Return a dict of all entries that are non-error.
21+
Works with both sample-barcode (nested dict) and no-sample-barcode (flat dict) cases.
22+
"""
23+
non_error_dict = {}
24+
25+
for key, val in observed_guide_reporter_umi_counts_inferred.items():
26+
if hasattr(val, "inferred_value"): # flat dict
27+
attr = getattr(val.inferred_value, attribute_name, None)
28+
if check_match_result_non_error(attr):
29+
non_error_dict[key] = val
30+
else: # nested dict (sample-barcode case)
31+
for inner_key, inner_val in val.items():
32+
attr = getattr(inner_val.inferred_value, attribute_name, None)
33+
if check_match_result_non_error(attr):
34+
non_error_dict[(key, inner_key)] = inner_val # tuple key: (sample, guide)
35+
36+
return non_error_dict
2037

2138
#
2239
# Given the datastructure containing the inference results "observed_guide_reporter_umi_counts_inferred", iterate through the entire datastructure to generate

crispr-ambiguous-mapping/crispr_ambiguous_mapping/processing/crispr_guide_counting.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from .crispr_count_processing import get_counterseries_all_results
2727
from ..quality_control.crispr_mapping_quality_control import perform_counts_quality_control
2828
from ..models.mapping_models import GeneralGuideCountType, GeneralMappingInferenceDict
29-
from ..models.mapping_models import AllMatchSetWhitelistReporterCounterSeriesResults, WhitelistReporterCountsResult, SampleWhitelistReporterCountsResult, InferenceResult, CountInput, QualityControlResult
29+
from ..models.mapping_models import AllMatchSetWhitelistReporterCounterSeriesResults, WhitelistReporterCountsResult, InferenceResult, CountInput, QualityControlResult
3030

3131

3232
# TODO: There will probably be some type errors with the DefaultDict when testing on non UMI (since it requires CounterType), so make sure to test with different variations of inputs
@@ -41,7 +41,7 @@ def get_whitelist_reporter_counts_with_umi(observed_guide_reporter_umi_counts: G
4141
surrogate_hamming_threshold_strict: Optional[int] = 2,
4242
guide_barcode_hamming_threshold_strict: Optional[int] = 2,
4343
store_intermediates: bool = False,
44-
cores: int=1) -> Union[WhitelistReporterCountsResult, SampleWhitelistReporterCountsResult]:
44+
cores: int=1) -> WhitelistReporterCountsResult:
4545

4646
# Generate whitelist dataframe based on all observed sequences if none provided
4747
if whitelist_guide_reporter_df is None:
@@ -163,8 +163,12 @@ def pad_series(series):
163163
print(f"Mapping inference results of length {len(inferred_true_reporter_sequences)} to the result object")
164164
# Some organization: Map the inferred result of each observed sequence to a dict with the inferred result and correspoding count
165165

166+
167+
168+
# NOTE 20251031: This may be able to be removed
166169
if contains_sample_barcode:
167170
observed_guide_reporter_umi_counts_inferred_all_samples: DefaultDict[str, GeneralMappingInferenceDict] = defaultdict(lambda: defaultdict(dict))
171+
168172
# Add all cell_barcodes
169173
for observed_guide_reporter_key_index, observed_guide_reporter_key in enumerate(observed_guide_reporter_list): # Iterate through each observed guide key
170174
observed_guide_reporter_cell_counts = observed_guide_reporter_umi_counts[observed_guide_reporter_key]
@@ -185,19 +189,12 @@ def pad_series(series):
185189
# GET THE MAPPED COUNT SERIES BASED ON THE INFERENCE RESULTS
186190
print("Prepare the processed count series ")
187191
all_cell_barcodes: List[str] = list(observed_guide_reporter_umi_counts_inferred_all_samples.keys())
188-
all_match_set_whitelist_reporter_counter_series_results_all_samples: DefaultDict[str, AllMatchSetWhitelistReporterCounterSeriesResults] = defaultdict(AllMatchSetWhitelistReporterCounterSeriesResults)
189-
quality_control_result_all_samples: DefaultDict[str, QualityControlResult] = defaultdict(QualityControlResult)
190-
for cell_barcode_i, cell_barcode in enumerate(all_cell_barcodes):
191-
observed_guide_reporter_umi_counts_inferred_per_sample = observed_guide_reporter_umi_counts_inferred_all_samples[cell_barcode]
192-
all_match_set_whitelist_reporter_counter_series_results_per_sample = get_counterseries_all_results(observed_guide_reporter_umi_counts_inferred_per_sample, whitelist_guide_reporter_df, contains_guide_barcode, contains_guide_surrogate, contains_guide_umi)
193-
quality_control_result_per_sample: QualityControlResult = perform_counts_quality_control(observed_guide_reporter_umi_counts_inferred_per_sample, contains_guide_umi, contains_guide_surrogate, contains_guide_barcode)
194-
195-
all_match_set_whitelist_reporter_counter_series_results_all_samples[cell_barcode] = all_match_set_whitelist_reporter_counter_series_results_per_sample
196-
quality_control_result_all_samples[cell_barcode] = quality_control_result_per_sample
197-
198-
if cell_barcode_i % 2500 == 0:
199-
print(f"- Processed cell_barcode {cell_barcode_i} out of {len(all_cell_barcodes)}")
200-
192+
193+
all_match_set_whitelist_reporter_counter_series_results: AllMatchSetWhitelistReporterCounterSeriesResults
194+
quality_control_result: QualityControlResult
195+
196+
all_match_set_whitelist_reporter_counter_series_results = get_counterseries_all_results(observed_guide_reporter_umi_counts_inferred_all_samples, whitelist_guide_reporter_df, contains_guide_barcode, contains_guide_surrogate, contains_guide_umi, contains_sample_barcode)
197+
quality_control_result: QualityControlResult = perform_counts_quality_control(observed_guide_reporter_umi_counts_inferred_all_samples, contains_guide_umi, contains_guide_surrogate, contains_guide_barcode, contains_sample_barcode)
201198

202199
count_input= CountInput(whitelist_guide_reporter_df=whitelist_guide_reporter_df,
203200
contains_surrogate=contains_guide_surrogate,
@@ -208,10 +205,10 @@ def pad_series(series):
208205
surrogate_hamming_threshold_strict=surrogate_hamming_threshold,
209206
guide_barcode_hamming_threshold_strict=guide_barcode_hamming_threshold)
210207

211-
return SampleWhitelistReporterCountsResult(all_match_set_whitelist_reporter_counter_series_results_all_samples=all_match_set_whitelist_reporter_counter_series_results_all_samples,
212-
observed_guide_reporter_umi_counts_inferred_all_samples=observed_guide_reporter_umi_counts_inferred_all_samples,
213-
quality_control_result_all_samples=quality_control_result_all_samples,
214-
count_input=count_input)
208+
return WhitelistReporterCountsResult(all_match_set_whitelist_reporter_counter_series_results=all_match_set_whitelist_reporter_counter_series_results,
209+
observed_guide_reporter_umi_counts_inferred=observed_guide_reporter_umi_counts_inferred_all_samples,
210+
quality_control_result=quality_control_result,
211+
count_input=count_input)
215212
else:
216213

217214
observed_guide_reporter_umi_counts_inferred: GeneralMappingInferenceDict = defaultdict(dict)
@@ -229,13 +226,13 @@ def pad_series(series):
229226
# GET THE MAPPED COUNT SERIES BASED ON THE INFERENCE RESULTS
230227
print("Prepare the processed count series ")
231228
# Count
232-
all_match_set_whitelist_reporter_counter_series_results = get_counterseries_all_results(observed_guide_reporter_umi_counts_inferred, whitelist_guide_reporter_df, contains_guide_barcode, contains_guide_surrogate, contains_guide_umi)
229+
all_match_set_whitelist_reporter_counter_series_results = get_counterseries_all_results(observed_guide_reporter_umi_counts_inferred, whitelist_guide_reporter_df, contains_guide_barcode, contains_guide_surrogate, contains_guide_umi, contains_sample_barcode)
233230

234231
after_counterseries_time = datetime.now()
235232
print(f"{(after_counterseries_time-after_inference_processing_time).seconds} seconds for counter series generation")
236233

237234
print("Preparing quality control")
238-
quality_control_result: QualityControlResult = perform_counts_quality_control(observed_guide_reporter_umi_counts_inferred, contains_guide_umi, contains_guide_surrogate, contains_guide_barcode)
235+
quality_control_result: QualityControlResult = perform_counts_quality_control(observed_guide_reporter_umi_counts_inferred, contains_guide_umi, contains_guide_surrogate, contains_guide_barcode, contains_sample_barcode)
239236

240237
after_qualitycontrol_time = datetime.now()
241238
print(f"{(after_qualitycontrol_time-after_counterseries_time).seconds} seconds for quality control")

0 commit comments

Comments
 (0)