Fix pepatac.py SE+bwa NameError, harden filter_pair check, add --skip-dedup

jpsmith5 · jpsmith5 · commit 8bda2e487dbe · 2026-05-11T09:16:39.000-04:00
- _align() referenced an undefined in the single-end + bwa branch paired chain (minus filter_pair) for both --keep and no-keep paths: pm.run([cmd1, cmd2, cmd3, cmd4], <target>). (#299) - Fix missing pm.fail_pipeline in the unmap_fq1 branch of the filter_paired_fq.pl handle check; previously a stuck filter on R1 set an error string that was never raised. Reworked the error message into a shared template that points at the underlying psutil introspection issue and recommends both --keep and --noFIFO as workarounds. (#234) - Add --skip-dedup flag for protocols where duplicates are biologically meaningful (CUT&Tag, CUT&RUN). When set: copy mapping_genome_bam to _sort_dedup.bam so downstream peak calling finds the expected path; report Duplicate_reads=0 and pass through Dedup_aligned_reads/Dedup_alignment_rate/Dedup_total_efficiency from the pre-dedup metrics. Plumbed through sample_pipeline_interface.yaml so it can be set per-sample. (#249) - Drop redundant Time/Success keys from pepatac_output_schema.yaml (both samples: and project: blocks). These are pipestat's auto- tracked status fields and the duplicate declaration triggered "SchemaError: Overlap between project- and sample-level keys" on newer pipestat. (#322, #305) - Fix _LOGGER NameError in tools/bamQC.py and bamSitesToWig.py: the variable was only defined inside , so pararead workers re-importing the module under multiprocessing 'spawn' (macOS default) hit NameError when class methods logged. Added a module-level fallback logger above each class definition. (#266) - Fix peakCounts() ref-peaks ignored when *_peaks_coverage.bed.gz coexists with *_ref_peaks_coverage.bed: the shared variable preferred .bed.gz from the regular peaks file and then looked for a non-existent _ref_peaks_coverage.bed.gz, falling through to the "not derived from a singular reference peak set" warning. Detect ref vs regular extensions independently. (#218, #219) - Guard refgenie[sample.genome] lookups in sample_pipeline_interface.yaml with , so projects with non-refgenie genomes (e.g. galGal6, bosTau9) no longer crash the Jinja template with an attmap AttributeError; instead they fall through to the per-sample paths or error cleanly from pepatac.py. (#231) - Fix plotAnno() empty-input fallback path bug: was constructing file.path(<output_file>, "<sample>_partition_dist.pdf") (treating the output pdf as a directory) and quit()ing the R session. Replaced with return(ggplot()), matching the function's other empty-data branches; the caller writes a clean blank placeholder at the expected target. (#232) Closes #299, #234, #249, #322, #305, #266, #218, #219, #231, #232.
diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R
@@ -1423,12 +1423,13 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"),
         if (file.exists(file.path(input)) && info$size != 0) {
             in_file  <- data.table::fread(file.path(input))
         } else {
-            out_file <- file.path(output, paste(basename(sample_path),
-                                                output_type,
-                                                "partition_dist.pdf",
-                                                sep="_"))
-            system2(paste("touch"), out_file)
-            quit()
+            # No input data — return an empty ggplot so the caller's
+            # pdf()/png() writes a blank placeholder at the expected
+            # target path. (Earlier versions tried to touch a hard-coded
+            # *_partition_dist.pdf inside `output` treating it as a
+            # directory, which silently dropped the placeholder when
+            # `output` was a file path. See #232.)
+            return(ggplot())
         }
     }
 
@@ -2855,29 +2856,33 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets,
         }
     }
 
-    # check if coverage files are compressed
-    if (any(file.exists(file.path(results_subdir,
-                        sample_names, paste0("peak_calling_", genomes),
-                        paste0(sample_names, "_ref_peaks_coverage.bed.gz"))))) {
-        ext <- ".bed.gz"
-    } else if (any(file.exists(file.path(results_subdir,
-                        sample_names, paste0("peak_calling_", genomes),
-                        paste0(sample_names, "_peaks_coverage.bed.gz"))))) {
-        ext <- ".bed.gz"
-    } else {
-        ext <- ".bed"
+    # Detect extension independently for reference vs sample peak coverage,
+    # since users commonly have *_peaks_coverage.bed.gz from the first sample
+    # run plus *_ref_peaks_coverage.bed from the re-run (or vice versa). A
+    # single shared `ext` defeats the ref-peaks lookup in that mixed state.
+    detect_ext <- function(suffix) {
+        for (e in c(".bed.gz", ".bed")) {
+            if (any(file.exists(file.path(
+                results_subdir,
+                sample_names, paste0("peak_calling_", genomes),
+                paste0(sample_names, suffix, e))))) {
+                return(e)
+            }
+        }
+        return(NA_character_)
     }
+    ref_ext      <- detect_ext("_ref_peaks_coverage")
+    fallback_ext <- detect_ext("_peaks_coverage")
+    if (is.na(fallback_ext)) fallback_ext <- ".bed"
 
     # Use reference peak coverage file if available
-    if (any(file.exists(file.path(results_subdir,
-                        sample_names, paste0("peak_calling_", genomes),
-                        paste0(sample_names, "_ref_peaks_coverage", ext))))) {
-        peak_file_name = paste0("_ref_peaks_coverage", ext)
+    if (!is.na(ref_ext)) {
+        peak_file_name = paste0("_ref_peaks_coverage", ref_ext)
         reference = TRUE
     } else {
         warning("Peak coverage files are not derived from a singular reference peak set.")
         reference = FALSE
-        peak_file_name = paste0("_peaks_coverage", ext)
+        peak_file_name = paste0("_peaks_coverage", fallback_ext)
     }
     
     # generate paths to peak coverage files
diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml
@@ -309,12 +309,6 @@ properties:
           required:
             - path
             - title
-        Time:
-          type: string
-          description: "time"
-        Success:
-          type: string
-          description: "success"
   project:
     type: object
     properties:
@@ -411,10 +405,4 @@ properties:
             type: string
         required:
           - path
-          - title
-      Time:
-        type: string
-        description: "Total elapsed pipeline time"
-      Success:
-        type: string
-        description: "Timestamp when pipeline completed successfully"
+          - title
diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py
@@ -118,6 +118,11 @@ def parse_arguments():
                         help="Skip FastQC. Useful for bugs in FastQC "
                              "that appear with some sequence read files.")
 
+    parser.add_argument("--skip-dedup", dest="skip_dedup", action='store_true',
+                        help="Skip duplicate removal. Recommended for protocols "
+                             "where duplicates are biologically meaningful "
+                             "(e.g. CUT&Tag, CUT&RUN).")
+
     # Prealignment genome assets
     parser.add_argument("--prealignment-names", default=[], type=str,
                         nargs="+",
@@ -342,9 +347,9 @@ def _align(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2,
             pm.run([cmd1, cmd2, cmd3, cmd4, filter_pair], out_fastq_r2_gz)
         else:
             if args.keep:
-                pm.run(cmd, mapped_bam)
+                pm.run([cmd1, cmd2, cmd3, cmd4], mapped_bam)
             else:
-                pm.run(cmd, out_fastq_tmp_gz)
+                pm.run([cmd1, cmd2, cmd3, cmd4], out_fastq_tmp_gz)
 
         cmd = tools.samtools + " view -c " + mapped_bam
         align_exact = pm.checkprint(cmd)       
@@ -981,6 +986,12 @@ def no_handle(fq):
         pm.debug("{} is released! \n".format(os.path.abspath(fq)))
         return True
     
+    handle_fail_msg = (
+        "Fastq filter_paired_fq.pl function did not complete successfully. "
+        "Re-run with `--keep` or `--noFIFO` to bypass the non-blocking "
+        "filter_pair path, which relies on psutil process introspection "
+        "and can fail in environments where it lacks permission to inspect "
+        "other processes' file handles.")
     if args.paired_end and not os.path.exists(mapping_genome_bam):
         if not pypiper.is_gzipped_fastq(unmap_fq1):
             checks = 1
@@ -989,20 +1000,15 @@ def no_handle(fq):
                 checks += 1
                 pm.debug("Check count fq1: {}".format(str(checks)))
             if checks > 100 and not no_handle(unmap_fq1):
-                err_msg = ("Fastq filter_paired_fq.pl function did not "
-                           "complete successfully. Try running the pipeline "
-                           "with `--keep`.")
+                pm.fail_pipeline(IOError(handle_fail_msg))
         if not pypiper.is_gzipped_fastq(unmap_fq2):
             checks = 1
             # Check unmap_fq2
             while not no_handle(unmap_fq2) and checks < 10000:
                 checks += 1
                 pm.debug("Check count fq2: {}".format(str(checks)))
             if checks > 100 and not no_handle(unmap_fq2):
-                err_msg = ("Fastq filter_paired_fq.pl function did not "
-                           "complete successfully. Try running the pipeline "
-                           "with `--keep`.")
-                pm.fail_pipeline(IOError(err_msg))
+                pm.fail_pipeline(IOError(handle_fail_msg))
 
     for unmapped_fq in to_compress:
         # Compress unmapped fastq reads
@@ -1222,6 +1228,17 @@ def estimate_lib_size(dedup_log):
         pm.report_result("Picard_est_lib_size", picard_est_lib_size)
 
     def post_dup_aligned_reads(dedup_log):
+        if args.skip_dedup:
+            ar = float(pm.get_stat("Aligned_reads"))
+            tr = float(pm.get_stat("Trimmed_reads"))
+            rr = float(pm.get_stat("Raw_reads"))
+            pm.report_result("Duplicate_reads", 0)
+            pm.report_result("Dedup_aligned_reads", ar)
+            pm.report_result("Dedup_alignment_rate",
+                             round(float(ar) * 100 / float(tr), 2))
+            pm.report_result("Dedup_total_efficiency",
+                             round(float(ar) * 100 / float(rr), 2))
+            return
         if args.deduplicator == "picard":
             cmd = ("grep -A2 'METRICS CLASS' " + dedup_log +
                    " | tail -n 1 | awk '{print $(NF-3)}'")
@@ -1269,8 +1286,14 @@ def post_dup_aligned_reads(dedup_log):
         java_settings = '-Xmx{mem}'.format(mem=pm.mem)
     else:
         java_settings = param.java_settings.params
-    if args.deduplicator == "picard":
-        cmd1 = (tools.java + " " + java_settings + " -jar " + 
+    if args.skip_dedup:
+        # User opted out of duplicate removal (e.g. CUT&Tag/CUT&RUN protocols).
+        # Reuse the post-alignment BAM as the "dedup" endpoint so downstream
+        # steps can find _sort_dedup.bam without a code-path fork.
+        cmd1 = "cp {} {}".format(mapping_genome_bam, rmdup_bam)
+        cmd2 = tools.samtools + " index " + rmdup_bam
+    elif args.deduplicator == "picard":
+        cmd1 = (tools.java + " " + java_settings + " -jar " +
                 tools.picard + " MarkDuplicates")
         cmd1 += " INPUT=" + mapping_genome_bam
         cmd1 += " OUTPUT=" + rmdup_bam
diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml
@@ -12,15 +12,15 @@ sample_interface:
     {% if sample.read2 is defined %} --input2 { sample.read2 } {% endif %}
     --single-or-paired { sample.read_type }
     --genome { sample.genome }
-    {% if sample.chrom_sizes is defined %} --chrom-sizes { sample.chrom_sizes } {% elif refgenie[sample.genome].fasta is defined %} --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } {% endif %}
-    {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %}
-    {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% elif refgenie[sample.genome].blacklist is defined %} --blacklist { refgenie[sample.genome].blacklist.blacklist } {% endif %}
-    {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %}
+    {% if sample.chrom_sizes is defined %} --chrom-sizes { sample.chrom_sizes } {% elif sample.genome in refgenie and refgenie[sample.genome].fasta is defined %} --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } {% endif %}
+    {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif sample.genome in refgenie and refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %}
+    {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% elif sample.genome in refgenie and refgenie[sample.genome].blacklist is defined %} --blacklist { refgenie[sample.genome].blacklist.blacklist } {% endif %}
+    {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif sample.genome in refgenie and refgenie[sample.genome].feat_annotation is defined %} --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %}
     {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% else %} --trimmer "skewer" {% endif %}
     {% if sample.aligner is defined %} --aligner { sample.aligner } {% set aligner = sample.aligner %} {% else %} --aligner "bowtie2" {% set aligner = "bowtie2" %} {% endif %}
-    {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bwa_index is defined %} --genome-index { refgenie[sample.genome].bwa_index.dir } {% endif %} {% endif %}
+    {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif sample.genome in refgenie and refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif sample.genome in refgenie and refgenie[sample.genome].bwa_index is defined %} --genome-index { refgenie[sample.genome].bwa_index.dir } {% endif %} {% endif %}
     {% if sample.prealignment_index is defined %} --prealignment-index { sample.prealignment_index } {% endif %}
-    {% if sample.prealignment_names is defined %} {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% else %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bwa_index.dir } {% endfor %} {% endif %} {% endif %}
+    {% if sample.prealignment_names is defined %} {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} --prealignment-index {% for p in sample.prealignment_names %} {% if p in refgenie %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endif %} {% endfor %} {% else %} --prealignment-index {% for p in sample.prealignment_names %} {% if p in refgenie %} { p ~ '=' ~ refgenie[p].bwa_index.dir } {% endif %} {% endfor %} {% endif %} {% endif %}
     {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %}
     {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% else %} --peak-caller "macs3" {% endif %}
     {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% else %} --peak-type "fixed" {% endif %}
@@ -29,15 +29,16 @@ sample_interface:
     {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks { sample.frip_ref_peaks } {% endif %}
     {% if sample.motif is defined %} --motif {% endif %}
     {% if sample.sob is defined %} --sob {% endif %}
-    {% if sample.sob is defined %} {% if refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% endif %}
-    {% if sample.sob is defined %} {% if refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} {% endif %}
-    {% if sample.fasta is defined %} --fasta { sample.fasta } {% elif refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %}
+    {% if sample.sob is defined %} {% if sample.genome in refgenie and refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% endif %}
+    {% if sample.sob is defined %} {% if sample.genome in refgenie and refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} {% endif %}
+    {% if sample.fasta is defined %} --fasta { sample.fasta } {% elif sample.genome in refgenie and refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %}
     {% if sample.no_scale is defined %} --no-scale {% endif %}
     {% if sample.prioritize is defined %} --prioritize {% endif %}
     {% if sample.keep is defined %} --keep {% endif %}
     {% if sample.no_fifo is defined %} --noFIFO {% endif %}
     {% if sample.lite is defined %} --lite {% endif %}
     {% if sample.skipqc is defined %} --skipqc {% endif %}
+    {% if sample.skip_dedup is defined %} --skip-dedup {% endif %}
     --pipestat-config {pipestat.config_file}
 
 compute:
diff --git a/tools/bamQC.py b/tools/bamQC.py
@@ -11,13 +11,22 @@
 __email__ = "jasonsmith@virginia.edu"
 
 from argparse import ArgumentParser
+import logging
 import os
 import sys
 import pararead
 import logmuse
 import pandas as _pd
 import numpy as np
 
+# Module-level fallback so class methods always have a logger, even when
+# pararead workers re-import this module under multiprocessing 'spawn'
+# (macOS default, and elsewhere when fork is unavailable). The __main__
+# block below upgrades this to a logmuse-configured logger for the
+# parent process.
+_LOGGER = logging.getLogger(__name__)
+
+
 class bamQC(pararead.ParaReadProcessor):
     def __init__(self, reads_filename, n_proc, out_filename, verbosity):
         """
diff --git a/tools/bamSitesToWig.py b/tools/bamSitesToWig.py
@@ -8,6 +8,7 @@
 
 from argparse import ArgumentParser
 import itertools # Used for nested region looping across reads
+import logging
 import numpy
 from operator import methodcaller
 import os
@@ -21,6 +22,12 @@
 
 MODES = ["dnase", "atac"]
 
+# Module-level fallback logger so class methods always have one available,
+# even when pararead workers re-import this module under multiprocessing
+# 'spawn' (macOS default). The __main__ block upgrades this to a
+# logmuse-configured logger for the parent process. (#266)
+_LOGGER = logging.getLogger(__name__)
+
 # A function object like this will be pickled by the parallel call to map,
 # So it cannot contain huge files or the pickling will limit everything.
 # For this reason I must rely on global vars for the big stuff.