From 1974df2c70ec694b6518850654bc5f57b19fb6e7 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:28 +0000
Subject: [PATCH 01/14] Fix BLANK_LINE_IN_SAMPLES_FILE: skip blank lines in
 init_summary_by_sample()

Trailing newline in samplesID file caused IndexError on splitted[3],
crashing post-processing after all off-target search had completed.
---
 PostProcess/process_summaries.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PostProcess/process_summaries.py b/PostProcess/process_summaries.py
index eb04847..48ac2d7 100755
--- a/PostProcess/process_summaries.py
+++ b/PostProcess/process_summaries.py
@@ -32,6 +32,8 @@ def init_summary_by_sample(path_samplesID):
         sid.readline()
         for line in sid:
             splitted = line.strip().split("\t")
+            if not splitted[0]:
+                continue
             dict_samples[splitted[0]] = [
                 splitted[3],
                 splitted[1],

From ce38ecd54f73e93acd3e75879263c76cb617f847 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:29 +0000
Subject: [PATCH 02/14] Fix CHR_FROM_FILENAME: read chromosome from VCF
 content, not filename

Filename-based chromosome extraction failed for VCFs whose names do not
embed the chromosome (e.g. whole-genome single-file VCFs, DeepVariant output).
Replace with _chrom_from_vcf() helper that reads the CHROM column from the
first data line in pool_post_analisi_indel.py and pool_search_indels.py.
In dictionary_creation_indels.py, set chr_name directly from first_line[0]
which is already read from the file.
---
 PostProcess/pool_post_analisi_indel.py        | 14 ++++++++++--
 PostProcess/pool_search_indels.py             | 22 +++++++++++--------
 .../dictionary_creation_indels.py             |  6 +----
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py
index 1d83206..c5582d8 100755
--- a/PostProcess/pool_post_analisi_indel.py
+++ b/PostProcess/pool_post_analisi_indel.py
@@ -2,10 +2,21 @@
 
 from multiprocessing import Pool
 
+import gzip
 import subprocess
 import sys
 import os
 
+
+def _chrom_from_vcf(vcf_path: str) -> str:
+    """Return the chromosome name from the first data line of a VCF (plain or gzipped)."""
+    open_fn = gzip.open if vcf_path.endswith(".gz") else open
+    with open_fn(vcf_path, "rt") as fh:
+        for line in fh:
+            if not line.startswith("#"):
+                return line.split("\t")[0]
+    raise ValueError(f"No data lines found in VCF: {vcf_path}")
+
 # post-analysis script name
 POSTANALYSIS = "./post_analisi_indel.sh"
 
@@ -26,8 +37,7 @@
 
 
 def start_analysis(fname: str) -> None:
-    chrom = next((e for e in fname.split(".") if e.startswith("chr")), None)
-    assert chrom
+    chrom = _chrom_from_vcf(os.path.join(vcf_folder, fname))
     code = subprocess.call(
         f"{POSTANALYSIS} {output_folder} {ref_folder} {vcf_folder} {guide_file} "
         f"{mm} {bDNA} {bRNA} {annotation_file} {pam_file} {dict_folder} "
diff --git a/PostProcess/pool_search_indels.py b/PostProcess/pool_search_indels.py
index 6646678..6180116 100755
--- a/PostProcess/pool_search_indels.py
+++ b/PostProcess/pool_search_indels.py
@@ -1,10 +1,21 @@
 #!/usr/bin/env python
 
+import gzip
 import os
 import sys
 from multiprocessing import Pool
 from datetime import datetime
 
+
+def _chrom_from_vcf(vcf_path: str) -> str:
+    """Return the chromosome name from the first data line of a VCF (plain or gzipped)."""
+    open_fn = gzip.open if vcf_path.endswith(".gz") else open
+    with open_fn(vcf_path, "rt") as fh:
+        for line in fh:
+            if not line.startswith("#"):
+                return line.split("\t")[0]
+    raise ValueError(f"No data lines found in VCF: {vcf_path}")
+
 ref_folder = sys.argv[1]
 ref_name = os.path.basename(sys.argv[1])
 vcf_dir = sys.argv[2]
@@ -24,11 +35,7 @@
 
 
 def search_indels(f):
-    # global use_thread
-    splitted = f.split(".")
-    for elem in splitted:
-        if "chr" in elem:
-            chrom = elem
+    chrom = _chrom_from_vcf(os.path.join(vcf_dir, f))
     print("Searching for INDELs in", chrom)
     if bDNA != "0" or bRNA != "0":
         os.system(
@@ -62,10 +69,7 @@ def search_indels(f):
 
 
 for key in chrs:
-    splitted = key.split(".")
-    for elem in splitted:
-        if "chr" in elem:
-            chrom = elem
+    chrom = _chrom_from_vcf(os.path.join(vcf_dir, key))
     os.system(
         f"tail -n +2 {output_folder}/fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt >> {output_folder}/indels_{ref_name}+{vcf_name}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt"
     )
diff --git a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py
index da86fa8..ddd7341 100755
--- a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py
+++ b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py
@@ -15,11 +15,6 @@
 import time
 
 vcf_file = sys.argv[1]
-chr_name = vcf_file.split('.')
-for i in chr_name:
-    if 'chr' in i:
-        chr_name = i
-        break
 selected_sample = sys.argv[2]
 save_directory = sys.argv[3]
 
@@ -37,6 +32,7 @@
             break
 
     first_line = next(targets).decode('ascii').strip().split('\t')
+    chr_name = first_line[0]   # authoritative chromosome from VCF data
     if 'chr' not in first_line[0]:
         add_to_name = 'chr'
     list_samples = []

From 3c5dc00aa5500a7ed885a47e9f5bc918d326d511 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:29 +0000
Subject: [PATCH 03/14] Fix FILTER_PASS: add --vcf-filter-pass-values flag
 (default "PASS,.")
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose a new --vcf-filter-pass-values CLI flag on crisprme.py so users
can control which VCF FILTER values are treated as passing. The default
"PASS,." covers the VCF spec §1.6.1 "not evaluated" sentinel (".") in
addition to the standard PASS value, making it backwards-compatible.

Changes:
- crisprme.py: parse --vcf-filter-pass-values in complete_search() and
  gnomAD_converter(); pass the value to the shell script (as positional
  arg 24) and to convert_gnomAD_vcfs.py as a 7th positional argument.
- submit_job_automated_new_multiple_vcfs.sh: accept vcf_filter_pass_values
  as ${24} (default "PASS,."); insert an awk loop before crispritz.py
  add-variants that rewrites accepted non-PASS FILTER values to PASS so
  crispritz includes those records during genome enrichment.
- convert_gnomAD_vcfs.py: accept filter_pass_values as an optional 7th
  argument (parse_commandline); add itertools import; add a pre-flight
  warning if no accepted-filter records are found in the first 10k rows;
  replace the hardcoded "PASS" check with a set-intersection test against
  filter_pass_values throughout convert_vcf() and run_conversion_pipeline().
---
 PostProcess/convert_gnomAD_vcfs.py            | 51 +++++++++++++++----
 .../submit_job_automated_new_multiple_vcfs.sh | 39 +++++++++++++-
 crisprme.py                                   | 23 ++++++++-
 3 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/PostProcess/convert_gnomAD_vcfs.py b/PostProcess/convert_gnomAD_vcfs.py
index c205991..01878b3 100644
--- a/PostProcess/convert_gnomAD_vcfs.py
+++ b/PostProcess/convert_gnomAD_vcfs.py
@@ -33,6 +33,7 @@
 
 import multiprocessing
 import subprocess
+import itertools
 import pysam
 import gzip
 import time
@@ -44,7 +45,7 @@
 BCFTOOLSNORM = "bcftools norm"
 
 
-def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int]:
+def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, frozenset]:
     """Parse and validate command line arguments for gnomAD VCF conversion.
 
     This function checks the provided arguments for correctness and extracts the
@@ -56,10 +57,11 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int]
         args (List[str]): A list of command line arguments.
 
     Returns:
-        Tuple[str, str, bool, bool, bool, int]: A tuple containing the gnomAD
+        Tuple[str, str, bool, bool, bool, int, frozenset]: A tuple containing the gnomAD
             VCF directory, sample IDs, a flag indicating whether to use joint
             VCF processing, a flag indicating whether to keep files, a flag for
-            multiallelic processing, and the number of threads to use.
+            multiallelic processing, the number of threads to use, and a frozenset
+            of VCF FILTER values to treat as passing.
 
     Raises:
         ValueError: If the number of arguments is incorrect, if the specified
@@ -67,11 +69,12 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int]
             allowed range.
     """
 
-    if len(args) != 6:
+    if len(args) not in (6, 7):
         raise ValueError(
             "Wrong number of input arguments, cannot proceed with gnomAD VCF conversion"
         )
-    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args
+    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args[:6]
+    filter_pass_values = frozenset(args[6].split(",")) if len(args) == 7 else frozenset({"PASS", "."})
     if not os.path.isdir(gnomad_vcfs_dir):
         raise ValueError(
             f"The specified gnomAD VCF directory is not a directory ({gnomad_vcfs_dir})"
@@ -82,7 +85,7 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int]
     joint = joint == "True"
     keep = keep == "True"
     multiallelic = multiallelic == "True"
-    return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads
+    return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values
 
 
 def read_samples_ids(samples_ids: str):
@@ -226,7 +229,13 @@ def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str:
     return "\t".join([f"{e}" if e is not None else "." for e in variant_format])
 
 
-def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool):
+def convert_vcf(
+    vcf_fname: str,
+    samples: List[str],
+    joint: bool,
+    keep: bool,
+    filter_pass_values: frozenset = frozenset({"PASS", "."}),
+):
     """
     Converts a VCF file by updating the header, filtering variants, and creating
     a new compressed VCF file.
@@ -236,6 +245,8 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool):
         samples (List[str]): List of sample names.
         keep (bool): Flag to determine whether to keep variants that do not pass
             the filter.
+        filter_pass_values (frozenset): Set of VCF FILTER values to treat as
+            passing. Default: frozenset({"PASS", "."}).
 
     Returns:
         str: Path to the converted and compressed VCF file.
@@ -250,6 +261,17 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool):
         vcf = load_vcf(vcf_fname)  # use pysam for optimized VCF loading
     except OSError as e:
         raise OSError(f"An error occurred while loading {vcf_fname}") from e
+    # Pre-flight: warn if no accepted FILTER records found in the first 10,000 rows.
+    # If every record has a hard-fail filter the output will be silently empty.
+    _sample = list(itertools.islice(vcf.fetch(), 10_000))
+    _n_ok = sum(1 for v in _sample if filter_pass_values & set(v.filter.keys()))
+    if not keep and _n_ok == 0 and _sample:
+        sys.stderr.write(
+            f"WARNING: {vcf_fname}: sampled {len(_sample)} records, none have "
+            f"FILTER in {filter_pass_values} — all variants will be skipped. "
+            f"Re-run with keep=True or adjust --vcf-filter-pass-values.\n"
+        )
+    vcf.reset()  # rewind before the main loop below
     samples_ac = [
         f"AC_joint_{sample}" if joint else f"AC_{sample}" for sample in samples
     ]  # recover allele count field for each input sample
@@ -258,7 +280,7 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool):
             # write the upated header to the converted vcf
             outfile.write(update_header(vcf.header.copy(), samples, joint))
             for variant in vcf:
-                if not keep and "PASS" not in variant.filter.keys():
+                if not keep and not (filter_pass_values & set(variant.filter.keys())):
                     continue
                 genotypes = "\t".join(
                     [
@@ -314,7 +336,12 @@ def bcftools_merge(vcf_fname: str, multiallelic: bool) -> str:
 
 
 def run_conversion_pipeline(
-    vcf_fname: str, samples: List[str], joint: bool, keep: bool, multiallelic: bool
+    vcf_fname: str,
+    samples: List[str],
+    joint: bool,
+    keep: bool,
+    multiallelic: bool,
+    filter_pass_values: frozenset = frozenset({"PASS", "."}),
 ) -> None:
     """
     Runs a conversion pipeline to process a VCF file by adding genotypes and merging
@@ -327,13 +354,14 @@ def run_conversion_pipeline(
             the filter.
         multiallelic (bool): Flag indicating whether to handle multiallelic variants
             during merging.
+        filter_pass_values (frozenset): Set of VCF FILTER values to treat as passing.
 
     Returns:
         None
     """
 
     # add genotypes to input VCF
-    vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep)  
+    vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep, filter_pass_values)
     # merge variants into mutlialleic/biallelic sites
     vcf_merged = bcftools_merge(vcf_genotypes, multiallelic)
     assert os.path.isfile(vcf_merged) and os.stat(vcf_merged).st_size > 0
@@ -354,7 +382,7 @@ def convert_gnomad_vcfs() -> None:
     """
 
     # read input arguments
-    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = (
+    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values = (
         parse_commandline(sys.argv[1:])
     )
     start = time.time()
@@ -372,6 +400,7 @@ def convert_gnomad_vcfs() -> None:
             joint=joint,
             keep=keep,
             multiallelic=multiallelic,
+            filter_pass_values=filter_pass_values,
         )
         pool.map(partial_run_conversion_pipeline, gnomad_vcfs)
         pool.close()
diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh
index a13dc18..92d2712 100755
--- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh
+++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh
@@ -24,6 +24,13 @@
 #   ${15}: Current working directory path.
 #   ${16}: Gene proximity file path.
 #   ${17}: Email address for notifications.
+#   ${18}: Base check start (base editing window).
+#   ${19}: Base check end (base editing window).
+#   ${20}: Base check set (base editing bases).
+#   ${21}: Sorting criteria for scoring columns.
+#   ${22}: Sorting criteria (mm+bulges) columns.
+#   ${23}: CI/CD test mode flag.
+#   ${24}: Comma-separated VCF FILTER values to treat as passing (default "PASS,.").
 #
 # Returns:
 #   Generates various output files including target lists, merged results, and a database.
@@ -63,6 +70,9 @@ sorting_criteria=${22}
 # CI/CD test mode
 cicd_test=${23}
 
+# Comma-separated VCF FILTER values to treat as passing (default "PASS,.")
+vcf_filter_pass_values="${24:-PASS,.}"
+
 # log files
 log="$output_folder/log.txt"
 touch $log
@@ -176,10 +186,35 @@ while read vcf_f; do
 		if ! [ -d "$enriched_folder" ]; then
 			echo -e 'Add-variants\tStart\t'$(date) >>$log
 			echo -e "Adding variants"
-			
+
+			# Rewrite accepted non-PASS FILTER values to PASS before genome enrichment.
+			# crispritz only includes records with FILTER=PASS; any other accepted value
+			# (e.g. ".") must be normalised here. The accepted list comes from
+			# --vcf-filter-pass-values (default "PASS,.").
+			for _vcf in "$vcf_folder"/*.vcf.gz; do
+				_tmp=$(mktemp --suffix=.vcf.gz)
+				_fname=$(basename "$_vcf")
+				zcat "$_vcf" | \
+					awk -v accept="$vcf_filter_pass_values" -v fname="$_fname" '
+						BEGIN {
+							FS="\t"; OFS="\t"; n=0
+							split(accept, _a, ",")
+							for (i in _a) ok[_a[i]] = 1
+						}
+						/^#/ { print; next }
+						$7 != "PASS" && ($7 in ok) { $7 = "PASS"; n++ }
+						{ print }
+						END {
+							if (n > 0)
+								printf "WARNING: %d non-PASS accepted FILTER record(s) in %s normalised to PASS for CRISPRme compatibility.\n", n, fname > "/dev/stderr"
+						}' | bgzip > "$_tmp"
+				mv "$_tmp" "$_vcf"
+				tabix -p vcf "$_vcf"
+			done
+
 			# enrich genome using crispritz
 			cd "$current_working_directory/Genomes"
-			crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" 
+			crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true"
 			if [ -s $logerror ]; then
 				printf "ERROR: Genome enrichment failed on %s\n" "$vcf_name" >&2
 				# since failure happened, force genome enrichment to be repeated
diff --git a/crisprme.py b/crisprme.py
index e2f6b1d..710d322 100755
--- a/crisprme.py
+++ b/crisprme.py
@@ -1153,6 +1153,15 @@ def complete_search() -> None:
     sorting_criteria = _check_sorting_criteria(args, "--sorting-criteria" in args)  # sorting criteria (mm+bulges) columns
     outputfolder = _check_output(args)  # output folder
     thread = _check_threads(args, "--thread" in args)  # number of threads
+    # VCF FILTER values to treat as passing (default: "PASS,." per VCF spec §1.6.1)
+    vcf_filter_pass_values = "PASS,."
+    if "--vcf-filter-pass-values" in args:
+        try:
+            vcf_filter_pass_values = args[args.index("--vcf-filter-pass-values") + 1]
+            if vcf_filter_pass_values.startswith("--"):
+                raise ValueError("Please input a value for flag --vcf-filter-pass-values")
+        except IndexError as e:
+            raise ValueError("Missing input for --vcf-filter-pass-values") from e
 
     # extract pam seq from file
     pam_len = 0
@@ -1304,7 +1313,8 @@ def complete_search() -> None:
                 f"{pamfile} {annotationfile} {samplefile} {bMax + 1} {mm} {bDNA} {bRNA} "
                 f"{merge_t} {outputfolder} {script_path} {thread} {current_working_directory} "
                 f"{gene_annotation} {void_mail} {base_start} {base_end} {base_set} "
-                f"{sorting_criteria_scoring} {sorting_criteria} {cicd_test}"
+                f"{sorting_criteria_scoring} {sorting_criteria} {cicd_test} "
+                f"{vcf_filter_pass_values}"
             )
             code = subprocess.call(
                 crisprme_run, shell=True, stderr=log_error, stdout=log_verbose
@@ -1550,11 +1560,20 @@ def gnomAD_converter():
                 raise ValueError(f"Forbidden number of threads ({threads})")
         except IndexError as e:
             raise ValueError("Missing or forbidden threads value") from e
+    # read vcf filter pass values arg
+    vcf_filter_pass_values = "PASS,."
+    if "--vcf-filter-pass-values" in args:
+        try:
+            vcf_filter_pass_values = args[args.index("--vcf-filter-pass-values") + 1]
+            if vcf_filter_pass_values.startswith("--"):
+                raise ValueError("Please input a value for flag --vcf-filter-pass-values")
+        except IndexError as e:
+            raise ValueError("Missing input for --vcf-filter-pass-values") from e
     # run gnom AD converter
     gnomad_converter_script = os.path.join(script_path, "convert_gnomAD_vcfs.py")
     cmd = (
         f"python {gnomad_converter_script} {gnomad_dir} {samples_ids} {joint} "
-        f"{keep} {multiallelic} {threads}"
+        f"{keep} {multiallelic} {threads} {vcf_filter_pass_values}"
     )
     code = subprocess.call(cmd, shell=True)
     if code != 0:

From c8af3f8b15ada770ada8a1f472c9a5b93deb0b34 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:30 +0000
Subject: [PATCH 04/14] Fix GT_HAPLOID: guard against bare 0/1 haploid GT (no |
 separator)

Two code paths assumed all GTs are diploid and split on "|", raising
IndexError when encountering hemizygous records (e.g. chrX in male samples).

- resultIntegrator.py: split into alleles list and use alleles[0] for
  allele2 when only one element is present, correctly treating hemizygous
  as homozygous for haplotype frequency counting.
- dictionary_creation_indels.py: after splitting on "|", duplicate a
  single-element list so the haploid allele is counted twice, matching
  the biological expectation for hemizygous variants.
---
 PostProcess/resultIntegrator.py                              | 5 +++--
 .../dictionaryIndel/dictionary_creation_indels.py            | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/PostProcess/resultIntegrator.py b/PostProcess/resultIntegrator.py
index 912f9bd..59d0fa4 100755
--- a/PostProcess/resultIntegrator.py
+++ b/PostProcess/resultIntegrator.py
@@ -140,8 +140,9 @@ def createBedforMultiAlternative(variantList, samples):
     # e.g. 0|1 and 1|1 will return 0|1 that will be counted as 1 for that sample haplotype
     for snp in haplotypeDict:
         for count, sample in enumerate(haplotypeDict[snp]):
-            allele1_snp = sample.strip().split("|")[0]
-            allele2_snp = sample.strip().split("|")[1]
+            alleles = sample.strip().split("|")
+            allele1_snp = alleles[0]
+            allele2_snp = alleles[1] if len(alleles) > 1 else alleles[0]  # haploid guard
             allele1[count] = int(allele1[count]) and int(allele1_snp)
             allele2[count] = int(allele2[count]) and int(allele2_snp)
     # final sum to generete haplotype frequency of the specific target
diff --git a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py
index ddd7341..6ed3830 100755
--- a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py
+++ b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py
@@ -39,6 +39,8 @@
     list_chars = []
     
     hap = first_line[sample_header_pos].split('|')
+    if len(hap) == 1:
+        hap = hap * 2  # haploid GT: treat as homozygous for frequency counting
     for h in hap:   #NOTE posso avere anche 2|1 #TODO da finire
         if '0' == h:
             continue

From 6a94b049021a61e89cb2b7e1feaa6d2a61b82199 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:31 +0000
Subject: [PATCH 05/14] Fix CONCURRENT_RUNS: unique temp dir and mkdir locks
 for shared build steps

Two classes of race condition existed when multiple CRISPRme jobs ran
simultaneously:

1. Shared variants_genome/ temp folder: crispritz.py add-variants always
   writes to variants_genome/ relative to its working directory, which was
   fixed at Genomes/. Concurrent jobs with different VCF sets would
   overwrite each other's intermediate enriched genome.

   Fix: create a per-run temp dir with mktemp -d (run_XXXXXX under Genomes/)
   and cd into it before calling crispritz, so each run gets its own
   variants_genome/ subtree. The temp dir is removed after the mv steps.

2. Shared index/enriched-genome folders built without atomicity: two runs
   with identical parameters could both pass an "if ! [ -d folder ]" check
   simultaneously and interleave writes into the same directory, corrupting
   the output.

   Fix: wrap all four shared-folder build steps with a mkdir-based lock
   (atomic on POSIX). The winning process builds the folder and rmdir's the
   lock; losing processes wait in a 5-second poll loop until the lock
   disappears. Applied to: enriched genome, ref genome index, variant genome
   index, and indels index.
---
 .../submit_job_automated_new_multiple_vcfs.sh | 129 +++++++++++-------
 1 file changed, 81 insertions(+), 48 deletions(-)

diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh
index 92d2712..4873f2d 100755
--- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh
+++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh
@@ -178,12 +178,15 @@ while read vcf_f; do
 	# START STEP 1 - Genome enrichment
 	if [ "$vcf_name" != "_" ]; then
 		enriched_folder="$current_working_directory/Genomes/${ref_name}+${vcf_name}"
-		variants_tmp="$current_working_directory/Genomes/variants_genome"
+		_run_tmp=$(mktemp -d "$current_working_directory/Genomes/run_XXXXXX")
+		variants_tmp="$_run_tmp/variants_genome"
 		dict_folder="$current_working_directory/Dictionaries/dictionaries_${vcf_name}/"
 		indel_dict_folder="$current_working_directory/Dictionaries/log_indels_${vcf_name}/"
 		indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS"
 
-		if ! [ -d "$enriched_folder" ]; then
+		_lock="$enriched_folder.lock"
+		if ! [ -d "$enriched_folder" ] && mkdir "$_lock" 2>/dev/null; then
+			# only one process enters here; mkdir is atomic
 			echo -e 'Add-variants\tStart\t'$(date) >>$log
 			echo -e "Adding variants"
 
@@ -212,13 +215,14 @@ while read vcf_f; do
 				tabix -p vcf "$_vcf"
 			done
 
-			# enrich genome using crispritz
-			cd "$current_working_directory/Genomes"
+			# enrich genome using crispritz (run from unique temp dir so crispritz's
+			# fixed-name variants_genome/ output does not collide with concurrent runs)
+			cd "$_run_tmp"
 			crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true"
 			if [ -s $logerror ]; then
 				printf "ERROR: Genome enrichment failed on %s\n" "$vcf_name" >&2
-				# since failure happened, force genome enrichment to be repeated
-				rm -r "$enriched_folder"* "$variants_tmp"
+				rm -rf "$_run_tmp"
+				rmdir "$_lock"
 				exit 1
 			fi
 
@@ -236,12 +240,18 @@ while read vcf_f; do
 			mv $variants_tmp/SNPs_genome/*.json "$dict_folder"
 			mv $variants_tmp/SNPs_genome/log*.txt "$indel_dict_folder"
 
-			# remove temporary variant genome folder
-			rm -r "$variants_tmp"
+			# remove unique temp dir (variants_genome lives inside it)
+			rm -rf "$_run_tmp"
 
+			rmdir "$_lock"
 			echo -e 'Add-variants\tEnd\t'$(date) >>"$log"
+		elif [ -d "$_lock" ]; then
+			echo "Another run is building $enriched_folder — waiting..."
+			while [ -d "$_lock" ]; do sleep 5; done
+			rm -rf "$_run_tmp"
 		else
 			echo -e "Variants already added"
+			rm -rf "$_run_tmp"
 		fi
 
 		cd $current_working_directory
@@ -281,17 +291,24 @@ while read vcf_f; do
 		echo -e 'Index-genome Reference\tEnd\t'$(date) >>"$log"
 		idx_ref="$idx_folder3"
 	else
-		# no valid index found, compute it
-		echo -e 'Index-genome Reference\tStart\t'$(date) >>$log
-		echo -e "Indexing reference genome"
-		# index reference genome using crispritz
-		crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus
-		if [ -s $logerror ]; then
-			printf "ERROR: reference genome indexing failed\n" >&2
-			[ -d "$idx_folder1" ] && rm -r "$idx_folder1"
-			exit 1
+		# no valid index found, compute it; use mkdir lock to prevent concurrent builds
+		_lock="${idx_folder1}.lock"
+		if mkdir "$_lock" 2>/dev/null; then
+			echo -e 'Index-genome Reference\tStart\t'$(date) >>$log
+			echo -e "Indexing reference genome"
+			# index reference genome using crispritz
+			crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus
+			if [ -s $logerror ]; then
+				printf "ERROR: reference genome indexing failed\n" >&2
+				[ -d "$idx_folder1" ] && rm -r "$idx_folder1"
+				rmdir "$_lock"
+				exit 1
+			fi
+			rmdir "$_lock"
+		elif [ -d "$_lock" ]; then
+			echo "Another run is indexing $idx_folder1 — waiting..."
+			while [ -d "$_lock" ]; do sleep 5; done
 		fi
-		pid_index_ref=$!
 		echo -e 'Index-genome Reference\tEnd\t'$(date) >>$log
 		idx_ref="$idx_folder1"
 	fi
@@ -319,21 +336,28 @@ while read vcf_f; do
 			echo -e 'Index-genome Variant\tEnd\t'$(date) >>"$log"
 			idx_var="$idx_folder3"
 		else
-			# no index found, compute it
-			echo -e 'Index-genome Variant\tStart\t'$(date) >>$log
-			echo -e "Indexing variant genome"
-			# index alternative genome using crispritz
-			crispritz.py index-genome \
-				"${ref_name}+${vcf_name}" \
-				"$current_working_directory/Genomes/${ref_name}+${vcf_name}/" \
-				"$pam_file" \
-				-bMax $bMax -th $ncpus
-			if [ -s "$logerror" ]; then
-				printf "ERROR: alternative genome indexing failed on %s\n" "$vcf_name" >&2
-				[ -d "$idx_folder1" ] && rm -r "$idx_folder1"
-				exit 1
+			# no index found, compute it; use mkdir lock to prevent concurrent builds
+			_lock="${idx_folder1}.lock"
+			if mkdir "$_lock" 2>/dev/null; then
+				echo -e 'Index-genome Variant\tStart\t'$(date) >>$log
+				echo -e "Indexing variant genome"
+				# index alternative genome using crispritz
+				crispritz.py index-genome \
+					"${ref_name}+${vcf_name}" \
+					"$current_working_directory/Genomes/${ref_name}+${vcf_name}/" \
+					"$pam_file" \
+					-bMax $bMax -th $ncpus
+				if [ -s "$logerror" ]; then
+					printf "ERROR: alternative genome indexing failed on %s\n" "$vcf_name" >&2
+					[ -d "$idx_folder1" ] && rm -r "$idx_folder1"
+					rmdir "$_lock"
+					exit 1
+				fi
+				rmdir "$_lock"
+			elif [ -d "$_lock" ]; then
+				echo "Another run is indexing $idx_folder1 — waiting..."
+				while [ -d "$_lock" ]; do sleep 5; done
 			fi
-			pid_index_var=$!
 			echo -e 'Index-genome Variant\tEnd\t'$(date) >>"$log"
 			idx_var="$idx_folder1"
 		fi
@@ -343,23 +367,32 @@ while read vcf_f; do
 		indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS"
 
 		if ! [ -d "$indels_index_dir" ]; then
-			echo -e 'Indexing Indels\tStart\t'$(date) >>"$log"
-			"$starting_dir/pool_index_indels.py" \
-				"$indels_out/" \
-				"$pam_file" \
-				"$true_pam" \
-				"$ref_name" \
-				"$vcf_name" \
-				"$bMax" \
-				"$ncpus"
-
-			if [ -s "$logerror" ]; then
-				printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2
-				[ -d "$indels_index_dir" ] && rm -r "$indels_index_dir"
-				exit 1
-			fi
+			# use mkdir lock to prevent concurrent builds
+			_lock="${indels_index_dir}.lock"
+			if mkdir "$_lock" 2>/dev/null; then
+				echo -e 'Indexing Indels\tStart\t'$(date) >>"$log"
+				"$starting_dir/pool_index_indels.py" \
+					"$indels_out/" \
+					"$pam_file" \
+					"$true_pam" \
+					"$ref_name" \
+					"$vcf_name" \
+					"$bMax" \
+					"$ncpus"
+
+				if [ -s "$logerror" ]; then
+					printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2
+					[ -d "$indels_index_dir" ] && rm -r "$indels_index_dir"
+					rmdir "$_lock"
+					exit 1
+				fi
 
-			echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log"
+				rmdir "$_lock"
+				echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log"
+			elif [ -d "$_lock" ]; then
+				echo "Another run is indexing $indels_index_dir — waiting..."
+				while [ -d "$_lock" ]; do sleep 5; done
+			fi
 		else
 			echo "Indels Index already present"
 		fi

From a6fc4cb3635a1291baae16bc95ef66fea715f239 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:31 +0000
Subject: [PATCH 06/14] Fix DTYPE_WARNING: add dtype=str to chunked pd.read_csv
 in remove_n_and_dots

Without an explicit dtype, pandas infers column types independently per
chunk. A column that is all-NaN in one chunk and has values in another gets
different inferred dtypes, triggering a DtypeWarning to stderr. Because the
pipeline treats any non-empty log_error.txt as a fatal error, this warning
causes runs to be reported as failed even when all results are correct.

Adding dtype=str suppresses the inference entirely. It is also semantically
correct: the sole purpose of this script is text replacement ("n" -> "NA",
"." -> "NA"), so there is no reason to parse numeric types at all.
---
 PostProcess/remove_n_and_dots.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PostProcess/remove_n_and_dots.py b/PostProcess/remove_n_and_dots.py
index 04a917f..0deab9a 100755
--- a/PostProcess/remove_n_and_dots.py
+++ b/PostProcess/remove_n_and_dots.py
@@ -33,7 +33,7 @@ def read_report_chunks(report_fname: str) -> TextFileReader:
     Returns:
         TextFileReader: An iterator over DataFrame chunks.
     """
-    return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE)
+    return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE, dtype=str)
 
 
 def polish_report(report_chunks: TextFileReader, report_fname: str) -> None:

From 990d34a50c5d997bae06acb883b62cce4c4dc8df Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:31 +0000
Subject: [PATCH 07/14] Fix CHR_FROM_FILENAME: normalize chr prefix after
 _chrom_from_vcf()

Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without a chr
prefix in the CHROM field (e.g. '22' instead of 'chr22'), while the
genome indices and fake FASTA files created by pool_index_indels.py use
chr-prefixed names. Add _normalize_chrom() to pool_search_indels.py and
pool_post_analisi_indel.py to ensure the derived chromosome name always
matches the index naming convention.
---
 PostProcess/pool_post_analisi_indel.py | 10 +++++++++-
 PostProcess/pool_search_indels.py      | 12 ++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py
index c5582d8..8448704 100755
--- a/PostProcess/pool_post_analisi_indel.py
+++ b/PostProcess/pool_post_analisi_indel.py
@@ -17,6 +17,14 @@ def _chrom_from_vcf(vcf_path: str) -> str:
                 return line.split("\t")[0]
     raise ValueError(f"No data lines found in VCF: {vcf_path}")
 
+
+def _normalize_chrom(chrom: str) -> str:
+    """Ensure chromosome name has chr prefix (e.g. '22' -> 'chr22').
+    Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without the
+    prefix in the CHROM field, while the genome indices use 'chr'-prefixed names.
+    """
+    return chrom if chrom.startswith("chr") else "chr" + chrom
+
 # post-analysis script name
 POSTANALYSIS = "./post_analisi_indel.sh"
 
@@ -37,7 +45,7 @@ def _chrom_from_vcf(vcf_path: str) -> str:
 
 
 def start_analysis(fname: str) -> None:
-    chrom = _chrom_from_vcf(os.path.join(vcf_folder, fname))
+    chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_folder, fname)))
     code = subprocess.call(
         f"{POSTANALYSIS} {output_folder} {ref_folder} {vcf_folder} {guide_file} "
         f"{mm} {bDNA} {bRNA} {annotation_file} {pam_file} {dict_folder} "
diff --git a/PostProcess/pool_search_indels.py b/PostProcess/pool_search_indels.py
index 6180116..a7ecf8c 100755
--- a/PostProcess/pool_search_indels.py
+++ b/PostProcess/pool_search_indels.py
@@ -16,6 +16,14 @@ def _chrom_from_vcf(vcf_path: str) -> str:
                 return line.split("\t")[0]
     raise ValueError(f"No data lines found in VCF: {vcf_path}")
 
+
+def _normalize_chrom(chrom: str) -> str:
+    """Ensure chromosome name has chr prefix (e.g. '22' -> 'chr22').
+    Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without the
+    prefix in the CHROM field, while the genome indices use 'chr'-prefixed names.
+    """
+    return chrom if chrom.startswith("chr") else "chr" + chrom
+
 ref_folder = sys.argv[1]
 ref_name = os.path.basename(sys.argv[1])
 vcf_dir = sys.argv[2]
@@ -35,7 +43,7 @@ def _chrom_from_vcf(vcf_path: str) -> str:
 
 
 def search_indels(f):
-    chrom = _chrom_from_vcf(os.path.join(vcf_dir, f))
+    chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_dir, f)))
     print("Searching for INDELs in", chrom)
     if bDNA != "0" or bRNA != "0":
         os.system(
@@ -69,7 +77,7 @@ def search_indels(f):
 
 
 for key in chrs:
-    chrom = _chrom_from_vcf(os.path.join(vcf_dir, key))
+    chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_dir, key)))
     os.system(
         f"tail -n +2 {output_folder}/fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt >> {output_folder}/indels_{ref_name}+{vcf_name}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt"
     )

From 5743ba01c89ddd02126f53f8bcec913e9266bed6 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:32 +0000
Subject: [PATCH 08/14] Fix TBI_FILTER: exclude .vcf.gz.tbi files from
 chromosome list in pool_post_analisi_indel

---
 PostProcess/pool_post_analisi_indel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py
index 8448704..76e2407 100755
--- a/PostProcess/pool_post_analisi_indel.py
+++ b/PostProcess/pool_post_analisi_indel.py
@@ -59,6 +59,6 @@ def start_analysis(fname: str) -> None:
 
 
 # chromosome-wise vcfs list
-chrs = [f for f in os.listdir(vcf_folder) if "vcf.gz" in f]
+chrs = [f for f in os.listdir(vcf_folder) if f.endswith(".vcf.gz")]
 with Pool(processes=ncpus) as pool:  # run chrom-wise post-analysis in parallel
     pool.map(start_analysis, chrs)

From 6a47dd889af88cd15def822db878bcb8b32b2d93 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:32 +0000
Subject: [PATCH 09/14] Fix TABIX_CONCURRENT_WRITE: only regenerate .tbi when
 absent to prevent parallel write race

---
 PostProcess/annotation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PostProcess/annotation.py b/PostProcess/annotation.py
index 9353800..e084523 100644
--- a/PostProcess/annotation.py
+++ b/PostProcess/annotation.py
@@ -60,7 +60,8 @@ def load_annotation_bed(annotation_fname: str) -> TabixFile:
         SamtoolsError: If the annotation BED file cannot be loaded or indexed.
     """
     # check that tabix index is available for all annotation bed
-    pysam.tabix_index(annotation_fname, force=True, preset="bed")
+    if not os.path.isfile(annotation_fname + ".tbi"):
+        pysam.tabix_index(annotation_fname, force=True, preset="bed")
     try:  # return tabix indexes for each annotation bed
         return pysam.TabixFile(annotation_fname)
     except (SamtoolsError, Exception) as e:

From 2c8c13a987362661efd1770f6867ea3ba4822b09 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:33 +0000
Subject: [PATCH 10/14] Fix TABIX_CONCURRENT_WRITE: flock + timestamp check
 replaces bare existence check

Existence-only check missed stale .tbi after complete-test re-bgzips annotation.
Use fcntl.flock exclusive lock so only one parallel annotation process regenerates
the index, and only when the .tbi is absent or older than the .gz.
---
 PostProcess/annotation.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/PostProcess/annotation.py b/PostProcess/annotation.py
index e084523..27a279e 100644
--- a/PostProcess/annotation.py
+++ b/PostProcess/annotation.py
@@ -5,6 +5,7 @@
 from typing import List, Tuple, Optional
 from time import time
 
+import fcntl
 import pysam
 import sys
 import os
@@ -59,9 +60,15 @@ def load_annotation_bed(annotation_fname: str) -> TabixFile:
     Raises:
         SamtoolsError: If the annotation BED file cannot be loaded or indexed.
     """
-    # check that tabix index is available for all annotation bed
-    if not os.path.isfile(annotation_fname + ".tbi"):
-        pysam.tabix_index(annotation_fname, force=True, preset="bed")
+    # check that tabix index is available and up-to-date; use a lock file so
+    # that parallel annotation processes don't write the .tbi simultaneously
+    tbi_path = annotation_fname + ".tbi"
+    lock_path = annotation_fname + ".tbi.lock"
+    with open(lock_path, "w") as lock_fh:
+        fcntl.flock(lock_fh, fcntl.LOCK_EX)
+        if (not os.path.isfile(tbi_path)
+                or os.path.getmtime(tbi_path) < os.path.getmtime(annotation_fname)):
+            pysam.tabix_index(annotation_fname, force=True, preset="bed")
     try:  # return tabix indexes for each annotation bed
         return pysam.TabixFile(annotation_fname)
     except (SamtoolsError, Exception) as e:

From 1f80a45a953e01dd10b9398f45baa24fea740f84 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:33 +0000
Subject: [PATCH 11/14] Fix FILTER_PASS: remove VCF rewrite loop from
 submit_job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The awk+bgzip+tabix loop that normalised non-PASS FILTER values to PASS
before calling crispritz was incorrect — it rewrote source VCF files on
disk (replacing symlinks with real files and creating .tbi files), which
then triggered a pre-existing crispritz modify-while-iterating bug on the
.tbi files in the VCF directory.

VCF files must never be modified by CRISPRme. The FILTER_PASS flag is
already correctly handled for gnomAD VCFs in convert_gnomAD_vcfs.py
(in-memory filtering during conversion). For VCFs passed directly to
crispritz (1000G, AoU, etc.), those files are expected to already carry
appropriate FILTER values (PASS) and no rewriting is needed.
---
 .../submit_job_automated_new_multiple_vcfs.sh | 25 -------------------
 1 file changed, 25 deletions(-)

diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh
index 4873f2d..97f4661 100755
--- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh
+++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh
@@ -190,31 +190,6 @@ while read vcf_f; do
 			echo -e 'Add-variants\tStart\t'$(date) >>$log
 			echo -e "Adding variants"
 
-			# Rewrite accepted non-PASS FILTER values to PASS before genome enrichment.
-			# crispritz only includes records with FILTER=PASS; any other accepted value
-			# (e.g. ".") must be normalised here. The accepted list comes from
-			# --vcf-filter-pass-values (default "PASS,.").
-			for _vcf in "$vcf_folder"/*.vcf.gz; do
-				_tmp=$(mktemp --suffix=.vcf.gz)
-				_fname=$(basename "$_vcf")
-				zcat "$_vcf" | \
-					awk -v accept="$vcf_filter_pass_values" -v fname="$_fname" '
-						BEGIN {
-							FS="\t"; OFS="\t"; n=0
-							split(accept, _a, ",")
-							for (i in _a) ok[_a[i]] = 1
-						}
-						/^#/ { print; next }
-						$7 != "PASS" && ($7 in ok) { $7 = "PASS"; n++ }
-						{ print }
-						END {
-							if (n > 0)
-								printf "WARNING: %d non-PASS accepted FILTER record(s) in %s normalised to PASS for CRISPRme compatibility.\n", n, fname > "/dev/stderr"
-						}' | bgzip > "$_tmp"
-				mv "$_tmp" "$_vcf"
-				tabix -p vcf "$_vcf"
-			done
-
 			# enrich genome using crispritz (run from unique temp dir so crispritz's
 			# fixed-name variants_genome/ output does not collide with concurrent runs)
 			cd "$_run_tmp"

From c85f1958c97c661d418131ce2531a8836bd0bb21 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:34 +0000
Subject: [PATCH 12/14] Fix MISSING_ALLELE: guard against '.' alleles in
 resultIntegrator haplotype counting

---
 PostProcess/resultIntegrator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PostProcess/resultIntegrator.py b/PostProcess/resultIntegrator.py
index 59d0fa4..33416c5 100755
--- a/PostProcess/resultIntegrator.py
+++ b/PostProcess/resultIntegrator.py
@@ -141,8 +141,9 @@ def createBedforMultiAlternative(variantList, samples):
     for snp in haplotypeDict:
         for count, sample in enumerate(haplotypeDict[snp]):
             alleles = sample.strip().split("|")
-            allele1_snp = alleles[0]
+            allele1_snp = alleles[0] if alleles[0] != "." else "0"  # missing allele guard
             allele2_snp = alleles[1] if len(alleles) > 1 else alleles[0]  # haploid guard
+            allele2_snp = allele2_snp if allele2_snp != "." else "0"  # missing allele guard
             allele1[count] = int(allele1[count]) and int(allele1_snp)
             allele2[count] = int(allele2[count]) and int(allele2_snp)
     # final sum to generete haplotype frequency of the specific target

From 5ceb1dddabebe743a1bad7d438e4569b16082235 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:34 +0000
Subject: [PATCH 13/14] Fix GT_HAPLOID (extension): add haploid guard to
 new_simple_analysis.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same IndexError as Fix #4 — sampleInfo[1].split('|')[haplo] crashes when
GT has no pipe (haploid encoding). Triggered by 1KG 2021 eagle2-phased
chrX VCF which stores male hemizygous calls as bare '0'/'1' rather than
forcing diploid '0|0'/'1|1' as shapeit2 did in the 2019 VCF.
---
 PostProcess/new_simple_analysis.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PostProcess/new_simple_analysis.py b/PostProcess/new_simple_analysis.py
index 4c11f80..d0f05af 100755
--- a/PostProcess/new_simple_analysis.py
+++ b/PostProcess/new_simple_analysis.py
@@ -227,8 +227,10 @@ def iupac_decomposition(split, guide_no_bulge, guide_no_pam, cluster_to_save):
                     haploSamples = {0: [], 1: []}
                     for count, sample in enumerate(sampleSet[i]):
                         sampleInfo = sample.split(":")
+                        gt_alleles = sampleInfo[1].split("|")
                         for haplo in totalDict:
-                            if sampleInfo[1].split("|")[haplo] != "0":
+                            allele = gt_alleles[haplo] if len(gt_alleles) > haplo else gt_alleles[0]  # haploid guard
+                            if allele != "0":
                                 haploSamples[haplo].append(sampleInfo[0])
                     totalDict[0][0][(pos_c, elem)] = [
                         listReplaceTarget,

From 302dc8425eebce658c7b0345c6be14dac421f362 Mon Sep 17 00:00:00 2001
From: anncir1 <19394514+anncir1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:04:35 +0000
Subject: [PATCH 14/14] Fix GNOMAD_CONVERTER: add --af-threshold and
 --output-dir flags; fix AF field lookup

- parse_commandline: parse optional --af-threshold and --output-dir keyword args
  from sys.argv (positional args unchanged for backward compat); create output_dir
  if specified
- convert_vcf: add af_threshold (default 0.0, no filter) and output_dir (default
  "", same dir as input) params; skip variants where AF_joint < af_threshold;
  write output to output_dir when specified; set af_field = "AF_joint" if joint
- format_variant_record: add af_field param (default "AF") so gnomAD joint VCFs
  correctly read AF_joint instead of falling back to AF=0.0 in the output INFO
- run_conversion_pipeline: pass af_threshold and output_dir through to convert_vcf
- convert_gnomad_vcfs: unpack two new values from parse_commandline; pass to pipeline
---
 PostProcess/convert_gnomAD_vcfs.py | 93 +++++++++++++++++++++++++-----
 1 file changed, 77 insertions(+), 16 deletions(-)

diff --git a/PostProcess/convert_gnomAD_vcfs.py b/PostProcess/convert_gnomAD_vcfs.py
index 01878b3..63d38a9 100644
--- a/PostProcess/convert_gnomAD_vcfs.py
+++ b/PostProcess/convert_gnomAD_vcfs.py
@@ -45,7 +45,7 @@
 BCFTOOLSNORM = "bcftools norm"
 
 
-def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, frozenset]:
+def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, frozenset, float, str]:
     """Parse and validate command line arguments for gnomAD VCF conversion.
 
     This function checks the provided arguments for correctness and extracts the
@@ -53,15 +53,22 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int,
     the directory, joint processing flag, and thread count are valid before
     returning the parsed values.
 
+    Positional args (6 required, 7th optional):
+        gnomad_vcfs_dir samples_ids joint keep multiallelic threads [filter_pass_values]
+
+    Optional keyword args (may appear anywhere after the positional args):
+        --af-threshold FLOAT   Only include variants with AF_joint >= this value
+                               (default: 0.0, i.e. no filtering).
+        --output-dir PATH      Directory for converted output VCFs. Created if
+                               absent. Default: same directory as each input VCF.
+
     Args:
         args (List[str]): A list of command line arguments.
 
     Returns:
-        Tuple[str, str, bool, bool, bool, int, frozenset]: A tuple containing the gnomAD
-            VCF directory, sample IDs, a flag indicating whether to use joint
-            VCF processing, a flag indicating whether to keep files, a flag for
-            multiallelic processing, the number of threads to use, and a frozenset
-            of VCF FILTER values to treat as passing.
+        Tuple containing: gnomAD VCF directory, sample IDs, joint flag, keep flag,
+        multiallelic flag, threads, filter_pass_values frozenset, af_threshold float,
+        and output_dir string (empty string means same dir as input).
 
     Raises:
         ValueError: If the number of arguments is incorrect, if the specified
@@ -69,12 +76,34 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int,
             allowed range.
     """
 
-    if len(args) not in (6, 7):
+    # Extract optional keyword args before counting positional args
+    af_threshold = 0.0
+    output_dir = ""
+    positional = []
+    i = 0
+    while i < len(args):
+        if args[i] == "--af-threshold":
+            if i + 1 >= len(args):
+                raise ValueError("Missing value for --af-threshold")
+            af_threshold = float(args[i + 1])
+            if not (0.0 <= af_threshold < 1.0):
+                raise ValueError(f"--af-threshold must be in [0, 1), got {af_threshold}")
+            i += 2
+        elif args[i] == "--output-dir":
+            if i + 1 >= len(args):
+                raise ValueError("Missing value for --output-dir")
+            output_dir = args[i + 1]
+            i += 2
+        else:
+            positional.append(args[i])
+            i += 1
+
+    if len(positional) not in (6, 7):
         raise ValueError(
             "Wrong number of input arguments, cannot proceed with gnomAD VCF conversion"
         )
-    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args[:6]
-    filter_pass_values = frozenset(args[6].split(",")) if len(args) == 7 else frozenset({"PASS", "."})
+    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = positional[:6]
+    filter_pass_values = frozenset(positional[6].split(",")) if len(positional) == 7 else frozenset({"PASS", "."})
     if not os.path.isdir(gnomad_vcfs_dir):
         raise ValueError(
             f"The specified gnomAD VCF directory is not a directory ({gnomad_vcfs_dir})"
@@ -85,7 +114,9 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int,
     joint = joint == "True"
     keep = keep == "True"
     multiallelic = multiallelic == "True"
-    return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values, af_threshold, output_dir
 
 
 def read_samples_ids(samples_ids: str):
@@ -197,7 +228,7 @@ def variant_observed(allele_count: Tuple[int]) -> bool:
     return any((ac > 0 for ac in allele_count))
 
 
-def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str:
+def format_variant_record(variant: pysam.VariantRecord, genotypes: str, af_field: str = "AF") -> str:
     """
     Formats a variant record into a string with specified genotypes and variant
     information.
@@ -205,13 +236,15 @@ def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str:
     Args:
         variant (pysam.VariantRecord): The variant record to be formatted.
         genotypes (str): Genotypes information to be included in the format.
+        af_field (str): INFO field name to read allele frequency from. Use
+            "AF_joint" for gnomAD joint VCFs, "AF" otherwise. Default: "AF".
 
     Returns:
         str: Formatted variant record as a tab-separated string.
     """
 
     try:
-        af = ",".join(list(map(str, variant.info["AF"])))
+        af = ",".join(list(map(str, variant.info[af_field])))
     except KeyError:  # catch potential AF missing in variant INFO field
         af = ",".join(["0.0" for _ in variant.alts])  # type: ignore
     variant_format = [
@@ -235,6 +268,8 @@ def convert_vcf(
     joint: bool,
     keep: bool,
     filter_pass_values: frozenset = frozenset({"PASS", "."}),
+    af_threshold: float = 0.0,
+    output_dir: str = "",
 ):
     """
     Converts a VCF file by updating the header, filtering variants, and creating
@@ -247,6 +282,10 @@ def convert_vcf(
             the filter.
         filter_pass_values (frozenset): Set of VCF FILTER values to treat as
             passing. Default: frozenset({"PASS", "."}).
+        af_threshold (float): Minimum AF_joint value required to include a variant.
+            Default 0.0 (no filtering).
+        output_dir (str): Directory for the output VCF. If empty, the output is
+            written alongside the input file. Default: "".
 
     Returns:
         str: Path to the converted and compressed VCF file.
@@ -256,7 +295,12 @@ def convert_vcf(
         IOError: If an error occurs during the conversion process.
     """
 
-    vcf_outfname = f"{os.path.splitext(vcf_fname)[0]}.gz"  # replace bgz with gz
+    # Determine output path: use output_dir if provided, else same dir as input
+    out_basename = f"{os.path.splitext(os.path.basename(vcf_fname))[0]}.gz"
+    if output_dir:
+        vcf_outfname = os.path.join(output_dir, out_basename)
+    else:
+        vcf_outfname = os.path.join(os.path.dirname(vcf_fname), out_basename)
     try:
         vcf = load_vcf(vcf_fname)  # use pysam for optimized VCF loading
     except OSError as e:
@@ -272,6 +316,7 @@ def convert_vcf(
             f"Re-run with keep=True or adjust --vcf-filter-pass-values.\n"
         )
     vcf.reset()  # rewind before the main loop below
+    af_field = "AF_joint" if joint else "AF"  # field name for AF threshold filtering
     samples_ac = [
         f"AC_joint_{sample}" if joint else f"AC_{sample}" for sample in samples
     ]  # recover allele count field for each input sample
@@ -282,6 +327,15 @@ def convert_vcf(
             for variant in vcf:
                 if not keep and not (filter_pass_values & set(variant.filter.keys())):
                     continue
+                # AF threshold filter (skipped when af_threshold == 0.0)
+                if af_threshold > 0.0:
+                    try:
+                        af_vals = variant.info[af_field]
+                        af_max = max(af_vals) if hasattr(af_vals, "__len__") else af_vals
+                    except KeyError:
+                        af_max = 0.0
+                    if af_max < af_threshold:
+                        continue
                 genotypes = "\t".join(
                     [
                         GT[1] if variant_observed(variant.info[sac]) else GT[0]
@@ -289,7 +343,7 @@ def convert_vcf(
                     ]
                 )
                 outfile.write(
-                    f"{format_variant_record(variant, genotypes)}\n"
+                    f"{format_variant_record(variant, genotypes, af_field)}\n"
                 )  # write the formatted VCF line to the out vcf
     except IOError as e:
         raise IOError(f"An error occurred while converting {vcf_fname}") from e
@@ -342,6 +396,8 @@ def run_conversion_pipeline(
     keep: bool,
     multiallelic: bool,
     filter_pass_values: frozenset = frozenset({"PASS", "."}),
+    af_threshold: float = 0.0,
+    output_dir: str = "",
 ) -> None:
     """
     Runs a conversion pipeline to process a VCF file by adding genotypes and merging
@@ -355,13 +411,16 @@ def run_conversion_pipeline(
         multiallelic (bool): Flag indicating whether to handle multiallelic variants
             during merging.
         filter_pass_values (frozenset): Set of VCF FILTER values to treat as passing.
+        af_threshold (float): Minimum AF_joint value required to include a variant.
+            Default 0.0 (no filtering).
+        output_dir (str): Directory for output VCFs. Default "" (same as input).
 
     Returns:
         None
     """
 
     # add genotypes to input VCF
-    vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep, filter_pass_values)
+    vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep, filter_pass_values, af_threshold, output_dir)
     # merge variants into mutlialleic/biallelic sites
     vcf_merged = bcftools_merge(vcf_genotypes, multiallelic)
     assert os.path.isfile(vcf_merged) and os.stat(vcf_merged).st_size > 0
@@ -382,7 +441,7 @@ def convert_gnomad_vcfs() -> None:
     """
 
     # read input arguments
-    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values = (
+    gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values, af_threshold, output_dir = (
         parse_commandline(sys.argv[1:])
     )
     start = time.time()
@@ -401,6 +460,8 @@ def convert_gnomad_vcfs() -> None:
             keep=keep,
             multiallelic=multiallelic,
             filter_pass_values=filter_pass_values,
+            af_threshold=af_threshold,
+            output_dir=output_dir,
         )
         pool.map(partial_run_conversion_pipeline, gnomad_vcfs)
         pool.close()