From 1974df2c70ec694b6518850654bc5f57b19fb6e7 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:28 +0000 Subject: [PATCH 01/14] Fix BLANK_LINE_IN_SAMPLES_FILE: skip blank lines in init_summary_by_sample() Trailing newline in samplesID file caused IndexError on splitted[3], crashing post-processing after all off-target search had completed. --- PostProcess/process_summaries.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PostProcess/process_summaries.py b/PostProcess/process_summaries.py index eb04847..48ac2d7 100755 --- a/PostProcess/process_summaries.py +++ b/PostProcess/process_summaries.py @@ -32,6 +32,8 @@ def init_summary_by_sample(path_samplesID): sid.readline() for line in sid: splitted = line.strip().split("\t") + if not splitted[0]: + continue dict_samples[splitted[0]] = [ splitted[3], splitted[1], From ce38ecd54f73e93acd3e75879263c76cb617f847 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:29 +0000 Subject: [PATCH 02/14] Fix CHR_FROM_FILENAME: read chromosome from VCF content, not filename Filename-based chromosome extraction failed for VCFs whose names do not embed the chromosome (e.g. whole-genome single-file VCFs, DeepVariant output). Replace with _chrom_from_vcf() helper that reads the CHROM column from the first data line in pool_post_analisi_indel.py and pool_search_indels.py. In dictionary_creation_indels.py, set chr_name directly from first_line[0] which is already read from the file. --- PostProcess/pool_post_analisi_indel.py | 14 ++++++++++-- PostProcess/pool_search_indels.py | 22 +++++++++++-------- .../dictionary_creation_indels.py | 6 +---- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py index 1d83206..c5582d8 100755 --- a/PostProcess/pool_post_analisi_indel.py +++ b/PostProcess/pool_post_analisi_indel.py @@ -2,10 +2,21 @@ from multiprocessing import Pool +import gzip import subprocess import sys import os + +def _chrom_from_vcf(vcf_path: str) -> str: + """Return the chromosome name from the first data line of a VCF (plain or gzipped).""" + open_fn = gzip.open if vcf_path.endswith(".gz") else open + with open_fn(vcf_path, "rt") as fh: + for line in fh: + if not line.startswith("#"): + return line.split("\t")[0] + raise ValueError(f"No data lines found in VCF: {vcf_path}") + # post-analysis script name POSTANALYSIS = "./post_analisi_indel.sh" @@ -26,8 +37,7 @@ def start_analysis(fname: str) -> None: - chrom = next((e for e in fname.split(".") if e.startswith("chr")), None) - assert chrom + chrom = _chrom_from_vcf(os.path.join(vcf_folder, fname)) code = subprocess.call( f"{POSTANALYSIS} {output_folder} {ref_folder} {vcf_folder} {guide_file} " f"{mm} {bDNA} {bRNA} {annotation_file} {pam_file} {dict_folder} " diff --git a/PostProcess/pool_search_indels.py b/PostProcess/pool_search_indels.py index 6646678..6180116 100755 --- a/PostProcess/pool_search_indels.py +++ b/PostProcess/pool_search_indels.py @@ -1,10 +1,21 @@ #!/usr/bin/env python +import gzip import os import sys from multiprocessing import Pool from datetime import datetime + +def _chrom_from_vcf(vcf_path: str) -> str: + """Return the chromosome name from the first data line of a VCF (plain or gzipped).""" + open_fn = gzip.open if vcf_path.endswith(".gz") else open + with open_fn(vcf_path, "rt") as fh: + for line in fh: + if not line.startswith("#"): + return line.split("\t")[0] + raise ValueError(f"No data lines found in VCF: {vcf_path}") + ref_folder = sys.argv[1] ref_name = os.path.basename(sys.argv[1]) vcf_dir = sys.argv[2] @@ -24,11 +35,7 @@ def search_indels(f): - # global use_thread - splitted = f.split(".") - for elem in splitted: - if "chr" in elem: - chrom = elem + chrom = _chrom_from_vcf(os.path.join(vcf_dir, f)) print("Searching for INDELs in", chrom) if bDNA != "0" or bRNA != "0": os.system( @@ -62,10 +69,7 @@ def search_indels(f): for key in chrs: - splitted = key.split(".") - for elem in splitted: - if "chr" in elem: - chrom = elem + chrom = _chrom_from_vcf(os.path.join(vcf_dir, key)) os.system( f"tail -n +2 {output_folder}/fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt >> {output_folder}/indels_{ref_name}+{vcf_name}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt" ) diff --git a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py index da86fa8..ddd7341 100755 --- a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py +++ b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py @@ -15,11 +15,6 @@ import time vcf_file = sys.argv[1] -chr_name = vcf_file.split('.') -for i in chr_name: - if 'chr' in i: - chr_name = i - break selected_sample = sys.argv[2] save_directory = sys.argv[3] @@ -37,6 +32,7 @@ break first_line = next(targets).decode('ascii').strip().split('\t') + chr_name = first_line[0] # authoritative chromosome from VCF data if 'chr' not in first_line[0]: add_to_name = 'chr' list_samples = [] From 3c5dc00aa5500a7ed885a47e9f5bc918d326d511 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:29 +0000 Subject: [PATCH 03/14] Fix FILTER_PASS: add --vcf-filter-pass-values flag (default "PASS,.") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose a new --vcf-filter-pass-values CLI flag on crisprme.py so users can control which VCF FILTER values are treated as passing. The default "PASS,." covers the VCF spec §1.6.1 "not evaluated" sentinel (".") in addition to the standard PASS value, making it backwards-compatible. Changes: - crisprme.py: parse --vcf-filter-pass-values in complete_search() and gnomAD_converter(); pass the value to the shell script (as positional arg 24) and to convert_gnomAD_vcfs.py as a 7th positional argument. - submit_job_automated_new_multiple_vcfs.sh: accept vcf_filter_pass_values as ${24} (default "PASS,."); insert an awk loop before crispritz.py add-variants that rewrites accepted non-PASS FILTER values to PASS so crispritz includes those records during genome enrichment. - convert_gnomAD_vcfs.py: accept filter_pass_values as an optional 7th argument (parse_commandline); add itertools import; add a pre-flight warning if no accepted-filter records are found in the first 10k rows; replace the hardcoded "PASS" check with a set-intersection test against filter_pass_values throughout convert_vcf() and run_conversion_pipeline(). --- PostProcess/convert_gnomAD_vcfs.py | 51 +++++++++++++++---- .../submit_job_automated_new_multiple_vcfs.sh | 39 +++++++++++++- crisprme.py | 23 ++++++++- 3 files changed, 98 insertions(+), 15 deletions(-) diff --git a/PostProcess/convert_gnomAD_vcfs.py b/PostProcess/convert_gnomAD_vcfs.py index c205991..01878b3 100644 --- a/PostProcess/convert_gnomAD_vcfs.py +++ b/PostProcess/convert_gnomAD_vcfs.py @@ -33,6 +33,7 @@ import multiprocessing import subprocess +import itertools import pysam import gzip import time @@ -44,7 +45,7 @@ BCFTOOLSNORM = "bcftools norm" -def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int]: +def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, frozenset]: """Parse and validate command line arguments for gnomAD VCF conversion. This function checks the provided arguments for correctness and extracts the @@ -56,10 +57,11 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int] args (List[str]): A list of command line arguments. Returns: - Tuple[str, str, bool, bool, bool, int]: A tuple containing the gnomAD + Tuple[str, str, bool, bool, bool, int, frozenset]: A tuple containing the gnomAD VCF directory, sample IDs, a flag indicating whether to use joint VCF processing, a flag indicating whether to keep files, a flag for - multiallelic processing, and the number of threads to use. + multiallelic processing, the number of threads to use, and a frozenset + of VCF FILTER values to treat as passing. Raises: ValueError: If the number of arguments is incorrect, if the specified @@ -67,11 +69,12 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int] allowed range. """ - if len(args) != 6: + if len(args) not in (6, 7): raise ValueError( "Wrong number of input arguments, cannot proceed with gnomAD VCF conversion" ) - gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args[:6] + filter_pass_values = frozenset(args[6].split(",")) if len(args) == 7 else frozenset({"PASS", "."}) if not os.path.isdir(gnomad_vcfs_dir): raise ValueError( f"The specified gnomAD VCF directory is not a directory ({gnomad_vcfs_dir})" @@ -82,7 +85,7 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int] joint = joint == "True" keep = keep == "True" multiallelic = multiallelic == "True" - return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads + return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values def read_samples_ids(samples_ids: str): @@ -226,7 +229,13 @@ def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str: return "\t".join([f"{e}" if e is not None else "." for e in variant_format]) -def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): +def convert_vcf( + vcf_fname: str, + samples: List[str], + joint: bool, + keep: bool, + filter_pass_values: frozenset = frozenset({"PASS", "."}), +): """ Converts a VCF file by updating the header, filtering variants, and creating a new compressed VCF file. @@ -236,6 +245,8 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): samples (List[str]): List of sample names. keep (bool): Flag to determine whether to keep variants that do not pass the filter. + filter_pass_values (frozenset): Set of VCF FILTER values to treat as + passing. Default: frozenset({"PASS", "."}). Returns: str: Path to the converted and compressed VCF file. @@ -250,6 +261,17 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): vcf = load_vcf(vcf_fname) # use pysam for optimized VCF loading except OSError as e: raise OSError(f"An error occurred while loading {vcf_fname}") from e + # Pre-flight: warn if no accepted FILTER records found in the first 10,000 rows. + # If every record has a hard-fail filter the output will be silently empty. + _sample = list(itertools.islice(vcf.fetch(), 10_000)) + _n_ok = sum(1 for v in _sample if filter_pass_values & set(v.filter.keys())) + if not keep and _n_ok == 0 and _sample: + sys.stderr.write( + f"WARNING: {vcf_fname}: sampled {len(_sample)} records, none have " + f"FILTER in {filter_pass_values} — all variants will be skipped. " + f"Re-run with keep=True or adjust --vcf-filter-pass-values.\n" + ) + vcf.reset() # rewind before the main loop below samples_ac = [ f"AC_joint_{sample}" if joint else f"AC_{sample}" for sample in samples ] # recover allele count field for each input sample @@ -258,7 +280,7 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): # write the upated header to the converted vcf outfile.write(update_header(vcf.header.copy(), samples, joint)) for variant in vcf: - if not keep and "PASS" not in variant.filter.keys(): + if not keep and not (filter_pass_values & set(variant.filter.keys())): continue genotypes = "\t".join( [ @@ -314,7 +336,12 @@ def bcftools_merge(vcf_fname: str, multiallelic: bool) -> str: def run_conversion_pipeline( - vcf_fname: str, samples: List[str], joint: bool, keep: bool, multiallelic: bool + vcf_fname: str, + samples: List[str], + joint: bool, + keep: bool, + multiallelic: bool, + filter_pass_values: frozenset = frozenset({"PASS", "."}), ) -> None: """ Runs a conversion pipeline to process a VCF file by adding genotypes and merging @@ -327,13 +354,14 @@ def run_conversion_pipeline( the filter. multiallelic (bool): Flag indicating whether to handle multiallelic variants during merging. + filter_pass_values (frozenset): Set of VCF FILTER values to treat as passing. Returns: None """ # add genotypes to input VCF - vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep) + vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep, filter_pass_values) # merge variants into mutlialleic/biallelic sites vcf_merged = bcftools_merge(vcf_genotypes, multiallelic) assert os.path.isfile(vcf_merged) and os.stat(vcf_merged).st_size > 0 @@ -354,7 +382,7 @@ def convert_gnomad_vcfs() -> None: """ # read input arguments - gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = ( + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values = ( parse_commandline(sys.argv[1:]) ) start = time.time() @@ -372,6 +400,7 @@ def convert_gnomad_vcfs() -> None: joint=joint, keep=keep, multiallelic=multiallelic, + filter_pass_values=filter_pass_values, ) pool.map(partial_run_conversion_pipeline, gnomad_vcfs) pool.close() diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index a13dc18..92d2712 100755 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -24,6 +24,13 @@ # ${15}: Current working directory path. # ${16}: Gene proximity file path. # ${17}: Email address for notifications. +# ${18}: Base check start (base editing window). +# ${19}: Base check end (base editing window). +# ${20}: Base check set (base editing bases). +# ${21}: Sorting criteria for scoring columns. +# ${22}: Sorting criteria (mm+bulges) columns. +# ${23}: CI/CD test mode flag. +# ${24}: Comma-separated VCF FILTER values to treat as passing (default "PASS,."). # # Returns: # Generates various output files including target lists, merged results, and a database. @@ -63,6 +70,9 @@ sorting_criteria=${22} # CI/CD test mode cicd_test=${23} +# Comma-separated VCF FILTER values to treat as passing (default "PASS,.") +vcf_filter_pass_values="${24:-PASS,.}" + # log files log="$output_folder/log.txt" touch $log @@ -176,10 +186,35 @@ while read vcf_f; do if ! [ -d "$enriched_folder" ]; then echo -e 'Add-variants\tStart\t'$(date) >>$log echo -e "Adding variants" - + + # Rewrite accepted non-PASS FILTER values to PASS before genome enrichment. + # crispritz only includes records with FILTER=PASS; any other accepted value + # (e.g. ".") must be normalised here. The accepted list comes from + # --vcf-filter-pass-values (default "PASS,."). + for _vcf in "$vcf_folder"/*.vcf.gz; do + _tmp=$(mktemp --suffix=.vcf.gz) + _fname=$(basename "$_vcf") + zcat "$_vcf" | \ + awk -v accept="$vcf_filter_pass_values" -v fname="$_fname" ' + BEGIN { + FS="\t"; OFS="\t"; n=0 + split(accept, _a, ",") + for (i in _a) ok[_a[i]] = 1 + } + /^#/ { print; next } + $7 != "PASS" && ($7 in ok) { $7 = "PASS"; n++ } + { print } + END { + if (n > 0) + printf "WARNING: %d non-PASS accepted FILTER record(s) in %s normalised to PASS for CRISPRme compatibility.\n", n, fname > "/dev/stderr" + }' | bgzip > "$_tmp" + mv "$_tmp" "$_vcf" + tabix -p vcf "$_vcf" + done + # enrich genome using crispritz cd "$current_working_directory/Genomes" - crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" + crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" if [ -s $logerror ]; then printf "ERROR: Genome enrichment failed on %s\n" "$vcf_name" >&2 # since failure happened, force genome enrichment to be repeated diff --git a/crisprme.py b/crisprme.py index e2f6b1d..710d322 100755 --- a/crisprme.py +++ b/crisprme.py @@ -1153,6 +1153,15 @@ def complete_search() -> None: sorting_criteria = _check_sorting_criteria(args, "--sorting-criteria" in args) # sorting criteria (mm+bulges) columns outputfolder = _check_output(args) # output folder thread = _check_threads(args, "--thread" in args) # number of threads + # VCF FILTER values to treat as passing (default: "PASS,." per VCF spec §1.6.1) + vcf_filter_pass_values = "PASS,." + if "--vcf-filter-pass-values" in args: + try: + vcf_filter_pass_values = args[args.index("--vcf-filter-pass-values") + 1] + if vcf_filter_pass_values.startswith("--"): + raise ValueError("Please input a value for flag --vcf-filter-pass-values") + except IndexError as e: + raise ValueError("Missing input for --vcf-filter-pass-values") from e # extract pam seq from file pam_len = 0 @@ -1304,7 +1313,8 @@ def complete_search() -> None: f"{pamfile} {annotationfile} {samplefile} {bMax + 1} {mm} {bDNA} {bRNA} " f"{merge_t} {outputfolder} {script_path} {thread} {current_working_directory} " f"{gene_annotation} {void_mail} {base_start} {base_end} {base_set} " - f"{sorting_criteria_scoring} {sorting_criteria} {cicd_test}" + f"{sorting_criteria_scoring} {sorting_criteria} {cicd_test} " + f"{vcf_filter_pass_values}" ) code = subprocess.call( crisprme_run, shell=True, stderr=log_error, stdout=log_verbose @@ -1550,11 +1560,20 @@ def gnomAD_converter(): raise ValueError(f"Forbidden number of threads ({threads})") except IndexError as e: raise ValueError("Missing or forbidden threads value") from e + # read vcf filter pass values arg + vcf_filter_pass_values = "PASS,." + if "--vcf-filter-pass-values" in args: + try: + vcf_filter_pass_values = args[args.index("--vcf-filter-pass-values") + 1] + if vcf_filter_pass_values.startswith("--"): + raise ValueError("Please input a value for flag --vcf-filter-pass-values") + except IndexError as e: + raise ValueError("Missing input for --vcf-filter-pass-values") from e # run gnom AD converter gnomad_converter_script = os.path.join(script_path, "convert_gnomAD_vcfs.py") cmd = ( f"python {gnomad_converter_script} {gnomad_dir} {samples_ids} {joint} " - f"{keep} {multiallelic} {threads}" + f"{keep} {multiallelic} {threads} {vcf_filter_pass_values}" ) code = subprocess.call(cmd, shell=True) if code != 0: From c8af3f8b15ada770ada8a1f472c9a5b93deb0b34 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:30 +0000 Subject: [PATCH 04/14] Fix GT_HAPLOID: guard against bare 0/1 haploid GT (no | separator) Two code paths assumed all GTs are diploid and split on "|", raising IndexError when encountering hemizygous records (e.g. chrX in male samples). - resultIntegrator.py: split into alleles list and use alleles[0] for allele2 when only one element is present, correctly treating hemizygous as homozygous for haplotype frequency counting. - dictionary_creation_indels.py: after splitting on "|", duplicate a single-element list so the haploid allele is counted twice, matching the biological expectation for hemizygous variants. --- PostProcess/resultIntegrator.py | 5 +++-- .../dictionaryIndel/dictionary_creation_indels.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/PostProcess/resultIntegrator.py b/PostProcess/resultIntegrator.py index 912f9bd..59d0fa4 100755 --- a/PostProcess/resultIntegrator.py +++ b/PostProcess/resultIntegrator.py @@ -140,8 +140,9 @@ def createBedforMultiAlternative(variantList, samples): # e.g. 0|1 and 1|1 will return 0|1 that will be counted as 1 for that sample haplotype for snp in haplotypeDict: for count, sample in enumerate(haplotypeDict[snp]): - allele1_snp = sample.strip().split("|")[0] - allele2_snp = sample.strip().split("|")[1] + alleles = sample.strip().split("|") + allele1_snp = alleles[0] + allele2_snp = alleles[1] if len(alleles) > 1 else alleles[0] # haploid guard allele1[count] = int(allele1[count]) and int(allele1_snp) allele2[count] = int(allele2[count]) and int(allele2_snp) # final sum to generete haplotype frequency of the specific target diff --git a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py index ddd7341..6ed3830 100755 --- a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py +++ b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py @@ -39,6 +39,8 @@ list_chars = [] hap = first_line[sample_header_pos].split('|') + if len(hap) == 1: + hap = hap * 2 # haploid GT: treat as homozygous for frequency counting for h in hap: #NOTE posso avere anche 2|1 #TODO da finire if '0' == h: continue From 6a94b049021a61e89cb2b7e1feaa6d2a61b82199 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:31 +0000 Subject: [PATCH 05/14] Fix CONCURRENT_RUNS: unique temp dir and mkdir locks for shared build steps Two classes of race condition existed when multiple CRISPRme jobs ran simultaneously: 1. Shared variants_genome/ temp folder: crispritz.py add-variants always writes to variants_genome/ relative to its working directory, which was fixed at Genomes/. Concurrent jobs with different VCF sets would overwrite each other's intermediate enriched genome. Fix: create a per-run temp dir with mktemp -d (run_XXXXXX under Genomes/) and cd into it before calling crispritz, so each run gets its own variants_genome/ subtree. The temp dir is removed after the mv steps. 2. Shared index/enriched-genome folders built without atomicity: two runs with identical parameters could both pass an "if ! [ -d folder ]" check simultaneously and interleave writes into the same directory, corrupting the output. Fix: wrap all four shared-folder build steps with a mkdir-based lock (atomic on POSIX). The winning process builds the folder and rmdir's the lock; losing processes wait in a 5-second poll loop until the lock disappears. Applied to: enriched genome, ref genome index, variant genome index, and indels index. --- .../submit_job_automated_new_multiple_vcfs.sh | 129 +++++++++++------- 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index 92d2712..4873f2d 100755 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -178,12 +178,15 @@ while read vcf_f; do # START STEP 1 - Genome enrichment if [ "$vcf_name" != "_" ]; then enriched_folder="$current_working_directory/Genomes/${ref_name}+${vcf_name}" - variants_tmp="$current_working_directory/Genomes/variants_genome" + _run_tmp=$(mktemp -d "$current_working_directory/Genomes/run_XXXXXX") + variants_tmp="$_run_tmp/variants_genome" dict_folder="$current_working_directory/Dictionaries/dictionaries_${vcf_name}/" indel_dict_folder="$current_working_directory/Dictionaries/log_indels_${vcf_name}/" indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS" - if ! [ -d "$enriched_folder" ]; then + _lock="$enriched_folder.lock" + if ! [ -d "$enriched_folder" ] && mkdir "$_lock" 2>/dev/null; then + # only one process enters here; mkdir is atomic echo -e 'Add-variants\tStart\t'$(date) >>$log echo -e "Adding variants" @@ -212,13 +215,14 @@ while read vcf_f; do tabix -p vcf "$_vcf" done - # enrich genome using crispritz - cd "$current_working_directory/Genomes" + # enrich genome using crispritz (run from unique temp dir so crispritz's + # fixed-name variants_genome/ output does not collide with concurrent runs) + cd "$_run_tmp" crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" if [ -s $logerror ]; then printf "ERROR: Genome enrichment failed on %s\n" "$vcf_name" >&2 - # since failure happened, force genome enrichment to be repeated - rm -r "$enriched_folder"* "$variants_tmp" + rm -rf "$_run_tmp" + rmdir "$_lock" exit 1 fi @@ -236,12 +240,18 @@ while read vcf_f; do mv $variants_tmp/SNPs_genome/*.json "$dict_folder" mv $variants_tmp/SNPs_genome/log*.txt "$indel_dict_folder" - # remove temporary variant genome folder - rm -r "$variants_tmp" + # remove unique temp dir (variants_genome lives inside it) + rm -rf "$_run_tmp" + rmdir "$_lock" echo -e 'Add-variants\tEnd\t'$(date) >>"$log" + elif [ -d "$_lock" ]; then + echo "Another run is building $enriched_folder — waiting..." + while [ -d "$_lock" ]; do sleep 5; done + rm -rf "$_run_tmp" else echo -e "Variants already added" + rm -rf "$_run_tmp" fi cd $current_working_directory @@ -281,17 +291,24 @@ while read vcf_f; do echo -e 'Index-genome Reference\tEnd\t'$(date) >>"$log" idx_ref="$idx_folder3" else - # no valid index found, compute it - echo -e 'Index-genome Reference\tStart\t'$(date) >>$log - echo -e "Indexing reference genome" - # index reference genome using crispritz - crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus - if [ -s $logerror ]; then - printf "ERROR: reference genome indexing failed\n" >&2 - [ -d "$idx_folder1" ] && rm -r "$idx_folder1" - exit 1 + # no valid index found, compute it; use mkdir lock to prevent concurrent builds + _lock="${idx_folder1}.lock" + if mkdir "$_lock" 2>/dev/null; then + echo -e 'Index-genome Reference\tStart\t'$(date) >>$log + echo -e "Indexing reference genome" + # index reference genome using crispritz + crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus + if [ -s $logerror ]; then + printf "ERROR: reference genome indexing failed\n" >&2 + [ -d "$idx_folder1" ] && rm -r "$idx_folder1" + rmdir "$_lock" + exit 1 + fi + rmdir "$_lock" + elif [ -d "$_lock" ]; then + echo "Another run is indexing $idx_folder1 — waiting..." + while [ -d "$_lock" ]; do sleep 5; done fi - pid_index_ref=$! echo -e 'Index-genome Reference\tEnd\t'$(date) >>$log idx_ref="$idx_folder1" fi @@ -319,21 +336,28 @@ while read vcf_f; do echo -e 'Index-genome Variant\tEnd\t'$(date) >>"$log" idx_var="$idx_folder3" else - # no index found, compute it - echo -e 'Index-genome Variant\tStart\t'$(date) >>$log - echo -e "Indexing variant genome" - # index alternative genome using crispritz - crispritz.py index-genome \ - "${ref_name}+${vcf_name}" \ - "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" \ - "$pam_file" \ - -bMax $bMax -th $ncpus - if [ -s "$logerror" ]; then - printf "ERROR: alternative genome indexing failed on %s\n" "$vcf_name" >&2 - [ -d "$idx_folder1" ] && rm -r "$idx_folder1" - exit 1 + # no index found, compute it; use mkdir lock to prevent concurrent builds + _lock="${idx_folder1}.lock" + if mkdir "$_lock" 2>/dev/null; then + echo -e 'Index-genome Variant\tStart\t'$(date) >>$log + echo -e "Indexing variant genome" + # index alternative genome using crispritz + crispritz.py index-genome \ + "${ref_name}+${vcf_name}" \ + "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" \ + "$pam_file" \ + -bMax $bMax -th $ncpus + if [ -s "$logerror" ]; then + printf "ERROR: alternative genome indexing failed on %s\n" "$vcf_name" >&2 + [ -d "$idx_folder1" ] && rm -r "$idx_folder1" + rmdir "$_lock" + exit 1 + fi + rmdir "$_lock" + elif [ -d "$_lock" ]; then + echo "Another run is indexing $idx_folder1 — waiting..." + while [ -d "$_lock" ]; do sleep 5; done fi - pid_index_var=$! echo -e 'Index-genome Variant\tEnd\t'$(date) >>"$log" idx_var="$idx_folder1" fi @@ -343,23 +367,32 @@ while read vcf_f; do indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS" if ! [ -d "$indels_index_dir" ]; then - echo -e 'Indexing Indels\tStart\t'$(date) >>"$log" - "$starting_dir/pool_index_indels.py" \ - "$indels_out/" \ - "$pam_file" \ - "$true_pam" \ - "$ref_name" \ - "$vcf_name" \ - "$bMax" \ - "$ncpus" - - if [ -s "$logerror" ]; then - printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2 - [ -d "$indels_index_dir" ] && rm -r "$indels_index_dir" - exit 1 - fi + # use mkdir lock to prevent concurrent builds + _lock="${indels_index_dir}.lock" + if mkdir "$_lock" 2>/dev/null; then + echo -e 'Indexing Indels\tStart\t'$(date) >>"$log" + "$starting_dir/pool_index_indels.py" \ + "$indels_out/" \ + "$pam_file" \ + "$true_pam" \ + "$ref_name" \ + "$vcf_name" \ + "$bMax" \ + "$ncpus" + + if [ -s "$logerror" ]; then + printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2 + [ -d "$indels_index_dir" ] && rm -r "$indels_index_dir" + rmdir "$_lock" + exit 1 + fi - echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log" + rmdir "$_lock" + echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log" + elif [ -d "$_lock" ]; then + echo "Another run is indexing $indels_index_dir — waiting..." + while [ -d "$_lock" ]; do sleep 5; done + fi else echo "Indels Index already present" fi From a6fc4cb3635a1291baae16bc95ef66fea715f239 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:31 +0000 Subject: [PATCH 06/14] Fix DTYPE_WARNING: add dtype=str to chunked pd.read_csv in remove_n_and_dots Without an explicit dtype, pandas infers column types independently per chunk. A column that is all-NaN in one chunk and has values in another gets different inferred dtypes, triggering a DtypeWarning to stderr. Because the pipeline treats any non-empty log_error.txt as a fatal error, this warning causes runs to be reported as failed even when all results are correct. Adding dtype=str suppresses the inference entirely. It is also semantically correct: the sole purpose of this script is text replacement ("n" -> "NA", "." -> "NA"), so there is no reason to parse numeric types at all. --- PostProcess/remove_n_and_dots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PostProcess/remove_n_and_dots.py b/PostProcess/remove_n_and_dots.py index 04a917f..0deab9a 100755 --- a/PostProcess/remove_n_and_dots.py +++ b/PostProcess/remove_n_and_dots.py @@ -33,7 +33,7 @@ def read_report_chunks(report_fname: str) -> TextFileReader: Returns: TextFileReader: An iterator over DataFrame chunks. """ - return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE) + return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE, dtype=str) def polish_report(report_chunks: TextFileReader, report_fname: str) -> None: From 990d34a50c5d997bae06acb883b62cce4c4dc8df Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:31 +0000 Subject: [PATCH 07/14] Fix CHR_FROM_FILENAME: normalize chr prefix after _chrom_from_vcf() Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without a chr prefix in the CHROM field (e.g. '22' instead of 'chr22'), while the genome indices and fake FASTA files created by pool_index_indels.py use chr-prefixed names. Add _normalize_chrom() to pool_search_indels.py and pool_post_analisi_indel.py to ensure the derived chromosome name always matches the index naming convention. --- PostProcess/pool_post_analisi_indel.py | 10 +++++++++- PostProcess/pool_search_indels.py | 12 ++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py index c5582d8..8448704 100755 --- a/PostProcess/pool_post_analisi_indel.py +++ b/PostProcess/pool_post_analisi_indel.py @@ -17,6 +17,14 @@ def _chrom_from_vcf(vcf_path: str) -> str: return line.split("\t")[0] raise ValueError(f"No data lines found in VCF: {vcf_path}") + +def _normalize_chrom(chrom: str) -> str: + """Ensure chromosome name has chr prefix (e.g. '22' -> 'chr22'). + Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without the + prefix in the CHROM field, while the genome indices use 'chr'-prefixed names. + """ + return chrom if chrom.startswith("chr") else "chr" + chrom + # post-analysis script name POSTANALYSIS = "./post_analisi_indel.sh" @@ -37,7 +45,7 @@ def _chrom_from_vcf(vcf_path: str) -> str: def start_analysis(fname: str) -> None: - chrom = _chrom_from_vcf(os.path.join(vcf_folder, fname)) + chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_folder, fname))) code = subprocess.call( f"{POSTANALYSIS} {output_folder} {ref_folder} {vcf_folder} {guide_file} " f"{mm} {bDNA} {bRNA} {annotation_file} {pam_file} {dict_folder} " diff --git a/PostProcess/pool_search_indels.py b/PostProcess/pool_search_indels.py index 6180116..a7ecf8c 100755 --- a/PostProcess/pool_search_indels.py +++ b/PostProcess/pool_search_indels.py @@ -16,6 +16,14 @@ def _chrom_from_vcf(vcf_path: str) -> str: return line.split("\t")[0] raise ValueError(f"No data lines found in VCF: {vcf_path}") + +def _normalize_chrom(chrom: str) -> str: + """Ensure chromosome name has chr prefix (e.g. '22' -> 'chr22'). + Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without the + prefix in the CHROM field, while the genome indices use 'chr'-prefixed names. + """ + return chrom if chrom.startswith("chr") else "chr" + chrom + ref_folder = sys.argv[1] ref_name = os.path.basename(sys.argv[1]) vcf_dir = sys.argv[2] @@ -35,7 +43,7 @@ def _chrom_from_vcf(vcf_path: str) -> str: def search_indels(f): - chrom = _chrom_from_vcf(os.path.join(vcf_dir, f)) + chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_dir, f))) print("Searching for INDELs in", chrom) if bDNA != "0" or bRNA != "0": os.system( @@ -69,7 +77,7 @@ def search_indels(f): for key in chrs: - chrom = _chrom_from_vcf(os.path.join(vcf_dir, key)) + chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_dir, key))) os.system( f"tail -n +2 {output_folder}/fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt >> {output_folder}/indels_{ref_name}+{vcf_name}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt" ) From 5743ba01c89ddd02126f53f8bcec913e9266bed6 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:32 +0000 Subject: [PATCH 08/14] Fix TBI_FILTER: exclude .vcf.gz.tbi files from chromosome list in pool_post_analisi_indel --- PostProcess/pool_post_analisi_indel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py index 8448704..76e2407 100755 --- a/PostProcess/pool_post_analisi_indel.py +++ b/PostProcess/pool_post_analisi_indel.py @@ -59,6 +59,6 @@ def start_analysis(fname: str) -> None: # chromosome-wise vcfs list -chrs = [f for f in os.listdir(vcf_folder) if "vcf.gz" in f] +chrs = [f for f in os.listdir(vcf_folder) if f.endswith(".vcf.gz")] with Pool(processes=ncpus) as pool: # run chrom-wise post-analysis in parallel pool.map(start_analysis, chrs) From 6a47dd889af88cd15def822db878bcb8b32b2d93 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:32 +0000 Subject: [PATCH 09/14] Fix TABIX_CONCURRENT_WRITE: only regenerate .tbi when absent to prevent parallel write race --- PostProcess/annotation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PostProcess/annotation.py b/PostProcess/annotation.py index 9353800..e084523 100644 --- a/PostProcess/annotation.py +++ b/PostProcess/annotation.py @@ -60,7 +60,8 @@ def load_annotation_bed(annotation_fname: str) -> TabixFile: SamtoolsError: If the annotation BED file cannot be loaded or indexed. """ # check that tabix index is available for all annotation bed - pysam.tabix_index(annotation_fname, force=True, preset="bed") + if not os.path.isfile(annotation_fname + ".tbi"): + pysam.tabix_index(annotation_fname, force=True, preset="bed") try: # return tabix indexes for each annotation bed return pysam.TabixFile(annotation_fname) except (SamtoolsError, Exception) as e: From 2c8c13a987362661efd1770f6867ea3ba4822b09 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:33 +0000 Subject: [PATCH 10/14] Fix TABIX_CONCURRENT_WRITE: flock + timestamp check replaces bare existence check Existence-only check missed stale .tbi after complete-test re-bgzips annotation. Use fcntl.flock exclusive lock so only one parallel annotation process regenerates the index, and only when the .tbi is absent or older than the .gz. --- PostProcess/annotation.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/PostProcess/annotation.py b/PostProcess/annotation.py index e084523..27a279e 100644 --- a/PostProcess/annotation.py +++ b/PostProcess/annotation.py @@ -5,6 +5,7 @@ from typing import List, Tuple, Optional from time import time +import fcntl import pysam import sys import os @@ -59,9 +60,15 @@ def load_annotation_bed(annotation_fname: str) -> TabixFile: Raises: SamtoolsError: If the annotation BED file cannot be loaded or indexed. """ - # check that tabix index is available for all annotation bed - if not os.path.isfile(annotation_fname + ".tbi"): - pysam.tabix_index(annotation_fname, force=True, preset="bed") + # check that tabix index is available and up-to-date; use a lock file so + # that parallel annotation processes don't write the .tbi simultaneously + tbi_path = annotation_fname + ".tbi" + lock_path = annotation_fname + ".tbi.lock" + with open(lock_path, "w") as lock_fh: + fcntl.flock(lock_fh, fcntl.LOCK_EX) + if (not os.path.isfile(tbi_path) + or os.path.getmtime(tbi_path) < os.path.getmtime(annotation_fname)): + pysam.tabix_index(annotation_fname, force=True, preset="bed") try: # return tabix indexes for each annotation bed return pysam.TabixFile(annotation_fname) except (SamtoolsError, Exception) as e: From 1f80a45a953e01dd10b9398f45baa24fea740f84 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:33 +0000 Subject: [PATCH 11/14] Fix FILTER_PASS: remove VCF rewrite loop from submit_job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The awk+bgzip+tabix loop that normalised non-PASS FILTER values to PASS before calling crispritz was incorrect — it rewrote source VCF files on disk (replacing symlinks with real files and creating .tbi files), which then triggered a pre-existing crispritz modify-while-iterating bug on the .tbi files in the VCF directory. VCF files must never be modified by CRISPRme. The FILTER_PASS flag is already correctly handled for gnomAD VCFs in convert_gnomAD_vcfs.py (in-memory filtering during conversion). For VCFs passed directly to crispritz (1000G, AoU, etc.), those files are expected to already carry appropriate FILTER values (PASS) and no rewriting is needed. --- .../submit_job_automated_new_multiple_vcfs.sh | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index 4873f2d..97f4661 100755 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -190,31 +190,6 @@ while read vcf_f; do echo -e 'Add-variants\tStart\t'$(date) >>$log echo -e "Adding variants" - # Rewrite accepted non-PASS FILTER values to PASS before genome enrichment. - # crispritz only includes records with FILTER=PASS; any other accepted value - # (e.g. ".") must be normalised here. The accepted list comes from - # --vcf-filter-pass-values (default "PASS,."). - for _vcf in "$vcf_folder"/*.vcf.gz; do - _tmp=$(mktemp --suffix=.vcf.gz) - _fname=$(basename "$_vcf") - zcat "$_vcf" | \ - awk -v accept="$vcf_filter_pass_values" -v fname="$_fname" ' - BEGIN { - FS="\t"; OFS="\t"; n=0 - split(accept, _a, ",") - for (i in _a) ok[_a[i]] = 1 - } - /^#/ { print; next } - $7 != "PASS" && ($7 in ok) { $7 = "PASS"; n++ } - { print } - END { - if (n > 0) - printf "WARNING: %d non-PASS accepted FILTER record(s) in %s normalised to PASS for CRISPRme compatibility.\n", n, fname > "/dev/stderr" - }' | bgzip > "$_tmp" - mv "$_tmp" "$_vcf" - tabix -p vcf "$_vcf" - done - # enrich genome using crispritz (run from unique temp dir so crispritz's # fixed-name variants_genome/ output does not collide with concurrent runs) cd "$_run_tmp" From c85f1958c97c661d418131ce2531a8836bd0bb21 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:34 +0000 Subject: [PATCH 12/14] Fix MISSING_ALLELE: guard against '.' alleles in resultIntegrator haplotype counting --- PostProcess/resultIntegrator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PostProcess/resultIntegrator.py b/PostProcess/resultIntegrator.py index 59d0fa4..33416c5 100755 --- a/PostProcess/resultIntegrator.py +++ b/PostProcess/resultIntegrator.py @@ -141,8 +141,9 @@ def createBedforMultiAlternative(variantList, samples): for snp in haplotypeDict: for count, sample in enumerate(haplotypeDict[snp]): alleles = sample.strip().split("|") - allele1_snp = alleles[0] + allele1_snp = alleles[0] if alleles[0] != "." else "0" # missing allele guard allele2_snp = alleles[1] if len(alleles) > 1 else alleles[0] # haploid guard + allele2_snp = allele2_snp if allele2_snp != "." else "0" # missing allele guard allele1[count] = int(allele1[count]) and int(allele1_snp) allele2[count] = int(allele2[count]) and int(allele2_snp) # final sum to generete haplotype frequency of the specific target From 5ceb1dddabebe743a1bad7d438e4569b16082235 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:34 +0000 Subject: [PATCH 13/14] Fix GT_HAPLOID (extension): add haploid guard to new_simple_analysis.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same IndexError as Fix #4 — sampleInfo[1].split('|')[haplo] crashes when GT has no pipe (haploid encoding). Triggered by 1KG 2021 eagle2-phased chrX VCF which stores male hemizygous calls as bare '0'/'1' rather than forcing diploid '0|0'/'1|1' as shapeit2 did in the 2019 VCF. --- PostProcess/new_simple_analysis.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PostProcess/new_simple_analysis.py b/PostProcess/new_simple_analysis.py index 4c11f80..d0f05af 100755 --- a/PostProcess/new_simple_analysis.py +++ b/PostProcess/new_simple_analysis.py @@ -227,8 +227,10 @@ def iupac_decomposition(split, guide_no_bulge, guide_no_pam, cluster_to_save): haploSamples = {0: [], 1: []} for count, sample in enumerate(sampleSet[i]): sampleInfo = sample.split(":") + gt_alleles = sampleInfo[1].split("|") for haplo in totalDict: - if sampleInfo[1].split("|")[haplo] != "0": + allele = gt_alleles[haplo] if len(gt_alleles) > haplo else gt_alleles[0] # haploid guard + if allele != "0": haploSamples[haplo].append(sampleInfo[0]) totalDict[0][0][(pos_c, elem)] = [ listReplaceTarget, From 302dc8425eebce658c7b0345c6be14dac421f362 Mon Sep 17 00:00:00 2001 From: anncir1 <19394514+anncir1@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:04:35 +0000 Subject: [PATCH 14/14] Fix GNOMAD_CONVERTER: add --af-threshold and --output-dir flags; fix AF field lookup - parse_commandline: parse optional --af-threshold and --output-dir keyword args from sys.argv (positional args unchanged for backward compat); create output_dir if specified - convert_vcf: add af_threshold (default 0.0, no filter) and output_dir (default "", same dir as input) params; skip variants where AF_joint < af_threshold; write output to output_dir when specified; set af_field = "AF_joint" if joint - format_variant_record: add af_field param (default "AF") so gnomAD joint VCFs correctly read AF_joint instead of falling back to AF=0.0 in the output INFO - run_conversion_pipeline: pass af_threshold and output_dir through to convert_vcf - convert_gnomad_vcfs: unpack two new values from parse_commandline; pass to pipeline --- PostProcess/convert_gnomAD_vcfs.py | 93 +++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/PostProcess/convert_gnomAD_vcfs.py b/PostProcess/convert_gnomAD_vcfs.py index 01878b3..63d38a9 100644 --- a/PostProcess/convert_gnomAD_vcfs.py +++ b/PostProcess/convert_gnomAD_vcfs.py @@ -45,7 +45,7 @@ BCFTOOLSNORM = "bcftools norm" -def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, frozenset]: +def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, frozenset, float, str]: """Parse and validate command line arguments for gnomAD VCF conversion. This function checks the provided arguments for correctness and extracts the @@ -53,15 +53,22 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, the directory, joint processing flag, and thread count are valid before returning the parsed values. + Positional args (6 required, 7th optional): + gnomad_vcfs_dir samples_ids joint keep multiallelic threads [filter_pass_values] + + Optional keyword args (may appear anywhere after the positional args): + --af-threshold FLOAT Only include variants with AF_joint >= this value + (default: 0.0, i.e. no filtering). + --output-dir PATH Directory for converted output VCFs. Created if + absent. Default: same directory as each input VCF. + Args: args (List[str]): A list of command line arguments. Returns: - Tuple[str, str, bool, bool, bool, int, frozenset]: A tuple containing the gnomAD - VCF directory, sample IDs, a flag indicating whether to use joint - VCF processing, a flag indicating whether to keep files, a flag for - multiallelic processing, the number of threads to use, and a frozenset - of VCF FILTER values to treat as passing. + Tuple containing: gnomAD VCF directory, sample IDs, joint flag, keep flag, + multiallelic flag, threads, filter_pass_values frozenset, af_threshold float, + and output_dir string (empty string means same dir as input). Raises: ValueError: If the number of arguments is incorrect, if the specified @@ -69,12 +76,34 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, allowed range. """ - if len(args) not in (6, 7): + # Extract optional keyword args before counting positional args + af_threshold = 0.0 + output_dir = "" + positional = [] + i = 0 + while i < len(args): + if args[i] == "--af-threshold": + if i + 1 >= len(args): + raise ValueError("Missing value for --af-threshold") + af_threshold = float(args[i + 1]) + if not (0.0 <= af_threshold < 1.0): + raise ValueError(f"--af-threshold must be in [0, 1), got {af_threshold}") + i += 2 + elif args[i] == "--output-dir": + if i + 1 >= len(args): + raise ValueError("Missing value for --output-dir") + output_dir = args[i + 1] + i += 2 + else: + positional.append(args[i]) + i += 1 + + if len(positional) not in (6, 7): raise ValueError( "Wrong number of input arguments, cannot proceed with gnomAD VCF conversion" ) - gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args[:6] - filter_pass_values = frozenset(args[6].split(",")) if len(args) == 7 else frozenset({"PASS", "."}) + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = positional[:6] + filter_pass_values = frozenset(positional[6].split(",")) if len(positional) == 7 else frozenset({"PASS", "."}) if not os.path.isdir(gnomad_vcfs_dir): raise ValueError( f"The specified gnomAD VCF directory is not a directory ({gnomad_vcfs_dir})" @@ -85,7 +114,9 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, joint = joint == "True" keep = keep == "True" multiallelic = multiallelic == "True" - return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values + if output_dir: + os.makedirs(output_dir, exist_ok=True) + return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values, af_threshold, output_dir def read_samples_ids(samples_ids: str): @@ -197,7 +228,7 @@ def variant_observed(allele_count: Tuple[int]) -> bool: return any((ac > 0 for ac in allele_count)) -def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str: +def format_variant_record(variant: pysam.VariantRecord, genotypes: str, af_field: str = "AF") -> str: """ Formats a variant record into a string with specified genotypes and variant information. @@ -205,13 +236,15 @@ def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str: Args: variant (pysam.VariantRecord): The variant record to be formatted. genotypes (str): Genotypes information to be included in the format. + af_field (str): INFO field name to read allele frequency from. Use + "AF_joint" for gnomAD joint VCFs, "AF" otherwise. Default: "AF". Returns: str: Formatted variant record as a tab-separated string. """ try: - af = ",".join(list(map(str, variant.info["AF"]))) + af = ",".join(list(map(str, variant.info[af_field]))) except KeyError: # catch potential AF missing in variant INFO field af = ",".join(["0.0" for _ in variant.alts]) # type: ignore variant_format = [ @@ -235,6 +268,8 @@ def convert_vcf( joint: bool, keep: bool, filter_pass_values: frozenset = frozenset({"PASS", "."}), + af_threshold: float = 0.0, + output_dir: str = "", ): """ Converts a VCF file by updating the header, filtering variants, and creating @@ -247,6 +282,10 @@ def convert_vcf( the filter. filter_pass_values (frozenset): Set of VCF FILTER values to treat as passing. Default: frozenset({"PASS", "."}). + af_threshold (float): Minimum AF_joint value required to include a variant. + Default 0.0 (no filtering). + output_dir (str): Directory for the output VCF. If empty, the output is + written alongside the input file. Default: "". Returns: str: Path to the converted and compressed VCF file. @@ -256,7 +295,12 @@ def convert_vcf( IOError: If an error occurs during the conversion process. """ - vcf_outfname = f"{os.path.splitext(vcf_fname)[0]}.gz" # replace bgz with gz + # Determine output path: use output_dir if provided, else same dir as input + out_basename = f"{os.path.splitext(os.path.basename(vcf_fname))[0]}.gz" + if output_dir: + vcf_outfname = os.path.join(output_dir, out_basename) + else: + vcf_outfname = os.path.join(os.path.dirname(vcf_fname), out_basename) try: vcf = load_vcf(vcf_fname) # use pysam for optimized VCF loading except OSError as e: @@ -272,6 +316,7 @@ def convert_vcf( f"Re-run with keep=True or adjust --vcf-filter-pass-values.\n" ) vcf.reset() # rewind before the main loop below + af_field = "AF_joint" if joint else "AF" # field name for AF threshold filtering samples_ac = [ f"AC_joint_{sample}" if joint else f"AC_{sample}" for sample in samples ] # recover allele count field for each input sample @@ -282,6 +327,15 @@ def convert_vcf( for variant in vcf: if not keep and not (filter_pass_values & set(variant.filter.keys())): continue + # AF threshold filter (skipped when af_threshold == 0.0) + if af_threshold > 0.0: + try: + af_vals = variant.info[af_field] + af_max = max(af_vals) if hasattr(af_vals, "__len__") else af_vals + except KeyError: + af_max = 0.0 + if af_max < af_threshold: + continue genotypes = "\t".join( [ GT[1] if variant_observed(variant.info[sac]) else GT[0] @@ -289,7 +343,7 @@ def convert_vcf( ] ) outfile.write( - f"{format_variant_record(variant, genotypes)}\n" + f"{format_variant_record(variant, genotypes, af_field)}\n" ) # write the formatted VCF line to the out vcf except IOError as e: raise IOError(f"An error occurred while converting {vcf_fname}") from e @@ -342,6 +396,8 @@ def run_conversion_pipeline( keep: bool, multiallelic: bool, filter_pass_values: frozenset = frozenset({"PASS", "."}), + af_threshold: float = 0.0, + output_dir: str = "", ) -> None: """ Runs a conversion pipeline to process a VCF file by adding genotypes and merging @@ -355,13 +411,16 @@ def run_conversion_pipeline( multiallelic (bool): Flag indicating whether to handle multiallelic variants during merging. filter_pass_values (frozenset): Set of VCF FILTER values to treat as passing. + af_threshold (float): Minimum AF_joint value required to include a variant. + Default 0.0 (no filtering). + output_dir (str): Directory for output VCFs. Default "" (same as input). Returns: None """ # add genotypes to input VCF - vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep, filter_pass_values) + vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep, filter_pass_values, af_threshold, output_dir) # merge variants into mutlialleic/biallelic sites vcf_merged = bcftools_merge(vcf_genotypes, multiallelic) assert os.path.isfile(vcf_merged) and os.stat(vcf_merged).st_size > 0 @@ -382,7 +441,7 @@ def convert_gnomad_vcfs() -> None: """ # read input arguments - gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values = ( + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values, af_threshold, output_dir = ( parse_commandline(sys.argv[1:]) ) start = time.time() @@ -401,6 +460,8 @@ def convert_gnomad_vcfs() -> None: keep=keep, multiallelic=multiallelic, filter_pass_values=filter_pass_values, + af_threshold=af_threshold, + output_dir=output_dir, ) pool.map(partial_run_conversion_pipeline, gnomad_vcfs) pool.close()