diff --git a/PostProcess/annotation.py b/PostProcess/annotation.py index 9353800..27a279e 100644 --- a/PostProcess/annotation.py +++ b/PostProcess/annotation.py @@ -5,6 +5,7 @@ from typing import List, Tuple, Optional from time import time +import fcntl import pysam import sys import os @@ -59,8 +60,15 @@ def load_annotation_bed(annotation_fname: str) -> TabixFile: Raises: SamtoolsError: If the annotation BED file cannot be loaded or indexed. """ - # check that tabix index is available for all annotation bed - pysam.tabix_index(annotation_fname, force=True, preset="bed") + # check that tabix index is available and up-to-date; use a lock file so + # that parallel annotation processes don't write the .tbi simultaneously + tbi_path = annotation_fname + ".tbi" + lock_path = annotation_fname + ".tbi.lock" + with open(lock_path, "w") as lock_fh: + fcntl.flock(lock_fh, fcntl.LOCK_EX) + if (not os.path.isfile(tbi_path) + or os.path.getmtime(tbi_path) < os.path.getmtime(annotation_fname)): + pysam.tabix_index(annotation_fname, force=True, preset="bed") try: # return tabix indexes for each annotation bed return pysam.TabixFile(annotation_fname) except (SamtoolsError, Exception) as e: diff --git a/PostProcess/convert_gnomAD_vcfs.py b/PostProcess/convert_gnomAD_vcfs.py index c205991..63d38a9 100644 --- a/PostProcess/convert_gnomAD_vcfs.py +++ b/PostProcess/convert_gnomAD_vcfs.py @@ -33,6 +33,7 @@ import multiprocessing import subprocess +import itertools import pysam import gzip import time @@ -44,7 +45,7 @@ BCFTOOLSNORM = "bcftools norm" -def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int]: +def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int, frozenset, float, str]: """Parse and validate command line arguments for gnomAD VCF conversion. This function checks the provided arguments for correctness and extracts the @@ -52,14 +53,22 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int] the directory, joint processing flag, and thread count are valid before returning the parsed values. + Positional args (6 required, 7th optional): + gnomad_vcfs_dir samples_ids joint keep multiallelic threads [filter_pass_values] + + Optional keyword args (may appear anywhere after the positional args): + --af-threshold FLOAT Only include variants with AF_joint >= this value + (default: 0.0, i.e. no filtering). + --output-dir PATH Directory for converted output VCFs. Created if + absent. Default: same directory as each input VCF. + Args: args (List[str]): A list of command line arguments. Returns: - Tuple[str, str, bool, bool, bool, int]: A tuple containing the gnomAD - VCF directory, sample IDs, a flag indicating whether to use joint - VCF processing, a flag indicating whether to keep files, a flag for - multiallelic processing, and the number of threads to use. + Tuple containing: gnomAD VCF directory, sample IDs, joint flag, keep flag, + multiallelic flag, threads, filter_pass_values frozenset, af_threshold float, + and output_dir string (empty string means same dir as input). Raises: ValueError: If the number of arguments is incorrect, if the specified @@ -67,11 +76,34 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int] allowed range. """ - if len(args) != 6: + # Extract optional keyword args before counting positional args + af_threshold = 0.0 + output_dir = "" + positional = [] + i = 0 + while i < len(args): + if args[i] == "--af-threshold": + if i + 1 >= len(args): + raise ValueError("Missing value for --af-threshold") + af_threshold = float(args[i + 1]) + if not (0.0 <= af_threshold < 1.0): + raise ValueError(f"--af-threshold must be in [0, 1), got {af_threshold}") + i += 2 + elif args[i] == "--output-dir": + if i + 1 >= len(args): + raise ValueError("Missing value for --output-dir") + output_dir = args[i + 1] + i += 2 + else: + positional.append(args[i]) + i += 1 + + if len(positional) not in (6, 7): raise ValueError( "Wrong number of input arguments, cannot proceed with gnomAD VCF conversion" ) - gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = positional[:6] + filter_pass_values = frozenset(positional[6].split(",")) if len(positional) == 7 else frozenset({"PASS", "."}) if not os.path.isdir(gnomad_vcfs_dir): raise ValueError( f"The specified gnomAD VCF directory is not a directory ({gnomad_vcfs_dir})" @@ -82,7 +114,9 @@ def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int] joint = joint == "True" keep = keep == "True" multiallelic = multiallelic == "True" - return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads + if output_dir: + os.makedirs(output_dir, exist_ok=True) + return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values, af_threshold, output_dir def read_samples_ids(samples_ids: str): @@ -194,7 +228,7 @@ def variant_observed(allele_count: Tuple[int]) -> bool: return any((ac > 0 for ac in allele_count)) -def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str: +def format_variant_record(variant: pysam.VariantRecord, genotypes: str, af_field: str = "AF") -> str: """ Formats a variant record into a string with specified genotypes and variant information. @@ -202,13 +236,15 @@ def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str: Args: variant (pysam.VariantRecord): The variant record to be formatted. genotypes (str): Genotypes information to be included in the format. + af_field (str): INFO field name to read allele frequency from. Use + "AF_joint" for gnomAD joint VCFs, "AF" otherwise. Default: "AF". Returns: str: Formatted variant record as a tab-separated string. """ try: - af = ",".join(list(map(str, variant.info["AF"]))) + af = ",".join(list(map(str, variant.info[af_field]))) except KeyError: # catch potential AF missing in variant INFO field af = ",".join(["0.0" for _ in variant.alts]) # type: ignore variant_format = [ @@ -226,7 +262,15 @@ def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str: return "\t".join([f"{e}" if e is not None else "." for e in variant_format]) -def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): +def convert_vcf( + vcf_fname: str, + samples: List[str], + joint: bool, + keep: bool, + filter_pass_values: frozenset = frozenset({"PASS", "."}), + af_threshold: float = 0.0, + output_dir: str = "", +): """ Converts a VCF file by updating the header, filtering variants, and creating a new compressed VCF file. @@ -236,6 +280,12 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): samples (List[str]): List of sample names. keep (bool): Flag to determine whether to keep variants that do not pass the filter. + filter_pass_values (frozenset): Set of VCF FILTER values to treat as + passing. Default: frozenset({"PASS", "."}). + af_threshold (float): Minimum AF_joint value required to include a variant. + Default 0.0 (no filtering). + output_dir (str): Directory for the output VCF. If empty, the output is + written alongside the input file. Default: "". Returns: str: Path to the converted and compressed VCF file. @@ -245,11 +295,28 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): IOError: If an error occurs during the conversion process. """ - vcf_outfname = f"{os.path.splitext(vcf_fname)[0]}.gz" # replace bgz with gz + # Determine output path: use output_dir if provided, else same dir as input + out_basename = f"{os.path.splitext(os.path.basename(vcf_fname))[0]}.gz" + if output_dir: + vcf_outfname = os.path.join(output_dir, out_basename) + else: + vcf_outfname = os.path.join(os.path.dirname(vcf_fname), out_basename) try: vcf = load_vcf(vcf_fname) # use pysam for optimized VCF loading except OSError as e: raise OSError(f"An error occurred while loading {vcf_fname}") from e + # Pre-flight: warn if no accepted FILTER records found in the first 10,000 rows. + # If every record has a hard-fail filter the output will be silently empty. + _sample = list(itertools.islice(vcf.fetch(), 10_000)) + _n_ok = sum(1 for v in _sample if filter_pass_values & set(v.filter.keys())) + if not keep and _n_ok == 0 and _sample: + sys.stderr.write( + f"WARNING: {vcf_fname}: sampled {len(_sample)} records, none have " + f"FILTER in {filter_pass_values} — all variants will be skipped. " + f"Re-run with keep=True or adjust --vcf-filter-pass-values.\n" + ) + vcf.reset() # rewind before the main loop below + af_field = "AF_joint" if joint else "AF" # field name for AF threshold filtering samples_ac = [ f"AC_joint_{sample}" if joint else f"AC_{sample}" for sample in samples ] # recover allele count field for each input sample @@ -258,8 +325,17 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): # write the upated header to the converted vcf outfile.write(update_header(vcf.header.copy(), samples, joint)) for variant in vcf: - if not keep and "PASS" not in variant.filter.keys(): + if not keep and not (filter_pass_values & set(variant.filter.keys())): continue + # AF threshold filter (skipped when af_threshold == 0.0) + if af_threshold > 0.0: + try: + af_vals = variant.info[af_field] + af_max = max(af_vals) if hasattr(af_vals, "__len__") else af_vals + except KeyError: + af_max = 0.0 + if af_max < af_threshold: + continue genotypes = "\t".join( [ GT[1] if variant_observed(variant.info[sac]) else GT[0] @@ -267,7 +343,7 @@ def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): ] ) outfile.write( - f"{format_variant_record(variant, genotypes)}\n" + f"{format_variant_record(variant, genotypes, af_field)}\n" ) # write the formatted VCF line to the out vcf except IOError as e: raise IOError(f"An error occurred while converting {vcf_fname}") from e @@ -314,7 +390,14 @@ def bcftools_merge(vcf_fname: str, multiallelic: bool) -> str: def run_conversion_pipeline( - vcf_fname: str, samples: List[str], joint: bool, keep: bool, multiallelic: bool + vcf_fname: str, + samples: List[str], + joint: bool, + keep: bool, + multiallelic: bool, + filter_pass_values: frozenset = frozenset({"PASS", "."}), + af_threshold: float = 0.0, + output_dir: str = "", ) -> None: """ Runs a conversion pipeline to process a VCF file by adding genotypes and merging @@ -327,13 +410,17 @@ def run_conversion_pipeline( the filter. multiallelic (bool): Flag indicating whether to handle multiallelic variants during merging. + filter_pass_values (frozenset): Set of VCF FILTER values to treat as passing. + af_threshold (float): Minimum AF_joint value required to include a variant. + Default 0.0 (no filtering). + output_dir (str): Directory for output VCFs. Default "" (same as input). Returns: None """ # add genotypes to input VCF - vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep) + vcf_genotypes = convert_vcf(vcf_fname, samples, joint, keep, filter_pass_values, af_threshold, output_dir) # merge variants into mutlialleic/biallelic sites vcf_merged = bcftools_merge(vcf_genotypes, multiallelic) assert os.path.isfile(vcf_merged) and os.stat(vcf_merged).st_size > 0 @@ -354,7 +441,7 @@ def convert_gnomad_vcfs() -> None: """ # read input arguments - gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = ( + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads, filter_pass_values, af_threshold, output_dir = ( parse_commandline(sys.argv[1:]) ) start = time.time() @@ -372,6 +459,9 @@ def convert_gnomad_vcfs() -> None: joint=joint, keep=keep, multiallelic=multiallelic, + filter_pass_values=filter_pass_values, + af_threshold=af_threshold, + output_dir=output_dir, ) pool.map(partial_run_conversion_pipeline, gnomad_vcfs) pool.close() diff --git a/PostProcess/new_simple_analysis.py b/PostProcess/new_simple_analysis.py index 4c11f80..d0f05af 100755 --- a/PostProcess/new_simple_analysis.py +++ b/PostProcess/new_simple_analysis.py @@ -227,8 +227,10 @@ def iupac_decomposition(split, guide_no_bulge, guide_no_pam, cluster_to_save): haploSamples = {0: [], 1: []} for count, sample in enumerate(sampleSet[i]): sampleInfo = sample.split(":") + gt_alleles = sampleInfo[1].split("|") for haplo in totalDict: - if sampleInfo[1].split("|")[haplo] != "0": + allele = gt_alleles[haplo] if len(gt_alleles) > haplo else gt_alleles[0] # haploid guard + if allele != "0": haploSamples[haplo].append(sampleInfo[0]) totalDict[0][0][(pos_c, elem)] = [ listReplaceTarget, diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py index 1d83206..76e2407 100755 --- a/PostProcess/pool_post_analisi_indel.py +++ b/PostProcess/pool_post_analisi_indel.py @@ -2,10 +2,29 @@ from multiprocessing import Pool +import gzip import subprocess import sys import os + +def _chrom_from_vcf(vcf_path: str) -> str: + """Return the chromosome name from the first data line of a VCF (plain or gzipped).""" + open_fn = gzip.open if vcf_path.endswith(".gz") else open + with open_fn(vcf_path, "rt") as fh: + for line in fh: + if not line.startswith("#"): + return line.split("\t")[0] + raise ValueError(f"No data lines found in VCF: {vcf_path}") + + +def _normalize_chrom(chrom: str) -> str: + """Ensure chromosome name has chr prefix (e.g. '22' -> 'chr22'). + Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without the + prefix in the CHROM field, while the genome indices use 'chr'-prefixed names. + """ + return chrom if chrom.startswith("chr") else "chr" + chrom + # post-analysis script name POSTANALYSIS = "./post_analisi_indel.sh" @@ -26,8 +45,7 @@ def start_analysis(fname: str) -> None: - chrom = next((e for e in fname.split(".") if e.startswith("chr")), None) - assert chrom + chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_folder, fname))) code = subprocess.call( f"{POSTANALYSIS} {output_folder} {ref_folder} {vcf_folder} {guide_file} " f"{mm} {bDNA} {bRNA} {annotation_file} {pam_file} {dict_folder} " @@ -41,6 +59,6 @@ def start_analysis(fname: str) -> None: # chromosome-wise vcfs list -chrs = [f for f in os.listdir(vcf_folder) if "vcf.gz" in f] +chrs = [f for f in os.listdir(vcf_folder) if f.endswith(".vcf.gz")] with Pool(processes=ncpus) as pool: # run chrom-wise post-analysis in parallel pool.map(start_analysis, chrs) diff --git a/PostProcess/pool_search_indels.py b/PostProcess/pool_search_indels.py index 6646678..a7ecf8c 100755 --- a/PostProcess/pool_search_indels.py +++ b/PostProcess/pool_search_indels.py @@ -1,10 +1,29 @@ #!/usr/bin/env python +import gzip import os import sys from multiprocessing import Pool from datetime import datetime + +def _chrom_from_vcf(vcf_path: str) -> str: + """Return the chromosome name from the first data line of a VCF (plain or gzipped).""" + open_fn = gzip.open if vcf_path.endswith(".gz") else open + with open_fn(vcf_path, "rt") as fh: + for line in fh: + if not line.startswith("#"): + return line.split("\t")[0] + raise ValueError(f"No data lines found in VCF: {vcf_path}") + + +def _normalize_chrom(chrom: str) -> str: + """Ensure chromosome name has chr prefix (e.g. '22' -> 'chr22'). + Some VCF datasets (e.g. 1000G GRCh38) store chromosomes without the + prefix in the CHROM field, while the genome indices use 'chr'-prefixed names. + """ + return chrom if chrom.startswith("chr") else "chr" + chrom + ref_folder = sys.argv[1] ref_name = os.path.basename(sys.argv[1]) vcf_dir = sys.argv[2] @@ -24,11 +43,7 @@ def search_indels(f): - # global use_thread - splitted = f.split(".") - for elem in splitted: - if "chr" in elem: - chrom = elem + chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_dir, f))) print("Searching for INDELs in", chrom) if bDNA != "0" or bRNA != "0": os.system( @@ -62,10 +77,7 @@ def search_indels(f): for key in chrs: - splitted = key.split(".") - for elem in splitted: - if "chr" in elem: - chrom = elem + chrom = _normalize_chrom(_chrom_from_vcf(os.path.join(vcf_dir, key))) os.system( f"tail -n +2 {output_folder}/fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt >> {output_folder}/indels_{ref_name}+{vcf_name}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA}.targets.txt" ) diff --git a/PostProcess/process_summaries.py b/PostProcess/process_summaries.py index eb04847..48ac2d7 100755 --- a/PostProcess/process_summaries.py +++ b/PostProcess/process_summaries.py @@ -32,6 +32,8 @@ def init_summary_by_sample(path_samplesID): sid.readline() for line in sid: splitted = line.strip().split("\t") + if not splitted[0]: + continue dict_samples[splitted[0]] = [ splitted[3], splitted[1], diff --git a/PostProcess/remove_n_and_dots.py b/PostProcess/remove_n_and_dots.py index 04a917f..0deab9a 100755 --- a/PostProcess/remove_n_and_dots.py +++ b/PostProcess/remove_n_and_dots.py @@ -33,7 +33,7 @@ def read_report_chunks(report_fname: str) -> TextFileReader: Returns: TextFileReader: An iterator over DataFrame chunks. """ - return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE) + return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE, dtype=str) def polish_report(report_chunks: TextFileReader, report_fname: str) -> None: diff --git a/PostProcess/resultIntegrator.py b/PostProcess/resultIntegrator.py index 912f9bd..33416c5 100755 --- a/PostProcess/resultIntegrator.py +++ b/PostProcess/resultIntegrator.py @@ -140,8 +140,10 @@ def createBedforMultiAlternative(variantList, samples): # e.g. 0|1 and 1|1 will return 0|1 that will be counted as 1 for that sample haplotype for snp in haplotypeDict: for count, sample in enumerate(haplotypeDict[snp]): - allele1_snp = sample.strip().split("|")[0] - allele2_snp = sample.strip().split("|")[1] + alleles = sample.strip().split("|") + allele1_snp = alleles[0] if alleles[0] != "." else "0" # missing allele guard + allele2_snp = alleles[1] if len(alleles) > 1 else alleles[0] # haploid guard + allele2_snp = allele2_snp if allele2_snp != "." else "0" # missing allele guard allele1[count] = int(allele1[count]) and int(allele1_snp) allele2[count] = int(allele2[count]) and int(allele2_snp) # final sum to generete haplotype frequency of the specific target diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index a13dc18..97f4661 100755 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -24,6 +24,13 @@ # ${15}: Current working directory path. # ${16}: Gene proximity file path. # ${17}: Email address for notifications. +# ${18}: Base check start (base editing window). +# ${19}: Base check end (base editing window). +# ${20}: Base check set (base editing bases). +# ${21}: Sorting criteria for scoring columns. +# ${22}: Sorting criteria (mm+bulges) columns. +# ${23}: CI/CD test mode flag. +# ${24}: Comma-separated VCF FILTER values to treat as passing (default "PASS,."). # # Returns: # Generates various output files including target lists, merged results, and a database. @@ -63,6 +70,9 @@ sorting_criteria=${22} # CI/CD test mode cicd_test=${23} +# Comma-separated VCF FILTER values to treat as passing (default "PASS,.") +vcf_filter_pass_values="${24:-PASS,.}" + # log files log="$output_folder/log.txt" touch $log @@ -168,22 +178,26 @@ while read vcf_f; do # START STEP 1 - Genome enrichment if [ "$vcf_name" != "_" ]; then enriched_folder="$current_working_directory/Genomes/${ref_name}+${vcf_name}" - variants_tmp="$current_working_directory/Genomes/variants_genome" + _run_tmp=$(mktemp -d "$current_working_directory/Genomes/run_XXXXXX") + variants_tmp="$_run_tmp/variants_genome" dict_folder="$current_working_directory/Dictionaries/dictionaries_${vcf_name}/" indel_dict_folder="$current_working_directory/Dictionaries/log_indels_${vcf_name}/" indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS" - if ! [ -d "$enriched_folder" ]; then + _lock="$enriched_folder.lock" + if ! [ -d "$enriched_folder" ] && mkdir "$_lock" 2>/dev/null; then + # only one process enters here; mkdir is atomic echo -e 'Add-variants\tStart\t'$(date) >>$log echo -e "Adding variants" - - # enrich genome using crispritz - cd "$current_working_directory/Genomes" - crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" + + # enrich genome using crispritz (run from unique temp dir so crispritz's + # fixed-name variants_genome/ output does not collide with concurrent runs) + cd "$_run_tmp" + crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" if [ -s $logerror ]; then printf "ERROR: Genome enrichment failed on %s\n" "$vcf_name" >&2 - # since failure happened, force genome enrichment to be repeated - rm -r "$enriched_folder"* "$variants_tmp" + rm -rf "$_run_tmp" + rmdir "$_lock" exit 1 fi @@ -201,12 +215,18 @@ while read vcf_f; do mv $variants_tmp/SNPs_genome/*.json "$dict_folder" mv $variants_tmp/SNPs_genome/log*.txt "$indel_dict_folder" - # remove temporary variant genome folder - rm -r "$variants_tmp" + # remove unique temp dir (variants_genome lives inside it) + rm -rf "$_run_tmp" + rmdir "$_lock" echo -e 'Add-variants\tEnd\t'$(date) >>"$log" + elif [ -d "$_lock" ]; then + echo "Another run is building $enriched_folder — waiting..." + while [ -d "$_lock" ]; do sleep 5; done + rm -rf "$_run_tmp" else echo -e "Variants already added" + rm -rf "$_run_tmp" fi cd $current_working_directory @@ -246,17 +266,24 @@ while read vcf_f; do echo -e 'Index-genome Reference\tEnd\t'$(date) >>"$log" idx_ref="$idx_folder3" else - # no valid index found, compute it - echo -e 'Index-genome Reference\tStart\t'$(date) >>$log - echo -e "Indexing reference genome" - # index reference genome using crispritz - crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus - if [ -s $logerror ]; then - printf "ERROR: reference genome indexing failed\n" >&2 - [ -d "$idx_folder1" ] && rm -r "$idx_folder1" - exit 1 + # no valid index found, compute it; use mkdir lock to prevent concurrent builds + _lock="${idx_folder1}.lock" + if mkdir "$_lock" 2>/dev/null; then + echo -e 'Index-genome Reference\tStart\t'$(date) >>$log + echo -e "Indexing reference genome" + # index reference genome using crispritz + crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus + if [ -s $logerror ]; then + printf "ERROR: reference genome indexing failed\n" >&2 + [ -d "$idx_folder1" ] && rm -r "$idx_folder1" + rmdir "$_lock" + exit 1 + fi + rmdir "$_lock" + elif [ -d "$_lock" ]; then + echo "Another run is indexing $idx_folder1 — waiting..." + while [ -d "$_lock" ]; do sleep 5; done fi - pid_index_ref=$! echo -e 'Index-genome Reference\tEnd\t'$(date) >>$log idx_ref="$idx_folder1" fi @@ -284,21 +311,28 @@ while read vcf_f; do echo -e 'Index-genome Variant\tEnd\t'$(date) >>"$log" idx_var="$idx_folder3" else - # no index found, compute it - echo -e 'Index-genome Variant\tStart\t'$(date) >>$log - echo -e "Indexing variant genome" - # index alternative genome using crispritz - crispritz.py index-genome \ - "${ref_name}+${vcf_name}" \ - "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" \ - "$pam_file" \ - -bMax $bMax -th $ncpus - if [ -s "$logerror" ]; then - printf "ERROR: alternative genome indexing failed on %s\n" "$vcf_name" >&2 - [ -d "$idx_folder1" ] && rm -r "$idx_folder1" - exit 1 + # no index found, compute it; use mkdir lock to prevent concurrent builds + _lock="${idx_folder1}.lock" + if mkdir "$_lock" 2>/dev/null; then + echo -e 'Index-genome Variant\tStart\t'$(date) >>$log + echo -e "Indexing variant genome" + # index alternative genome using crispritz + crispritz.py index-genome \ + "${ref_name}+${vcf_name}" \ + "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" \ + "$pam_file" \ + -bMax $bMax -th $ncpus + if [ -s "$logerror" ]; then + printf "ERROR: alternative genome indexing failed on %s\n" "$vcf_name" >&2 + [ -d "$idx_folder1" ] && rm -r "$idx_folder1" + rmdir "$_lock" + exit 1 + fi + rmdir "$_lock" + elif [ -d "$_lock" ]; then + echo "Another run is indexing $idx_folder1 — waiting..." + while [ -d "$_lock" ]; do sleep 5; done fi - pid_index_var=$! echo -e 'Index-genome Variant\tEnd\t'$(date) >>"$log" idx_var="$idx_folder1" fi @@ -308,23 +342,32 @@ while read vcf_f; do indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS" if ! [ -d "$indels_index_dir" ]; then - echo -e 'Indexing Indels\tStart\t'$(date) >>"$log" - "$starting_dir/pool_index_indels.py" \ - "$indels_out/" \ - "$pam_file" \ - "$true_pam" \ - "$ref_name" \ - "$vcf_name" \ - "$bMax" \ - "$ncpus" - - if [ -s "$logerror" ]; then - printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2 - [ -d "$indels_index_dir" ] && rm -r "$indels_index_dir" - exit 1 - fi + # use mkdir lock to prevent concurrent builds + _lock="${indels_index_dir}.lock" + if mkdir "$_lock" 2>/dev/null; then + echo -e 'Indexing Indels\tStart\t'$(date) >>"$log" + "$starting_dir/pool_index_indels.py" \ + "$indels_out/" \ + "$pam_file" \ + "$true_pam" \ + "$ref_name" \ + "$vcf_name" \ + "$bMax" \ + "$ncpus" + + if [ -s "$logerror" ]; then + printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2 + [ -d "$indels_index_dir" ] && rm -r "$indels_index_dir" + rmdir "$_lock" + exit 1 + fi - echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log" + rmdir "$_lock" + echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log" + elif [ -d "$_lock" ]; then + echo "Another run is indexing $indels_index_dir — waiting..." + while [ -d "$_lock" ]; do sleep 5; done + fi else echo "Indels Index already present" fi diff --git a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py index da86fa8..6ed3830 100755 --- a/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py +++ b/PostProcess/supportFunctions/dictionaryIndel/dictionary_creation_indels.py @@ -15,11 +15,6 @@ import time vcf_file = sys.argv[1] -chr_name = vcf_file.split('.') -for i in chr_name: - if 'chr' in i: - chr_name = i - break selected_sample = sys.argv[2] save_directory = sys.argv[3] @@ -37,12 +32,15 @@ break first_line = next(targets).decode('ascii').strip().split('\t') + chr_name = first_line[0] # authoritative chromosome from VCF data if 'chr' not in first_line[0]: add_to_name = 'chr' list_samples = [] list_chars = [] hap = first_line[sample_header_pos].split('|') + if len(hap) == 1: + hap = hap * 2 # haploid GT: treat as homozygous for frequency counting for h in hap: #NOTE posso avere anche 2|1 #TODO da finire if '0' == h: continue diff --git a/crisprme.py b/crisprme.py index e2f6b1d..710d322 100755 --- a/crisprme.py +++ b/crisprme.py @@ -1153,6 +1153,15 @@ def complete_search() -> None: sorting_criteria = _check_sorting_criteria(args, "--sorting-criteria" in args) # sorting criteria (mm+bulges) columns outputfolder = _check_output(args) # output folder thread = _check_threads(args, "--thread" in args) # number of threads + # VCF FILTER values to treat as passing (default: "PASS,." per VCF spec §1.6.1) + vcf_filter_pass_values = "PASS,." + if "--vcf-filter-pass-values" in args: + try: + vcf_filter_pass_values = args[args.index("--vcf-filter-pass-values") + 1] + if vcf_filter_pass_values.startswith("--"): + raise ValueError("Please input a value for flag --vcf-filter-pass-values") + except IndexError as e: + raise ValueError("Missing input for --vcf-filter-pass-values") from e # extract pam seq from file pam_len = 0 @@ -1304,7 +1313,8 @@ def complete_search() -> None: f"{pamfile} {annotationfile} {samplefile} {bMax + 1} {mm} {bDNA} {bRNA} " f"{merge_t} {outputfolder} {script_path} {thread} {current_working_directory} " f"{gene_annotation} {void_mail} {base_start} {base_end} {base_set} " - f"{sorting_criteria_scoring} {sorting_criteria} {cicd_test}" + f"{sorting_criteria_scoring} {sorting_criteria} {cicd_test} " + f"{vcf_filter_pass_values}" ) code = subprocess.call( crisprme_run, shell=True, stderr=log_error, stdout=log_verbose @@ -1550,11 +1560,20 @@ def gnomAD_converter(): raise ValueError(f"Forbidden number of threads ({threads})") except IndexError as e: raise ValueError("Missing or forbidden threads value") from e + # read vcf filter pass values arg + vcf_filter_pass_values = "PASS,." + if "--vcf-filter-pass-values" in args: + try: + vcf_filter_pass_values = args[args.index("--vcf-filter-pass-values") + 1] + if vcf_filter_pass_values.startswith("--"): + raise ValueError("Please input a value for flag --vcf-filter-pass-values") + except IndexError as e: + raise ValueError("Missing input for --vcf-filter-pass-values") from e # run gnom AD converter gnomad_converter_script = os.path.join(script_path, "convert_gnomAD_vcfs.py") cmd = ( f"python {gnomad_converter_script} {gnomad_dir} {samples_ids} {joint} " - f"{keep} {multiallelic} {threads}" + f"{keep} {multiallelic} {threads} {vcf_filter_pass_values}" ) code = subprocess.call(cmd, shell=True) if code != 0: