1+ import subprocess
2+ import gzip
3+ import shutil
4+ import yaml
5+ import pysam
6+ import unittest
7+ import os
8+
9+ from pathlib import Path
10+ from typing import List
11+ from Bio import bgzf
12+ from Bio .bgzf import BgzfWriter , BgzfReader
13+
14+
15+ # Rearranges the bacterial chromosome by wrapping it around
16+
17+ def wrapper (seq ):
18+ length = len (seq )
19+
20+ if (length % 2 == 0 ):
21+ half_index = length // 2
22+ else :
23+ half_index = (length // 2 ) + 1
24+
25+ first_half = seq [:half_index ]
26+ second_half = seq [half_index :]
27+
28+ new_seq = second_half + first_half
29+
30+ return new_seq
31+
32+
33+ # Writes the newly rearranged chromosome's sequence to a new fasta file
34+
35+ def write_fasta_file (new_seq , bacteria_name , fasta_header , output_dir_path , type ):
36+ fasta_file_name = f"{ type } _{ bacteria_name } .fna"
37+ fasta_file_path = output_dir_path / fasta_file_name
38+ fasta_file = open (fasta_file_path , "w" )
39+
40+ fasta_file .write (fasta_header + "\n " + new_seq )
41+
42+ fasta_file .close ()
43+
44+ return fasta_file_path
45+
46+
47+ # Writes a yml configuration file for the newly rearranged chromosome's fasta sequence
48+ # Splits the coverage in half for the reference and new config files
49+ # These use default values for all other parameters for NEAT
50+
51+ def write_config_file (ref_config_file , rearranged_seq_file , orig_seq_file , bacteria_name , output_dir_path ):
52+ new_config_file_name = f"new_{ bacteria_name } _config_test.yml"
53+ old_config_file_name = f"{ bacteria_name } _config_test.yml"
54+
55+ new_config_file_path = output_dir_path / new_config_file_name
56+ old_config_file_path = output_dir_path / old_config_file_name
57+
58+ with open (ref_config_file , 'r' ) as ref_file , open (new_config_file_path , 'w' ) as new_file , open (old_config_file_path , 'w' ) as old_file :
59+ for line in ref_file :
60+ if line .find ("reference:" ) != - 1 :
61+ new_file .write (f"reference: { rearranged_seq_file } \n " )
62+ old_file .write (f"reference: { orig_seq_file } \n " )
63+ # elif line.find("coverage:") != -1:
64+ # if line.strip() == "coverage: .":
65+ # new_coverage = 5.0
66+ # else:
67+ # new_coverage = float((line.split(" "))[1].strip()) // 2
68+
69+ # new_file.write(f"coverage: {new_coverage}\n")
70+ # old_file.write(f"coverage: {new_coverage}\n")
71+ else :
72+ new_file .write (line )
73+ old_file .write (line )
74+
75+
76+ ref_file .close ()
77+ new_file .close ()
78+ old_file .close ()
79+
80+ return old_config_file_path , new_config_file_path
81+
82+
83+ # Runs the NEAT read simulator using the given config file
84+
85+ def run_neat (config_file , output_dir , prefix ):
86+ subprocess .run (["neat" , "read-simulator" , "-c" , config_file , "-o" , output_dir + "/" + prefix ])
87+
88+
89+ # General function for bacterial wrapper that calls all of the functions defined above
90+
91+ def bacterial_wrapper (reference_file , bacteria_name , ref_config_file , output_dir ):
92+
93+ orig_seq = ""
94+
95+ f = open (reference_file )
96+ fasta_header = f .readline ().strip ()
97+
98+ plasmids = False
99+
100+ for line in f :
101+ if line [0 ] != ">" :
102+ orig_seq += line .strip ()
103+ elif line .lower ().find ("plasmid" ) != - 1 : # exclude plasmids from the sequence to be rearranged
104+ plasmids = True
105+ break
106+
107+ f .close ()
108+
109+ output_dir_path = Path (output_dir )
110+
111+ rearranged_seq = wrapper (orig_seq )
112+ rearranged_seq_file = write_fasta_file (rearranged_seq , bacteria_name , fasta_header , output_dir_path , "wrapped" )
113+
114+ orig_seq_file = reference_file
115+ if plasmids :
116+ orig_seq_file = write_fasta_file (orig_seq , bacteria_name , fasta_header , output_dir_path , "orig" )
117+
118+ config_files = write_config_file (ref_config_file , rearranged_seq_file , orig_seq_file , bacteria_name , output_dir_path )
119+ old_config_file = config_files [0 ]
120+ new_config_file = config_files [1 ]
121+
122+ run_neat (old_config_file , output_dir , "Regular" )
123+ run_neat (new_config_file , output_dir , "Wrapped" )
124+
125+
126+ # Stitching all outputs together - Keshav's script
127+
128+ def concat_fq (input_files : List [Path ], dest : Path ) -> None :
129+
130+ if not input_files :
131+ # Nothing to do, and no error to throw
132+ return
133+
134+ with gzip .open (dest , 'wt' ) as out_f :
135+ for input_file in input_files :
136+ with gzip .open (input_file , 'rt' ) as in_f :
137+ shutil .copyfileobj (in_f , out_f )
138+
139+ def merge_bam (bams : List [Path ], dest : Path , threads : int ) -> None :
140+
141+ if not bams :
142+ return
143+
144+ unsorted = dest .with_suffix (".unsorted.bam" )
145+ pysam .merge ("--no-PG" , "-@" , str (threads ), "-f" , str (unsorted ), * map (str , bams ))
146+ pysam .sort ("-@" , str (threads ), "-o" , str (dest ), str (unsorted ))
147+ unsorted .unlink (missing_ok = True )
148+
149+ def merge_vcf (vcfs : List [Path ], dest : Path ) -> None :
150+ if not vcfs :
151+ return
152+
153+ first , * rest = vcfs
154+ shutil .copy (first , dest )
155+
156+ with dest .open ("ab" ) as out_f :
157+ for vcf in rest :
158+ with vcf .open ("rb" ) as fh :
159+ for line in fh :
160+ if not line .startswith (b"#" ):
161+ out_f .write (line )
162+
163+ def stitch_all_outputs (files : List [Path ], output_dir ) -> None :
164+ fq1_list = []
165+ fq2_list = []
166+ vcf_list = []
167+ bam_list = []
168+
169+ for file in files :
170+ file_name = file .stem # use stem to differentiate fq1 and fq2
171+ suffixes = file .suffixes # use suffixes to catch vcf and bam files
172+
173+ if "r2.fastq" in file_name :
174+ fq2_list .append (file )
175+ elif "r1.fastq" in file_name or ".fastq" in suffixes :
176+ fq1_list .append (file )
177+ elif ".vcf" in suffixes and ".tbi" not in suffixes :
178+ vcf_list .append (file )
179+ elif ".bam" in suffixes and ".bai" not in suffixes :
180+ bam_list .append (file )
181+
182+ dest_fq1 = Path (f"{ output_dir } /stitched_fq1.gz" )
183+ dest_bam = Path (f"{ output_dir } /stitched.bam" )
184+ dest_vcf = Path (f"{ output_dir } /stitched.vcf" )
185+
186+ concat_fq (fq1_list , dest_fq1 )
187+
188+ if (fq2_list ):
189+ dest_fq2 = Path (f"{ output_dir } /stitched_fq2.gz" )
190+ concat_fq (fq2_list , dest_fq2 )
191+
192+ merge_bam (bam_list , dest_bam , 2 )
193+ merge_vcf (vcf_list , dest_vcf )
194+
195+
196+ # Testing functions
197+
198+ class TestWrapper (unittest .TestCase ):
199+ def test_even (self ):
200+ self .assertEqual (wrapper ("ABBCBB" ), "CBBABB" )
201+
202+ def test_odd (self ):
203+ self .assertEqual (wrapper ("ABBCBBC" ), "BBCABBC" )
0 commit comments