Skip to content

Commit 60ea4fd

Browse files
Merge pull request #237 from ncsa/feature/karen_bacterial_wrapper
Feature/karen bacterial wrapper
2 parents 670bc10 + 9cd65cc commit 60ea4fd

11 files changed

Lines changed: 577 additions & 311 deletions

File tree

neat/bacterial_wrapper/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""
2+
Load modules needed for other parts of the program
3+
"""
4+
from .runner import *

neat/bacterial_wrapper/runner.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import subprocess
2+
import gzip
3+
import shutil
4+
import yaml
5+
import pysam
6+
import unittest
7+
import os
8+
9+
from pathlib import Path
10+
from typing import List
11+
from Bio import bgzf
12+
from Bio.bgzf import BgzfWriter, BgzfReader
13+
14+
15+
# Rearranges the bacterial chromosome by wrapping it around
16+
17+
def wrapper(seq):
18+
length = len(seq)
19+
20+
if (length % 2 == 0):
21+
half_index = length // 2
22+
else:
23+
half_index = (length // 2) + 1
24+
25+
first_half = seq[:half_index]
26+
second_half = seq[half_index:]
27+
28+
new_seq = second_half + first_half
29+
30+
return new_seq
31+
32+
33+
# Writes the newly rearranged chromosome's sequence to a new fasta file
34+
35+
def write_fasta_file(new_seq, bacteria_name, fasta_header, output_dir_path, type):
36+
fasta_file_name = f"{type}_{bacteria_name}.fna"
37+
fasta_file_path = output_dir_path / fasta_file_name
38+
fasta_file = open(fasta_file_path, "w")
39+
40+
fasta_file.write(fasta_header + "\n" + new_seq)
41+
42+
fasta_file.close()
43+
44+
return fasta_file_path
45+
46+
47+
# Writes a yml configuration file for the newly rearranged chromosome's fasta sequence
48+
# Splits the coverage in half for the reference and new config files
49+
# These use default values for all other parameters for NEAT
50+
51+
def write_config_file(ref_config_file, rearranged_seq_file, orig_seq_file, bacteria_name, output_dir_path):
52+
new_config_file_name = f"new_{bacteria_name}_config_test.yml"
53+
old_config_file_name = f"{bacteria_name}_config_test.yml"
54+
55+
new_config_file_path = output_dir_path / new_config_file_name
56+
old_config_file_path = output_dir_path / old_config_file_name
57+
58+
with open(ref_config_file, 'r') as ref_file, open(new_config_file_path, 'w') as new_file, open(old_config_file_path, 'w') as old_file:
59+
for line in ref_file:
60+
if line.find("reference:") != -1:
61+
new_file.write(f"reference: {rearranged_seq_file}\n")
62+
old_file.write(f"reference: {orig_seq_file}\n")
63+
# elif line.find("coverage:") != -1:
64+
# if line.strip() == "coverage: .":
65+
# new_coverage = 5.0
66+
# else:
67+
# new_coverage = float((line.split(" "))[1].strip()) // 2
68+
69+
# new_file.write(f"coverage: {new_coverage}\n")
70+
# old_file.write(f"coverage: {new_coverage}\n")
71+
else:
72+
new_file.write(line)
73+
old_file.write(line)
74+
75+
76+
ref_file.close()
77+
new_file.close()
78+
old_file.close()
79+
80+
return old_config_file_path, new_config_file_path
81+
82+
83+
# Runs the NEAT read simulator using the given config file
84+
85+
def run_neat(config_file, output_dir, prefix):
86+
subprocess.run(["neat", "read-simulator", "-c", config_file, "-o", output_dir + "/" + prefix])
87+
88+
89+
# General function for bacterial wrapper that calls all of the functions defined above
90+
91+
def bacterial_wrapper(reference_file, bacteria_name, ref_config_file, output_dir):
92+
93+
orig_seq = ""
94+
95+
f = open(reference_file)
96+
fasta_header = f.readline().strip()
97+
98+
plasmids = False
99+
100+
for line in f:
101+
if line[0] != ">":
102+
orig_seq += line.strip()
103+
elif line.lower().find("plasmid") != -1: # exclude plasmids from the sequence to be rearranged
104+
plasmids = True
105+
break
106+
107+
f.close()
108+
109+
output_dir_path = Path(output_dir)
110+
111+
rearranged_seq = wrapper(orig_seq)
112+
rearranged_seq_file = write_fasta_file(rearranged_seq, bacteria_name, fasta_header, output_dir_path, "wrapped")
113+
114+
orig_seq_file = reference_file
115+
if plasmids:
116+
orig_seq_file = write_fasta_file(orig_seq, bacteria_name, fasta_header, output_dir_path, "orig")
117+
118+
config_files = write_config_file(ref_config_file, rearranged_seq_file, orig_seq_file, bacteria_name, output_dir_path)
119+
old_config_file = config_files[0]
120+
new_config_file = config_files[1]
121+
122+
run_neat(old_config_file, output_dir, "Regular")
123+
run_neat(new_config_file, output_dir, "Wrapped")
124+
125+
126+
# Stitching all outputs together - Keshav's script
127+
128+
def concat_fq(input_files: List[Path], dest: Path) -> None:
129+
130+
if not input_files:
131+
# Nothing to do, and no error to throw
132+
return
133+
134+
with gzip.open(dest, 'wt') as out_f:
135+
for input_file in input_files:
136+
with gzip.open(input_file, 'rt') as in_f:
137+
shutil.copyfileobj(in_f, out_f)
138+
139+
def merge_bam(bams: List[Path], dest: Path, threads: int) -> None:
140+
141+
if not bams:
142+
return
143+
144+
unsorted = dest.with_suffix(".unsorted.bam")
145+
pysam.merge("--no-PG", "-@", str(threads), "-f", str(unsorted), *map(str, bams))
146+
pysam.sort("-@", str(threads), "-o", str(dest), str(unsorted))
147+
unsorted.unlink(missing_ok=True)
148+
149+
def merge_vcf(vcfs: List[Path], dest: Path) -> None:
150+
if not vcfs:
151+
return
152+
153+
first, *rest = vcfs
154+
shutil.copy(first, dest)
155+
156+
with dest.open("ab") as out_f:
157+
for vcf in rest:
158+
with vcf.open("rb") as fh:
159+
for line in fh:
160+
if not line.startswith(b"#"):
161+
out_f.write(line)
162+
163+
def stitch_all_outputs(files: List[Path], output_dir) -> None:
164+
fq1_list = []
165+
fq2_list = []
166+
vcf_list = []
167+
bam_list = []
168+
169+
for file in files:
170+
file_name = file.stem # use stem to differentiate fq1 and fq2
171+
suffixes = file.suffixes # use suffixes to catch vcf and bam files
172+
173+
if "r2.fastq" in file_name:
174+
fq2_list.append(file)
175+
elif "r1.fastq" in file_name or ".fastq" in suffixes:
176+
fq1_list.append(file)
177+
elif ".vcf" in suffixes and ".tbi" not in suffixes:
178+
vcf_list.append(file)
179+
elif ".bam" in suffixes and ".bai" not in suffixes:
180+
bam_list.append(file)
181+
182+
dest_fq1 = Path(f"{output_dir}/stitched_fq1.gz")
183+
dest_bam = Path(f"{output_dir}/stitched.bam")
184+
dest_vcf = Path(f"{output_dir}/stitched.vcf")
185+
186+
concat_fq(fq1_list, dest_fq1)
187+
188+
if (fq2_list):
189+
dest_fq2 = Path(f"{output_dir}/stitched_fq2.gz")
190+
concat_fq(fq2_list, dest_fq2)
191+
192+
merge_bam(bam_list, dest_bam, 2)
193+
merge_vcf(vcf_list, dest_vcf)
194+
195+
196+
# Testing functions
197+
198+
class TestWrapper(unittest.TestCase):
199+
def test_even(self):
200+
self.assertEqual(wrapper("ABBCBB"), "CBBABB")
201+
202+
def test_odd(self):
203+
self.assertEqual(wrapper("ABBCBBC"), "BBCABBC")
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
"""
2+
Command line interface for NEAT's bacterial wrapper function
3+
"""
4+
5+
import argparse
6+
import subprocess
7+
import os
8+
from pathlib import Path
9+
10+
from ...bacterial_wrapper import bacterial_wrapper
11+
from ...bacterial_wrapper import stitch_all_outputs
12+
from .base import BaseCommand
13+
from .options import output_group
14+
15+
16+
class Command(BaseCommand):
17+
"""
18+
Class that generates wrapped bacterial models
19+
"""
20+
name = "bacterial-wrapper"
21+
description = "Generate wrapped bacterial model reads"
22+
23+
def add_arguments(self, parser: argparse.ArgumentParser):
24+
"""
25+
Add the command's arguments to its parser
26+
27+
:param parser: The parser to add arguments to
28+
"""
29+
30+
parser.add_argument('reference',
31+
type=str,
32+
metavar='reference.fa',
33+
help="Reference file for organism in fasta format.")
34+
35+
parser.add_argument('bacteria_name',
36+
type=str,
37+
metavar='bacteria_name',
38+
help="Name of the bacteria.")
39+
40+
parser.add_argument(
41+
"-c", "--config",
42+
metavar="config",
43+
type=str,
44+
required=True,
45+
help="Path (including filename) to the configuration file for the reference run."
46+
)
47+
48+
output_group.add_to_parser(parser)
49+
50+
def execute(self, arguments: argparse.Namespace):
51+
"""
52+
Execute the command
53+
54+
:param arguments: The namespace with arguments and their values.
55+
"""
56+
bacterial_wrapper(arguments.reference, arguments.bacteria_name, arguments.config, arguments.output_dir)
57+
58+
output_path = Path(arguments.output_dir)
59+
file_list = os.listdir(output_path / "Regular") # same file names for both Regular and Wrapped folders
60+
61+
output_files = []
62+
63+
for file in file_list:
64+
reg_file_path = output_path / "Regular" / file
65+
wrap_file_path = output_path / "Wrapped" / file
66+
67+
if ("vcf" in file):
68+
subprocess.run(["gzip", "-d", reg_file_path])
69+
subprocess.run(["gzip", "-d", wrap_file_path])
70+
71+
file = file[:-3]
72+
reg_file_path = output_path / "Regular" / file
73+
wrap_file_path = output_path / "Wrapped" / file
74+
75+
output_files.append(reg_file_path)
76+
output_files.append(wrap_file_path)
77+
78+
stitch_all_outputs(output_files, arguments.output_dir)

neat/read_simulator/utils/generate_reads.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
import pickle
33
import time
4-
4+
import pdb
55
from math import ceil
66
from pathlib import Path
77

neat/read_simulator/utils/generate_variants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import numpy as np
1010
import re
1111
import sys
12-
12+
import pdb
1313
from Bio.Seq import Seq
1414
from Bio.SeqRecord import SeqRecord
1515
from numpy.random import Generator
@@ -283,7 +283,7 @@ def generate_variants(
283283
genotype = np.zeros(options.ploidy)
284284
genotype[ploid] = 1
285285
temp_variant.genotype = genotype
286-
286+
# pdb.set_trace()
287287
# Make sure this new variant doesn't overlap an existing insertion or deletion
288288
in_deletion = return_variants.check_if_del(temp_variant)
289289
in_insertion = return_variants.check_if_ins(temp_variant)

neat/read_simulator/utils/options.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -249,15 +249,13 @@ def from_cli(output_dir: Path,
249249
for key, (_, default, _, _) in defs.items():
250250
if key not in input_args:
251251
input_args[key] = default
252-
253252
base_options = Options(
254253
output_dir=output_dir,
255254
output_prefix=output_prefix
256255
)
257256

258257
# Read the config file using the definitions for validation
259258
base_options.read_yaml(config_file, defs)
260-
261259
# Merge validated config values over defaults
262260
final_args = dict(input_args)
263261
for key, val in defs.items():
@@ -299,6 +297,8 @@ def read_yaml(self, config_yaml: Path, args: dict):
299297
But I'm not sure how else to accomplish this.
300298
"""
301299
config = yaml.load(open(config_yaml, 'r'), Loader=Loader)
300+
301+
302302
for key, value in config.items():
303303
if key in args:
304304
type_of_var, default, criteria1, criteria2 = args[key]
@@ -444,13 +444,18 @@ def log_configuration(self):
444444
_LOG.info(f'Splitting input by contig.')
445445
if self.reuse_splits:
446446
splits_dir = Path(f'{self.output_dir}/splits/')
447+
_LOG.info(f'Reusing existing splits {splits_dir}.')
447448
if not splits_dir.is_dir():
448449
raise FileNotFoundError(f"reuse_splits=True but splits dir not found: {splits_dir}")
449-
_LOG.info(f'Reusing existing splits {splits_dir}.')
450-
_LOG.info(f'Preserving splits for next run in directory {splits_dir}.')
450+
else:
451+
if self.reuse_splits:
452+
raise FileNotFoundError(f'reuse_splits=True')
453+
else:
454+
_LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits')
455+
_LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
451456
elif not self.cleanup_splits:
452457
splits_dir = Path(f'{self.output_dir}/splits/')
453-
_LOG.info(f'Preserving splits for next run in directory {splits_dir}.')
458+
_LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
454459
else:
455460
splits_dir = self.temp_dir_path / "splits"
456461

@@ -511,4 +516,4 @@ def log_configuration(self):
511516
_LOG.info(f'Custom average mutation rate for the run: {self.mutation_rate}')
512517
if self.mutation_bed:
513518
_LOG.info(f'BED of mutation rates of different regions: {self.mutation_bed}')
514-
_LOG.info(f'RNG seed value for run: {self.rng_seed}')
519+
_LOG.info(f'RNG seed value for run: {self.rng_seed}')

neat/read_simulator/utils/output_file_writer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,16 @@
1010

1111
import os
1212
import re
13+
import shutil
14+
import time
1315
from struct import pack
1416
import logging
1517
from typing import Any
1618

1719
from Bio import bgzf
20+
from Bio import SeqIO
1821
from pathlib import Path
22+
from numpy.random import Generator
1923

2024
#gzip for temp outs, bgzip for final outs
2125
import gzip

0 commit comments

Comments
 (0)