88 "OutputFileWriter"
99]
1010
11+ import os
1112import re
1213from struct import pack
1314import logging
15+ from typing import Any
1416
1517from Bio import bgzf
1618from pathlib import Path
1719
20+ from Bio .bgzf import BgzfWriter
21+
1822from .read import Read
1923from .options import Options
2024
2125_LOG = logging .getLogger (__name__ )
2226
2327
2428# Some Constants
25- # TODO make bam compression a configurable option
26- BAM_COMPRESSION_LEVEL = 6
2729CIGAR_PACKED = {'M' : 0 , 'I' : 1 , 'D' : 2 , 'N' : 3 , 'S' : 4 , 'H' : 5 , 'P' : 6 , '=' : 7 , 'X' : 8 }
2830SEQ_PACKED = {'=' : 0 , 'A' : 1 , 'C' : 2 , 'M' : 3 , 'G' : 4 , 'R' : 5 , 'S' : 6 , 'V' : 7 ,
2931 'T' : 8 , 'W' : 9 , 'Y' : 10 , 'H' : 11 , 'K' : 12 , 'D' : 13 , 'B' : 14 , 'N' : 15 }
@@ -62,16 +64,16 @@ class OutputFileWriter:
6264 in the various formats.
6365
6466 :param options: Options for the current run.
65- :param bam_header : A dictionary of lengths of each contig from the reference, keyed by contig id.
67+ :param header : A dictionary of lengths of each contig from the reference, keyed by contig id.
6668 """
6769 def __init__ (self ,
6870 options : Options ,
69- bam_header : dict = None ):
71+ header : dict = None ):
7072
7173 self .paired_ended = options .paired_ended
72- self .bam_header = bam_header
74+ self .bam_header = header
7375
74- file_handles = {}
76+ file_handles : dict [ Path , Any ] = {}
7577
7678 # Set up filenames based on booleans
7779 if options .fq1 is not None :
@@ -91,7 +93,6 @@ def __init__(self,
9193 vcf = None
9294 if options .bam is not None :
9395 bam = options .bam
94- file_handles [bam ] = bgzf .BgzfWriter (bam , 'w' , compresslevel = BAM_COMPRESSION_LEVEL )
9596 else :
9697 bam = None
9798
@@ -119,26 +120,27 @@ def __init__(self,
119120 f'#CHROM\t POS\t ID\t REF\t ALT\t QUAL\t FILTER\t INFO\t FORMAT\t NEAT_simulated_sample\n '
120121 self .files_to_write [self .vcf ].write (vcf_header )
121122
122- if options .produce_bam and self . bam_header :
123+ if options .produce_bam :
123124 # bam header
124- bam_handle = self . files_to_write [ self .bam ]
125+ bam_handle = bgzf . BgzfWriter ( self .bam , 'w' , compresslevel = 6 )
125126 bam_handle .write ("BAM\1 " )
126127 # Without a header, we can't write these as bams.
127- bam_header = "@HD\t VN:1.4\t SO:coordinate\n "
128+ header = "@HD\t VN:1.4\t SO:coordinate\n "
128129 for item in self .bam_header :
129- bam_header += f'@SQ\t SN:{ item } \t LN:{ str (self .bam_header [item ])} \n '
130- bam_header += "@RG\t ID:NEAT\t SM:NEAT\t LB:NEAT\t PL:NEAT\n "
131- header_bytes = len (bam_header )
130+ header += f'@SQ\t SN:{ item } \t LN:{ str (self .bam_header [item ])} \n '
131+ header += "@RG\t ID:NEAT\t SM:NEAT\t LB:NEAT\t PL:NEAT\n "
132+ header_bytes = len (header )
132133 num_refs = len (self .bam_header )
133134 bam_handle .write (pack ('<i' , header_bytes ))
134- bam_handle .write (bam_header )
135+ bam_handle .write (header )
135136 bam_handle .write (pack ('<i' , num_refs ))
136137 # Contigs and lengths. If we can skip writing this out for intermediate files, great
137138 for item in self .bam_header :
138139 name_length = len (item ) + 1
139140 bam_handle .write (pack ('<i' , name_length ))
140141 bam_handle .write (f'{ item } \0 ' )
141142 bam_handle .write (pack ('<i' , self .bam_header [item ]))
143+ bam_handle .flush ()
142144 bam_handle .close ()
143145
144146 def write_fastq_record (self , filename : Path , record : str ):
@@ -148,10 +150,15 @@ def write_fastq_record(self, filename: Path, record: str):
148150 _LOG .error (f"Tried to write fastq record to unknown file { filename } " )
149151 raise ValueError
150152
151- def close_files (self ):
153+ def flush_and_close_files (self ):
152154 for file_name in self .files_to_write :
153155 file_handle = self .files_to_write [file_name ]
154- file_handle .close ()
156+ try :
157+ file_handle .flush ()
158+ os .fsync (file_handle .fileno ())
159+ file_handle .close ()
160+ except ValueError :
161+ _LOG .debug (f"file { file_name } already closed" )
155162
156163 def write_vcf_record (self , line : str ):
157164 """
@@ -171,14 +178,15 @@ def write_bam_record(
171178 self ,
172179 read : Read ,
173180 contig_id : int ,
174- bam_handle ,
181+ bam_handle : BgzfWriter ,
175182 read_length : int
176183 ):
177184 """
178185 Takes a read object and writes it out as a bam record
179186
180187 :param read: Read object to write to file
181188 :param contig_id: the index of the reference for this
189+ :param bam_handle: the handle to write data to
182190 :param read_length: the length of the read to output
183191 """
184192 read_bin = reg2bin (read .position , read .end_point )
0 commit comments