11import csv
22import io
3+ from pathlib import Path
34import sys
45import subprocess as sp
56from dataclasses import dataclass
6- from typing import Optional , List
7+ from typing import Literal , Optional , List
78
8- # Requries ish-aligner to have been compiled for the 3 widths of interest: (128, 256, 512)
9- ISH_128 = "/home/ubuntu/dev/ish/ish-aligner-128"
10- ISH_256 = "/home/ubuntu/dev/ish/ish-aligner-256"
11- ISH_512 = "/home/ubuntu/dev/ish/ish-aligner-512"
12- ISH_GPU = "/home/ubuntu/dev/ish/ish-aligner-gpu"
9+ import defopt
1310
14- PARASAIL_ALIGNER = "/home/ubuntu/dev/parasail/apps/parasail_aligner"
11+ # Requries ish-aligner to have been compiled for the 3 widths of interest: (128, 256, 512)
12+ # ISH_128 = "/home/ubuntu/dev/ish/ish-aligner-128"
13+ # ISH_256 = "/home/ubuntu/dev/ish/ish-aligner-256"
14+ # ISH_512 = "/home/ubuntu/dev/ish/ish-aligner-512"
15+ # ISH_GPU = "/home/ubuntu/dev/ish/ish-aligner-gpu"
16+ # curl https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2015_11/knowledgebase/uniprot_sprot-only2015_11.tar.gz --output uniprot_sprot-only2015_11.tar.gz
17+ # REF_DB = "/home/ubuntu/data/uniprot_sprot_5x.fasta"
1518
1619# From parasail data dir
1720# https://github.com/jeffdaily/parasail/tree/600fb26151ff19899ee39a214972dcf2b9b11ed7/data
1821QUERY_SEQS = {
19- "/home/ubuntu/dev/parasail/data/ P56980.fasta" : 24 ,
20- "/home/ubuntu/dev/parasail/data/ O29181.fasta" : 63 ,
21- "/home/ubuntu/dev/parasail/data/ O60341.fasta" : 852 ,
22- "/home/ubuntu/dev/parasail/data/ P00762.fasta" : 246 ,
23- "/home/ubuntu/dev/parasail/data/ P01008.fasta" : 464 ,
24- "/home/ubuntu/dev/parasail/data/ P01111.fasta" : 189 ,
25- "/home/ubuntu/dev/parasail/data/ P02232.fasta" : 144 ,
26- "/home/ubuntu/dev/parasail/data/ P03435.fasta" : 567 ,
27- "/home/ubuntu/dev/parasail/data/ P03630.fasta" : 127 ,
28- "/home/ubuntu/dev/parasail/data/ P03989.fasta" : 362 ,
29- "/home/ubuntu/dev/parasail/data/ P04775.fasta" : 2005 ,
30- "/home/ubuntu/dev/parasail/data/ P05013.fasta" : 189 ,
31- "/home/ubuntu/dev/parasail/data/ P07327.fasta" : 375 ,
32- "/home/ubuntu/dev/parasail/data/ P07756.fasta" : 1500 ,
33- "/home/ubuntu/dev/parasail/data/ P08519.fasta" : 4548 ,
34- "/home/ubuntu/dev/parasail/data/ P0C6B8.fasta" : 3564 ,
35- "/home/ubuntu/dev/parasail/data/ P10635.fasta" : 497 ,
36- "/home/ubuntu/dev/parasail/data/ P14942.fasta" : 222 ,
37- "/home/ubuntu/dev/parasail/data/ P19096.fasta" : 2504 ,
38- "/home/ubuntu/dev/parasail/data/ P20930.fasta" : 4061 ,
39- "/home/ubuntu/dev/parasail/data/ P21177.fasta" : 729 ,
40- "/home/ubuntu/dev/parasail/data/ P25705.fasta" : 553 ,
41- "/home/ubuntu/dev/parasail/data/ P27895.fasta" : 1000 ,
42- "/home/ubuntu/dev/parasail/data/ P28167.fasta" : 3005 ,
43- "/home/ubuntu/dev/parasail/data/ P33450.fasta" : 5147 ,
44- "/home/ubuntu/dev/parasail/data/ P42357.fasta" : 657 ,
45- "/home/ubuntu/dev/parasail/data/ P53765.fasta" : 255 ,
46- "/home/ubuntu/dev/parasail/data/ P58229.fasta" : 511 ,
47- "/home/ubuntu/dev/parasail/data/ Q7TMA5.fasta" : 4743 ,
48- "/home/ubuntu/dev/parasail/data/ Q8ZGB4.fasta" : 361 ,
49- "/home/ubuntu/dev/parasail/data/ Q9UKN1.fasta" : 5478 ,
22+ "P56980.fasta" : 24 ,
23+ "O29181.fasta" : 63 ,
24+ "O60341.fasta" : 852 ,
25+ "P00762.fasta" : 246 ,
26+ "P01008.fasta" : 464 ,
27+ "P01111.fasta" : 189 ,
28+ "P02232.fasta" : 144 ,
29+ "P03435.fasta" : 567 ,
30+ "P03630.fasta" : 127 ,
31+ "P03989.fasta" : 362 ,
32+ "P04775.fasta" : 2005 ,
33+ "P05013.fasta" : 189 ,
34+ "P07327.fasta" : 375 ,
35+ "P07756.fasta" : 1500 ,
36+ "P08519.fasta" : 4548 ,
37+ "P0C6B8.fasta" : 3564 ,
38+ "P10635.fasta" : 497 ,
39+ "P14942.fasta" : 222 ,
40+ "P19096.fasta" : 2504 ,
41+ "P20930.fasta" : 4061 ,
42+ "P21177.fasta" : 729 ,
43+ "P25705.fasta" : 553 ,
44+ "P27895.fasta" : 1000 ,
45+ "P28167.fasta" : 3005 ,
46+ "P33450.fasta" : 5147 ,
47+ "P42357.fasta" : 657 ,
48+ "P53765.fasta" : 255 ,
49+ "P58229.fasta" : 511 ,
50+ "Q7TMA5.fasta" : 4743 ,
51+ "Q8ZGB4.fasta" : 361 ,
52+ "Q9UKN1.fasta" : 5478 ,
5053}
5154
52- MATRIX = ["blosum62" , "blosum50" ]
53-
54- # curl https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2015_11/knowledgebase/uniprot_sprot-only2015_11.tar.gz --output uniprot_sprot-only2015_11.tar.gz
55- REF_DB = "/home/ubuntu/data/uniprot_sprot_5x.fasta"
56-
5755
5856@dataclass
5957class BenchmarkResults :
@@ -114,7 +112,9 @@ def to_csv(results: List["BenchmarkResults"]):
114112 )
115113
116114 @staticmethod
117- def from_ish_csv_str (csv_str : str , aligner : str , devices : int = 0 ) -> List ["BenchmarkResults" ]:
115+ def from_ish_csv_str (
116+ csv_str : str , aligner : str , devices : int = 0
117+ ) -> List ["BenchmarkResults" ]:
118118 csv_file = io .StringIO (csv_str )
119119 reader = csv .DictReader (
120120 csv_file ,
@@ -149,7 +149,7 @@ def from_parasail_blob_str(
149149 blob_str : str ,
150150 query_len : int ,
151151 instruction_set : str ,
152- score_size : int ,
152+ score_size : str ,
153153 aligner : str ,
154154 ) -> "BenchmarkResults" :
155155 file = io .StringIO (blob_str )
@@ -198,7 +198,7 @@ def run_parasail_aligner(
198198 gap_open_score = 3 ,
199199 gap_ext_score = 1 ,
200200 * ,
201- algo = "sg"
201+ algo = "sg" ,
202202):
203203
204204 scoring_matrix = scoring_matrix .lower ()
@@ -212,7 +212,9 @@ def run_parasail_aligner(
212212 raise ValueError ("Invalid score size" )
213213
214214 algorithm = (
215- f"{ algo } _striped_" + (instruction_set if instruction_set else "" ) + f"_{ score_size } "
215+ f"{ algo } _striped_"
216+ + (instruction_set if instruction_set else "" )
217+ + f"_{ score_size } "
216218 )
217219
218220 # fmt: off
@@ -239,7 +241,7 @@ def run_parasail_aligner(
239241 result = BenchmarkResults .from_parasail_blob_str (
240242 out .stdout ,
241243 query_len = query_len ,
242- instruction_set = instruction_set ,
244+ instruction_set = instruction_set if instruction_set else "" ,
243245 score_size = score_size ,
244246 aligner = "parasail_aligner" ,
245247 )
@@ -261,7 +263,7 @@ def run_ish_aligner(
261263 iterations = 3 ,
262264 devices = 0 ,
263265 * ,
264- algo = "striped-local"
266+ algo = "striped-local" ,
265267) -> Optional [BenchmarkResults ]:
266268 # fmt: off
267269 args = [
@@ -286,7 +288,9 @@ def run_ish_aligner(
286288 if "overflow" in out .stdout :
287289 print ("Overflow, no result for: " , " " .join (args ), file = sys .stderr )
288290 return None
289- result = BenchmarkResults .from_ish_csv_str (out .stdout , aligner = "ish-aligner" , devices = devices )[
291+ result = BenchmarkResults .from_ish_csv_str (
292+ out .stdout , aligner = "ish-aligner" , devices = devices
293+ )[
290294 0
291295 ] # Only take the first item since we're running this in such a way that only one will be there anyways
292296 except sp .CalledProcessError as e :
@@ -295,30 +299,53 @@ def run_ish_aligner(
295299 return result
296300
297301
298- def main ():
302+ ScoreSize = Literal ["byte" , "word" , "adaptive" ]
303+ ParasailInstructionSet = Literal ["sse41_128" , "neon_128" , "avx2_256" ]
304+ IshAlgorithm = Literal [
305+ "basic-semi-global-gpu-parallel" , "striped-semi-global" , "striped-local"
306+ ]
307+ ParasailAlgorithm = Literal ["sg" , "sw" ]
308+
309+
310+ def main (
311+ * ,
312+ ish_binaries : list [Path ],
313+ ish_algorithm : IshAlgorithm ,
314+ parasail_aligner_binary : Path = Path ("" ),
315+ parasail_algorithm : ParasailAlgorithm = "sg" ,
316+ query_seqs_base_dir : Path ,
317+ ref_data : Path ,
318+ score_sizes : list [ScoreSize ],
319+ parasail_instruction_sets : list [ParasailInstructionSet ] = [],
320+ output_dir : Path ,
321+ devices : int = 1 ,
322+ iterations : int = 3
323+ ):
299324
300325 # score_sizes = ["byte", "word", "adaptive"]
301- score_sizes = ["word" ]
326+ # score_sizes = ["word"]
302327
303328 writer = csv .DictWriter (sys .stdout , fieldnames = BenchmarkResults .HEADERS )
304329 writer .writeheader ()
305330
306331 results : List [BenchmarkResults ] = []
307- for ish in [ ISH_GPU ]: # , ISH_256, ISH_512]:
332+ for ish in ish_binaries : # , ISH_256, ISH_512]:
308333 for score_size in score_sizes :
309- for device in range (0 , 4 ):
334+ for device in range (0 , devices ):
310335 for query in QUERY_SEQS .keys ():
311- print (f"Running { ish } on { query } with { score_size } " , file = sys .stderr )
336+ print (
337+ f"Running { ish } on { query } with { score_size } " , file = sys .stderr
338+ )
312339 r = run_ish_aligner (
313- ish ,
314- query ,
315- REF_DB ,
340+ str ( ish ) ,
341+ str ( Path ( query_seqs_base_dir ) / query ) ,
342+ str ( ref_data ) ,
316343 score_size = score_size ,
317344 scoring_matrix = "Blosum62" ,
318- output_file = "/home/ubuntu/outputs/ ish-aligner.csv" ,
319- iterations = 3 ,
320- algo = "basic-semi-global-gpu-parallel" ,
321- devices = device + 1
345+ output_file = str ( output_dir / " ish-aligner-result .csv") ,
346+ iterations = iterations ,
347+ algo = ish_algorithm ,
348+ devices = device + 1 ,
322349 )
323350 if r :
324351 writer .writerow (
@@ -341,23 +368,20 @@ def main():
341368 )
342369 results .append (r )
343370
344- for inst in []:
345- # for inst in ["sse41_128"]:
346- # for inst in ["sse41_128", "avx2_256"]:
347- # for inst in ["neon_128"]:
371+ for inst in parasail_instruction_sets :
348372 for score_size in score_sizes :
349373 for query , query_len in QUERY_SEQS .items ():
350- print (f"Running { PARASAIL_ALIGNER } on { query } " , file = sys .stderr )
374+ print (f"Running { parasail_aligner_binary } on { query } " , file = sys .stderr )
351375 r = run_parasail_aligner (
352- PARASAIL_ALIGNER ,
353- REF_DB ,
354- query ,
376+ str ( parasail_aligner_binary ) ,
377+ str ( ref_data ) ,
378+ str ( Path ( query_seqs_base_dir ) / query ) ,
355379 query_len ,
356380 instruction_set = inst ,
357381 score_size = score_size ,
358382 scoring_matrix = "Blosum62" ,
359- output_file = "/home/ubuntu/outputs/ parasail-aligner.csv" ,
360- algo = "sg"
383+ output_file = str ( output_dir / " parasail-aligner-result .csv") ,
384+ algo = parasail_algorithm ,
361385 )
362386 if r :
363387 writer .writerow (
@@ -379,8 +403,8 @@ def main():
379403 )
380404 results .append (r )
381405
382- #BenchmarkResults.to_csv(results)
406+ # BenchmarkResults.to_csv(results)
383407
384408
385409if __name__ == "__main__" :
386- main ( )
410+ defopt . run ( main )
0 commit comments