Skip to content

Commit 93fb94e

Browse files
committed
feat: updates for v0.25.6 and improved benchmarking utilities
Signed-off-by: Seth Stadick <sstadick@gmail.com>
1 parent 0c88df6 commit 93fb94e

52 files changed

Lines changed: 5330 additions & 9018 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
*.egg-info
55
# magic environments
66
.magic
7-
bench
7+
bench
8+
bench_results

BENCHMARKING.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Benchmarking
2+
3+
`pixi run bench-all` will download all needed data, and compile parasail.
4+
5+
## Prereqs and data
6+
7+
1. You will need to clone and build parasail
8+
9+
```bash
10+
sudo apt-get install libtool
11+
git clone https://github.com/jeffdaily/parasail
12+
cd parasail
13+
autoreconf -fi
14+
./configure
15+
make -j $(nproc)
16+
```
17+
18+
2. You will need to have mojo and pixi installed. See the Modular website for install instructions.
19+
20+
## Data
21+
22+
1. Create the benchmarking and data dir
23+
24+
```bash
25+
mkdir -p bench/data && cd bench/data
26+
```
27+
28+
2. The sequences used for benchmarking are the same as those used in the [parasail paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0930-z). Query sequences perform the following:
29+
30+
```bash
31+
git clone https://github.com/jeffdaily/parasail
32+
```
33+
34+
The sequences are will be in `bench/data/parasail/data`.
35+
36+
3. The reference data can be gotten by:
37+
38+
```bash
39+
mkdir refdata && cd refdata
40+
curl https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2015_11/knowledgebase/uniprot_sprot-only2015_11.tar.gz --output uniprot_sprot-only2015_11.tar.gz
41+
tar -xvzf uniprot_sprot-only2015_11.tar.gz
42+
```

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11

22
# Next
33

4+
# v1.4.0 (10-31-2025)
5+
6+
- Support for Mojo v0.25.6
7+
- Improved benchmarking pipeline via `pixi run bench-all-cpu` and `pixi run bench-all-gpu`
8+
9+
410
# v1.3.1 (07-23-2025)
511

612
- Revert to lockfile from v1.2.1 to workaround modular-community package `regex` and conda channel ordering.

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,18 @@ This is a benchmarking tool based on `parasail_aligner`.
132132
>
133133
> `ish-aligner` and all variations of it are for development purposes only.
134134
135+
## Running benchmarks
136+
137+
```
138+
pixi run bench-all-cpu
139+
# And if you have a Tier 1 or Tier 2 supported GPU
140+
pixi run bench-all-gpu
141+
```
142+
143+
This will download all bench data needed, run the benchmarks, and produce plots. Look in `bench_results` upon completion.
144+
145+
**Note**, if you run or build individual benchmark binaries, the `SIMD_MOD` argument can be `sse`, `avx2`, or `avx512`. REGARDLESS of whether your system supports SIMD vectors at a hardware level of avx2 width, Mojo will simulate vectors of that width if they are not available.
146+
135147
## Further Reading
136148
137149
The associated paper can be found [here](https://www.biorxiv.org/content/10.1101/2025.06.04.657890v1).

benchmarking/bench_scripts/bench.py

Lines changed: 98 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,57 @@
11
import csv
22
import io
3+
from pathlib import Path
34
import sys
45
import subprocess as sp
56
from dataclasses import dataclass
6-
from typing import Optional, List
7+
from typing import Literal, Optional, List
78

8-
# Requries ish-aligner to have been compiled for the 3 widths of interest: (128, 256, 512)
9-
ISH_128 = "/home/ubuntu/dev/ish/ish-aligner-128"
10-
ISH_256 = "/home/ubuntu/dev/ish/ish-aligner-256"
11-
ISH_512 = "/home/ubuntu/dev/ish/ish-aligner-512"
12-
ISH_GPU = "/home/ubuntu/dev/ish/ish-aligner-gpu"
9+
import defopt
1310

14-
PARASAIL_ALIGNER = "/home/ubuntu/dev/parasail/apps/parasail_aligner"
11+
# Requries ish-aligner to have been compiled for the 3 widths of interest: (128, 256, 512)
12+
# ISH_128 = "/home/ubuntu/dev/ish/ish-aligner-128"
13+
# ISH_256 = "/home/ubuntu/dev/ish/ish-aligner-256"
14+
# ISH_512 = "/home/ubuntu/dev/ish/ish-aligner-512"
15+
# ISH_GPU = "/home/ubuntu/dev/ish/ish-aligner-gpu"
16+
# curl https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2015_11/knowledgebase/uniprot_sprot-only2015_11.tar.gz --output uniprot_sprot-only2015_11.tar.gz
17+
# REF_DB = "/home/ubuntu/data/uniprot_sprot_5x.fasta"
1518

1619
# From parasail data dir
1720
# https://github.com/jeffdaily/parasail/tree/600fb26151ff19899ee39a214972dcf2b9b11ed7/data
1821
QUERY_SEQS = {
19-
"/home/ubuntu/dev/parasail/data/P56980.fasta": 24,
20-
"/home/ubuntu/dev/parasail/data/O29181.fasta": 63,
21-
"/home/ubuntu/dev/parasail/data/O60341.fasta": 852,
22-
"/home/ubuntu/dev/parasail/data/P00762.fasta": 246,
23-
"/home/ubuntu/dev/parasail/data/P01008.fasta": 464,
24-
"/home/ubuntu/dev/parasail/data/P01111.fasta": 189,
25-
"/home/ubuntu/dev/parasail/data/P02232.fasta": 144,
26-
"/home/ubuntu/dev/parasail/data/P03435.fasta": 567,
27-
"/home/ubuntu/dev/parasail/data/P03630.fasta": 127,
28-
"/home/ubuntu/dev/parasail/data/P03989.fasta": 362,
29-
"/home/ubuntu/dev/parasail/data/P04775.fasta": 2005,
30-
"/home/ubuntu/dev/parasail/data/P05013.fasta": 189,
31-
"/home/ubuntu/dev/parasail/data/P07327.fasta": 375,
32-
"/home/ubuntu/dev/parasail/data/P07756.fasta": 1500,
33-
"/home/ubuntu/dev/parasail/data/P08519.fasta": 4548,
34-
"/home/ubuntu/dev/parasail/data/P0C6B8.fasta": 3564,
35-
"/home/ubuntu/dev/parasail/data/P10635.fasta": 497,
36-
"/home/ubuntu/dev/parasail/data/P14942.fasta": 222,
37-
"/home/ubuntu/dev/parasail/data/P19096.fasta": 2504,
38-
"/home/ubuntu/dev/parasail/data/P20930.fasta": 4061,
39-
"/home/ubuntu/dev/parasail/data/P21177.fasta": 729,
40-
"/home/ubuntu/dev/parasail/data/P25705.fasta": 553,
41-
"/home/ubuntu/dev/parasail/data/P27895.fasta": 1000,
42-
"/home/ubuntu/dev/parasail/data/P28167.fasta": 3005,
43-
"/home/ubuntu/dev/parasail/data/P33450.fasta": 5147,
44-
"/home/ubuntu/dev/parasail/data/P42357.fasta": 657,
45-
"/home/ubuntu/dev/parasail/data/P53765.fasta": 255,
46-
"/home/ubuntu/dev/parasail/data/P58229.fasta": 511,
47-
"/home/ubuntu/dev/parasail/data/Q7TMA5.fasta": 4743,
48-
"/home/ubuntu/dev/parasail/data/Q8ZGB4.fasta": 361,
49-
"/home/ubuntu/dev/parasail/data/Q9UKN1.fasta": 5478,
22+
"P56980.fasta": 24,
23+
"O29181.fasta": 63,
24+
"O60341.fasta": 852,
25+
"P00762.fasta": 246,
26+
"P01008.fasta": 464,
27+
"P01111.fasta": 189,
28+
"P02232.fasta": 144,
29+
"P03435.fasta": 567,
30+
"P03630.fasta": 127,
31+
"P03989.fasta": 362,
32+
"P04775.fasta": 2005,
33+
"P05013.fasta": 189,
34+
"P07327.fasta": 375,
35+
"P07756.fasta": 1500,
36+
"P08519.fasta": 4548,
37+
"P0C6B8.fasta": 3564,
38+
"P10635.fasta": 497,
39+
"P14942.fasta": 222,
40+
"P19096.fasta": 2504,
41+
"P20930.fasta": 4061,
42+
"P21177.fasta": 729,
43+
"P25705.fasta": 553,
44+
"P27895.fasta": 1000,
45+
"P28167.fasta": 3005,
46+
"P33450.fasta": 5147,
47+
"P42357.fasta": 657,
48+
"P53765.fasta": 255,
49+
"P58229.fasta": 511,
50+
"Q7TMA5.fasta": 4743,
51+
"Q8ZGB4.fasta": 361,
52+
"Q9UKN1.fasta": 5478,
5053
}
5154

52-
MATRIX = ["blosum62", "blosum50"]
53-
54-
# curl https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2015_11/knowledgebase/uniprot_sprot-only2015_11.tar.gz --output uniprot_sprot-only2015_11.tar.gz
55-
REF_DB = "/home/ubuntu/data/uniprot_sprot_5x.fasta"
56-
5755

5856
@dataclass
5957
class BenchmarkResults:
@@ -114,7 +112,9 @@ def to_csv(results: List["BenchmarkResults"]):
114112
)
115113

116114
@staticmethod
117-
def from_ish_csv_str(csv_str: str, aligner: str, devices: int = 0) -> List["BenchmarkResults"]:
115+
def from_ish_csv_str(
116+
csv_str: str, aligner: str, devices: int = 0
117+
) -> List["BenchmarkResults"]:
118118
csv_file = io.StringIO(csv_str)
119119
reader = csv.DictReader(
120120
csv_file,
@@ -149,7 +149,7 @@ def from_parasail_blob_str(
149149
blob_str: str,
150150
query_len: int,
151151
instruction_set: str,
152-
score_size: int,
152+
score_size: str,
153153
aligner: str,
154154
) -> "BenchmarkResults":
155155
file = io.StringIO(blob_str)
@@ -198,7 +198,7 @@ def run_parasail_aligner(
198198
gap_open_score=3,
199199
gap_ext_score=1,
200200
*,
201-
algo="sg"
201+
algo="sg",
202202
):
203203

204204
scoring_matrix = scoring_matrix.lower()
@@ -212,7 +212,9 @@ def run_parasail_aligner(
212212
raise ValueError("Invalid score size")
213213

214214
algorithm = (
215-
f"{algo}_striped_" + (instruction_set if instruction_set else "") + f"_{score_size}"
215+
f"{algo}_striped_"
216+
+ (instruction_set if instruction_set else "")
217+
+ f"_{score_size}"
216218
)
217219

218220
# fmt: off
@@ -239,7 +241,7 @@ def run_parasail_aligner(
239241
result = BenchmarkResults.from_parasail_blob_str(
240242
out.stdout,
241243
query_len=query_len,
242-
instruction_set=instruction_set,
244+
instruction_set=instruction_set if instruction_set else "",
243245
score_size=score_size,
244246
aligner="parasail_aligner",
245247
)
@@ -261,7 +263,7 @@ def run_ish_aligner(
261263
iterations=3,
262264
devices=0,
263265
*,
264-
algo="striped-local"
266+
algo="striped-local",
265267
) -> Optional[BenchmarkResults]:
266268
# fmt: off
267269
args = [
@@ -286,7 +288,9 @@ def run_ish_aligner(
286288
if "overflow" in out.stdout:
287289
print("Overflow, no result for: ", " ".join(args), file=sys.stderr)
288290
return None
289-
result = BenchmarkResults.from_ish_csv_str(out.stdout, aligner="ish-aligner", devices=devices)[
291+
result = BenchmarkResults.from_ish_csv_str(
292+
out.stdout, aligner="ish-aligner", devices=devices
293+
)[
290294
0
291295
] # Only take the first item since we're running this in such a way that only one will be there anyways
292296
except sp.CalledProcessError as e:
@@ -295,30 +299,53 @@ def run_ish_aligner(
295299
return result
296300

297301

298-
def main():
302+
ScoreSize = Literal["byte", "word", "adaptive"]
303+
ParasailInstructionSet = Literal["sse41_128", "neon_128", "avx2_256"]
304+
IshAlgorithm = Literal[
305+
"basic-semi-global-gpu-parallel", "striped-semi-global", "striped-local"
306+
]
307+
ParasailAlgorithm = Literal["sg", "sw"]
308+
309+
310+
def main(
311+
*,
312+
ish_binaries: list[Path],
313+
ish_algorithm: IshAlgorithm,
314+
parasail_aligner_binary: Path = Path(""),
315+
parasail_algorithm: ParasailAlgorithm = "sg",
316+
query_seqs_base_dir: Path,
317+
ref_data: Path,
318+
score_sizes: list[ScoreSize],
319+
parasail_instruction_sets: list[ParasailInstructionSet] = [],
320+
output_dir: Path,
321+
devices: int = 1,
322+
iterations: int = 3
323+
):
299324

300325
# score_sizes = ["byte", "word", "adaptive"]
301-
score_sizes = ["word"]
326+
# score_sizes = ["word"]
302327

303328
writer = csv.DictWriter(sys.stdout, fieldnames=BenchmarkResults.HEADERS)
304329
writer.writeheader()
305330

306331
results: List[BenchmarkResults] = []
307-
for ish in [ISH_GPU]: #, ISH_256, ISH_512]:
332+
for ish in ish_binaries: # , ISH_256, ISH_512]:
308333
for score_size in score_sizes:
309-
for device in range(0, 4):
334+
for device in range(0, devices):
310335
for query in QUERY_SEQS.keys():
311-
print(f"Running {ish} on {query} with {score_size}", file=sys.stderr)
336+
print(
337+
f"Running {ish} on {query} with {score_size}", file=sys.stderr
338+
)
312339
r = run_ish_aligner(
313-
ish,
314-
query,
315-
REF_DB,
340+
str(ish),
341+
str(Path(query_seqs_base_dir) / query),
342+
str(ref_data),
316343
score_size=score_size,
317344
scoring_matrix="Blosum62",
318-
output_file="/home/ubuntu/outputs/ish-aligner.csv",
319-
iterations=3,
320-
algo="basic-semi-global-gpu-parallel",
321-
devices = device + 1
345+
output_file=str(output_dir / "ish-aligner-result.csv"),
346+
iterations=iterations,
347+
algo=ish_algorithm,
348+
devices=device + 1,
322349
)
323350
if r:
324351
writer.writerow(
@@ -341,23 +368,20 @@ def main():
341368
)
342369
results.append(r)
343370

344-
for inst in []:
345-
# for inst in ["sse41_128"]:
346-
# for inst in ["sse41_128", "avx2_256"]:
347-
# for inst in ["neon_128"]:
371+
for inst in parasail_instruction_sets:
348372
for score_size in score_sizes:
349373
for query, query_len in QUERY_SEQS.items():
350-
print(f"Running {PARASAIL_ALIGNER} on {query}", file=sys.stderr)
374+
print(f"Running {parasail_aligner_binary} on {query}", file=sys.stderr)
351375
r = run_parasail_aligner(
352-
PARASAIL_ALIGNER,
353-
REF_DB,
354-
query,
376+
str(parasail_aligner_binary),
377+
str(ref_data),
378+
str(Path(query_seqs_base_dir) / query),
355379
query_len,
356380
instruction_set=inst,
357381
score_size=score_size,
358382
scoring_matrix="Blosum62",
359-
output_file="/home/ubuntu/outputs/parasail-aligner.csv",
360-
algo="sg"
383+
output_file=str(output_dir / "parasail-aligner-result.csv"),
384+
algo=parasail_algorithm,
361385
)
362386
if r:
363387
writer.writerow(
@@ -379,8 +403,8 @@ def main():
379403
)
380404
results.append(r)
381405

382-
#BenchmarkResults.to_csv(results)
406+
# BenchmarkResults.to_csv(results)
383407

384408

385409
if __name__ == "__main__":
386-
main()
410+
defopt.run(main)

0 commit comments

Comments
 (0)