Skip to content

Commit f0a340a

Browse files
committed
add fasta input support to the databases in pipeline
1 parent e4b0e3c commit f0a340a

4 files changed

Lines changed: 462 additions & 2 deletions

File tree

pipeline.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from PROBESt.args import arguments_parse
4141
from PROBESt.modeling import run_modeling
4242
from PROBESt.dedegeneration import run_dedegeneration
43+
from PROBESt.prepare_blast import prepare_bases_if_needed
4344

4445
# Functions
4546

@@ -68,6 +69,19 @@ def merge_iter(iter: int):
6869
print("\n---- PROBESt v.0.2.0 ----\n")
6970
print("Arguments passed")
7071

72+
# Create output directory early (needed for BLAST database preparation)
73+
os.makedirs(args.output, exist_ok=True)
74+
75+
# 0.1. Prepare BLAST databases from FASTA directories if needed ----
76+
args.true_base, args.false_base = prepare_bases_if_needed(
77+
true_base=args.true_base,
78+
false_bases=args.false_base,
79+
output_dir=args.output,
80+
contig_table_path=args.contig_table,
81+
tmp_dir=args.prep_db_tmp,
82+
script_path=args.script_path
83+
)
84+
7185
# Create TMP
7286
os.makedirs(out_dir(0), exist_ok=True)
7387

src/PROBESt/args.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,20 @@ def arguments_parse():
4444
help="Path(s) to the BLAST database(s) for non-specific testing. These databases are used to filter out non-specific probes. Wildcards are not accepted.")
4545

4646
parser.add_argument("-c", "--contig_table",
47-
required=True,
48-
help="Path to a .tsv table containing BLAST database information.")
47+
required=False,
48+
default=None,
49+
help="Path to a .tsv table containing BLAST database information. If not provided and FASTA directories are used for bases, it will be auto-generated in the output directory.")
4950

5051
parser.add_argument("-o", "--output",
5152
required=True,
5253
help="Output directory path for storing results.")
5354

55+
# BLAST database preparation arguments (for FASTA directory inputs)
56+
parser.add_argument("--prep_db_tmp",
57+
required=False,
58+
default=None,
59+
help="Temporary directory for BLAST database preparation when using FASTA directories as input. If not provided, a temporary directory will be created in the output directory.")
60+
5461
parser.add_argument("-t", "--threads",
5562
required=False,
5663
default="1",
@@ -286,4 +293,8 @@ def arguments_parse():
286293
if args.output_tmp == "":
287294
args.output_tmp = args.output + "/.tmp/"
288295

296+
# Set default contig_table if not provided
297+
if args.contig_table is None:
298+
args.contig_table = args.output + "/contigs.tsv"
299+
289300
return args

src/PROBESt/prepare_blast.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
"""
2+
Module for preparing BLAST databases from FASTA directories.
3+
4+
This module provides functions to detect whether input paths are directories
5+
containing FASTA files (instead of pre-built BLAST databases) and to convert
6+
them to BLAST databases using the prep_db.sh script.
7+
"""
8+
9+
import os
10+
import subprocess
11+
import glob
12+
from typing import List, Tuple, Optional
13+
14+
15+
# Supported FASTA file extensions
16+
FASTA_EXTENSIONS = {'.fa', '.fasta', '.fna', '.fa.gz', '.fasta.gz', '.fna.gz'}
17+
18+
19+
def is_fasta_directory(path: str) -> bool:
20+
"""
21+
Check if the given path is a directory containing FASTA files.
22+
23+
Args:
24+
path: Path to check.
25+
26+
Returns:
27+
True if path is a directory containing FASTA files, False otherwise.
28+
"""
29+
if not os.path.isdir(path):
30+
return False
31+
32+
# Check for FASTA files in the directory
33+
for ext in FASTA_EXTENSIONS:
34+
pattern = os.path.join(path, f"*{ext}")
35+
if glob.glob(pattern):
36+
return True
37+
38+
return False
39+
40+
41+
def get_fasta_files(directory: str) -> List[str]:
42+
"""
43+
Get all FASTA files from a directory.
44+
45+
Args:
46+
directory: Path to the directory containing FASTA files.
47+
48+
Returns:
49+
List of paths to FASTA files.
50+
"""
51+
fasta_files = []
52+
for ext in FASTA_EXTENSIONS:
53+
pattern = os.path.join(directory, f"*{ext}")
54+
fasta_files.extend(glob.glob(pattern))
55+
return sorted(fasta_files)
56+
57+
58+
def prepare_blast_database(
59+
fasta_dir: str,
60+
output_db_path: str,
61+
contig_table_path: str,
62+
tmp_dir: Optional[str] = None,
63+
script_path: Optional[str] = None
64+
) -> str:
65+
"""
66+
Prepare a BLAST database from a directory containing FASTA files.
67+
68+
Uses the prep_db.sh script to merge FASTA files and create a BLAST database.
69+
70+
Args:
71+
fasta_dir: Path to directory containing FASTA files.
72+
output_db_path: Path for the output BLAST database.
73+
contig_table_path: Path for the contig names output file.
74+
tmp_dir: Optional temporary directory for intermediate files.
75+
script_path: Path to the directory containing prep_db.sh script.
76+
77+
Returns:
78+
Path to the created BLAST database.
79+
80+
Raises:
81+
ValueError: If the directory contains no FASTA files.
82+
RuntimeError: If the prep_db.sh script fails.
83+
"""
84+
fasta_files = get_fasta_files(fasta_dir)
85+
86+
if not fasta_files:
87+
raise ValueError(f"No FASTA files found in directory: {fasta_dir}")
88+
89+
# Determine script path
90+
if script_path is None:
91+
script_path = os.path.join(
92+
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
93+
"scripts", "generator"
94+
)
95+
96+
prep_db_script = os.path.join(script_path, "prep_db.sh")
97+
98+
if not os.path.exists(prep_db_script):
99+
raise FileNotFoundError(f"prep_db.sh script not found at: {prep_db_script}")
100+
101+
# Ensure output directories exist
102+
os.makedirs(os.path.dirname(output_db_path) or '.', exist_ok=True)
103+
os.makedirs(os.path.dirname(contig_table_path) or '.', exist_ok=True)
104+
105+
# Build command
106+
cmd = [
107+
"bash", prep_db_script,
108+
"-n", output_db_path,
109+
"-c", contig_table_path,
110+
]
111+
112+
if tmp_dir:
113+
cmd.extend(["-t", tmp_dir])
114+
115+
cmd.extend(fasta_files)
116+
117+
# Run prep_db.sh
118+
result = subprocess.run(
119+
cmd,
120+
capture_output=True,
121+
text=True
122+
)
123+
124+
if result.returncode != 0:
125+
raise RuntimeError(
126+
f"prep_db.sh failed with return code {result.returncode}.\n"
127+
f"stdout: {result.stdout}\n"
128+
f"stderr: {result.stderr}"
129+
)
130+
131+
return output_db_path
132+
133+
134+
def prepare_bases_if_needed(
135+
true_base: str,
136+
false_bases: List[str],
137+
output_dir: str,
138+
contig_table_path: str,
139+
tmp_dir: Optional[str] = None,
140+
script_path: Optional[str] = None
141+
) -> Tuple[str, List[str]]:
142+
"""
143+
Prepare BLAST databases from FASTA directories if needed.
144+
145+
Checks if true_base and false_bases are directories containing FASTA files.
146+
If they are, creates BLAST databases from them. Otherwise, returns the
147+
original paths (assumed to be pre-built BLAST databases).
148+
149+
Args:
150+
true_base: Path to true base (BLAST database or FASTA directory).
151+
false_bases: List of paths to false bases (BLAST databases or FASTA directories).
152+
output_dir: Output directory for created BLAST databases.
153+
contig_table_path: Path for the contig names output file.
154+
tmp_dir: Optional temporary directory for intermediate files.
155+
script_path: Path to the directory containing prep_db.sh script.
156+
157+
Returns:
158+
Tuple of (processed_true_base, processed_false_bases) paths to BLAST databases.
159+
"""
160+
blast_db_dir = os.path.join(output_dir, ".blast_db")
161+
os.makedirs(blast_db_dir, exist_ok=True)
162+
163+
# Process true base
164+
if is_fasta_directory(true_base):
165+
dir_name = os.path.basename(true_base.rstrip('/'))
166+
processed_true_base = os.path.join(blast_db_dir, f"true_{dir_name}")
167+
true_contig_table = contig_table_path if os.path.dirname(contig_table_path) else os.path.join(blast_db_dir, contig_table_path)
168+
169+
print(f"Preparing BLAST database from FASTA directory: {true_base}")
170+
prepare_blast_database(
171+
fasta_dir=true_base,
172+
output_db_path=processed_true_base,
173+
contig_table_path=true_contig_table,
174+
tmp_dir=tmp_dir,
175+
script_path=script_path
176+
)
177+
print(f"Created BLAST database: {processed_true_base}")
178+
else:
179+
processed_true_base = true_base
180+
181+
# Process false bases
182+
processed_false_bases = []
183+
for i, false_base in enumerate(false_bases):
184+
if is_fasta_directory(false_base):
185+
dir_name = os.path.basename(false_base.rstrip('/'))
186+
processed_false_base = os.path.join(blast_db_dir, f"false_{i}_{dir_name}")
187+
false_contig_table = os.path.join(blast_db_dir, f"contigs_false_{i}.tsv")
188+
189+
print(f"Preparing BLAST database from FASTA directory: {false_base}")
190+
prepare_blast_database(
191+
fasta_dir=false_base,
192+
output_db_path=processed_false_base,
193+
contig_table_path=false_contig_table,
194+
tmp_dir=tmp_dir,
195+
script_path=script_path
196+
)
197+
print(f"Created BLAST database: {processed_false_base}")
198+
processed_false_bases.append(processed_false_base)
199+
else:
200+
processed_false_bases.append(false_base)
201+
202+
return processed_true_base, processed_false_bases

0 commit comments

Comments
 (0)