Skip to content

Commit a457bb9

Browse files
author
feeka
committed
Added advanced settings and usability
1 parent 1f8e0c9 commit a457bb9

9 files changed

Lines changed: 652 additions & 549 deletions

File tree

include/cycle_finder.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <stack>
1010
#include <omp.h>
1111
#include "path_writer.h"
12+
#include "settings.h"
1213
#include <unordered_set>
1314
//#include "progressbar.hpp"
1415
#include <list>
@@ -23,15 +24,15 @@ using namespace std;
2324

2425
class CycleFinder {
2526
private:
26-
uint8_t maximal_length;
27-
uint8_t minimal_length;
27+
// Use Settings to configure CycleFinder globally
28+
Settings& settings;
2829
static constexpr size_t MAX_EDGE_COUNT = 4;
29-
SDBG& sdbg;
30-
30+
// Use SDBG pointer from settings everywhere instead of storing a separate reference
31+
//SDBG& sdbg;
3132
uint16_t cluster_bounds;
3233
vector<bool> visited;
3334
vector<bool> look_up_table;
34-
int threads_count;
35+
// thread count obtained from settings
3536

3637
//#### DEVELOPER FUNCTIONS ####
3738
void _WriteStartNodesToFile(const map<int, vector<uint64_t>, greater<int>>& start_nodes_chunked, const std::string& filename);
@@ -53,8 +54,8 @@ class CycleFinder {
5354
//#### HELPER FUNCTIONS FOR DLS ####
5455

5556
public:
56-
string genome_name;
57-
CycleFinder(SDBG& sdbg, int length_bound, int minimal_length, string genome_name,int threads_count);
57+
// genome/cycles folder available via settings
58+
CycleFinder(Settings& settings);
5859
//write a getter for results
5960
unordered_map<uint64_t, vector<vector<uint64_t>>> results;
6061

include/phage_curator.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class PhageCurator {
5050
std::vector<std::vector<uint64_t>> BeamSearchPathsAvoiding(uint64_t start, int lower, int higher, const std::set<uint64_t>& forbidden, int beam_width, double min_mult, double max_mult, std::function<void(const std::vector<uint64_t>&)> path_callback = nullptr);
5151
std::map<std::string,vector<string>> FindQualityPathsBeamSearchFromGroupedPaths(int min_length, int max_length, const std::string& filename, int beam_width);
5252
std::string ComputeConsensusForCurrentGroup(vector<string> sequences);
53+
std::vector<vector<uint64_t>> GetTopPathsFromBeamPaths(const std::vector<std::vector<uint64_t>>& beam_paths,int max,int min,size_t top_n);
54+
5355
};
5456

5557
#endif // PHAGE_CURATOR_H

include/settings.h

Lines changed: 140 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,53 @@
44
#include <iostream>
55
#include <string>
66
#include <map>
7+
#include <vector>
78
#include <thread>
89
#include <filesystem>
910
#include <chrono>
1011
#include <iomanip>
1112
#include <sstream>
13+
#include <algorithm>
14+
#include <stdexcept>
15+
#include <cctype>
16+
#include "sdbg/sdbg.h"
17+
#include <fstream>
1218

1319
using namespace std;
1420
namespace fs = std::filesystem;
1521

1622
struct Settings {
17-
std::string input_files;
18-
double ram = 0.0; // In gigabytes
19-
size_t threads = 0;
20-
std::string output_folder = "out";
21-
std::string graph_folder;
22-
std::string cycles_folder;
23-
std::string output_file;
23+
std::string input_files; // Path to the input files
24+
double ram = 0.0; // Maximum RAM usage in gigabytes
25+
size_t threads = 0; // Number of threads to use
26+
std::string output_folder; // Output directory path. Empty until chosen by CLI or settings file.
27+
std::string graph_folder; // Folder for graph data
28+
std::string cycles_folder; // Folder for cycle data
29+
std::string output_file; // Path to the main output file
30+
// Sdbg
31+
32+
struct CycleFinderSettings {
33+
uint64_t threshold_multiplicity = 20; // Minimum multiplicity threshold
34+
bool low_abundance = true; // Flag to enable low abundance mode
35+
int cycle_max_length = 77; // Maximum length of a cycle
36+
int cycle_min_length = 27; // Minimum length of a cycle
37+
} cycle_finder_settings;
38+
struct DNASequenceSettings {
39+
int spacer_min_length = 23; // Minimum length of a spacer
40+
int spacer_max_length = 50; // Maximum length of a spacer
41+
int repeat_min_length = 23; // Minimum length of a repeat
42+
int repeat_max_length = 50; // Maximum length of a repeat
43+
} dna_sequence_settings;
44+
45+
SDBG* sdbg = nullptr; // Pointer to the SDBG graph
2446

2547
Settings() {
26-
// Generate timestamp for folder structure
27-
string timestamp = get_timestamp();
28-
output_folder = "" + timestamp;
29-
graph_folder = output_folder + "/graph";
30-
cycles_folder = output_folder + "/cycles";
31-
output_file = output_folder + "/CRISPR_Arrays.txt";
48+
// Defaults are intentionally empty so parse_arguments can apply
49+
// a timestamp-based default only if neither CLI nor settings file specify the output folder.
50+
output_folder = "";
51+
graph_folder = "";
52+
cycles_folder = "";
53+
output_file = "";
3254
}
3355

3456
// Generate timestamp in YYYY-MM-DD_HH-MM-SS format
@@ -91,6 +113,110 @@ struct Settings {
91113
}
92114
return erroneous_properties;
93115
}
116+
117+
// Read settings values from a simple key=value file. Lines starting with # or // are ignored.
118+
// Keys mirror struct property names; examples:
119+
// input_files=/path/a.fa /path/b.fa
120+
// ram=4G
121+
// threads=4
122+
// cycle_max_length=77
123+
// cycle_min_length=27
124+
// threshold_multiplicity=20
125+
// low_abundance=true
126+
bool LoadFromFile(const std::string& path) {
127+
std::ifstream file(path);
128+
if (!file.is_open()) {
129+
std::cerr << "Could not open settings file: " << path << std::endl;
130+
return false;
131+
}
132+
std::string line;
133+
auto trim = [](std::string s) {
134+
// trim in place
135+
const char* ws = " \t\n\r\f\v";
136+
s.erase(0, s.find_first_not_of(ws));
137+
s.erase(s.find_last_not_of(ws) + 1);
138+
return s;
139+
};
140+
while (std::getline(file, line)) {
141+
// Remove comments
142+
size_t posc = line.find('#');
143+
if (posc != std::string::npos) line = line.substr(0, posc);
144+
size_t pos2 = line.find("//");
145+
if (pos2 != std::string::npos) line = line.substr(0, pos2);
146+
string s = trim(line);
147+
if (s.empty()) continue;
148+
size_t eq = s.find('=');
149+
if (eq == std::string::npos) continue;
150+
string key = trim(s.substr(0, eq));
151+
string val = trim(s.substr(eq + 1));
152+
153+
// Interpret known keys
154+
if (key == "input_files") {
155+
// allow space/comma/semicolon-separated file list; normalize to single-space separated
156+
vector<string> tokens;
157+
string cur;
158+
for (char c : val) {
159+
if (c == ',' || c == ';') c = ' ';
160+
if (!isspace(static_cast<unsigned char>(c))) {
161+
cur.push_back(c);
162+
} else {
163+
if (!cur.empty()) {
164+
tokens.push_back(cur);
165+
cur.clear();
166+
}
167+
}
168+
}
169+
if (!cur.empty()) tokens.push_back(cur);
170+
// join with single space so older code (SDBGBuild) gets consistent formatting
171+
this->input_files.clear();
172+
for (size_t i = 0; i < tokens.size(); ++i) {
173+
this->input_files += tokens[i];
174+
if (i + 1 < tokens.size()) this->input_files += " ";
175+
}
176+
} else if (key == "ram") {
177+
// reuse same parsing as CLI: accept B/K/M/G suffix
178+
try {
179+
double value = 0.0;
180+
char unit = 'G';
181+
size_t p = val.find_first_not_of("0123456789.");
182+
if (p != std::string::npos) {
183+
value = stod(val.substr(0, p));
184+
unit = toupper(val[p]);
185+
} else {
186+
value = stod(val);
187+
}
188+
switch (unit) {
189+
case 'B': this->ram = value / (1024.0 * 1024.0 * 1024.0); break;
190+
case 'K': this->ram = value / (1024.0 * 1024.0); break;
191+
case 'M': this->ram = value / 1024.0; break;
192+
case 'G': this->ram = value; break;
193+
default: throw runtime_error("Invalid RAM unit in settings file: " + val);
194+
}
195+
} catch (...) {
196+
std::cerr << "Warning: could not parse RAM value '" << val << "' in settings file" << std::endl;
197+
}
198+
} else if (key == "threads") {
199+
try { this->threads = stoul(val); } catch (...) { }
200+
} else if (key == "output_folder") { this->output_folder = val; }
201+
else if (key == "graph_folder") { this->graph_folder = val; }
202+
else if (key == "cycles_folder") { this->cycles_folder = val; }
203+
else if (key == "output_file") { this->output_file = val; }
204+
else if (key == "cycle_max_length") { this->cycle_finder_settings.cycle_max_length = stoi(val); }
205+
else if (key == "cycle_min_length") { this->cycle_finder_settings.cycle_min_length = stoi(val); }
206+
else if (key == "threshold_multiplicity") { this->cycle_finder_settings.threshold_multiplicity = stoull(val); }
207+
else if (key == "low_abundance") {
208+
std::transform(val.begin(), val.end(), val.begin(), ::tolower);
209+
this->cycle_finder_settings.low_abundance = (val == "true" || val == "1" || val == "yes");
210+
}
211+
else if (key == "spacer_min_length") { this->dna_sequence_settings.spacer_min_length = stoi(val); }
212+
else if (key == "spacer_max_length") { this->dna_sequence_settings.spacer_max_length = stoi(val); }
213+
else if (key == "repeat_min_length") { this->dna_sequence_settings.repeat_min_length = stoi(val); }
214+
else if (key == "repeat_max_length") { this->dna_sequence_settings.repeat_max_length = stoi(val); }
215+
// unknown keys are ignored for forward-compatibility
216+
}
217+
file.close();
218+
return true;
219+
}
94220
};
95221

96-
#endif
222+
#endif // SETTINGS_H

readme.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,32 @@ The tool creates the following directory structure inside the specified output f
130130

131131
---
132132

133+
### ⚙️ Settings file support
134+
135+
Create a simple key=value text file (one setting per line) and pass it with `--settings /path/to/file`.
136+
137+
The program reads values from this file unless you override them with CLI flags. If you change the file, run the program again — new values will be used.
138+
139+
Example of ```settings.txt``` (must include `input_files`):
140+
```
141+
# MUST INCLUDE
142+
input_files=/data/sample_folder/1.fastq /data/sample_folder/2.fastq.fastq
143+
ram=128G
144+
threads=26
145+
output_folder=results/run_2025-11-19
146+
# OPTIONAL
147+
cycle_max_length=77
148+
cycle_min_length=27
149+
threshold_multiplicity=20
150+
low_abundance=true
151+
```
152+
153+
Notes:
154+
- `input_files` accepts one or two paths; entries may be separated by spaces, commas, or semicolons.
155+
- Terminal values will override the ```settings.txt```. For example for simplicity you can use the ```settings.txt``` file and change only ```-i``` parameter.
156+
157+
---
158+
133159
#### Requirements
134160

135161
- C++17 compiler

0 commit comments

Comments
 (0)