Skip to content

Commit 0aebe30

Browse files
committed
added seqgen v1.0, expanded readme
1 parent 96f6221 commit 0aebe30

6 files changed

Lines changed: 237 additions & 12 deletions

File tree

ChangeLog

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
2019-04-25 Benjamin Jean-Marie Tremblay <benjmtremblay@gmail.com>
2+
3+
* Added seqgen v1.0
4+
15
2019-04-24 Benjamin Jean-Marie Tremblay <benjmtremblay@gmail.com>
26

37
* countlets and shuffler are now v1.0

Makefile

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
SRC := countlets.cpp klets.cpp shuffler.cpp shuffle_euler.cpp \
2-
shuffle_linear.cpp shuffle_markov.cpp
2+
shuffle_linear.cpp shuffle_markov.cpp seqgen.cpp
33
OBJ_COUNTLETS := countlets.o klets.o
44
OBJ_SHUFFLER := shuffler.o klets.o shuffle_euler.o shuffle_linear.o \
55
shuffle_markov.o
6+
OBJ_SEQGEN := seqgen.o
67
CC := g++
78

89
all: build install
@@ -19,10 +20,14 @@ shuffler:
1920
cd src;\
2021
$(CC) $(OBJ_SHUFFLER) -o ../bin/shuffler
2122

23+
seqgen:
24+
cd src;\
25+
$(CC) $(OBJ_SEQGEN) -o ../bin/seqgen
26+
2227
makebin:
2328
mkdir -p bin
2429

25-
install: makebin countlets shuffler
30+
install: makebin countlets shuffler seqgen
2631

2732
clean:
2833
cd src;\

README

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ sequenceshuffler
22
================
33

44
Author: Benjamin Jean-Marie Tremblay <benjmtremblay@gmail.com>
5-
Date: 2019-04-24
5+
Date: 2019-04-25
6+
67

78
Installation
89
------------
@@ -13,24 +14,65 @@ Installation
1314

1415
The following binaries are created:
1516

16-
sequenceshuffler/bin/countlets
17-
sequenceshuffler/bin/shuffler
17+
bin/countlets
18+
bin/seqgen
19+
bin/shuffler
1820

1921
Run these without any arguments or with the -h flag to see usage.
2022

23+
2124
countlets
2225
---------
2326

2427
This utility counts the total number of k-lets in the input sequence. Be aware
2528
that the total number of k-lets is n^k, where n is the alphabet length.
2629

30+
Example usage:
31+
2732
bin/./countlets -k 1 -i example/sequence.txt
2833

2934
A 17384
3035
C 8081
3136
G 7583
3237
T 16952
3338

39+
40+
seqgen
41+
------
42+
43+
Create random sequences from any alphabet. Letters can be made up of any
44+
number of characters. Weights can be provided to modify random generation.
45+
If any of the input letters contain spaces, use quotation marks.
46+
47+
Example usage:
48+
49+
bin/./seqgen -a A,C,G,T -l 10 -s 11
50+
51+
CGAACTATTC
52+
53+
bin/./seqgen -a A,C,G,T -l 1000 -s 1 | bin/./countlets
54+
55+
A 246
56+
C 258
57+
G 233
58+
T 263
59+
60+
bin/./seqgen -a "A,B,CD, " -l 10 -s 3
61+
62+
BCD BBBAAB
63+
64+
bin/./seqgen -a A,C,G,T -l 10 -s 11 -w 1,0.5,0.5,1
65+
66+
TTAGAATTTT
67+
68+
bin/./seqgen -a A,C,G,T -l 1000 -s 11 -w 1,0.5,0.5,1 | bin/./countlets
69+
70+
A 338
71+
C 150
72+
G 169
73+
T 343
74+
75+
3476
shuffler
3577
--------
3678

@@ -73,6 +115,7 @@ this type of shuffling is discussed by Fitch (1983).
73115
Note: these methods only apply for k > 1. Otherwise, a simple shuffle call is
74116
performed.
75117

118+
76119
References
77120
----------
78121

src/seqgen.cpp

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
/*
2+
* Copyright (C) 2019 Benjamin Jean-Marie Tremblay
3+
*
4+
* This file is part of sequenceshuffler.
5+
*
6+
* sequenceshuffler is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* sequenceshuffler is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU General Public License
17+
* along with sequenceshuffler. If not, see <https://www.gnu.org/licenses/>.
18+
*
19+
*/
20+
21+
#include <string>
22+
#include <vector>
23+
#include <iostream>
24+
#include <fstream>
25+
#include <random>
26+
#include <unistd.h>
27+
using namespace std;
28+
29+
void usage() {
30+
printf(
31+
"seqgen v1.0 Copyright (C) 2019 Benjamin Jean-Marie Tremblay \n"
32+
" \n"
33+
"Usage: seqgen [options] -a [letters] -l [length] -o [outfile] \n"
34+
" seqgen [options] -a [letters] -l [length] > [outfile] \n"
35+
" \n"
36+
" -o <str> Output filename. Alternatively, prints to stdout. \n"
37+
" -a <str> Comma-seperated sequence letters. Each letter can be made of any \n"
38+
" number of characters. \n"
39+
" -w <str> Comma-seperated letter weights. Order matches that of letters. If \n"
40+
" missing, assumes equal likelihood for all letters. \n"
41+
" -l <int> Final sequence length. This length is the number of characters in \n"
42+
" output string. \n"
43+
" -s <int> RNG seed number. Defaults to time in seconds. \n"
44+
" -h Print usage and exit. \n"
45+
);
46+
}
47+
48+
int main(int argc, char **argv) {
49+
50+
unsigned int seqlen{0};
51+
int alphlen, opt;
52+
ofstream outfile;
53+
bool has_out{false}, has_freqs{false};
54+
default_random_engine gen;
55+
vector<string> lets;
56+
vector<double> freqs;
57+
string outletters, all_lets, all_freqs, final_freq, final_let;
58+
string comma = ",";
59+
unsigned int iseed = time(0);
60+
size_t last, next;
61+
62+
if (argc == 1) {
63+
usage();
64+
return 0;
65+
}
66+
67+
while ((opt = getopt(argc, argv, "a:o:l:s:w:h")) != -1) {
68+
switch (opt) {
69+
case 'o': if (optarg) {
70+
outfile.open(optarg);
71+
if (outfile.bad()) {
72+
cerr << "Error: could not create outfile" << endl;
73+
exit(EXIT_FAILURE);
74+
}
75+
has_out = true;
76+
}
77+
break;
78+
case 'l': if (optarg) seqlen = atoi(optarg);
79+
break;
80+
case 's': if (optarg) iseed = atoi(optarg);
81+
break;
82+
case 'a': if (optarg) all_lets = string(optarg);
83+
break;
84+
case 'w': if (optarg) {
85+
all_freqs = string(optarg);
86+
has_freqs = true;
87+
}
88+
break;
89+
case 'h': usage();
90+
return 0;
91+
default: usage();
92+
return 0;
93+
}
94+
}
95+
96+
if (seqlen < 1) {
97+
cerr << "Error: please input a desired sequence length above 0" << endl;;
98+
exit(EXIT_FAILURE);
99+
}
100+
101+
/* split up letters */
102+
103+
last = 0;
104+
next = 0;
105+
while ((next = all_lets.find(comma, last)) != string::npos) {
106+
lets.push_back(all_lets.substr(last, next - last));
107+
last = next + 1;
108+
}
109+
final_let = all_lets.substr(last);
110+
if (final_let.length() > 0) lets.push_back(final_let);
111+
112+
alphlen = lets.size();
113+
114+
if (alphlen < 1) {
115+
cerr << "Error: could not parse sequence alphabet" << endl;
116+
exit(EXIT_FAILURE);
117+
}
118+
119+
/* split up freqs */
120+
121+
if (has_freqs) {
122+
123+
last = 0;
124+
next = 0;
125+
while ((next = all_freqs.find(comma, last)) != string::npos) {
126+
freqs.push_back(stod(all_freqs.substr(last, next - last)));
127+
last = next + 1;
128+
}
129+
final_freq = all_freqs.substr(last);
130+
if (final_freq.length() > 0) freqs.push_back(stod(final_freq));
131+
132+
if (lets.size() != freqs.size()) {
133+
cerr << "Error: mismatching number of letters [" << lets.size()
134+
<< "] and frequencies [" << freqs.size() << "]" << endl;
135+
exit(EXIT_FAILURE);
136+
}
137+
138+
}
139+
140+
/* main seq generation loop */
141+
142+
gen = default_random_engine(iseed);
143+
outletters = "";
144+
145+
if (!has_freqs) {
146+
147+
while (outletters.length() < seqlen) {
148+
outletters += lets[gen() % alphlen];
149+
}
150+
151+
} else {
152+
153+
discrete_distribution<int> next_let(freqs.begin(), freqs.end());
154+
while (outletters.length() < seqlen) {
155+
outletters += lets[next_let(gen)];
156+
}
157+
158+
}
159+
160+
if (outletters.length() > seqlen)
161+
outletters = outletters.substr(0, seqlen);
162+
163+
/* return */
164+
165+
if (has_out) {
166+
outfile << outletters << endl;
167+
} else {
168+
cout << outletters << endl;
169+
}
170+
171+
return 1;
172+
173+
}

src/shuffle_euler.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ vector<vector<int>> make_edgelist(vector<int> let_counts, int nletsm1, int alphl
3737
*/
3838

3939
vector<vector<int>> edgelist(nletsm1, vector<int>(alphlen));
40-
int counter {0};
40+
int counter{0};
4141

4242
for (int i = 0; i < nletsm1; ++i) {
4343

@@ -58,7 +58,7 @@ vector<int> find_euler(vector<vector<int>> edgelist, int lasti, int nletsm1,
5858

5959
int u;
6060
int nletsm2 = pow(alphlen, k - 2);
61-
int good_v {0}, counter {0};
61+
int good_v{0}, counter{0};
6262
vector<bool> vertices(nletsm1, false);
6363
vector<int> last_letsi(nletsm1, 0);
6464
vector<int> next_let_i;
@@ -160,7 +160,7 @@ vector<int> walk_euler(vector<vector<int>> edgelist, int seqlen, int k,
160160
vector<int> out_i;
161161
int alphlen = lets_uniq.size();
162162
int nletsm1 = edgelist.size();
163-
int current {0};
163+
int current{0};
164164
int n = firstl.length();
165165
vector<int> edgelist_counter(nletsm1, 0);
166166

@@ -202,7 +202,7 @@ string shuffle_euler(vector<char> letters, default_random_engine gen, int k,
202202

203203
int seqlen = letters.size();
204204
int alphlen, nlets, nletsm1;
205-
int lasti {-1};
205+
int lasti{-1};
206206
vector<int> let_counts, last_letsi, out_i;
207207
vector<char> lets_uniq;
208208
set<int> lets_set;

src/shuffle_linear.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ string shuffle_linear(vector<char> letters, default_random_engine gen, int k,
3030
/* variables */
3131

3232
int seqlen1 = letters.size();
33-
int seqlen2 {seqlen1 / k};
34-
int seqrem {seqlen1 % k};
35-
int seqremlen {seqlen1 - seqrem};
33+
int seqlen2{seqlen1 / k};
34+
int seqrem{seqlen1 % k};
35+
int seqremlen{seqlen1 - seqrem};
3636

3737
if (verbose) {
3838
cerr << " Times split: " << seqlen2 << endl;

0 commit comments

Comments
 (0)