Skip to content

Commit 12a62cc

Browse files
authored
nthash.hpp: faster ntBase via building with tetramer (#18)
* nthash.hpp: faster ntBase via building with tetramer * nthash.hpp: remove unnecessary functions * nthash.hpp: reorder header * nthash.hpp: rename function variables * ntHashIterator: add starting kmer pos functionality * ntHashIterator: add starting kmer pos functionality * add start Kmer pos functionality to st/sshashiterator * unittest: fix rna test * unittest: fix rna test
1 parent 80cced0 commit 12a62cc

5 files changed

Lines changed: 217 additions & 15 deletions

File tree

ntHashIterator.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ class ntHashIterator
3535
* @param k k-mer size
3636
* @param h number of hashes
3737
*/
38-
ntHashIterator(const std::string& seq, unsigned h, unsigned k):
39-
m_seq(seq), m_h(h), m_k(k), m_hVec(new uint64_t[h]), m_pos(0)
38+
ntHashIterator(const std::string& seq, unsigned h, unsigned k, size_t pos = 0):
39+
m_seq(seq), m_h(h), m_k(k), m_hVec(new uint64_t[h]), m_pos(pos)
4040
{
4141
init();
4242
}

nthash.hpp

Lines changed: 208 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,173 @@
99
#ifndef NT_HASH_H
1010
#define NT_HASH_H
1111

12+
#include <limits>
1213
#include <stdint.h>
1314
#include <vector>
1415

16+
static const uint8_t RCconvertTab[256] = {
17+
255, 255, 255, 255, 255, 255, 255, 255, // 0..7
18+
255, 255, 255, 255, 255, 255, 255, 255, // 8..15
19+
255, 255, 255, 255, 255, 255, 255, 255, // 16..23
20+
255, 255, 255, 255, 255, 255, 255, 255, // 24..31
21+
255, 255, 255, 255, 255, 255, 255, 255, // 32..39
22+
255, 255, 255, 255, 255, 255, 255, 255, // 40..47
23+
255, 255, 255, 255, 255, 255, 255, 255, // 48..55
24+
255, 255, 255, 255, 255, 255, 255, 255, // 56..63
25+
255, 3, 255, 2, 255, 255, 255, 1, // 64..71
26+
255, 255, 255, 255, 255, 255, 255, 255, // 72..79
27+
255, 255, 255, 255, 0, 3, 255, 255, // 80..87
28+
255, 255, 255, 255, 255, 255, 255, 255, // 88..95
29+
255, 3, 255, 2, 255, 255, 255, 1, // 96..103
30+
255, 255, 255, 255, 255, 255, 255, 255, // 104..111
31+
255, 255, 255, 255, 0, 3, 255, 255, // 112..119
32+
255, 255, 255, 255, 255, 255, 255, 255, // 120..127
33+
255, 255, 255, 255, 255, 255, 255, 255, // 128..135
34+
255, 255, 255, 255, 255, 255, 255, 255, // 136..143
35+
255, 255, 255, 255, 255, 255, 255, 255, // 144..151
36+
255, 255, 255, 255, 255, 255, 255, 255, // 152..159
37+
255, 255, 255, 255, 255, 255, 255, 255, // 160..167
38+
255, 255, 255, 255, 255, 255, 255, 255, // 168..175
39+
255, 255, 255, 255, 255, 255, 255, 255, // 176..183
40+
255, 255, 255, 255, 255, 255, 255, 255, // 184..191
41+
255, 255, 255, 255, 255, 255, 255, 255, // 192..199
42+
255, 255, 255, 255, 255, 255, 255, 255, // 200..207
43+
255, 255, 255, 255, 255, 255, 255, 255, // 208..215
44+
255, 255, 255, 255, 255, 255, 255, 255, // 216..223
45+
255, 255, 255, 255, 255, 255, 255, 255, // 224..231
46+
255, 255, 255, 255, 255, 255, 255, 255, // 232..239
47+
255, 255, 255, 255, 255, 255, 255, 255, // 240..247
48+
255, 255, 255, 255, 255, 255, 255, 255 // 248..255
49+
};
50+
51+
static const uint8_t convertTab[256] = {
52+
255, 255, 255, 255, 255, 255, 255, 255, // 0..7
53+
255, 255, 255, 255, 255, 255, 255, 255, // 8..15
54+
255, 255, 255, 255, 255, 255, 255, 255, // 16..23
55+
255, 255, 255, 255, 255, 255, 255, 255, // 24..31
56+
255, 255, 255, 255, 255, 255, 255, 255, // 32..39
57+
255, 255, 255, 255, 255, 255, 255, 255, // 40..47
58+
255, 255, 255, 255, 255, 255, 255, 255, // 48..55
59+
255, 255, 255, 255, 255, 255, 255, 255, // 56..63
60+
255, 0, 255, 1, 255, 255, 255, 2, // 64..71
61+
255, 255, 255, 255, 255, 255, 255, 255, // 72..79
62+
255, 255, 255, 255, 3, 0, 255, 255, // 80..87
63+
255, 255, 255, 255, 255, 255, 255, 255, // 88..95
64+
255, 0, 255, 1, 255, 255, 255, 2, // 96..103
65+
255, 255, 255, 255, 255, 255, 255, 255, // 104..111
66+
255, 255, 255, 255, 3, 0, 255, 255, // 112..119
67+
255, 255, 255, 255, 255, 255, 255, 255, // 120..127
68+
255, 255, 255, 255, 255, 255, 255, 255, // 128..135
69+
255, 255, 255, 255, 255, 255, 255, 255, // 136..143
70+
255, 255, 255, 255, 255, 255, 255, 255, // 144..151
71+
255, 255, 255, 255, 255, 255, 255, 255, // 152..159
72+
255, 255, 255, 255, 255, 255, 255, 255, // 160..167
73+
255, 255, 255, 255, 255, 255, 255, 255, // 168..175
74+
255, 255, 255, 255, 255, 255, 255, 255, // 176..183
75+
255, 255, 255, 255, 255, 255, 255, 255, // 184..191
76+
255, 255, 255, 255, 255, 255, 255, 255, // 192..199
77+
255, 255, 255, 255, 255, 255, 255, 255, // 200..207
78+
255, 255, 255, 255, 255, 255, 255, 255, // 208..215
79+
255, 255, 255, 255, 255, 255, 255, 255, // 216..223
80+
255, 255, 255, 255, 255, 255, 255, 255, // 224..231
81+
255, 255, 255, 255, 255, 255, 255, 255, // 232..239
82+
255, 255, 255, 255, 255, 255, 255, 255, // 240..247
83+
255, 255, 255, 255, 255, 255, 255, 255 // 248..255
84+
};
85+
86+
static const uint64_t dimerTab[16] = {
87+
5015898201438948509U, 5225361804584821669U, 6423762225589857229U, 5783394398799547583U,
88+
6894017875502584557U, 5959461383092338133U, 4833978511655400893U, 5364573296520205007U,
89+
9002561594443973180U, 8212239310050454788U, 6941810030513055084U, 7579897184553533982U,
90+
7935738758488558809U, 7149836515649299425U, 8257540373175577481U, 8935100007508790523U
91+
};
92+
93+
static const uint64_t trimerTab[64] = {
94+
13237172352163388750U, 13451082378889146998U, 12324706752351386142U, 11704099346423635308U,
95+
12503002411303846718U, 11573033083854154758U, 12770611021816489070U, 13284814289517544220U,
96+
10286336837755622383U, 9500434588327378135U, 10554658215321236671U, 11177611689138066381U,
97+
11245073286936829194U, 10454751004568891954U, 9274956656780491354U, 9930495270120774952U,
98+
9498947889754972591U, 10289371588586147479U, 11487222103436658431U, 10812501148518244749U,
99+
11088845979783725023U, 10735249574334615783U, 9609199230360475791U, 10105458452942995453U,
100+
13447889238169808654U, 13238535845420384310U, 11968673763542288478U, 12645600078955589420U,
101+
12136759312206930411U, 11922809957208297171U, 13031072242070652603U, 13668666814620918217U,
102+
14219262150204358668U, 14433136993975185204U, 15703263506252408668U, 15026899868095529006U,
103+
16097136083696541308U, 15167201938128040260U, 14113514427211577644U, 14608043031429815902U,
104+
18169629015343943341U, 17383691583363408277U, 16185576633819064829U, 16859734366019948175U,
105+
17215452794964541512U, 16425095330967072624U, 17460550829194815256U, 18101973914136232042U,
106+
16197524846324948423U, 17136496960994620159U, 18190301010467109527U, 17660752969549176293U,
107+
18084590689685816247U, 17861669045228104847U, 16591430392433501415U, 17233003275094786965U,
108+
15689030113991676774U, 15321980360070757470U, 14196301091602199606U, 14727918144983470916U,
109+
14660430141886012803U, 14297932370981794491U, 15550237822687034067U, 16044915679164358049U
110+
};
111+
112+
static const uint64_t tetramerTab[256] = {
113+
6047278271377325800U, 6842100033257738704U, 5716751207778949560U, 5058261232784932554U,
114+
5322212292231585944U, 4955210659836481440U, 6153481158060361672U, 6630136099103187130U,
115+
7683058811908681801U, 7460089081761259377U, 8513615477720831769U, 9169618076073996395U,
116+
8669810821731892908U, 8451393064794886548U, 7271235746105367036U, 7894785163577458318U,
117+
7461575445318369801U, 7680024275870068017U, 8878022265940976985U, 8237757801848291883U,
118+
9060296013225843833U, 8116780716040188737U, 6991106539262573353U, 7521593563379047515U,
119+
6845292839028968616U, 6045914992845185936U, 4775672622745250808U, 5413871935584767114U,
120+
5490367161684853325U, 4695435745326017909U, 5803018666222232861U, 6480400171096490607U,
121+
2381043025085637546U, 3175899973157948562U, 4445879008075678970U, 3807116472585741192U,
122+
4268108881087626714U, 3901072061426881250U, 2847008385469766282U, 3379366782720458232U,
123+
1763336001516006667U, 1540401457157816883U, 342666797974407771U, 983493939256405289U,
124+
771890739233563630U, 553508169276984534U, 1589643033626739902U, 2263336780810576844U,
125+
330722743541775969U, 688712796851212633U, 1742668713148160305U, 1245320973785726531U,
126+
2208596672445898769U, 1422777727841816361U, 152919646732699457U, 826464124477841459U,
127+
4460107693596700864U, 3530055095011467256U, 2403999925630162832U, 2899137386794791138U,
128+
3398970977768160805U, 2464498338584432925U, 3716128830812494197U, 4248337413163712007U,
129+
4264326372183459627U, 3906261395711551507U, 2851952150714671227U, 3383149429014333193U,
130+
2386233046276708699U, 3172117876357805667U, 4441779805226941963U, 3801926588820052345U,
131+
170684860043692426U, 1100671402695403186U, 2226926226858061530U, 1693589575942097320U,
132+
1193606390847620975U, 2128144916583147607U, 876319371625685055U, 382305650241144653U,
133+
1102545060664966090U, 168107437338776818U, 1437989166537956506U, 1915072878734195688U,
134+
1548519783094789562U, 1757891215679916674U, 703889661060612842U, 46092416782165400U,
135+
3908715595921208683U, 4262294307145226835U, 3064498623987880507U, 2585134797421409609U,
136+
2661735585529691022U, 3019760716990469302U, 4055956603131813086U, 3543998858204232620U,
137+
5317339067591416425U, 4959238909506745681U, 6157334207435046201U, 6635009461133220427U,
138+
6051307208490845209U, 6837227221258447649U, 5711490920986878793U, 5054232433096901691U,
139+
8122648135453742280U, 9052599496358476784U, 7782418148093113240U, 7307023562816214250U,
140+
7095314801322056237U, 8029818144085865749U, 9137340041034366333U, 8622472983995947535U,
141+
7806751516869674914U, 7011855109925922970U, 8137690373747335410U, 8757695200062998400U,
142+
8531879593853721042U, 8898947385530005226U, 7700757522090507906U, 7186022138009770480U,
143+
6135219772853324035U, 6358123720871388731U, 5304510851123850835U, 4682089562405882145U,
144+
5182028715320330214U, 5400512630465816798U, 6580751683450298550U, 5923625422568720324U,
145+
13124074928584983660U, 13491146941631638356U, 12293650504952193852U, 11816502978180760654U,
146+
12399079312662682140U, 11604187204414436644U, 12730450818222161228U, 13388307479092468286U,
147+
10327209524901530317U, 9388215691182564853U, 10657868830410829213U, 11137168911054473967U,
148+
11357920004770333736U, 10414374197647485712U, 9306325182584103800U, 9818342344138146826U,
149+
9386341947321596045U, 10329786896059045813U, 11455812913355464669U, 10924692575052363951U,
150+
10984992149858150141U, 10766613702172592581U, 9568826821541020077U, 10208598699842184927U,
151+
13488692655530571308U, 13126106942075820308U, 12072096584926548348U, 12605510244625659406U,
152+
12249677498819492041U, 11882645355480553457U, 13062230760632229785U, 13556163143878539499U,
153+
14178740190036597038U, 14545847390080448022U, 15599559227675164286U, 15067834145139579148U,
154+
16065876409530435422U, 15270949115358734438U, 14000758968863088654U, 14640014089599289212U,
155+
18281953465151117199U, 17342994818563569847U, 16217267316526477535U, 16746698532205467565U,
156+
17255653680509032810U, 16312143059561297490U, 17564497017566543418U, 18061360711745100104U,
157+
16237972021990524133U, 17023861349393640413U, 18293930539975648181U, 17619893477009409223U,
158+
18115916316835994261U, 17757855915011241389U, 16704251839199542725U, 17200966263939144375U,
159+
15576639675766950468U, 15362743113290245500U, 14164544455910714644U, 14841019967217601126U,
160+
14620295210399335585U, 14410818688327658393U, 15446357621659116529U, 16085462927495578755U,
161+
18237799192036655099U, 17294270664133710019U, 16258109964509321387U, 16773410497518403545U,
162+
16657084189963477387U, 16875519862962278067U, 18127020052323321563U, 17507580374969491881U,
163+
14153168177888129370U, 14515696771658964578U, 15624080140268688906U, 15110866744451150200U,
164+
15466708232756051903U, 15833797605570023559U, 14563810316809509103U, 14085706539145691037U,
165+
14517711175708869402U, 14150731501263563810U, 15402451490950456394U, 15899948742203982648U,
166+
15224753927964908906U, 16019597712369578578U, 14983744703118572090U, 14310050713553640776U,
167+
17296865610423782843U, 18235907873078829699U, 17055988043521714923U, 16561000163437350297U,
168+
16340222631939670878U, 17283720110790814822U, 18338064546595415054U, 17805706452459078524U,
169+
10375933128878629561U, 9432369415202180481U, 10612588863825479145U, 11105888166746317467U,
170+
10794790039591648457U, 11013260899437695985U, 9905396050428550041U, 9228014311730625771U,
171+
13154226096333843480U, 13516719503928509216U, 12264699899470662472U, 11768891770841246778U,
172+
11836546934201131773U, 12203601119882644933U, 13328994472388527533U, 12798507759874630367U,
173+
12277767672444305266U, 12068343612890878026U, 13176021535246260258U, 13816435502572994384U,
174+
12705517425460601090U, 13640043170446921274U, 12460006250421962322U, 11929369723008524576U,
175+
10597232027372843475U, 11387585128312430315U, 10351852510211364483U, 9713802769929286129U,
176+
9357917249443839798U, 10143859113470169102U, 11342251114164164710U, 10664720106027613972U
177+
};
178+
15179
// offset for the complement base in the random seeds table
16180
const uint8_t cpOff = 0x07;
17181

@@ -187,6 +351,11 @@ inline uint64_t rol1(const uint64_t v) {
187351
return (v << 1) | (v >> 63);
188352
}
189353

354+
// rotate "v" to the left x position
355+
inline uint64_t rolx(const uint64_t v, const unsigned x) {
356+
return (v << x) | (v >> (64 - x));
357+
}
358+
190359
// rotate "v" to the right by 1 position
191360
inline uint64_t ror1(const uint64_t v) {
192361
return (v >> 1) | (v << 63);
@@ -216,24 +385,55 @@ inline uint64_t swapbits3263(const uint64_t v) {
216385
return v ^ ((x << 32) | (x << 63));
217386
}
218387

388+
inline uint64_t swapxbits033(const uint64_t v, const unsigned x) {
389+
uint64_t y = (v ^ (v >> 33)) & (std::numeric_limits<uint64_t>::max() >> (64 - x));
390+
return v ^ (y | (y << 33));
391+
}
392+
219393
// forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
220394
inline uint64_t NTF64(const char * kmerSeq, const unsigned k) {
221395
uint64_t hVal=0;
222-
for(unsigned i=0; i<k; i++) {
223-
hVal = rol1(hVal);
224-
hVal = swapbits033(hVal);
225-
hVal ^= seedTab[(unsigned char)kmerSeq[i]];
396+
for (unsigned i=0; i<k/4; i++) {
397+
hVal = rolx(hVal, 4);
398+
hVal = swapxbits033(hVal, 4);
399+
uint8_t currOffSet = 4 * i;
400+
uint8_t tetramerLoc = 64 * convertTab[(unsigned char)kmerSeq[currOffSet]] + 16 * convertTab[(unsigned char)kmerSeq[currOffSet + 1]] + 4 * convertTab[(unsigned char)kmerSeq[currOffSet + 2]] + convertTab[(unsigned char)kmerSeq[currOffSet + 3]];
401+
hVal ^= tetramerTab[tetramerLoc];
226402
}
403+
unsigned remainder = k % 4;
404+
hVal = rolx(hVal, remainder);
405+
hVal = swapxbits033(hVal, remainder);
406+
if (remainder == 3) {
407+
uint8_t trimerLoc = 16 * convertTab[(unsigned char)kmerSeq[k - 3]] + 4 * convertTab[(unsigned char)kmerSeq[k - 2]] + convertTab[(unsigned char)kmerSeq[k - 1]];
408+
hVal ^= trimerTab[trimerLoc];
409+
} else if (remainder == 2) {
410+
uint8_t dimerLoc = 4 * convertTab[(unsigned char)kmerSeq[k - 2]] + convertTab[(unsigned char)kmerSeq[k - 1]];
411+
hVal ^= dimerTab[dimerLoc];
412+
} else if (remainder == 1) {
413+
hVal ^= seedTab[(unsigned char)kmerSeq[k - 1]];
414+
}
227415
return hVal;
228416
}
229417

230418
// reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
231419
inline uint64_t NTR64(const char * kmerSeq, const unsigned k) {
232420
uint64_t hVal=0;
233-
for(unsigned i=0; i<k; i++) {
234-
hVal = rol1(hVal);
235-
hVal = swapbits033(hVal);
236-
hVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]&cpOff];
421+
unsigned remainder = k % 4;
422+
if (remainder == 3) {
423+
uint8_t trimerLoc = 16 * RCconvertTab[(unsigned char)kmerSeq[k - 1]] + 4 * RCconvertTab[(unsigned char)kmerSeq[k - 2]] + RCconvertTab[(unsigned char)kmerSeq[k - 3]];
424+
hVal ^= trimerTab[trimerLoc];
425+
} else if (remainder == 2) {
426+
uint8_t dimerLoc = 4 * RCconvertTab[(unsigned char)kmerSeq[k - 1]] + RCconvertTab[(unsigned char)kmerSeq[k - 2]];
427+
hVal ^= dimerTab[dimerLoc];
428+
} else if (remainder == 1) {
429+
hVal ^= seedTab[(unsigned char)kmerSeq[k-1]&cpOff];
430+
}
431+
for(unsigned i=0; i<k/4; i++) {
432+
hVal = rolx(hVal, 4);
433+
hVal = swapxbits033(hVal, 4);
434+
uint8_t currOffSet = 4 * (k / 4 - i) - 1;
435+
uint8_t tetramerLoc= 64 * RCconvertTab[(unsigned char)kmerSeq[currOffSet]] + 16 * RCconvertTab[(unsigned char)kmerSeq[currOffSet - 1]] + 4 * RCconvertTab[(unsigned char)kmerSeq[currOffSet - 2]] + RCconvertTab[(unsigned char)kmerSeq[currOffSet - 3]];
436+
hVal ^= tetramerTab[tetramerLoc];
237437
}
238438
return hVal;
239439
}

ssHashIterator.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ class ssHashIterator
3434
* @param k k-mer size
3535
* @param h number of hashes
3636
*/
37-
ssHashIterator(const std::string& seq, const std::vector<bool>& seed, unsigned k):
38-
m_seq(seq), m_seed(seed), m_k(k), m_hVal(0), m_sVal(0), m_pos(0)
37+
ssHashIterator(const std::string& seq, const std::vector<bool>& seed, unsigned k, size_t pos = 0):
38+
m_seq(seq), m_seed(seed), m_k(k), m_hVal(0), m_sVal(0), m_pos(pos)
3939
{
4040
init();
4141
}

stHashIterator.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class stHashIterator
5050
* @param h number of seeds
5151
* @param h2 number of hashes per seed
5252
*/
53-
stHashIterator(const std::string& seq, const std::vector<std::vector<unsigned> >& seed, unsigned h, unsigned h2, unsigned k):
54-
m_seq(seq), m_seed(seed), m_h(h), m_h2(h2), m_k(k), m_hVec(new uint64_t[h * h2]), m_hStn(new bool[h * h2]), m_pos(0)
53+
stHashIterator(const std::string& seq, const std::vector<std::vector<unsigned> >& seed, unsigned h, unsigned h2, unsigned k, size_t pos = 0):
54+
m_seq(seq), m_seed(seed), m_h(h), m_h2(h2), m_k(k), m_hVec(new uint64_t[h * h2]), m_hStn(new bool[h * h2]), m_pos(pos)
5555
{
5656
init();
5757
}

unittest/UnitTests.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,11 @@ TEST_CASE("test fixture", "[UnitTests]")
8484

8585
for (int i = 0; i < 3; ++i)
8686
{
87+
ntHashIterator kmerPosIt(kmer, 3, 18, i);
8788
for (int j = 0; j < 3; ++j)
8889
{
8990
assert((*rollingHashIt)[j] == (*vecKmerIt[i])[j]);
91+
assert((*rollingHashIt)[j] == (*kmerPosIt)[j]);
9092
}
9193
++rollingHashIt;
9294
}
@@ -123,7 +125,7 @@ TEST_CASE("test fixture", "[UnitTests]")
123125
ntHashIterator dnaIt(kmer, 3, 20);
124126

125127
std::string rnaKmer = "ACGUACACUGGACUGAGUCU";
126-
ntHashIterator rnaIt(kmer, 3, 20);
128+
ntHashIterator rnaIt(rnaKmer, 3, 20);
127129

128130
for (int i = 0; i < 3; ++i)
129131
{

0 commit comments

Comments
 (0)