|
9 | 9 | #ifndef NT_HASH_H |
10 | 10 | #define NT_HASH_H |
11 | 11 |
|
| 12 | +#include <limits> |
12 | 13 | #include <stdint.h> |
13 | 14 | #include <vector> |
14 | 15 |
|
| 16 | +static const uint8_t RCconvertTab[256] = { |
| 17 | + 255, 255, 255, 255, 255, 255, 255, 255, // 0..7 |
| 18 | + 255, 255, 255, 255, 255, 255, 255, 255, // 8..15 |
| 19 | + 255, 255, 255, 255, 255, 255, 255, 255, // 16..23 |
| 20 | + 255, 255, 255, 255, 255, 255, 255, 255, // 24..31 |
| 21 | + 255, 255, 255, 255, 255, 255, 255, 255, // 32..39 |
| 22 | + 255, 255, 255, 255, 255, 255, 255, 255, // 40..47 |
| 23 | + 255, 255, 255, 255, 255, 255, 255, 255, // 48..55 |
| 24 | + 255, 255, 255, 255, 255, 255, 255, 255, // 56..63 |
| 25 | + 255, 3, 255, 2, 255, 255, 255, 1, // 64..71 |
| 26 | + 255, 255, 255, 255, 255, 255, 255, 255, // 72..79 |
| 27 | + 255, 255, 255, 255, 0, 3, 255, 255, // 80..87 |
| 28 | + 255, 255, 255, 255, 255, 255, 255, 255, // 88..95 |
| 29 | + 255, 3, 255, 2, 255, 255, 255, 1, // 96..103 |
| 30 | + 255, 255, 255, 255, 255, 255, 255, 255, // 104..111 |
| 31 | + 255, 255, 255, 255, 0, 3, 255, 255, // 112..119 |
| 32 | + 255, 255, 255, 255, 255, 255, 255, 255, // 120..127 |
| 33 | + 255, 255, 255, 255, 255, 255, 255, 255, // 128..135 |
| 34 | + 255, 255, 255, 255, 255, 255, 255, 255, // 136..143 |
| 35 | + 255, 255, 255, 255, 255, 255, 255, 255, // 144..151 |
| 36 | + 255, 255, 255, 255, 255, 255, 255, 255, // 152..159 |
| 37 | + 255, 255, 255, 255, 255, 255, 255, 255, // 160..167 |
| 38 | + 255, 255, 255, 255, 255, 255, 255, 255, // 168..175 |
| 39 | + 255, 255, 255, 255, 255, 255, 255, 255, // 176..183 |
| 40 | + 255, 255, 255, 255, 255, 255, 255, 255, // 184..191 |
| 41 | + 255, 255, 255, 255, 255, 255, 255, 255, // 192..199 |
| 42 | + 255, 255, 255, 255, 255, 255, 255, 255, // 200..207 |
| 43 | + 255, 255, 255, 255, 255, 255, 255, 255, // 208..215 |
| 44 | + 255, 255, 255, 255, 255, 255, 255, 255, // 216..223 |
| 45 | + 255, 255, 255, 255, 255, 255, 255, 255, // 224..231 |
| 46 | + 255, 255, 255, 255, 255, 255, 255, 255, // 232..239 |
| 47 | + 255, 255, 255, 255, 255, 255, 255, 255, // 240..247 |
| 48 | + 255, 255, 255, 255, 255, 255, 255, 255 // 248..255 |
| 49 | +}; |
| 50 | + |
| 51 | +static const uint8_t convertTab[256] = { |
| 52 | + 255, 255, 255, 255, 255, 255, 255, 255, // 0..7 |
| 53 | + 255, 255, 255, 255, 255, 255, 255, 255, // 8..15 |
| 54 | + 255, 255, 255, 255, 255, 255, 255, 255, // 16..23 |
| 55 | + 255, 255, 255, 255, 255, 255, 255, 255, // 24..31 |
| 56 | + 255, 255, 255, 255, 255, 255, 255, 255, // 32..39 |
| 57 | + 255, 255, 255, 255, 255, 255, 255, 255, // 40..47 |
| 58 | + 255, 255, 255, 255, 255, 255, 255, 255, // 48..55 |
| 59 | + 255, 255, 255, 255, 255, 255, 255, 255, // 56..63 |
| 60 | + 255, 0, 255, 1, 255, 255, 255, 2, // 64..71 |
| 61 | + 255, 255, 255, 255, 255, 255, 255, 255, // 72..79 |
| 62 | + 255, 255, 255, 255, 3, 0, 255, 255, // 80..87 |
| 63 | + 255, 255, 255, 255, 255, 255, 255, 255, // 88..95 |
| 64 | + 255, 0, 255, 1, 255, 255, 255, 2, // 96..103 |
| 65 | + 255, 255, 255, 255, 255, 255, 255, 255, // 104..111 |
| 66 | + 255, 255, 255, 255, 3, 0, 255, 255, // 112..119 |
| 67 | + 255, 255, 255, 255, 255, 255, 255, 255, // 120..127 |
| 68 | + 255, 255, 255, 255, 255, 255, 255, 255, // 128..135 |
| 69 | + 255, 255, 255, 255, 255, 255, 255, 255, // 136..143 |
| 70 | + 255, 255, 255, 255, 255, 255, 255, 255, // 144..151 |
| 71 | + 255, 255, 255, 255, 255, 255, 255, 255, // 152..159 |
| 72 | + 255, 255, 255, 255, 255, 255, 255, 255, // 160..167 |
| 73 | + 255, 255, 255, 255, 255, 255, 255, 255, // 168..175 |
| 74 | + 255, 255, 255, 255, 255, 255, 255, 255, // 176..183 |
| 75 | + 255, 255, 255, 255, 255, 255, 255, 255, // 184..191 |
| 76 | + 255, 255, 255, 255, 255, 255, 255, 255, // 192..199 |
| 77 | + 255, 255, 255, 255, 255, 255, 255, 255, // 200..207 |
| 78 | + 255, 255, 255, 255, 255, 255, 255, 255, // 208..215 |
| 79 | + 255, 255, 255, 255, 255, 255, 255, 255, // 216..223 |
| 80 | + 255, 255, 255, 255, 255, 255, 255, 255, // 224..231 |
| 81 | + 255, 255, 255, 255, 255, 255, 255, 255, // 232..239 |
| 82 | + 255, 255, 255, 255, 255, 255, 255, 255, // 240..247 |
| 83 | + 255, 255, 255, 255, 255, 255, 255, 255 // 248..255 |
| 84 | +}; |
| 85 | + |
| 86 | +static const uint64_t dimerTab[16] = { |
| 87 | +5015898201438948509U, 5225361804584821669U, 6423762225589857229U, 5783394398799547583U, |
| 88 | +6894017875502584557U, 5959461383092338133U, 4833978511655400893U, 5364573296520205007U, |
| 89 | +9002561594443973180U, 8212239310050454788U, 6941810030513055084U, 7579897184553533982U, |
| 90 | +7935738758488558809U, 7149836515649299425U, 8257540373175577481U, 8935100007508790523U |
| 91 | +}; |
| 92 | + |
| 93 | +static const uint64_t trimerTab[64] = { |
| 94 | +13237172352163388750U, 13451082378889146998U, 12324706752351386142U, 11704099346423635308U, |
| 95 | +12503002411303846718U, 11573033083854154758U, 12770611021816489070U, 13284814289517544220U, |
| 96 | +10286336837755622383U, 9500434588327378135U, 10554658215321236671U, 11177611689138066381U, |
| 97 | +11245073286936829194U, 10454751004568891954U, 9274956656780491354U, 9930495270120774952U, |
| 98 | +9498947889754972591U, 10289371588586147479U, 11487222103436658431U, 10812501148518244749U, |
| 99 | +11088845979783725023U, 10735249574334615783U, 9609199230360475791U, 10105458452942995453U, |
| 100 | +13447889238169808654U, 13238535845420384310U, 11968673763542288478U, 12645600078955589420U, |
| 101 | +12136759312206930411U, 11922809957208297171U, 13031072242070652603U, 13668666814620918217U, |
| 102 | +14219262150204358668U, 14433136993975185204U, 15703263506252408668U, 15026899868095529006U, |
| 103 | +16097136083696541308U, 15167201938128040260U, 14113514427211577644U, 14608043031429815902U, |
| 104 | +18169629015343943341U, 17383691583363408277U, 16185576633819064829U, 16859734366019948175U, |
| 105 | +17215452794964541512U, 16425095330967072624U, 17460550829194815256U, 18101973914136232042U, |
| 106 | +16197524846324948423U, 17136496960994620159U, 18190301010467109527U, 17660752969549176293U, |
| 107 | +18084590689685816247U, 17861669045228104847U, 16591430392433501415U, 17233003275094786965U, |
| 108 | +15689030113991676774U, 15321980360070757470U, 14196301091602199606U, 14727918144983470916U, |
| 109 | +14660430141886012803U, 14297932370981794491U, 15550237822687034067U, 16044915679164358049U |
| 110 | +}; |
| 111 | + |
| 112 | +static const uint64_t tetramerTab[256] = { |
| 113 | +6047278271377325800U, 6842100033257738704U, 5716751207778949560U, 5058261232784932554U, |
| 114 | +5322212292231585944U, 4955210659836481440U, 6153481158060361672U, 6630136099103187130U, |
| 115 | +7683058811908681801U, 7460089081761259377U, 8513615477720831769U, 9169618076073996395U, |
| 116 | +8669810821731892908U, 8451393064794886548U, 7271235746105367036U, 7894785163577458318U, |
| 117 | +7461575445318369801U, 7680024275870068017U, 8878022265940976985U, 8237757801848291883U, |
| 118 | +9060296013225843833U, 8116780716040188737U, 6991106539262573353U, 7521593563379047515U, |
| 119 | +6845292839028968616U, 6045914992845185936U, 4775672622745250808U, 5413871935584767114U, |
| 120 | +5490367161684853325U, 4695435745326017909U, 5803018666222232861U, 6480400171096490607U, |
| 121 | +2381043025085637546U, 3175899973157948562U, 4445879008075678970U, 3807116472585741192U, |
| 122 | +4268108881087626714U, 3901072061426881250U, 2847008385469766282U, 3379366782720458232U, |
| 123 | +1763336001516006667U, 1540401457157816883U, 342666797974407771U, 983493939256405289U, |
| 124 | +771890739233563630U, 553508169276984534U, 1589643033626739902U, 2263336780810576844U, |
| 125 | +330722743541775969U, 688712796851212633U, 1742668713148160305U, 1245320973785726531U, |
| 126 | +2208596672445898769U, 1422777727841816361U, 152919646732699457U, 826464124477841459U, |
| 127 | +4460107693596700864U, 3530055095011467256U, 2403999925630162832U, 2899137386794791138U, |
| 128 | +3398970977768160805U, 2464498338584432925U, 3716128830812494197U, 4248337413163712007U, |
| 129 | +4264326372183459627U, 3906261395711551507U, 2851952150714671227U, 3383149429014333193U, |
| 130 | +2386233046276708699U, 3172117876357805667U, 4441779805226941963U, 3801926588820052345U, |
| 131 | +170684860043692426U, 1100671402695403186U, 2226926226858061530U, 1693589575942097320U, |
| 132 | +1193606390847620975U, 2128144916583147607U, 876319371625685055U, 382305650241144653U, |
| 133 | +1102545060664966090U, 168107437338776818U, 1437989166537956506U, 1915072878734195688U, |
| 134 | +1548519783094789562U, 1757891215679916674U, 703889661060612842U, 46092416782165400U, |
| 135 | +3908715595921208683U, 4262294307145226835U, 3064498623987880507U, 2585134797421409609U, |
| 136 | +2661735585529691022U, 3019760716990469302U, 4055956603131813086U, 3543998858204232620U, |
| 137 | +5317339067591416425U, 4959238909506745681U, 6157334207435046201U, 6635009461133220427U, |
| 138 | +6051307208490845209U, 6837227221258447649U, 5711490920986878793U, 5054232433096901691U, |
| 139 | +8122648135453742280U, 9052599496358476784U, 7782418148093113240U, 7307023562816214250U, |
| 140 | +7095314801322056237U, 8029818144085865749U, 9137340041034366333U, 8622472983995947535U, |
| 141 | +7806751516869674914U, 7011855109925922970U, 8137690373747335410U, 8757695200062998400U, |
| 142 | +8531879593853721042U, 8898947385530005226U, 7700757522090507906U, 7186022138009770480U, |
| 143 | +6135219772853324035U, 6358123720871388731U, 5304510851123850835U, 4682089562405882145U, |
| 144 | +5182028715320330214U, 5400512630465816798U, 6580751683450298550U, 5923625422568720324U, |
| 145 | +13124074928584983660U, 13491146941631638356U, 12293650504952193852U, 11816502978180760654U, |
| 146 | +12399079312662682140U, 11604187204414436644U, 12730450818222161228U, 13388307479092468286U, |
| 147 | +10327209524901530317U, 9388215691182564853U, 10657868830410829213U, 11137168911054473967U, |
| 148 | +11357920004770333736U, 10414374197647485712U, 9306325182584103800U, 9818342344138146826U, |
| 149 | +9386341947321596045U, 10329786896059045813U, 11455812913355464669U, 10924692575052363951U, |
| 150 | +10984992149858150141U, 10766613702172592581U, 9568826821541020077U, 10208598699842184927U, |
| 151 | +13488692655530571308U, 13126106942075820308U, 12072096584926548348U, 12605510244625659406U, |
| 152 | +12249677498819492041U, 11882645355480553457U, 13062230760632229785U, 13556163143878539499U, |
| 153 | +14178740190036597038U, 14545847390080448022U, 15599559227675164286U, 15067834145139579148U, |
| 154 | +16065876409530435422U, 15270949115358734438U, 14000758968863088654U, 14640014089599289212U, |
| 155 | +18281953465151117199U, 17342994818563569847U, 16217267316526477535U, 16746698532205467565U, |
| 156 | +17255653680509032810U, 16312143059561297490U, 17564497017566543418U, 18061360711745100104U, |
| 157 | +16237972021990524133U, 17023861349393640413U, 18293930539975648181U, 17619893477009409223U, |
| 158 | +18115916316835994261U, 17757855915011241389U, 16704251839199542725U, 17200966263939144375U, |
| 159 | +15576639675766950468U, 15362743113290245500U, 14164544455910714644U, 14841019967217601126U, |
| 160 | +14620295210399335585U, 14410818688327658393U, 15446357621659116529U, 16085462927495578755U, |
| 161 | +18237799192036655099U, 17294270664133710019U, 16258109964509321387U, 16773410497518403545U, |
| 162 | +16657084189963477387U, 16875519862962278067U, 18127020052323321563U, 17507580374969491881U, |
| 163 | +14153168177888129370U, 14515696771658964578U, 15624080140268688906U, 15110866744451150200U, |
| 164 | +15466708232756051903U, 15833797605570023559U, 14563810316809509103U, 14085706539145691037U, |
| 165 | +14517711175708869402U, 14150731501263563810U, 15402451490950456394U, 15899948742203982648U, |
| 166 | +15224753927964908906U, 16019597712369578578U, 14983744703118572090U, 14310050713553640776U, |
| 167 | +17296865610423782843U, 18235907873078829699U, 17055988043521714923U, 16561000163437350297U, |
| 168 | +16340222631939670878U, 17283720110790814822U, 18338064546595415054U, 17805706452459078524U, |
| 169 | +10375933128878629561U, 9432369415202180481U, 10612588863825479145U, 11105888166746317467U, |
| 170 | +10794790039591648457U, 11013260899437695985U, 9905396050428550041U, 9228014311730625771U, |
| 171 | +13154226096333843480U, 13516719503928509216U, 12264699899470662472U, 11768891770841246778U, |
| 172 | +11836546934201131773U, 12203601119882644933U, 13328994472388527533U, 12798507759874630367U, |
| 173 | +12277767672444305266U, 12068343612890878026U, 13176021535246260258U, 13816435502572994384U, |
| 174 | +12705517425460601090U, 13640043170446921274U, 12460006250421962322U, 11929369723008524576U, |
| 175 | +10597232027372843475U, 11387585128312430315U, 10351852510211364483U, 9713802769929286129U, |
| 176 | +9357917249443839798U, 10143859113470169102U, 11342251114164164710U, 10664720106027613972U |
| 177 | +}; |
| 178 | + |
15 | 179 | // offset for the complement base in the random seeds table |
16 | 180 | const uint8_t cpOff = 0x07; |
17 | 181 |
|
@@ -187,6 +351,11 @@ inline uint64_t rol1(const uint64_t v) { |
187 | 351 | return (v << 1) | (v >> 63); |
188 | 352 | } |
189 | 353 |
|
| 354 | +// rotate "v" to the left x position |
| 355 | +inline uint64_t rolx(const uint64_t v, const unsigned x) { |
| 356 | + return (v << x) | (v >> (64 - x)); |
| 357 | +} |
| 358 | + |
190 | 359 | // rotate "v" to the right by 1 position |
191 | 360 | inline uint64_t ror1(const uint64_t v) { |
192 | 361 | return (v >> 1) | (v << 63); |
@@ -216,24 +385,55 @@ inline uint64_t swapbits3263(const uint64_t v) { |
216 | 385 | return v ^ ((x << 32) | (x << 63)); |
217 | 386 | } |
218 | 387 |
|
| 388 | +inline uint64_t swapxbits033(const uint64_t v, const unsigned x) { |
| 389 | + uint64_t y = (v ^ (v >> 33)) & (std::numeric_limits<uint64_t>::max() >> (64 - x)); |
| 390 | + return v ^ (y | (y << 33)); |
| 391 | +} |
| 392 | + |
219 | 393 | // forward-strand hash value of the base kmer, i.e. fhval(kmer_0) |
220 | 394 | inline uint64_t NTF64(const char * kmerSeq, const unsigned k) { |
221 | 395 | uint64_t hVal=0; |
222 | | - for(unsigned i=0; i<k; i++) { |
223 | | - hVal = rol1(hVal); |
224 | | - hVal = swapbits033(hVal); |
225 | | - hVal ^= seedTab[(unsigned char)kmerSeq[i]]; |
| 396 | + for (unsigned i=0; i<k/4; i++) { |
| 397 | + hVal = rolx(hVal, 4); |
| 398 | + hVal = swapxbits033(hVal, 4); |
| 399 | + uint8_t currOffSet = 4 * i; |
| 400 | + uint8_t tetramerLoc = 64 * convertTab[(unsigned char)kmerSeq[currOffSet]] + 16 * convertTab[(unsigned char)kmerSeq[currOffSet + 1]] + 4 * convertTab[(unsigned char)kmerSeq[currOffSet + 2]] + convertTab[(unsigned char)kmerSeq[currOffSet + 3]]; |
| 401 | + hVal ^= tetramerTab[tetramerLoc]; |
226 | 402 | } |
| 403 | + unsigned remainder = k % 4; |
| 404 | + hVal = rolx(hVal, remainder); |
| 405 | + hVal = swapxbits033(hVal, remainder); |
| 406 | + if (remainder == 3) { |
| 407 | + uint8_t trimerLoc = 16 * convertTab[(unsigned char)kmerSeq[k - 3]] + 4 * convertTab[(unsigned char)kmerSeq[k - 2]] + convertTab[(unsigned char)kmerSeq[k - 1]]; |
| 408 | + hVal ^= trimerTab[trimerLoc]; |
| 409 | + } else if (remainder == 2) { |
| 410 | + uint8_t dimerLoc = 4 * convertTab[(unsigned char)kmerSeq[k - 2]] + convertTab[(unsigned char)kmerSeq[k - 1]]; |
| 411 | + hVal ^= dimerTab[dimerLoc]; |
| 412 | + } else if (remainder == 1) { |
| 413 | + hVal ^= seedTab[(unsigned char)kmerSeq[k - 1]]; |
| 414 | + } |
227 | 415 | return hVal; |
228 | 416 | } |
229 | 417 |
|
230 | 418 | // reverse-strand hash value of the base kmer, i.e. rhval(kmer_0) |
231 | 419 | inline uint64_t NTR64(const char * kmerSeq, const unsigned k) { |
232 | 420 | uint64_t hVal=0; |
233 | | - for(unsigned i=0; i<k; i++) { |
234 | | - hVal = rol1(hVal); |
235 | | - hVal = swapbits033(hVal); |
236 | | - hVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]&cpOff]; |
| 421 | + unsigned remainder = k % 4; |
| 422 | + if (remainder == 3) { |
| 423 | + uint8_t trimerLoc = 16 * RCconvertTab[(unsigned char)kmerSeq[k - 1]] + 4 * RCconvertTab[(unsigned char)kmerSeq[k - 2]] + RCconvertTab[(unsigned char)kmerSeq[k - 3]]; |
| 424 | + hVal ^= trimerTab[trimerLoc]; |
| 425 | + } else if (remainder == 2) { |
| 426 | + uint8_t dimerLoc = 4 * RCconvertTab[(unsigned char)kmerSeq[k - 1]] + RCconvertTab[(unsigned char)kmerSeq[k - 2]]; |
| 427 | + hVal ^= dimerTab[dimerLoc]; |
| 428 | + } else if (remainder == 1) { |
| 429 | + hVal ^= seedTab[(unsigned char)kmerSeq[k-1]&cpOff]; |
| 430 | + } |
| 431 | + for(unsigned i=0; i<k/4; i++) { |
| 432 | + hVal = rolx(hVal, 4); |
| 433 | + hVal = swapxbits033(hVal, 4); |
| 434 | + uint8_t currOffSet = 4 * (k / 4 - i) - 1; |
| 435 | + uint8_t tetramerLoc= 64 * RCconvertTab[(unsigned char)kmerSeq[currOffSet]] + 16 * RCconvertTab[(unsigned char)kmerSeq[currOffSet - 1]] + 4 * RCconvertTab[(unsigned char)kmerSeq[currOffSet - 2]] + RCconvertTab[(unsigned char)kmerSeq[currOffSet - 3]]; |
| 436 | + hVal ^= tetramerTab[tetramerLoc]; |
237 | 437 | } |
238 | 438 | return hVal; |
239 | 439 | } |
|
0 commit comments