Skip to content

Commit eea1058

Browse files
authored
stHashIterator enhancement (#14)
* nthash.hpp: add Vlad's code for multi-seed multi-hash support * nthash.hpp: remove double semi-colon * stHashIterator.hpp: add Vlad's code for multi-seed multi-hash support * stHashIterator.hpp: fix style * UnitTests.cpp: update code to reflect stHashIterator changes
1 parent 9df1464 commit eea1058

3 files changed

Lines changed: 89 additions & 24 deletions

File tree

nthash.hpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,5 +616,66 @@ inline void NTMS64(const char *kmerSeq, const std::vector<std::vector<unsigned>
616616
}
617617
}
618618

619+
// strand-aware multihash spaced seed ntHash with multiple hashes per seed
620+
inline bool NTMSM64(const char *kmerSeq, const std::vector<std::vector<unsigned> > &seedSeq, const unsigned k, const unsigned m, const unsigned m2,
621+
uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t* hVal, bool *hStn)
622+
{
623+
fhVal=rhVal=0;
624+
locN=0;
625+
for(int i=k-1; i>=0; i--) {
626+
if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
627+
locN=i;
628+
return false;
629+
}
630+
fhVal = rol1(fhVal);
631+
fhVal = swapbits033(fhVal);
632+
fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
633+
634+
rhVal = rol1(rhVal);
635+
rhVal = swapbits033(rhVal);
636+
rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
637+
}
638+
639+
for(unsigned j=0; j<m; j++) {
640+
uint64_t fsVal=fhVal, rsVal=rhVal;
641+
for(std::vector<unsigned>::const_iterator i=seedSeq[j].begin(); i!=seedSeq[j].end(); ++i) {
642+
fsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]][(k-1-*i)%31] | msTab33r[(unsigned char)kmerSeq[*i]][(k-1-*i)%33]);
643+
rsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]&cpOff][*i%31] | msTab33r[(unsigned char)kmerSeq[*i]&cpOff][*i%33]);
644+
}
645+
hStn[j * m2] = rsVal<fsVal;
646+
hVal[j * m2] = hStn[j * m2]? rsVal : fsVal;
647+
for(unsigned j2=1; j2<m2; j2++) {
648+
uint64_t tVal = hVal[j * m2] * (j2 ^ k * multiSeed);
649+
tVal ^= tVal >> multiShift;
650+
hStn[j * m2 + j2] = hStn[j * m2];
651+
hVal[j * m2 + j2] = tVal;
652+
}
653+
}
654+
return true;
655+
}
656+
657+
// strand-aware multihash spaced seed ntHash for sliding k-mers with multiple hashes per seed
658+
inline void NTMSM64(const char *kmerSeq, const std::vector<std::vector<unsigned> > &seedSeq, const unsigned char charOut, const unsigned char charIn,
659+
const unsigned k, const unsigned m, const unsigned m2, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal, bool *hStn)
660+
{
661+
fhVal = NTF64(fhVal,k,charOut,charIn);
662+
rhVal = NTR64(rhVal,k,charOut,charIn);
663+
for(unsigned j=0; j<m; j++) {
664+
uint64_t fsVal=fhVal, rsVal=rhVal;
665+
for(std::vector<unsigned>::const_iterator i=seedSeq[j].begin(); i!=seedSeq[j].end(); ++i) {
666+
fsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]][(k-1-*i)%31] | msTab33r[(unsigned char)kmerSeq[*i]][(k-1-*i)%33]);
667+
rsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]&cpOff][*i%31] | msTab33r[(unsigned char)kmerSeq[*i]&cpOff][*i%33]);
668+
}
669+
hStn[j * m2] = rsVal<fsVal;
670+
hVal[j * m2] = hStn[j * m2]? rsVal : fsVal;
671+
for(unsigned j2=1; j2<m2; j2++) {
672+
uint64_t tVal = hVal[j * m2] * (j2 ^ k * multiSeed);
673+
tVal ^= tVal >> multiShift;
674+
hStn[j * m2 + j2] = hStn[j * m2];
675+
hVal[j * m2 + j2] = tVal;
676+
}
677+
}
678+
}
679+
619680

620681
#endif

stHashIterator.hpp

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class stHashIterator
1919
{
2020

2121
public:
22-
22+
2323
static std::vector<std::vector<unsigned> > parseSeed(const std::vector<std::string> &seedString) {
2424
std::vector<std::vector<unsigned> > seedSet;
2525
for (unsigned i=0; i< seedString.size(); i++) {
@@ -31,7 +31,7 @@ class stHashIterator
3131
}
3232
return seedSet;
3333
}
34-
34+
3535
/**
3636
* Default constructor. Creates an iterator pointing to
3737
* the end of the iterator range.
@@ -47,14 +47,15 @@ class stHashIterator
4747
* @param seq address of DNA sequence to be hashed
4848
* @param seed address of spaced seed
4949
* @param k k-mer size
50-
* @param h number of hashes
50+
* @param h number of seeds
51+
* @param h2 number of hashes per seed
5152
*/
52-
stHashIterator(const std::string& seq, const std::vector<std::vector<unsigned> >& seed, unsigned h, unsigned k):
53-
m_seq(seq), m_seed(seed), m_h(h), m_k(k), m_hVec(new uint64_t[h]), m_hStn(new bool[h]), m_pos(0)
53+
stHashIterator(const std::string& seq, const std::vector<std::vector<unsigned> >& seed, unsigned h, unsigned h2, unsigned k):
54+
m_seq(seq), m_seed(seed), m_h(h), m_h2(h2), m_k(k), m_hVec(new uint64_t[h * h2]), m_hStn(new bool[h * h2]), m_pos(0)
5455
{
5556
init();
5657
}
57-
58+
5859
/** Initialize internal state of iterator */
5960
void init()
6061
{
@@ -63,7 +64,7 @@ class stHashIterator
6364
return;
6465
}
6566
unsigned locN=0;
66-
while (m_pos<m_seq.length()-m_k+1 && !NTMS64(m_seq.data()+m_pos, m_seed, m_k, m_h, m_fhVal, m_rhVal, locN, m_hVec, m_hStn))
67+
while (m_pos<m_seq.length()-m_k+1 && !NTMSM64(m_seq.data()+m_pos, m_seed, m_k, m_h, m_h2, m_fhVal, m_rhVal, locN, m_hVec, m_hStn))
6768
m_pos+=locN+1;
6869
if (m_pos >= m_seq.length()-m_k+1)
6970
m_pos = std::numeric_limits<std::size_t>::max();
@@ -82,9 +83,9 @@ class stHashIterator
8283
init();
8384
}
8485
else
85-
NTMS64(m_seq.data()+m_pos, m_seed, m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_fhVal, m_rhVal, m_hVec, m_hStn);
86+
NTMSM64(m_seq.data()+m_pos, m_seed, m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_h2, m_fhVal, m_rhVal, m_hVec, m_hStn);
8687
}
87-
88+
8889
size_t pos() const{
8990
return m_pos;
9091
}
@@ -95,13 +96,13 @@ class stHashIterator
9596
return m_hStn;
9697
}
9798

98-
99+
99100
/** get pointer to hash values for current k-mer */
100101
const uint64_t* operator*() const
101102
{
102103
return m_hVec;
103104
}
104-
105+
105106

106107
/** test equality with another iterator */
107108
bool operator==(const stHashIterator& it) const
@@ -114,20 +115,20 @@ class stHashIterator
114115
{
115116
return !(*this == it);
116117
}
117-
118+
118119
/** pre-increment operator */
119120
stHashIterator& operator++()
120121
{
121122
next();
122123
return *this;
123124
}
124-
125+
125126
/** iterator pointing to one past last element */
126127
static const stHashIterator end()
127128
{
128129
return stHashIterator();
129130
}
130-
131+
131132
/** destructor */
132133
~stHashIterator() {
133134
if(m_hVec!=NULL) {
@@ -137,16 +138,19 @@ class stHashIterator
137138
}
138139

139140
private:
140-
141+
141142
/** DNA sequence */
142143
std::string m_seq;
143-
144+
144145
/** Spaced Seed sequence */
145146
std::vector<std::vector<unsigned> > m_seed;
146-
147-
/** number of hashes */
147+
148+
/** number of seeds */
148149
unsigned m_h;
149150

151+
/** number of hashes per seed */
152+
unsigned m_h2;
153+
150154
/** k-mer size */
151155
unsigned m_k;
152156

@@ -155,7 +159,7 @@ class stHashIterator
155159

156160
/** hash strands, forward = 0, reverse-complement = 1 */
157161
bool *m_hStn;
158-
162+
159163
/** position of current k-mer */
160164
size_t m_pos;
161165

unittest/UnitTests.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ TEST_CASE("test fixture", "[UnitTests]")
9797
std::vector<std::string> seedString;
9898
seedString.push_back("11111100000000111111");
9999

100-
auto h = seedString.size();
100+
auto numSeeds = seedString.size();
101101
auto k = seedString[0].size();
102102

103103
std::vector<std::vector<unsigned>> seedSet = stHashIterator::parseSeed(seedString);
@@ -107,11 +107,11 @@ TEST_CASE("test fixture", "[UnitTests]")
107107
std::string kmerM2 = "ACGTACACTGTACTGAGTCT";
108108
std::string kmerM3 = "ACGTACACTGCACTGAGTCT";
109109

110-
stHashIterator ssItr(kmer, seedSet, h, k);
110+
stHashIterator ssItr(kmer, seedSet, numSeeds, 1, k);
111111

112-
stHashIterator ssM1Itr(kmerM1, seedSet, h, k);
113-
stHashIterator ssM2Itr(kmerM2, seedSet, h, k);
114-
stHashIterator ssM3Itr(kmerM3, seedSet, h, k);
112+
stHashIterator ssM1Itr(kmerM1, seedSet, numSeeds, 1, k);
113+
stHashIterator ssM2Itr(kmerM2, seedSet, numSeeds, 1, k);
114+
stHashIterator ssM3Itr(kmerM3, seedSet, numSeeds, 1, k);
115115

116116
assert((*ssItr)[0] == (*ssM1Itr)[0]);
117117
assert((*ssItr)[0] == (*ssM2Itr)[0]);

0 commit comments

Comments
 (0)