Skip to content

Commit 07a3a5e

Browse files
authored
Merge pull request #80 from dib-lab/aaKmers
2 parents e351568 + e03f554 commit 07a3a5e

16 files changed

Lines changed: 416 additions & 111 deletions

File tree

include/kProcessor/algorithms.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ kmerDecoder* initialize_kmerDecoder(int kmer_size, int hash_mode = 1);
9494
* Mode 1: Integer Hashing | Reversible | Full Hashing
9595
* Mode 2: TwoBitsHashing | Not considered hashing, just store the two bits representation
9696
*/
97-
void kmerDecoder_setHashing(kmerDecoder * KD, int hash_mode, bool canonical = true);
98-
void kmerDecoder_setHashing(kDataFrame * KF, int hash_mode, bool canonical = true);
97+
void kmerDecoder_setHashing(kmerDecoder * KD, hashingModes hash_mode);
98+
void kmerDecoder_setHashing(kDataFrame * KF, hashingModes hash_mode);
9999

100100
/// Perform indexing to a sequences file with predefined kmers decoding mode, returns a colored kDataframe.
101101
colored_kDataFrame *index(kmerDecoder *KD, string names_fileName, kDataFrame *frame);
@@ -104,5 +104,7 @@ colored_kDataFrame *index(kmerDecoder *KD, string names_fileName, kDataFrame *fr
104104
colored_kDataFrame *
105105
index(kDataFrame *frame, std::map<std::string, int> parse_params, string filename, int chunks, string names_fileName);
106106

107+
colored_kDataFrame * index(kDataFrame *frame, string filename, int chunks, string names_fileName);
108+
107109
}
108110
#endif

include/kProcessor/kDataFrame.hpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -340,13 +340,17 @@ class kDataFrameMQF: public kDataFrame{
340340
public:
341341
kDataFrameMQF();
342342
kDataFrameMQF(uint64_t kSize);
343-
kDataFrameMQF(uint64_t kSize, int mode);
343+
kDataFrameMQF(uint64_t kSize, hashingModes hash_mode);
344+
kDataFrameMQF(readingModes RM, hashingModes HM, map<string, int> params);
344345
kDataFrameMQF(uint64_t ksize,uint8_t q,uint8_t fixedCounterSize,uint8_t tagSize
345346
,double falsePositiveRate);
346347

347-
kDataFrameMQF(uint64_t ksize, uint8_t q, int mode);
348+
kDataFrameMQF(uint64_t ksize, uint8_t q, hashingModes hash_mode = integer_hasher);
348349

349350
kDataFrameMQF(QF* mqf,uint64_t ksize,double falsePositiveRate);
351+
kDataFrameMQF(QF *mqf, uint64_t ksize, hashingModes hash_mode);
352+
kDataFrameMQF(QF *mqf, readingModes RM, hashingModes HM, map<string, int> params);
353+
350354
//count histogram is array where count of kmers repeated n times is found at index n. index 0 holds number of distinct kmers.
351355
kDataFrameMQF(uint64_t ksize,vector<uint64_t> countHistogram,uint8_t tagSize
352356
,double falsePositiveRate);
@@ -439,7 +443,9 @@ class kDataFrameBMQF: public kDataFrame{
439443
kDataFrameIterator* endIterator;
440444
public:
441445
kDataFrameBMQF();
442-
kDataFrameBMQF(uint64_t kSize);
446+
kDataFrameBMQF(readingModes RM, hashingModes HM, map<string, int> params);
447+
kDataFrameBMQF(uint64_t kSize, hashingModes hash_mode = integer_hasher);
448+
kDataFrameBMQF(bufferedMQF* bufferedmqf, readingModes RM, hashingModes HM, map<string, int> params);
443449
kDataFrameBMQF(uint64_t ksize,uint8_t q,uint8_t fixedCounterSize,uint8_t tagSize,double falsePositiveRate);
444450
kDataFrameBMQF(bufferedMQF* bufferedmqf,uint64_t ksize,double falsePositiveRate);
445451
//count histogram is array where count of kmers repeated n times is found at index n. index 0 holds number of distinct kmers.
@@ -511,6 +517,7 @@ class kDataFrameMAP : public kDataFrame
511517
public:
512518
kDataFrameMAP();
513519
kDataFrameMAP(uint64_t ksize);
520+
kDataFrameMAP(readingModes RM, hashingModes HM, map<string, int> params);
514521
kDataFrameMAP(uint64_t kSize,vector<uint64_t> kmersHistogram);
515522
kDataFrame* getTwin();
516523
void reserve (uint64_t n);
@@ -588,9 +595,9 @@ class kDataFramePHMAP : public kDataFrame {
588595
flat_hash_map<uint64_t, uint64_t> MAP;
589596
public:
590597
kDataFramePHMAP();
591-
592598
kDataFramePHMAP(uint64_t ksize);
593-
kDataFramePHMAP(uint64_t ksize, int mode);
599+
kDataFramePHMAP(readingModes RM, hashingModes hash_mode, map<string, int> params);
600+
kDataFramePHMAP(uint64_t ksize, hashingModes hash_mode);
594601
kDataFramePHMAP(uint64_t kSize,vector<uint64_t> kmersHistogram);
595602

596603
kDataFrame *getTwin();

src/algorithms.cpp

Lines changed: 201 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ namespace kProcessor {
355355

356356
// Clone the hashing
357357

358-
kmerDecoder_setHashing(KD, kframe->KD->hash_mode, kframe->KD->canonical);
358+
kmerDecoder_setHashing(KD, kframe->KD->hash_mode);
359359

360360
// Processing
361361

@@ -407,7 +407,7 @@ namespace kProcessor {
407407

408408
// Clone the hashing
409409

410-
kmerDecoder_setHashing(KD, frame->KD->hash_mode, frame->KD->canonical);
410+
kmerDecoder_setHashing(KD, frame->KD->hash_mode);
411411

412412
if (KD->get_kSize() != (int)frame->getkSize()) {
413413
std::cerr << "kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
@@ -537,12 +537,12 @@ namespace kProcessor {
537537
}
538538

539539

540-
void kmerDecoder_setHashing(kmerDecoder * KD, int hash_mode, bool canonical){
541-
KD->setHashingMode(hash_mode, canonical);
540+
void kmerDecoder_setHashing(kmerDecoder * KD, hashingModes hash_mode){
541+
KD->setHashingMode(hash_mode, KD->get_kSize());
542542
}
543543

544-
void kmerDecoder_setHashing(kDataFrame * KF, int hash_mode, bool canonical){
545-
KF->KD->setHashingMode(hash_mode, canonical);
544+
void kmerDecoder_setHashing(kDataFrame * KF, hashingModes hash_mode){
545+
KF->KD->setHashingMode(hash_mode, KF->ksize());
546546
}
547547

548548

@@ -641,18 +641,14 @@ namespace kProcessor {
641641
}
642642
}
643643

644-
kmerDecoder* initialize_kmerDecoder(int kSize, int hash_mode){
645-
// Mode 0: Murmar Hashing | Irreversible
646-
// Mode 1: Integer Hashing | Reversible | Full Hashing
647-
// Mode 2: TwoBitsHashing | Not considered hashing, just store the two bits representation
648-
return new Kmers(kSize, hash_mode);
644+
kmerDecoder* initialize_kmerDecoder(int kSize, hashingModes HM = integer_hasher){
645+
return new Kmers(kSize, HM);
649646
}
650647

651648
colored_kDataFrame *index(kmerDecoder *KD, string names_fileName, kDataFrame *frame) {
652649

653-
if (KD->get_kSize() != (int)frame->ksize()) {
654-
std::cerr << "kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
655-
exit(1);
650+
if (KD->get_kSize() != (int)frame->ksize() && (KD->hash_mode != protein_hasher || KD->hash_mode != proteinDayhoff_hasher)) {
651+
std::cerr << "WARNING: kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
656652
}
657653

658654
flat_hash_map<string, string> namesMap;
@@ -847,7 +843,7 @@ namespace kProcessor {
847843

848844
// Clone the hashing
849845

850-
kmerDecoder_setHashing(KD, frame->KD->hash_mode, frame->KD->canonical);
846+
kmerDecoder_setHashing(KD, frame->KD->hash_mode);
851847

852848
// Processing
853849

@@ -1028,6 +1024,196 @@ namespace kProcessor {
10281024
return res;
10291025
}
10301026

1027+
colored_kDataFrame * index(kDataFrame * frame, string filename, int chunk_size, string names_fileName){
1028+
1029+
// parse_params["mode"] = 1 > Default: Kmers
1030+
// parse_params["mode"] = 2 > Skipmers
1031+
// parse_params["mode"] = 3 > Minimizers
1032+
1033+
// Initialize kmerDecoder
1034+
map<string, int> params = kmerDecoder::string_to_params(frame->KD->params_to_string());
1035+
1036+
kmerDecoder * KD = kmerDecoder::getInstance(filename, chunk_size, frame->KD->slicing_mode, frame->KD->hash_mode, params);
1037+
1038+
// Processing
1039+
1040+
if (KD->get_kSize() != (int)frame->ksize()) {
1041+
std::cerr << "kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
1042+
exit(1);
1043+
}
1044+
1045+
flat_hash_map<string, string> namesMap;
1046+
flat_hash_map<string, uint64_t> tagsMap;
1047+
flat_hash_map<string, uint64_t> groupNameMap;
1048+
auto *legend = new flat_hash_map<uint64_t, std::vector<uint32_t>>();
1049+
flat_hash_map<uint64_t, uint64_t> colorsCount;
1050+
uint64_t readID = 0, groupID = 1;
1051+
ifstream namesFile(names_fileName.c_str());
1052+
string seqName, groupName;
1053+
string line;
1054+
priority_queue<uint64_t, vector<uint64_t>, std::greater<uint64_t>> freeColors;
1055+
flat_hash_map<string, uint64_t> groupCounter;
1056+
// while (namesFile >> seqName >> groupName) {
1057+
while (std::getline(namesFile, line)) {
1058+
std::vector<string> tokens;
1059+
std::istringstream iss(line);
1060+
std::string token;
1061+
while(std::getline(iss, token, '\t')) // but we can specify a different one
1062+
tokens.push_back(token);
1063+
seqName = tokens[0];
1064+
groupName = tokens[1];
1065+
namesMap.insert(make_pair(seqName, groupName));
1066+
auto it = groupNameMap.find(groupName);
1067+
groupCounter[groupName]++;
1068+
if (it == groupNameMap.end()) {
1069+
groupNameMap.insert(make_pair(groupName, groupID));
1070+
tagsMap.insert(make_pair(to_string(groupID), groupID));
1071+
vector<uint32_t> tmp;
1072+
tmp.clear();
1073+
tmp.push_back(groupID);
1074+
legend->insert(make_pair(groupID, tmp));
1075+
colorsCount.insert(make_pair(groupID, 0));
1076+
groupID++;
1077+
}
1078+
}
1079+
1080+
flat_hash_map<uint64_t, string> inv_groupNameMap;
1081+
for (auto &_ : groupNameMap)
1082+
inv_groupNameMap[_.second] = _.first;
1083+
1084+
1085+
vector<kDataFrameMQF *> frames;
1086+
int currIndex = 0;
1087+
string kmer;
1088+
uint64_t tagBits = 0;
1089+
uint64_t maxTagValue = (1ULL << tagBits) - 1;
1090+
// kDataFrame *frame;
1091+
int kSize = KD->get_kSize();
1092+
1093+
1094+
uint64_t lastTag = 0;
1095+
readID = 0;
1096+
int __batch_count = 0;
1097+
while (!KD->end()) {
1098+
KD->next_chunk();
1099+
cout << "Processing Chunk(" << ++__batch_count << "): " << chunk_size*__batch_count << " seqs ..." << endl;
1100+
flat_hash_map<uint64_t, uint64_t> convertMap;
1101+
1102+
for (const auto &seq : *KD->getKmers()) {
1103+
string readName = seq.first;
1104+
1105+
auto it = namesMap.find(readName);
1106+
if (it == namesMap.end()) {
1107+
continue;
1108+
// cout << "read " << readName << "dont have group. Please check the group names file." << endl;
1109+
}
1110+
string groupName = it->second;
1111+
1112+
uint64_t readTag = groupNameMap.find(groupName)->second;
1113+
1114+
1115+
convertMap.clear();
1116+
convertMap.insert(make_pair(0, readTag));
1117+
convertMap.insert(make_pair(readTag, readTag));
1118+
// cout<<readName<<" "<<seq.size()<<endl;
1119+
for (const auto &kmer : seq.second) {
1120+
uint64_t currentTag = frame->getCount(kmer.hash);
1121+
auto itc = convertMap.find(currentTag);
1122+
if (itc == convertMap.end()) {
1123+
vector<uint32_t> colors = legend->find(currentTag)->second;
1124+
auto tmpiT = find(colors.begin(), colors.end(), readTag);
1125+
if (tmpiT == colors.end()) {
1126+
colors.push_back(readTag);
1127+
sort(colors.begin(), colors.end());
1128+
}
1129+
1130+
string colorsString = to_string(colors[0]);
1131+
for (int k = 1; k < colors.size(); k++) {
1132+
colorsString += ";" + to_string(colors[k]);
1133+
}
1134+
1135+
auto itTag = tagsMap.find(colorsString);
1136+
if (itTag == tagsMap.end()) {
1137+
uint64_t newColor;
1138+
if (freeColors.size() == 0) {
1139+
newColor = groupID++;
1140+
} else {
1141+
newColor = freeColors.top();
1142+
freeColors.pop();
1143+
}
1144+
1145+
tagsMap.insert(make_pair(colorsString, newColor));
1146+
legend->insert(make_pair(newColor, colors));
1147+
itTag = tagsMap.find(colorsString);
1148+
colorsCount[newColor] = 0;
1149+
// if(groupID>=maxTagValue){
1150+
// cerr<<"Tag size is not enough. ids reached "<<groupID<<endl;
1151+
// return -1;
1152+
// }
1153+
}
1154+
uint64_t newColor = itTag->second;
1155+
1156+
convertMap.insert(make_pair(currentTag, newColor));
1157+
itc = convertMap.find(currentTag);
1158+
}
1159+
1160+
if (itc->second != currentTag) {
1161+
1162+
colorsCount[currentTag]--;
1163+
if (colorsCount[currentTag] == 0 && currentTag != 0) {
1164+
1165+
auto _invGroupNameIT = inv_groupNameMap.find(currentTag);
1166+
if (_invGroupNameIT == inv_groupNameMap.end()){
1167+
freeColors.push(currentTag);
1168+
vector<uint32_t> colors = legend->find(currentTag)->second;
1169+
string colorsString = to_string(colors[0]);
1170+
for (unsigned int k = 1; k < colors.size(); k++) {
1171+
colorsString += ";" + to_string(colors[k]);
1172+
}
1173+
tagsMap.erase(colorsString);
1174+
legend->erase(currentTag);
1175+
if (convertMap.find(currentTag) != convertMap.end())
1176+
convertMap.erase(currentTag);
1177+
}
1178+
1179+
}
1180+
colorsCount[itc->second]++;
1181+
}
1182+
1183+
frame->setCount(kmer.hash, itc->second);
1184+
if (frame->getCount(kmer.hash) != itc->second) {
1185+
//frame->setC(kmer,itc->second);
1186+
cout << "Error Founded " << kmer.str << " from sequence " << readName << " expected "
1187+
<< itc->second << " found " << frame->getCount(kmer.hash) << endl;
1188+
return NULL;
1189+
}
1190+
}
1191+
readID += 1;
1192+
groupCounter[groupName]--;
1193+
if (colorsCount[readTag] == 0) {
1194+
if (groupCounter[groupName] == 0) {
1195+
freeColors.push(readTag);
1196+
legend->erase(readTag);
1197+
}
1198+
}
1199+
}
1200+
}
1201+
colorTable *colors = new intVectorsTable();
1202+
for (auto it : *legend) {
1203+
colors->setColor(it.first, it.second);
1204+
}
1205+
1206+
colored_kDataFrame *res = new colored_kDataFrame();
1207+
res->setColorTable(colors);
1208+
res->setkDataFrame(frame);
1209+
for (auto iit = namesMap.begin(); iit != namesMap.end(); iit++) {
1210+
uint32_t sampleID = groupNameMap[iit->second];
1211+
res->namesMap[sampleID] = iit->second;
1212+
res->namesMapInv[iit->second] = sampleID;
1213+
}
1214+
return res;
1215+
}
1216+
10311217

10321218
vector<uint64_t> estimateKmersHistogram(string fileName, int kSize ,int threads)
10331219
{

0 commit comments

Comments
 (0)