@@ -355,7 +355,7 @@ namespace kProcessor {
355355
356356 // Clone the hashing
357357
358- kmerDecoder_setHashing (KD, kframe->KD ->hash_mode , kframe-> KD -> canonical );
358+ kmerDecoder_setHashing (KD, kframe->KD ->hash_mode );
359359
360360 // Processing
361361
@@ -407,7 +407,7 @@ namespace kProcessor {
407407
408408 // Clone the hashing
409409
410- kmerDecoder_setHashing (KD, frame->KD ->hash_mode , frame-> KD -> canonical );
410+ kmerDecoder_setHashing (KD, frame->KD ->hash_mode );
411411
412412 if (KD->get_kSize () != (int )frame->getkSize ()) {
413413 std::cerr << " kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
@@ -537,12 +537,12 @@ namespace kProcessor {
537537 }
538538
539539
540- void kmerDecoder_setHashing (kmerDecoder * KD, int hash_mode, bool canonical ){
541- KD->setHashingMode (hash_mode, canonical );
540+ void kmerDecoder_setHashing (kmerDecoder * KD, hashingModes hash_mode){
541+ KD->setHashingMode (hash_mode, KD-> get_kSize () );
542542 }
543543
544- void kmerDecoder_setHashing (kDataFrame * KF, int hash_mode, bool canonical ){
545- KF->KD ->setHashingMode (hash_mode, canonical );
544+ void kmerDecoder_setHashing (kDataFrame * KF, hashingModes hash_mode){
545+ KF->KD ->setHashingMode (hash_mode, KF-> ksize () );
546546 }
547547
548548
@@ -641,18 +641,14 @@ namespace kProcessor {
641641 }
642642 }
643643
644- kmerDecoder* initialize_kmerDecoder (int kSize , int hash_mode){
645- // Mode 0: Murmar Hashing | Irreversible
646- // Mode 1: Integer Hashing | Reversible | Full Hashing
647- // Mode 2: TwoBitsHashing | Not considered hashing, just store the two bits representation
648- return new Kmers (kSize , hash_mode);
644+ kmerDecoder* initialize_kmerDecoder (int kSize , hashingModes HM = integer_hasher){
645+ return new Kmers (kSize , HM);
649646 }
650647
651648 colored_kDataFrame *index (kmerDecoder *KD, string names_fileName, kDataFrame *frame) {
652649
653- if (KD->get_kSize () != (int )frame->ksize ()) {
654- std::cerr << " kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
655- exit (1 );
650+ if (KD->get_kSize () != (int )frame->ksize () && (KD->hash_mode != protein_hasher || KD->hash_mode != proteinDayhoff_hasher)) {
651+ std::cerr << " WARNING: kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
656652 }
657653
658654 flat_hash_map<string, string> namesMap;
@@ -847,7 +843,7 @@ namespace kProcessor {
847843
848844 // Clone the hashing
849845
850- kmerDecoder_setHashing (KD, frame->KD ->hash_mode , frame-> KD -> canonical );
846+ kmerDecoder_setHashing (KD, frame->KD ->hash_mode );
851847
852848 // Processing
853849
@@ -1028,6 +1024,196 @@ namespace kProcessor {
10281024 return res;
10291025 }
10301026
1027+ colored_kDataFrame * index (kDataFrame * frame, string filename, int chunk_size, string names_fileName){
1028+
1029+ // parse_params["mode"] = 1 > Default: Kmers
1030+ // parse_params["mode"] = 2 > Skipmers
1031+ // parse_params["mode"] = 3 > Minimizers
1032+
1033+ // Initialize kmerDecoder
1034+ map<string, int > params = kmerDecoder::string_to_params (frame->KD ->params_to_string ());
1035+
1036+ kmerDecoder * KD = kmerDecoder::getInstance (filename, chunk_size, frame->KD ->slicing_mode , frame->KD ->hash_mode , params);
1037+
1038+ // Processing
1039+
1040+ if (KD->get_kSize () != (int )frame->ksize ()) {
1041+ std::cerr << " kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
1042+ exit (1 );
1043+ }
1044+
1045+ flat_hash_map<string, string> namesMap;
1046+ flat_hash_map<string, uint64_t > tagsMap;
1047+ flat_hash_map<string, uint64_t > groupNameMap;
1048+ auto *legend = new flat_hash_map<uint64_t , std::vector<uint32_t >>();
1049+ flat_hash_map<uint64_t , uint64_t > colorsCount;
1050+ uint64_t readID = 0 , groupID = 1 ;
1051+ ifstream namesFile (names_fileName.c_str ());
1052+ string seqName, groupName;
1053+ string line;
1054+ priority_queue<uint64_t , vector<uint64_t >, std::greater<uint64_t >> freeColors;
1055+ flat_hash_map<string, uint64_t > groupCounter;
1056+ // while (namesFile >> seqName >> groupName) {
1057+ while (std::getline (namesFile, line)) {
1058+ std::vector<string> tokens;
1059+ std::istringstream iss (line);
1060+ std::string token;
1061+ while (std::getline (iss, token, ' \t ' )) // but we can specify a different one
1062+ tokens.push_back (token);
1063+ seqName = tokens[0 ];
1064+ groupName = tokens[1 ];
1065+ namesMap.insert (make_pair (seqName, groupName));
1066+ auto it = groupNameMap.find (groupName);
1067+ groupCounter[groupName]++;
1068+ if (it == groupNameMap.end ()) {
1069+ groupNameMap.insert (make_pair (groupName, groupID));
1070+ tagsMap.insert (make_pair (to_string (groupID), groupID));
1071+ vector<uint32_t > tmp;
1072+ tmp.clear ();
1073+ tmp.push_back (groupID);
1074+ legend->insert (make_pair (groupID, tmp));
1075+ colorsCount.insert (make_pair (groupID, 0 ));
1076+ groupID++;
1077+ }
1078+ }
1079+
1080+ flat_hash_map<uint64_t , string> inv_groupNameMap;
1081+ for (auto &_ : groupNameMap)
1082+ inv_groupNameMap[_.second ] = _.first ;
1083+
1084+
1085+ vector<kDataFrameMQF *> frames;
1086+ int currIndex = 0 ;
1087+ string kmer;
1088+ uint64_t tagBits = 0 ;
1089+ uint64_t maxTagValue = (1ULL << tagBits) - 1 ;
1090+ // kDataFrame *frame;
1091+ int kSize = KD->get_kSize ();
1092+
1093+
1094+ uint64_t lastTag = 0 ;
1095+ readID = 0 ;
1096+ int __batch_count = 0 ;
1097+ while (!KD->end ()) {
1098+ KD->next_chunk ();
1099+ cout << " Processing Chunk(" << ++__batch_count << " ): " << chunk_size*__batch_count << " seqs ..." << endl;
1100+ flat_hash_map<uint64_t , uint64_t > convertMap;
1101+
1102+ for (const auto &seq : *KD->getKmers ()) {
1103+ string readName = seq.first ;
1104+
1105+ auto it = namesMap.find (readName);
1106+ if (it == namesMap.end ()) {
1107+ continue ;
1108+ // cout << "read " << readName << "dont have group. Please check the group names file." << endl;
1109+ }
1110+ string groupName = it->second ;
1111+
1112+ uint64_t readTag = groupNameMap.find (groupName)->second ;
1113+
1114+
1115+ convertMap.clear ();
1116+ convertMap.insert (make_pair (0 , readTag));
1117+ convertMap.insert (make_pair (readTag, readTag));
1118+ // cout<<readName<<" "<<seq.size()<<endl;
1119+ for (const auto &kmer : seq.second ) {
1120+ uint64_t currentTag = frame->getCount (kmer.hash );
1121+ auto itc = convertMap.find (currentTag);
1122+ if (itc == convertMap.end ()) {
1123+ vector<uint32_t > colors = legend->find (currentTag)->second ;
1124+ auto tmpiT = find (colors.begin (), colors.end (), readTag);
1125+ if (tmpiT == colors.end ()) {
1126+ colors.push_back (readTag);
1127+ sort (colors.begin (), colors.end ());
1128+ }
1129+
1130+ string colorsString = to_string (colors[0 ]);
1131+ for (int k = 1 ; k < colors.size (); k++) {
1132+ colorsString += " ;" + to_string (colors[k]);
1133+ }
1134+
1135+ auto itTag = tagsMap.find (colorsString);
1136+ if (itTag == tagsMap.end ()) {
1137+ uint64_t newColor;
1138+ if (freeColors.size () == 0 ) {
1139+ newColor = groupID++;
1140+ } else {
1141+ newColor = freeColors.top ();
1142+ freeColors.pop ();
1143+ }
1144+
1145+ tagsMap.insert (make_pair (colorsString, newColor));
1146+ legend->insert (make_pair (newColor, colors));
1147+ itTag = tagsMap.find (colorsString);
1148+ colorsCount[newColor] = 0 ;
1149+ // if(groupID>=maxTagValue){
1150+ // cerr<<"Tag size is not enough. ids reached "<<groupID<<endl;
1151+ // return -1;
1152+ // }
1153+ }
1154+ uint64_t newColor = itTag->second ;
1155+
1156+ convertMap.insert (make_pair (currentTag, newColor));
1157+ itc = convertMap.find (currentTag);
1158+ }
1159+
1160+ if (itc->second != currentTag) {
1161+
1162+ colorsCount[currentTag]--;
1163+ if (colorsCount[currentTag] == 0 && currentTag != 0 ) {
1164+
1165+ auto _invGroupNameIT = inv_groupNameMap.find (currentTag);
1166+ if (_invGroupNameIT == inv_groupNameMap.end ()){
1167+ freeColors.push (currentTag);
1168+ vector<uint32_t > colors = legend->find (currentTag)->second ;
1169+ string colorsString = to_string (colors[0 ]);
1170+ for (unsigned int k = 1 ; k < colors.size (); k++) {
1171+ colorsString += " ;" + to_string (colors[k]);
1172+ }
1173+ tagsMap.erase (colorsString);
1174+ legend->erase (currentTag);
1175+ if (convertMap.find (currentTag) != convertMap.end ())
1176+ convertMap.erase (currentTag);
1177+ }
1178+
1179+ }
1180+ colorsCount[itc->second ]++;
1181+ }
1182+
1183+ frame->setCount (kmer.hash , itc->second );
1184+ if (frame->getCount (kmer.hash ) != itc->second ) {
1185+ // frame->setC(kmer,itc->second);
1186+ cout << " Error Founded " << kmer.str << " from sequence " << readName << " expected "
1187+ << itc->second << " found " << frame->getCount (kmer.hash ) << endl;
1188+ return NULL ;
1189+ }
1190+ }
1191+ readID += 1 ;
1192+ groupCounter[groupName]--;
1193+ if (colorsCount[readTag] == 0 ) {
1194+ if (groupCounter[groupName] == 0 ) {
1195+ freeColors.push (readTag);
1196+ legend->erase (readTag);
1197+ }
1198+ }
1199+ }
1200+ }
1201+ colorTable *colors = new intVectorsTable ();
1202+ for (auto it : *legend) {
1203+ colors->setColor (it.first , it.second );
1204+ }
1205+
1206+ colored_kDataFrame *res = new colored_kDataFrame ();
1207+ res->setColorTable (colors);
1208+ res->setkDataFrame (frame);
1209+ for (auto iit = namesMap.begin (); iit != namesMap.end (); iit++) {
1210+ uint32_t sampleID = groupNameMap[iit->second ];
1211+ res->namesMap [sampleID] = iit->second ;
1212+ res->namesMapInv [iit->second ] = sampleID;
1213+ }
1214+ return res;
1215+ }
1216+
10311217
10321218 vector<uint64_t > estimateKmersHistogram (string fileName, int kSize ,int threads)
10331219{
0 commit comments