diff --git a/kmc_api/mmer.cpp b/kmc_api/mmer.cpp index c2e4250d..ff782f36 100644 --- a/kmc_api/mmer.cpp +++ b/kmc_api/mmer.cpp @@ -24,38 +24,41 @@ CMmer::_si CMmer::_init; //-------------------------------------------------------------------------- -CMmer::CMmer(uint32 _len) +CMmer::CMmer(uint32 _len, CMmerNorm* _norm) { - switch (_len) - { - case 5: - norm = norm5; - break; - case 6: - norm = norm6; - break; - case 7: - norm = norm7; - break; - case 8: - norm = norm8; - break; - case 9: - norm = norm9; - break; - case 10: - norm = norm10; - break; - case 11: - norm = norm11; - break; - default: - break; - } - len = _len; - mask = (1 << _len * 2) - 1; - str = 0; + switch (_len) + { + case 5: + norm = norm5; + break; + case 6: + norm = norm6; + break; + case 7: + norm = norm7; + break; + case 8: + norm = norm8; + break; + case 9: + norm = norm9; + break; + case 10: + norm = norm10; + break; + case 11: + norm = norm11; + break; + default: + break; + } + len = _len; + mask = (1 << _len * 2) - 1; + str = 0; + + new_norm = _norm; } //-------------------------------------------------------------------------- +// ***** EOF \ No newline at end of file diff --git a/kmc_api/mmer.h b/kmc_api/mmer.h index d4e7e77e..5dc26dbf 100644 --- a/kmc_api/mmer.h +++ b/kmc_api/mmer.h @@ -18,97 +18,100 @@ class CMmer { - uint32 str; - uint32 mask; - uint32 current_val; - uint32* norm; - uint32 len; - static uint32 norm5[1 << 10]; - static uint32 norm6[1 << 12]; - static uint32 norm7[1 << 14]; - static uint32 norm8[1 << 16]; - static uint32 norm9[1 << 18]; - static uint32 norm10[1 << 20]; - static uint32 norm11[1 << 22]; - - static bool is_allowed(uint32 mmer, uint32 len) - { - if ((mmer & 0x3f) == 0x3f) // TTT suffix - return false; - if ((mmer & 0x3f) == 0x3b) // TGT suffix - return false; - if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152 - return false; - - for (uint32 j = 0; j < len - 3; ++j) - if ((mmer & 0xf) == 0) // AA inside - return false; - else - mmer >>= 2; - - if (mmer == 0) // AAA prefix - return false; - if (mmer == 0x04) // ACA prefix - return false; - if ((mmer & 0xf) == 0) // *AA prefix - return false; - - return true; - } - - friend class CSignatureMapper; - struct _si - { - static uint32 get_rev(uint32 mmer, uint32 len) - { - uint32 rev = 0; - uint32 shift = len*2 - 2; - for(uint32 i = 0 ; i < len ; ++i) - { - rev += (3 - (mmer & 3)) << shift; - mmer >>= 2; - shift -= 2; - } - return rev; - } - - - - static void init_norm(uint32* norm, uint32 len) - { - uint32 special = 1 << len * 2; - for(uint32 i = 0 ; i < special ; ++i) - { - uint32 rev = get_rev(i, len); - uint32 str_val = is_allowed(i, len) ? i : special; - uint32 rev_val = is_allowed(rev, len) ? rev : special; - norm[i] = MIN(str_val, rev_val); - } - } - - _si() - { - init_norm(norm5, 5); - init_norm(norm6, 6); - init_norm(norm7, 7); - init_norm(norm8, 8); - init_norm(norm9, 9); - init_norm(norm10, 10); - init_norm(norm11, 11); - } - - }static _init; + uint32 str; + uint32 mask; + uint32 current_val; + uint32* norm; + uint32 len; + + CMmerNorm* new_norm; + + static uint32 norm5[1 << 10]; + static uint32 norm6[1 << 12]; + static uint32 norm7[1 << 14]; + static uint32 norm8[1 << 16]; + static uint32 norm9[1 << 18]; + static uint32 norm10[1 << 20]; + static uint32 norm11[1 << 22]; + + static bool is_allowed(uint32 mmer, uint32 len) + { + if ((mmer & 0x3f) == 0x3f) // TTT suffix + return false; + if ((mmer & 0x3f) == 0x3b) // TGT suffix + return false; + if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152 + return false; + + for (uint32 j = 0; j < len - 3; ++j) + if ((mmer & 0xf) == 0) // AA inside + return false; + else + mmer >>= 2; + + if (mmer == 0) // AAA prefix + return false; + if (mmer == 0x04) // ACA prefix + return false; + if ((mmer & 0xf) == 0) // *AA prefix + return false; + + return true; + } + + friend class CSignatureMapper; + struct _si + { + static uint32 get_rev(uint32 mmer, uint32 len) + { + uint32 rev = 0; + uint32 shift = len*2 - 2; + for(uint32 i = 0 ; i < len ; ++i) + { + rev += (3 - (mmer & 3)) << shift; + mmer >>= 2; + shift -= 2; + } + return rev; + } + + + + static void init_norm(uint32* norm, uint32 len) + { + uint32 special = 1 << len * 2; + for(uint32 i = 0 ; i < special ; ++i) + { + uint32 rev = get_rev(i, len); + uint32 str_val = is_allowed(i, len) ? i : special; + uint32 rev_val = is_allowed(rev, len) ? rev : special; + norm[i] = MIN(str_val, rev_val); + } + } + + _si() + { + init_norm(norm5, 5); + init_norm(norm6, 6); + init_norm(norm7, 7); + init_norm(norm8, 8); + init_norm(norm9, 9); + init_norm(norm10, 10); + init_norm(norm11, 11); + } + + }static _init; public: - CMmer(uint32 _len); - inline void insert(uchar symb); - inline uint32 get() const; - inline bool operator==(const CMmer& x); - inline bool operator<(const CMmer& x); - inline void clear(); - inline bool operator<=(const CMmer& x); - inline void set(const CMmer& x); - inline void insert(const char* seq); - + CMmer(uint32 _len, CMmerNorm* _norm); + inline void insert(uchar symb); + inline uint32 get() const; + inline bool operator==(const CMmer& x); + inline bool operator<(const CMmer& x); + inline void clear(); + inline bool operator<=(const CMmer& x); + inline void set(const CMmer& x); + inline void insert(const char* seq); + }; @@ -116,82 +119,93 @@ class CMmer //-------------------------------------------------------------------------- inline void CMmer::insert(uchar symb) { - str <<= 2; - str += symb; - str &= mask; + str <<= 2; + str += symb; + str &= mask; - current_val = norm[str]; + current_val = norm[str]; } //-------------------------------------------------------------------------- inline uint32 CMmer::get() const { - return current_val; + return current_val; } //-------------------------------------------------------------------------- inline bool CMmer::operator==(const CMmer& x) { - return current_val == x.current_val; + return current_val == x.current_val; } //-------------------------------------------------------------------------- inline bool CMmer::operator<(const CMmer& x) { - return current_val < x.current_val; + return current_val < x.current_val; } //-------------------------------------------------------------------------- inline void CMmer::clear() { - str = 0; + str = 0; } //-------------------------------------------------------------------------- inline bool CMmer::operator<=(const CMmer& x) { - return current_val <= x.current_val; + return current_val <= x.current_val; } //-------------------------------------------------------------------------- inline void CMmer::set(const CMmer& x) { - str = x.str; - current_val = x.current_val; + str = x.str; + current_val = x.current_val; } //-------------------------------------------------------------------------- inline void CMmer::insert(const char* seq) { - switch (len) - { - case 5: - str = (seq[0] << 8) + (seq[1] << 6) + (seq[2] << 4) + (seq[3] << 2) + (seq[4]); - break; - case 6: - str = (seq[0] << 10) + (seq[1] << 8) + (seq[2] << 6) + (seq[3] << 4) + (seq[4] << 2) + (seq[5]); - break; - case 7: - str = (seq[0] << 12) + (seq[1] << 10) + (seq[2] << 8) + (seq[3] << 6) + (seq[4] << 4 ) + (seq[5] << 2) + (seq[6]); - break; - case 8: - str = (seq[0] << 14) + (seq[1] << 12) + (seq[2] << 10) + (seq[3] << 8) + (seq[4] << 6) + (seq[5] << 4) + (seq[6] << 2) + (seq[7]); - break; - case 9: - str = (seq[0] << 16) + (seq[1] << 14) + (seq[2] << 12) + (seq[3] << 10) + (seq[4] << 8) + (seq[5] << 6) + (seq[6] << 4) + (seq[7] << 2) + (seq[8]); - break; - case 10: - str = (seq[0] << 18) + (seq[1] << 16) + (seq[2] << 14) + (seq[3] << 12) + (seq[4] << 10) + (seq[5] << 8) + (seq[6] << 6) + (seq[7] << 4) + (seq[8] << 2) + (seq[9]); - break; - case 11: - str = (seq[0] << 20) + (seq[1] << 18) + (seq[2] << 16) + (seq[3] << 14) + (seq[4] << 12) + (seq[5] << 10) + (seq[6] << 8) + (seq[7] << 6) + (seq[8] << 4) + (seq[9] << 2) + (seq[10]); - break; - default: - break; - } - - current_val = norm[str]; + switch (len) + { + case 5: + str = (seq[0] << 8) + (seq[1] << 6) + (seq[2] << 4) + (seq[3] << 2) + (seq[4]); + break; + case 6: + str = (seq[0] << 10) + (seq[1] << 8) + (seq[2] << 6) + (seq[3] << 4) + (seq[4] << 2) + (seq[5]); + break; + case 7: + str = (seq[0] << 12) + (seq[1] << 10) + (seq[2] << 8) + (seq[3] << 6) + (seq[4] << 4 ) + (seq[5] << 2) + (seq[6]); + break; + case 8: + str = (seq[0] << 14) + (seq[1] << 12) + (seq[2] << 10) + (seq[3] << 8) + (seq[4] << 6) + (seq[5] << 4) + (seq[6] << 2) + (seq[7]); + break; + case 9: + str = (seq[0] << 16) + (seq[1] << 14) + (seq[2] << 12) + (seq[3] << 10) + (seq[4] << 8) + (seq[5] << 6) + (seq[6] << 4) + (seq[7] << 2) + (seq[8]); + break; + case 10: + str = (seq[0] << 18) + (seq[1] << 16) + (seq[2] << 14) + (seq[3] << 12) + (seq[4] << 10) + (seq[5] << 8) + (seq[6] << 6) + (seq[7] << 4) + (seq[8] << 2) + (seq[9]); + break; + case 11: + str = (seq[0] << 20) + (seq[1] << 18) + (seq[2] << 16) + (seq[3] << 14) + (seq[4] << 12) + (seq[5] << 10) + (seq[6] << 8) + (seq[7] << 6) + (seq[8] << 4) + (seq[9] << 2) + (seq[10]); + break; + default: + break; + } + + if(norm == nullptr) + { + current_val = norm[str]; + } + else + { + current_val = new_norm->get_norm_value(str); + } + + } -#endif \ No newline at end of file +#endif + +// ***** EOF \ No newline at end of file diff --git a/kmer_counter/kmc.h b/kmer_counter/kmc.h index 9eacf2be..34c51117 100644 --- a/kmer_counter/kmc.h +++ b/kmer_counter/kmc.h @@ -42,6 +42,7 @@ #include "bkb_merger.h" #include "bkb_writer.h" #include "binary_reader.h" +#include "mmer_compartator.h" using namespace std; @@ -919,6 +920,8 @@ template bool CKMC::Process() delete Queues.input_files_queue; Queues.input_files_queue = new CInputFilesQueue(Params.input_file_names); + CMmerNorm* norm = new CMmerNorm(stats); // this line alters stats + heuristic_time.startTimer(); Queues.s_mapper->Init(stats); heuristic_time.stopTimer(); @@ -926,12 +929,12 @@ template bool CKMC::Process() cerr << "\n"; w0.stopTimer(); - - Queues.pmm_stats->free(stats); - Queues.pmm_stats->release(); - delete Queues.pmm_stats; - Queues.pmm_stats = nullptr; - +//TODO: FIX +// Queues.pmm_stats->free(stats); +// Queues.pmm_stats->release(); +// delete Queues.pmm_stats; +// Queues.pmm_stats = nullptr; + // ***** Stage 1 ***** ShowSettingsStage1(); Queues.missingEOL_at_EOF_counter->Reset(); @@ -940,7 +943,7 @@ template bool CKMC::Process() for(int i = 0; i < Params.n_splitters; ++i) { - w_splitters[i] = new CWSplitter(Params, Queues); + w_splitters[i] = new CWSplitter(Params, Queues, norm); gr1_2.push_back(thread(std::ref(*w_splitters[i]))); } diff --git a/kmer_counter/mmer.cpp b/kmer_counter/mmer.cpp index a690a144..d0c3ad01 100644 --- a/kmer_counter/mmer.cpp +++ b/kmer_counter/mmer.cpp @@ -24,7 +24,7 @@ CMmer::_si CMmer::_init; //-------------------------------------------------------------------------- -CMmer::CMmer(uint32 _len) +CMmer::CMmer(uint32 _len, CMmerNorm* _norm) { switch (_len) { @@ -55,6 +55,8 @@ CMmer::CMmer(uint32 _len) len = _len; mask = (1 << _len * 2) - 1; str = 0; + + new_norm = _norm; } //-------------------------------------------------------------------------- diff --git a/kmer_counter/mmer.h b/kmer_counter/mmer.h index 7dc70ca0..450dc446 100644 --- a/kmer_counter/mmer.h +++ b/kmer_counter/mmer.h @@ -11,6 +11,8 @@ #ifndef _MMER_H #define _MMER_H #include "defs.h" +#include "mmer_compartator.h" + // ************************************************************************* // ************************************************************************* @@ -23,6 +25,9 @@ class CMmer uint32 current_val; uint32* norm; uint32 len; + + CMmerNorm* new_norm; + static uint32 norm5[1 << 10]; static uint32 norm6[1 << 12]; static uint32 norm7[1 << 14]; @@ -99,7 +104,7 @@ class CMmer }static _init; public: - CMmer(uint32 _len); + CMmer(uint32 _len, CMmerNorm* _norm); inline void insert(uchar symb); inline uint32 get() const; inline bool operator==(const CMmer& x); @@ -120,7 +125,16 @@ inline void CMmer::insert(uchar symb) str += symb; str &= mask; - current_val = norm[str]; + //current_val = norm[str]; + if(new_norm == nullptr) + { + // current_val = norm[str]; // this is the original + current_val = str; + } + else + { + current_val = new_norm->get_norm_value(str); + } } //-------------------------------------------------------------------------- @@ -190,7 +204,17 @@ inline void CMmer::insert(const char* seq) break; } - current_val = norm[str]; + if(new_norm == nullptr) + { + // current_val = norm[str]; // this is the original + current_val = str; + } + else + { + current_val = new_norm->get_norm_value(str); + } + + } diff --git a/kmer_counter/mmer_compartator.cpp b/kmer_counter/mmer_compartator.cpp new file mode 100644 index 00000000..a6f1bd3a --- /dev/null +++ b/kmer_counter/mmer_compartator.cpp @@ -0,0 +1,14 @@ +// +// Created by danflomin on 08/02/2021. +// +#include "mmer_compartator.h" +#include + +uint32 CMmerNorm::norm[]; + +CMmerNorm::CMmerNorm(uint32* stats) +{ + std::cout << "before stats" << std::endl; + Init(stats); + std::cout << "after stats" << std::endl; +} diff --git a/kmer_counter/mmer_compartator.h b/kmer_counter/mmer_compartator.h new file mode 100644 index 00000000..72e4bbd8 --- /dev/null +++ b/kmer_counter/mmer_compartator.h @@ -0,0 +1,273 @@ +// +// Created by danflomin on 08/02/2021. +// + +#ifndef KMC_MMER_COMPARTATOR_H +#define KMC_MMER_COMPARTATOR_H + +#include "defs.h" +#include +#include +#include +#include +#include + + + + +class CMmerNorm +{ + static uint32 norm[1<<18]; + + class MmerComp + { + uint32 stats[1<<18];//uint32* stats; + uint32 uhs[1<<18]; + uint32 my_norm[1<<18]; + char num_codes[256]; + public: + bool operator()(uint32 mmer1_idx, uint32 mmer2_idx) + { + uint32 mmer1 = my_norm[mmer1_idx]; + uint32 mmer2 = my_norm[mmer2_idx]; + + if(uhs[mmer1] == 1 && uhs[mmer2] == 0) + return true; + if(uhs[mmer1] == 0 && uhs[mmer2] == 1) + return false; + + if(is_allowed(mmer1) && !is_allowed(mmer2)) + return true; + if(!is_allowed(mmer1) && is_allowed(mmer2)) + return false; + + if(stats[mmer1] > stats[mmer2]) + return false; + if(stats[mmer1] < stats[mmer2]) + return true; + + // frequency is equal + return mmer1 < mmer2; + } + MmerComp(uint32* _signature_occurrences) + { + // stats = _signature_occurrences; + for (int j = 0; j < (1<<18); j++) + { + stats[j] = _signature_occurrences[j] + _signature_occurrences[get_reversed(j)]; + } + + for (int i = 0; i < 256; i++) + num_codes[i] = -1; + num_codes['A'] = num_codes['a'] = 0; + num_codes['C'] = num_codes['c'] = 1; + num_codes['G'] = num_codes['g'] = 2; + num_codes['T'] = num_codes['t'] = 3; + std::cout << "before bitset" << std::endl << std::flush; + init_uhs_bitset(); + std::cout << "after bitset" << std::endl<< std::flush; + + for (int j = 0; j < (1<<18); j++) + { + my_norm[j] = norm[j]; + } + for (int j = 0; j < (1<<18); j++) + { + _signature_occurrences[j] = 0; + if(uhs[j] == 1) + _signature_occurrences[MAX(j,get_reversed(j))] = stats[j]; + _signature_occurrences[MIN(j,get_reversed(j))] = 0; + + } + } + void init_uhs_bitset() + { + for(int i = 0; i< 1 << 18; i++) + { + uhs[i] = 0; + } + + + std::ifstream infile("res_9.txt"); + int str_value; + std::string line; + std::cout << "lolz0" << std::endl<< std::flush; + while (std::getline(infile, line)) + { + const char *seq = line.c_str(); + str_value = (num_codes[seq[0]] << 16) + (num_codes[seq[1]] << 14) + (num_codes[seq[2]] << 12) + (num_codes[seq[3]] << 10) + (num_codes[seq[4]] << 8) + (num_codes[seq[5]] << 6) + (num_codes[seq[6]] << 4) + (num_codes[seq[7]]<< 2) + (num_codes[seq[8]]); + uhs[str_value] = 1; + } + std::cout << "lolz6" << std::endl<< std::flush; + infile.close(); + std::cout << "lolz7" << std::endl<< std::flush; + } + bool is_allowed(uint32 mmer) + { + if ((mmer & 0x3f) == 0x3f) // TTT suffix + return false; + if ((mmer & 0x3f) == 0x3b) // TGT suffix + return false; + if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152 + return false; + + for (uint32 j = 0; j < 9 - 3; ++j) + if ((mmer & 0xf) == 0) // AA inside + return false; + else + mmer >>= 2; + + if (mmer == 0) // AAA prefix + return false; + if (mmer == 0x04) // ACA prefix + return false; + if ((mmer & 0xf) == 0) // *AA prefix + return false; + + return true; + } + + uint32 get_reversed(uint32 mmer) + { + uint32 rev = 0; + uint32 shift = 9*2 - 2; + for(uint32 i = 0 ; i < 9 ; ++i) + { + rev += (3 - (mmer & 3)) << shift; + mmer >>= 2; + shift -= 2; + } + return rev; + } + + + }; + + void merge(uint32 arr[], int l, int m, int r, MmerComp* comp) + { + int n1 = m - l + 1; + int n2 = r - m; + + // Create temp arrays + int L[n1], R[n2]; + + // Copy data to temp arrays L[] and R[] + for (int i = 0; i < n1; i++) + L[i] = arr[l + i]; + for (int j = 0; j < n2; j++) + R[j] = arr[m + 1 + j]; + + // Merge the temp arrays back into arr[l..r] + // Initial index of first subarray + int i = 0; + // Initial index of second subarray + int j = 0; + // Initial index of merged subarray + int k = l; + + while (i < n1 && j < n2) { + if (comp->operator()(L[i],R[j]) || norm[L[i]] == norm[R[j]]) { + arr[k] = L[i]; + i++; + } + else { + arr[k] = R[j]; + j++; + } + k++; + } + + // Copy the remaining elements of + // L[], if there are any + while (i < n1) { + arr[k] = L[i]; + i++; + k++; + } + + // Copy the remaining elements of + // R[], if there are any + while (j < n2) { + arr[k] = R[j]; + j++; + k++; + } + } + void mergeSort(uint32 arr[],int l,int r, MmerComp* comp){ + if(l>=r){ + return;//returns recursively + } + int m =l+ (r-l)/2; + mergeSort(arr,l,m, comp); + mergeSort(arr,m+1,r, comp); + merge(arr,l,m,r, comp); + } + + +public: + void Init(uint32* stats) + { + std::cout << "in INITTT" << std::endl << std::flush; + uint32 temp[1 << 18]; + uint32 special = 1 << 18; + uint32 i; + +// TODO: TAKE CARE OF CANONICAL KMERS. COMBINE stats[i] and stats[rev]. norm[i] == norm[rev]. I think that now they are 1 idx apart. + + for(i = 0 ; i < special ; ++i) + { + uint32 rev = get_rev(i); + norm[i] = MIN(i, rev); + temp[i] = i; + } + + MmerComp* comp = new MmerComp(stats); + mergeSort(temp, 0, special-1, comp); + std::cout << "after sort" << std::endl; + + for(uint32 i = 0 ; i < special ; ++i) + { + temp[i] = norm[temp[i]]; + } + std::cout << "after seconds for loop" << std::endl; + for(uint32 i = 0 ; i < special ; ++i) + { + norm[i] = temp[i]; + } + std::cout << "after reverse normalizing for loop" << std::endl; + for(uint32 i = 0 ; i < special ; ++i) + { + uint32 rev = get_rev(i); + uint32 min_value = MIN(norm[i], norm[rev]); + norm[i] = min_value; + norm[rev] = min_value; + } + std::cout << "after final for loop" << std::endl; + } + + CMmerNorm(uint32* stats); + + inline uint32 get_norm_value(uint32 value) + { + return norm[value]; + } + inline uint32 get_rev(uint32 mmer) + { + uint32 rev = 0; + uint32 shift = 9*2 - 2; + for(uint32 i = 0 ; i < 9 ; ++i) + { + rev += (3 - (mmer & 3)) << shift; + mmer >>= 2; + shift -= 2; + } + return rev; + } +}; + +// ***** EOF + + +#endif //KMC_MMER_COMPARTATOR_H + + diff --git a/kmer_counter/s_mapper.h b/kmer_counter/s_mapper.h index bfd84bf8..cc2db939 100644 --- a/kmer_counter/s_mapper.h +++ b/kmer_counter/s_mapper.h @@ -13,6 +13,8 @@ #include "defs.h" #include "mmer.h" #include "params.h" +#include + #ifdef DEVELOP_MODE #include "develop.h" @@ -43,7 +45,7 @@ class CSignatureMapper }; public: - void Init(uint32* stats) + void Init(uint32* stats) // TODO: stats do not contain special characters in original implementation { uint32 *sorted; pmm_stats->reserve(sorted); @@ -54,7 +56,7 @@ class CSignatureMapper list> _stats; for (uint32 i = 0; i < map_size ; ++i) { - if (CMmer::is_allowed(sorted[i], signature_len)) + //if (CMmer::is_allowed(sorted[i], signature_len)) _stats.push_back(make_pair(sorted[i], stats[sorted[i]])); } @@ -163,7 +165,8 @@ class CSignatureMapper } inline int32 get_bin_id(uint32 signature) { - return signature_map[signature]; + // return signature % n_bins; + return signature_map[signature]; // TODO: FIX??? } inline int32 get_max_bin_no() diff --git a/kmer_counter/splitter.cpp b/kmer_counter/splitter.cpp index 301969d6..1b4b0bf5 100644 --- a/kmer_counter/splitter.cpp +++ b/kmer_counter/splitter.cpp @@ -20,8 +20,9 @@ uint32 CSplitter::MAX_LINE_SIZE = 1 << 16; //---------------------------------------------------------------------------------- // Assigns queues -CSplitter::CSplitter(CKMCParams &Params, CKMCQueues &Queues) +CSplitter::CSplitter(CKMCParams &Params, CKMCQueues &Queues, CMmerNorm* _norm) { + norm = _norm; mm = Queues.mm; file_type = Params.file_type; both_strands = Params.both_strands; @@ -447,7 +448,7 @@ void CSplitter::CalcStats(uchar* _part, uint64 _part_size, ReadType read_type, u pmm_reads->reserve(seq); uint32 signature_start_pos; - CMmer current_signature(signature_len), end_mmer(signature_len); + CMmer current_signature(signature_len, norm), end_mmer(signature_len, norm); uint32 i; uint32 len;//length of extended kmer @@ -549,7 +550,7 @@ bool CSplitter::ProcessReads(uchar *_part, uint64 _part_size, ReadType read_type pmm_reads->reserve(seq); uint32 signature_start_pos; - CMmer current_signature(signature_len), end_mmer(signature_len); + CMmer current_signature(signature_len, norm), end_mmer(signature_len, norm); uint32 bin_no; uint32 i; @@ -820,12 +821,12 @@ CSplitter::~CSplitter() //************************************************************************************************************ //---------------------------------------------------------------------------------- // Constructor -CWSplitter::CWSplitter(CKMCParams &Params, CKMCQueues &Queues) +CWSplitter::CWSplitter(CKMCParams &Params, CKMCQueues &Queues, CMmerNorm* _norm) { pq = Queues.part_queue; bpq = Queues.bpq; pmm_fastq = Queues.pmm_fastq; - spl = new CSplitter(Params, Queues); + spl = new CSplitter(Params, Queues, _norm); spl->InitBins(Params, Queues); } @@ -883,7 +884,7 @@ CWStatsSplitter::CWStatsSplitter(CKMCParams &Params, CKMCQueues &Queues) spq = Queues.stats_part_queue; pmm_fastq = Queues.pmm_fastq; pmm_stats = Queues.pmm_stats; - spl = new CSplitter(Params, Queues); + spl = new CSplitter(Params, Queues, nullptr); signature_len = Params.signature_len; pmm_stats->reserve(stats); @@ -940,7 +941,7 @@ template CWSmallKSplitter::CWSmallKSplitte pmm_fastq = Queues.pmm_fastq; pmm_small_k = Queues.pmm_small_k_buf; kmer_len = Params.kmer_len; - spl = new CSplitter(Params, Queues); + spl = new CSplitter(Params, Queues, nullptr); } //---------------------------------------------------------------------------------- diff --git a/kmer_counter/splitter.h b/kmer_counter/splitter.h index a4f096c7..15da6c97 100644 --- a/kmer_counter/splitter.h +++ b/kmer_counter/splitter.h @@ -22,6 +22,7 @@ #include #include "small_k_buf.h" #include "bam_utils.h" +#include "mmer_compartator.h" using namespace std; @@ -54,6 +55,8 @@ class CSplitter { CSignatureMapper* s_mapper; + CMmerNorm* norm; + bool homopolymer_compressed; bool GetSeqLongRead(char *seq, uint32 &seq_size, uchar header_marker); @@ -66,7 +69,7 @@ class CSplitter { static uint32 MAX_LINE_SIZE; - CSplitter(CKMCParams &Params, CKMCQueues &Queues); + CSplitter(CKMCParams &Params, CKMCQueues &Queues, CMmerNorm* _norm); void InitBins(CKMCParams &Params, CKMCQueues &Queues); void CalcStats(uchar* _part, uint64 _part_size, ReadType read_type, uint32* _stats); bool ProcessReads(uchar *_part, uint64 _part_size, ReadType read_type); @@ -105,7 +108,7 @@ class CWSplitter { uint64 n_reads; public: - CWSplitter(CKMCParams &Params, CKMCQueues &Queues); + CWSplitter(CKMCParams &Params, CKMCQueues &Queues, CMmerNorm* _norm); void operator()(); void GetTotal(uint64 &_n_reads); ~CWSplitter(); diff --git a/makefile b/makefile index 09f4de33..14d4d50d 100644 --- a/makefile +++ b/makefile @@ -8,14 +8,15 @@ KMC_TOOLS_DIR = kmc_tools PY_KMC_API_DIR = py_kmc_api CC = g++ -CFLAGS = -Wall -O3 -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11 -CLINK = -lm -static -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11 +CFLAGS = -Wall -g -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11 +CLINK = -lm -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11 -KMC_TOOLS_CFLAGS = -Wall -O3 -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 -KMC_TOOLS_CLINK = -lm -static -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 +KMC_TOOLS_CFLAGS = -Wall -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 +KMC_TOOLS_CLINK = -lm -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 KMC_OBJS = \ $(KMC_MAIN_DIR)/kmer_counter.o \ +$(KMC_MAIN_DIR)/mmer_compartator.o \ $(KMC_MAIN_DIR)/mmer.o \ $(KMC_MAIN_DIR)/mem_disk_file.o \ $(KMC_MAIN_DIR)/rev_byte.o \ @@ -84,28 +85,9 @@ $(KMC_MAIN_DIR)/raduls_avx2.o: $(KMC_MAIN_DIR)/raduls_avx2.cpp $(CC) $(CFLAGS) -mavx2 -c $< -o $@ kmc: $(KMC_OBJS) $(RADULS_OBJS) - -mkdir -p $(KMC_BIN_DIR) + -mkdir $(KMC_BIN_DIR) $(CC) $(CLINK) -o $(KMC_BIN_DIR)/$@ $^ $(KMC_LIBS) -kmc_dump: $(KMC_DUMP_OBJS) $(KMC_API_OBJS) - -mkdir -p $(KMC_BIN_DIR) - $(CC) $(CLINK) -o $(KMC_BIN_DIR)/$@ $^ - -kmc_tools: $(KMC_TOOLS_OBJS) $(KMC_API_OBJS) - -mkdir -p $(KMC_BIN_DIR) - $(CC) $(KMC_TOOLS_CLINK) -o $(KMC_BIN_DIR)/$@ $^ $(KMC_TOOLS_LIBS) - -$(PY_KMC_API_DIR)/%.o: $(KMC_API_DIR)/%.cpp - $(CC) -c -fPIC -Wall -O3 -m64 -std=c++11 $^ -o $@ - -py_kmc_api: $(PY_KMC_API_OBJS) $(PY_KMC_API_OBJS) - -mkdir -p $(KMC_BIN_DIR) - $(CC) -fPIC -Wall -shared -std=c++11 -O3 $(PY_KMC_API_DIR)/py_kmc_api.cpp $(PY_KMC_API_OBJS) \ - -I $(KMC_API_DIR) \ - -I $(PY_KMC_API_DIR)/libs/pybind11/include \ - -I `python3 -c "import sysconfig;print(sysconfig.get_paths()['include'])"` \ - -o $(KMC_BIN_DIR)/$@`python3-config --extension-suffix` - clean: -rm -f $(KMC_MAIN_DIR)/*.o -rm -f $(KMC_API_DIR)/*.o diff --git a/makefile_old b/makefile_old new file mode 100644 index 00000000..9f508027 --- /dev/null +++ b/makefile_old @@ -0,0 +1,98 @@ +all: kmc kmc_dump kmc_tools py_kmc_api + +KMC_BIN_DIR = bin +KMC_MAIN_DIR = kmer_counter +KMC_API_DIR = kmc_api +KMC_DUMP_DIR = kmc_dump +KMC_TOOLS_DIR = kmc_tools +PY_KMC_API_DIR = py_kmc_api + +CC = g++ +CFLAGS = -Wall -O3 -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11 +CLINK = -lm -static -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11 + +KMC_TOOLS_CFLAGS = -Wall -O3 -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 +KMC_TOOLS_CLINK = -lm -static -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 + +KMC_OBJS = \ +$(KMC_MAIN_DIR)/kmer_counter.o \ +$(KMC_MAIN_DIR)/mmer_compartator.o \ +$(KMC_MAIN_DIR)/mmer.o \ +$(KMC_MAIN_DIR)/mem_disk_file.o \ +$(KMC_MAIN_DIR)/rev_byte.o \ +$(KMC_MAIN_DIR)/bkb_writer.o \ +$(KMC_MAIN_DIR)/cpu_info.o \ +$(KMC_MAIN_DIR)/bkb_reader.o \ +$(KMC_MAIN_DIR)/fastq_reader.o \ +$(KMC_MAIN_DIR)/timer.o \ +$(KMC_MAIN_DIR)/develop.o \ +$(KMC_MAIN_DIR)/kb_completer.o \ +$(KMC_MAIN_DIR)/kb_storer.o \ +$(KMC_MAIN_DIR)/kmer.o \ +$(KMC_MAIN_DIR)/splitter.o \ +$(KMC_MAIN_DIR)/kb_collector.o +RADULS_OBJS = \ +$(KMC_MAIN_DIR)/raduls_sse2.o \ +$(KMC_MAIN_DIR)/raduls_sse41.o \ +$(KMC_MAIN_DIR)/raduls_avx2.o \ +$(KMC_MAIN_DIR)/raduls_avx.o + +KMC_LIBS = \ +$(KMC_MAIN_DIR)/libs/libz.a \ +$(KMC_MAIN_DIR)/libs/libbz2.a + +KMC_DUMP_OBJS = \ +$(KMC_DUMP_DIR)/nc_utils.o \ +$(KMC_DUMP_DIR)/kmc_dump.o + +KMC_API_OBJS = \ +$(KMC_API_DIR)/mmer.o \ +$(KMC_API_DIR)/kmc_file.o \ +$(KMC_API_DIR)/kmer_api.o + +KMC_API_SRC_FILES = $(wildcard $(KMC_API_DIR)/*.cpp) +PY_KMC_API_OBJS = $(patsubst $(KMC_API_DIR)/%.cpp,$(PY_KMC_API_DIR)/%.o,$(KMC_API_SRC_FILES)) + +KMC_TOOLS_OBJS = \ +$(KMC_TOOLS_DIR)/kmc_header.o \ +$(KMC_TOOLS_DIR)/kmc_tools.o \ +$(KMC_TOOLS_DIR)/nc_utils.o \ +$(KMC_TOOLS_DIR)/parameters_parser.o \ +$(KMC_TOOLS_DIR)/parser.o \ +$(KMC_TOOLS_DIR)/tokenizer.o \ +$(KMC_TOOLS_DIR)/fastq_filter.o \ +$(KMC_TOOLS_DIR)/fastq_reader.o \ +$(KMC_TOOLS_DIR)/fastq_writer.o \ +$(KMC_TOOLS_DIR)/percent_progress.o + +KMC_TOOLS_LIBS = \ +$(KMC_TOOLS_DIR)/libs/libz.a \ +$(KMC_TOOLS_DIR)/libs/libbz2.a + +$(KMC_OBJS) $(KMC_DUMP_OBJS) $(KMC_API_OBJS): %.o: %.cpp + $(CC) $(CFLAGS) -c $< -o $@ + +$(KMC_TOOLS_OBJS): %.o: %.cpp + $(CC) $(KMC_TOOLS_CFLAGS) -c $< -o $@ + +$(KMC_MAIN_DIR)/raduls_sse2.o: $(KMC_MAIN_DIR)/raduls_sse2.cpp + $(CC) $(CFLAGS) -msse2 -c $< -o $@ +$(KMC_MAIN_DIR)/raduls_sse41.o: $(KMC_MAIN_DIR)/raduls_sse41.cpp + $(CC) $(CFLAGS) -msse4.1 -c $< -o $@ +$(KMC_MAIN_DIR)/raduls_avx.o: $(KMC_MAIN_DIR)/raduls_avx.cpp + $(CC) $(CFLAGS) -mavx -c $< -o $@ +$(KMC_MAIN_DIR)/raduls_avx2.o: $(KMC_MAIN_DIR)/raduls_avx2.cpp + $(CC) $(CFLAGS) -mavx2 -c $< -o $@ + +kmc: $(KMC_OBJS) $(RADULS_OBJS) + -mkdir -p $(KMC_BIN_DIR) + $(CC) $(CLINK) -o $(KMC_BIN_DIR)/$@ $^ $(KMC_LIBS) + +clean: + -rm -f $(KMC_MAIN_DIR)/*.o + -rm -f $(KMC_API_DIR)/*.o + -rm -f $(KMC_DUMP_DIR)/*.o + -rm -f $(KMC_TOOLS_DIR)/*.o + -rm -f $(PY_KMC_API_DIR)/*.o + -rm -f $(PY_KMC_API_DIR)/*.so + -rm -rf bin