对于一组基因文件中的基因序列,选取一段基因片段,作为索引,利用hash表,查找固定的基因片段。有一定的并且容忍错误。
简单讲就是自己实现一个hashtable,将选出特定字符串建立索引,便于查询。输出时可容忍一定数量的错误。
贴上代码
HashTable.h
#include <iostream> using namespace std; typedef int KeyType; #define NULLKEY -1 struct Entry { KeyType _key; string _value; Entry(KeyType key = NULLKEY, string value = "") :_key(key), _value(value) {} }; class hashTable { public: hashTable(); hashTable::hashTable(int table_size, double occupancy_get); //hashTable(int tableSize); ~hashTable(); bool find(const Entry& e); string hashTable::getValue(const KeyType& k); bool insert(const Entry& e); bool remove(const Entry& e); void clear(); Entry& operator[](KeyType key);//重载下标操作;当找不到key对应的Entry时,插入Entry(key,0) int size(); void display(); protected: int hashFunction(KeyType key);//将键值映射到对应地址 void rehash();//调整hashTable大小 bool find(const KeyType& k);//按键值查找 int nextPrime();//p(n) = n^2 - n + 41, n<41, p<1681 private: Entry *_pTable; int _pos;//当前访问元素的位置 int _size; int _capacity; int primeIndex; };
HashTable.cpp
#include "HashTable.h" static double _occupancy = 0.5; //开放定址法 hashTable::hashTable() { _capacity = 100;//初始化hashTable容量为100,便于观察rehash过程 _pTable = new Entry[_capacity]; _size = 0; primeIndex = 1; } hashTable::hashTable(int table_size,double occupancy_get) { _capacity = table_size; _occupancy = occupancy_get; _pTable = new Entry[_capacity]; _size = 0; primeIndex = 1; } hashTable::~hashTable() { clear(); } int hashTable::nextPrime() { int p = std::pow(static_cast<float>(primeIndex), 2) - primeIndex +3777; primeIndex = primeIndex << 1; if (primeIndex >= 3777) { cout << "Max capacity reached. exit!" << endl; exit(-1); } return p; } bool hashTable::find(const Entry& e) { return(find(e._key)); } bool hashTable::find(const KeyType& k) { _pos = hashFunction(k); if (_pTable[_pos]._key == NULLKEY) return false; int lastPos = _pos; while (_pTable[_pos]._key != k) { if (++_pos%_capacity == lastPos) return false; } return true; } string hashTable::getValue(const KeyType& k) { _pos = hashFunction(k); if (_pTable[_pos]._key == NULLKEY) return ""; int lastPos = _pos; while (_pTable[_pos]._key != k) { if (++_pos%_capacity == lastPos) return ""; } return _pTable[_pos]._value; } bool hashTable::insert(const Entry& e) { if ((_size*1.0) / _capacity>_occupancy) //通过occupancy判断是否需要扩容 rehash();//[OK]插入操作前,需要判断hash table是否需要扩容 if (find(e)) return false; _pTable[_pos] = e; ++_size; return true; } bool hashTable::remove(const Entry& e) { if (!find(e)) return false; _pTable[_pos]._key = NULLKEY; --_size; //rehash();//移除操作后,需要判断hash table是否需要缩容 return true; } void hashTable::clear() { delete[]_pTable; _size = _capacity = 0; } Entry& hashTable::operator[](KeyType key) { if (!find(key)) insert(Entry(key, 0)); return _pTable[_pos]; } int hashTable::size() { return _size; } int hashTable::hashFunction(KeyType key) { return key%_capacity; } void hashTable::display() { cout << "capacity = " << _capacity << ", size = " << _size << endl; for (int i = 0; i<_capacity; i++) { if (_pTable[i]._key != NULLKEY); // cout << "key=" << _pTable[i]._key << ",value=" << _pTable[i]._value << endl; } } void hashTable::rehash() { //cout << "begin rehash..." << endl; Entry *p = new Entry[_size];//用来暂存原哈希表 for (int i = 0; i<_capacity; i++) {//i<_size不对;元素散列在容量为_capacity的hashTable中 if (_pTable[i]._key != NULLKEY) *(p + i) = _pTable[i]; } delete[]_pTable; int lastSize = _size; _size = 0; _capacity = nextPrime(); _pTable = new Entry[_capacity]; for (int i = 0; i<lastSize; i++) insert(*(p + i)); delete[]p; }
miniblast_hash.cpp
#include <iostream> #include <string> #include <fstream> #include "HashTable.h" using namespace std; const int KMER_MAX = 100; //得到基因组文件中,position开始,长度为len的字符串 string getStringByPos(string file_name, long position,int str_len) { //读入数据 std::ifstream myfile(file_name); if (!myfile.is_open()) { cout << "未成功打开文件" << endl; } string s = ""; //移动文件指针到基因指定位置,因为有无用字符,需要边移动边判断 for (long i = 0; i < position; i++) { char c; myfile.get(c); //移动指针的过程中要忽略无意义字符 if (c=='/0'||c==' ') i--; } //截取指定长度的字符串 for (long i = 0; i < str_len; i++) { char c; myfile.get(c); //忽略无意义字符 if (c == '/0' || c == ' ') i--; else { switch (c) { case 'A': s.append("A"); break; case 'G': s.append("G"); break; case 'C': s.append("C"); break; case 'T': s.append("T"); break; default: break; } } } return s; } //比较检索基因片段与原始片段相差个数 int compareString(string s1, string s2, int len) { int num = 0; for (int i = 0; i < len; i++) { if (s1[i] != s2[i]) num++; } return num; } //搜索指定基因片段 void searchGenome(string file_name,hashTable *pTable,int mistakeNum,int kmer,string query_string,string out_name) { char *kmer_cut = new char[kmer]; int length = query_string.length(); int findNum = 0; int table_size = pTable->size(); //std::ofstream out(out_name, ios::app);//输出结果至文件 for (int i = 0; i <kmer; i++) { kmer_cut[i] = query_string[i]; } string kmer_string(&kmer_cut[0],&kmer_cut[kmer]); for (int i = 0; i <table_size; i++) { string get = pTable->getValue(i); if (kmer_string == get) { if (i + length > table_size) continue; string ss = getStringByPos(file_name, i, length); int num = compareString(ss, query_string,length); if (num<=mistakeNum) { findNum++; cout <<i<<" "<<num<<" "<<ss << endl; // out << i <<" "<< num <<" " << ss << endl; } } } if (findNum==0) { cout<< "No Match" << endl; // out << "No Match" << endl; } //out.close(); } //构建基因搜索用的hashtable索引 void GenomeIndex(string genome_file_name,int kmer, hashTable *pTable) { //读入数据,建立索引 std::ifstream myfile(genome_file_name); if (!myfile.is_open()) { cout << "未成功打开文件" << endl; } char *c = new char[KMER_MAX]; myfile.get(c, kmer + 1); string add(&c[0], &c[kmer]); int geno_loc = 0; pTable->insert(Entry(0, add)); //cout << add << endl; while (!myfile.eof()) { char next_char; myfile.get(next_char); //去除文件中无用字符 if (next_char != '/0'&&next_char != ' ') { for (int i = 0; i < kmer; i++) c[i] = c[i + 1]; c[kmer-1] = next_char; string add(&c[0], &c[kmer]); geno_loc++; pTable->insert(Entry(geno_loc, add)); //cout << add <<geno_loc<< endl; } } myfile.close(); } int main(int argc, char* argv[]) { /*string input_file_name =""; string out_file_name = ""; if (argc>1) { input_file_name = string(argv[1]); out_file_name = string(argv[2]); }*/ //如果不需要命令行参数执行文件,直接赋值input_file_name就好 //input_file_name = "input_small.txt"; std::freopen(argv[1], "r", stdin); std::freopen(argv[2], "w", stdout); //std::ifstream inputfile(input_file_name); hashTable *pTable=new hashTable(); /*if (!inputfile.is_open()) { cout << "未成功打开文件" << endl; }*/ //先读取命令文件,设置相关参数,初始值为随意指定,无意义 string str_order = ""; string genome_file_name = ""; int table_size =20; double occupancy =0.4; int kmer = 4; int mistakeNum = 0; string query_string; bool isIndex = false; while (str_order!="quit") { cin >> str_order; if (str_order=="genome") { cin >> genome_file_name; } else if (str_order == "table_size") { cin >> table_size; } else if (str_order == "occupancy") { cin >> occupancy; pTable = new hashTable(table_size, occupancy); } else if (str_order == "kmer") { cin >> kmer; } else if (str_order == "query") { cin >> mistakeNum >> query_string; //首次进行查询前,要建立索引hashTable if (!isIndex) { isIndex = true; GenomeIndex(genome_file_name, kmer, pTable); } //std::ofstream out(out_file_name,ios::app);//输出结果至文件 /*if (out.is_open()) { out << "Query: " << query_string <<endl; }*/ cout << "Query: " << query_string << endl; //out.close(); searchGenome(genome_file_name, pTable, mistakeNum, kmer, query_string, ""); } } delete pTable;//释放指针,防止内存泄露 //getchar();//暂停程序,观察结果 return 0; }
因为需要,代码上有详细注解,欢迎交流学习。