本博文主要介绍IO流中fstream,iterator的简单应用。
问题描述:
a):找一篇文章(The Bble Holy) ,将其所有的单词输入vector中,并统计其单词个数
b):增加统计词频功能,用结构体实现(word ,count) ,将结构体存入vector中
c):增加停用词功能。
1):从网上找 英文停用词,并命名为 stop_list.txt;
2):统计词频时,若遇到此类词直接略过。
d):计算上述话费时间,具体有:
1):读取文件时间;
2):排序所用时间;
3):打印所用时间。
代码如下:
1 #include <iostream> 2 #include <string> 3 #include <vector> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 #include <algorithm> 8 #include <fstream> 9 #include <ctype.h> 10 #include <sys/time.h> 11 #include <stdexcept> 12 using namespace std ; 13 14 struct Word 15 { 16 string word_ ; 17 int count_ ; 18 }; 19 //readfile to vec 20 void ReadFile(ifstream &in, vector<Word> &vec, const vector<string>&stop ); 21 //readstoplist to stop 22 void ReadStopList( ifstream& in , vector<string> &stop); 23 //judge s is a stopword or not 24 bool IsStopWord(const vector<string>&stop , const string s ); 25 //treans s to lower one 26 void stringToLower(string&s); 27 //judge s include punct or not 28 void ErasePunct(string &s); 29 //judge vec include s or not & add word to vec 30 void AddWordToDict(vector<Word> &vec ,const string s); 31 //sort word by frequency 32 void WordSort(vector<Word> &vec ); 33 //print frequency of a word 34 void printFrequency( const vector<Word>&vec ) ; 35 //calculate runing_time 36 int64_t getTime(); 37 int main(int argc, const char *argv[]) 38 { 39 // readfile -->store to vec --->sort ---> calculate count ---> print 40 if(argc < 2) 41 { 42 perror("Usage:exe srcfile"); 43 exit(EXIT_FAILURE); 44 } 45 vector<Word> vec ; 46 47 vector<string> stopList ; 48 49 ifstream infile(argv[1]); 50 ifstream stopfile("stop_list.txt"); 51 if( ! stopfile ) 52 throw std::runtime_error("stop_list does not exist!"); 53 if( !infile ) 54 throw std::runtime_error("open file failure"); 55 56 int64_t starttime = getTime(); 57 ReadStopList(stopfile , stopList); 58 ReadFile( infile, vec ,stopList ); 59 int64_t readtime = getTime(); 60 61 WordSort( vec ); 62 63 int64_t sorttime = getTime(); 64 printFrequency( vec ) ; 65 66 int printtime = getTime(); 67 cout << vec.size() << endl ; 68 cout << "the time of reading file :" << (readtime - starttime) << endl ; 69 cout << "the time of sorting: " << (sorttime - readtime) << endl ; 70 cout<<"the time of printing"<< (printtime - sorttime) << endl ; 71 infile.close(); 72 return 0; 73 } 74 75 //dos2unix command : window-->unix //将windows文档格式转化成unix文档格式 76 void ReadFile(ifstream &in, 77 vector<Word> &vec, 78 const vector<string>&stop ) 79 { 80 vec.clear(); 81 string s ; 82 while( in >> s) 83 { 84 ErasePunct(s); // judge s dose include punct or not 85 stringToLower(s); 86 if(! IsStopWord(stop ,s)) 87 { 88 AddWordToDict(vec , s); 89 } 90 } 91 } 92 93 void ReadStopList( ifstream& in , vector<string> &stop) 94 { 95 stop.clear(); 96 string s ; 97 while( in >> s) 98 { 99 stop.push_back( s ); 100 } 101 } 102 void stringToLower(string &s) 103 { 104 string::iterator it =s.begin(); 105 while(it != s.end()) 106 { 107 if(isupper(*it)) 108 *it = tolower(*it); 109 it++ ; 110 } 111 } 112 113 114 bool IsStopWord(const vector<string>&stop , const string s ) 115 { 116 vector<string>::const_iterator it = stop.begin(); 117 while( it != stop.end()) 118 { 119 if((*it) == s) 120 { 121 return true ; 122 } 123 it ++ ; 124 } 125 if(it == stop.end()) 126 return false ; 127 //还可以用find函数 ,代码如下 128 /* 129 vector<string>::const_iterator it = 130 find(stop.begin(), stop.end(), s ); 131 return (it != stop.end()); 132 133 */ 134 } 135 136 void ErasePunct(string &s) 137 { 138 string::iterator it = s.begin(); 139 while(it != s.end()) 140 { 141 if(ispunct(*it)) 142 it = s.erase(it); 143 else 144 ++it ; 145 } 146 } 147 148 void AddWordToDict(vector<Word> &vec ,const string s) 149 { 150 vector<Word>::iterator it = vec.begin();////顺序查找 151 while( it != vec.end()) 152 { 153 if(it->word_ == s) 154 { 155 (it->count_) ++ ; 156 break ; 157 } 158 ++ it ; 159 } 160 if(it == vec.end()) // 类似于链表操作 161 { 162 Word tmp ; 163 tmp.word_ = s ; 164 tmp.count_ = 1 ; 165 vec.push_back(tmp); 166 } 167 } 168 169 int tmp(const Word &w1 , const Word &w2) 170 { 171 // a > b 172 return (w1.count_ > w2.count_); 173 } 174 void WordSort(vector<Word> &vec ) 175 { 176 sort(vec.begin() , vec.end() ,tmp ); //库函数sort 177 } 178 179 void printFrequency( const vector<Word>&vec ) 180 { 181 for(vector<Word>::const_iterator it = vec.begin(); //注意此处的const 182 it != vec.end(); 183 ++ it) 184 printf("word :%s, frequency: %d ", it->word_.c_str(), it->count_); 185 int64_t getTime(); 186 } 187 188 189 int64_t getTime() 190 { 191 struct timeval tm ; 192 memset(&tm , 0, sizeof(tm)); 193 if(-1== gettimeofday(&tm ,NULL)) 194 throw runtime_error("gettime failure"); 195 196 int64_t t = tm.tv_usec ; 197 t += tm.tv_sec*1000*1000 ; 198 return t ; 199 }
本程序时间复杂度为O(n*n),后续文章我们将讨论时间复杂度更优的算法。