FROM:http://www.cnblogs.com/finallyliuyu/archive/2010/09/03/1817348.html
1 头文件: 2 3 4 #ifndef _Preprocess_H 5 #define _Preprocess_H 6 #include 7 #include 8 #include 9 #include 10 #include 11 #include 12 #include 13 #include 14 #include 15 #include 16 #include 17 #include 18 #include"ictclas30.h" 19 #include"boost r1 egex.hpp" 20 #include"boost/algorithm/string.hpp" 21 #include"windows.h" 22 23 //一些谓词函数 24 using namespace std; 25 26 class Preprocess 27 { 28 //typedef vector(Preprocess::*FUNCSEG)(string,set); 29 private: 30 char *bagofwordsAddress;//存放词袋子模型的位置 31 char * featurewordsAddress;//存放特征词文件的位置; 32 char *arffFileAddress;//存放ARFF文件的位置 33 char *infoFromWekaAddress;//存放调用weka后的实验结果 34 char *articleIdsAddress;//存放被聚类的文章的ID号 35 char *dbconnection;//数据库的链接字符串 36 char *dbselect;//数据库select语句 37 char *dbfield;//数据库字段 38 int beginIndex;//开始聚类的文章id 39 int endIndex;//结束聚类的文章id 40 public: 41 typedef vector(Preprocess::*FUNCSEG)(string,set); 42 Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex) 43 { 44 bagofwordsAddress=new char[c_style_stringsize]; 45 featurewordsAddress=new char[c_style_stringsize]; 46 arffFileAddress=new char[c_style_stringsize]; 47 infoFromWekaAddress=new char[c_style_stringsize]; 48 articleIdsAddress=new char[c_style_stringsize]; 49 dbconnection=new char[c_style_stringsize]; 50 dbselect=new char[c_style_stringsize]; 51 this->beginIndex=beginIndex; 52 this->endIndex=endIndex; 53 sprintf_s(bagofwordsAddress,c_style_stringsize,mydict); 54 sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo); 55 sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster); 56 sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka); 57 sprintf_s(articleIdsAddress,c_style_stringsize,artileIds); 58 sprintf_s(dbconnection,c_style_stringsize,conn); 59 sprintf_s(dbselect,c_style_stringsize,selectsql); 60 61 62 63 } 64 65 66 ~Preprocess() 67 { 68 delete []bagofwordsAddress; 69 delete []featurewordsAddress; 70 delete []arffFileAddress; 71 delete [] infoFromWekaAddress; 72 delete []articleIdsAddress; 73 delete []dbconnection; 74 delete []dbselect; 75 76 77 } 78 void trim(string &str,const string val);//去除字符串首尾空白 79 //构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf 80 int ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg); 81 inline void TruncateArff() 82 { 83 ofstream ofile; 84 ofile.open(arffFileAddress,ios::trunc); 85 ofile.close(); 86 } 87 //保存词袋子到硬盘 88 void save(mapint,int> > >&mymap); 89 //从内存中加载词袋子模型 90 void load(mapint,int> > >&mymap); 91 //打印词袋子模型 92 void print(mapint,int> > >&mymap); 93 //窄字符串转化成宽字符串 94 wstring myMultibyteToWideChar(string sResult); 95 //宽字符串转化成窄字符串 96 string myWideCharToMultibyte(wstring wsResult); 97 //调用ICTclass分词 98 string ICTsplit(const char *sInput); 99 //构造停用词表 100 setMakeStopSet(); 101 //去除停用词,噪声词 102 vectorgoodWordsinPieceArticle(string rawtext,set stopwords); 103 //整数转化成字符串 104 string do_fraction(int val); 105 //浮点数转化成字符串 106 string do_fraction(double val, int decplaces=5); 107 //特征词选择算法 108 void DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold); 109 //获取最后的特征词 110 vector GetFinalKeyWords(); 111 //获取特征词的maxTF,DF 112 vectorint,int> >GetfinalKeysMaxTFDF(mapint,int>>> &mymap); 113 //文档向量模型规范化 114 vectorint,double> > NormalizationVSM(vectorint,double> > tempVSM); 115 //建立文档向量模型并且写到arff文件里 116 void VSMFormation(mapint,int>>> &mymap); 117 118 string FormatVSMtoString(vectorint,double> > tempVSM); 119 //写Arff文件头部 120 void WriteHeadArff(); 121 void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg); 122 123 124 map<</code>int,vector<</code>double> >VSMConstruction(mapint,int>>> &mymap); 125 126 map<</code>double> > GetClusters(); 127 128 double CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); 129 130 double CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); 131 132 vectorint,string> >GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters); 133 134 map<</code>int> >FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo); 135 void RetreiveArticleInfoFromDataBase(); 136 vector mySplit(string s,set stopwords);//分割关键词 137 138 139 140 141 142 143 144 145 }; 146 147 148 149 #endif 150 151 152 Preprocess类的函数功能实现文件: 153 154 155 156 157 #include"stdafx.h" 158 #include "Preprocess.h" 159 160 #pragma comment(lib, "ICTCLAS30.lib") 161 using namespace std; 162 bool isLonger(const pairint> &pair1, const pairint> &pair2) 163 { 164 return pair1.second>pair2.second; 165 } 166 bool cntAssist(const pairint> &pair1) 167 { 168 return pair1.second<=100; 169 } 170 bool PredTF(const pair<</code>int,int>& pair1,int articleId) 171 { 172 return pair1.first==articleId; 173 174 } 175 class PredTFclass 176 { 177 private: const int m; 178 public: 179 PredTFclass(int id):m(id){}; 180 bool operator()(const pair<</code>int,int>& pair1){return PredTF(pair1,m);}; 181 }; 182 bool myCmp(const pairdouble>&pair1,const pairdouble>&pair2 ) 183 { 184 return pair1.second>=pair2.second; 185 } 186 187 void Preprocess:: trim(string &str,const string val) 188 { 189 str.erase(0,str.find_first_not_of(val)); 190 str.erase(str.find_last_not_of(val)+val.size()); 191 } 192 int Preprocess::ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg) 193 { 194 //setMakeStopSet(); 195 CoInitialize(NULL); 196 _ConnectionPtr pConn(__uuidof(Connection)); 197 _RecordsetPtr pRst(__uuidof(Recordset)); 198 pConn->ConnectionString=dbconnection; 199 pConn->Open("","","",adConnectUnspecified); 200 pRst=pConn->Execute(dbselect,NULL,adCmdText); 201 setstopwords=MakeStopSet(); 202 203 while(!pRst->rsEOF) 204 { vectorwordcollection; 205 //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord"); 206 string rawtext=(_bstr_t)pRst->GetCollect(dbfield); 207 if(rawtext!="") 208 { 209 wordcollection=(this->*seg)(rawtext,stopwords); 210 string tempid=(_bstr_t)pRst->GetCollect("ArticleId"); 211 int articleid=atoi(tempid.c_str()); 212 for(vector::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++) 213 { 214 vectorint,int>>::iterator it; 215 if(mymap[*strit].empty()) 216 { 217 pair<</code>int,int>mytemppair=make_pair(articleid,1); 218 mymap[*strit].push_back(mytemppair); 219 220 } 221 else 222 { 223 for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++) 224 { 225 if(it->first==articleid) 226 { 227 it->second=++(it->second); 228 break; 229 } 230 231 } 232 if(it==mymap[*strit].end()) 233 { 234 pair<</code>int,int>mytemppair=make_pair(articleid,1); 235 mymap[*strit].push_back(mytemppair); 236 } 237 238 } 239 240 } 241 242 243 } 244 245 246 pRst->MoveNext(); 247 wordcollection.clear(); 248 } 249 pRst->Close(); 250 pConn->Close(); 251 pRst.Release(); 252 pConn.Release(); 253 CoUninitialize(); 254 255 return 0; 256 257 } 258 void Preprocess::save(mapint,int> > >&mymap) 259 { 260 ofstream outfile(bagofwordsAddress,ios::binary); 261 outfile<<mymap.size()<<endl; 262 mapint,int> > >::iterator it; 263 for (it=mymap.begin();it!=mymap.end();it++) 264 { outfile<<it->first<<endl; 265 vectorint,int>>::iterator subit; 266 outfile<<it->second.size()<<endl; 267 for(subit=(it->second).begin();subit!=(it->second).end();++subit) 268 { 269 outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" "; 270 } 271 outfile<<endl; 272 } 273 //outfile.write((char *)&mymap,sizeof(mymap)); 274 275 outfile.close(); 276 277 } 278 void Preprocess::load(mapint,int> > >&mymap) 279 { 280 std::locale loc1 = std::locale::global(std::locale(".936")); 281 { 282 // 在这里使用std::ifstream 或者 std::fstream 283 ifstream infile(bagofwordsAddress,ios::binary); 284 int lenMyMap;//保存词典长度 285 int lenVector;//保存每个词出现的文章数目 286 string key;//保存读出的map的键值 287 int articleId;//文章标号 288 int count;//在该文章中刚出现的数目 289 string comma; 290 string semicolon; 291 infile>>lenMyMap; 292 while(!infile.eof()) 293 { 294 infile>>key; 295 infile>>lenVector; 296 vectorint,int> >temp; 297 for (int i=0;i 298 { 299 infile>>articleId>>count>>semicolon; 300 temp.push_back(make_pair(articleId,count)); 301 } 302 mymap[key]=temp; 303 304 305 } 306 307 308 infile.close(); 309 } 310 std::locale::global(std::locale(loc1)); 311 312 } 313 void print(mapint,int> > >&mymap) 314 { 315 cout<<mymap.size()<<endl; 316 mapint,int> > >::iterator it; 317 for (it=mymap.begin();it!=mymap.end();it++) 318 { cout<<it->first<<endl; 319 vectorint,int>>::iterator subit; 320 cout<<it->second.size()<<endl; 321 for(subit=(it->second).begin();subit!=(it->second).end();++subit) 322 { 323 cout<<subit->first<<','<<subit->second<<";"; 324 } 325 cout<<endl; 326 } 327 328 } 329 set Preprocess::MakeStopSet() 330 { 331 set stopwordsSet; 332 ifstream ifile("stopwords.txt"); 333 while(!ifile.eof()) 334 { 335 string temp; 336 trim(temp," "); 337 ifile>>temp; 338 stopwordsSet.insert(temp); 339 } 340 return stopwordsSet; 341 } 342 343 string Preprocess::do_fraction(int val) 344 { 345 ostringstream out; 346 out<<val; 347 string str= out.str(); //从流中取出字符串 348 str.swap(string(str.c_str()));//删除nul之后的多余字符 349 return str; 350 351 } 352 string Preprocess::do_fraction(double val,int decplaces) 353 { 354 355 //int prec=numeric_limits::digits10; 356 char DECIMAL_POINT='.'; 357 ostringstream out; 358 //out.precision(prec); 359 out<<val; 360 string str=out.str(); 361 size_t n=str.find(DECIMAL_POINT); 362 if((n!=string::npos)&&n+decplaces 363 { 364 str[n+decplaces]='