• Kmeans文本聚类系列之全部代码


    语料数据库

    实验结果以及中间数据

    文本预处理开源框架源代码

    头文件:

    #ifndef _Preprocess_H
    #define  _Preprocess_H
    #include<iostream>
    #include<map>
    #include<set>
    #include<vector>
    #include<string>
    #include<iomanip>
    #include<fstream>
    #include<algorithm>
    #include<cmath>
    #include<sstream>
    #include<limits>
    #include <xstring>
    #include"ictclas30.h"
    #include"boost\tr1\regex.hpp"
    #include"boost/algorithm/string.hpp"
    #include"windows.h"
    
    /************************************************************************/
    /* WkaPreprocess类完成如下功能
    将文本集合分词-》去停用词-》建立词袋子模型=》特征词选择=》对文章建立VSM模型=
    》写成weka数据格式(arff)-》输出聚类信息                                    */
    /************************************************************************/
    //一些谓词函数
    using namespace std;
    
    class Preprocess
    {       
    	//typedef  vector<string>(Preprocess::*FUNCSEG)(string,set<string>);
    	private:
    		 char *bagofwordsAddress;//存放词袋子模型的位置
    		char * featurewordsAddress;//存放特征词文件的位置;
    		char *arffFileAddress;//存放ARFF文件的位置
    	    char *infoFromWekaAddress;//存放调用weka后的实验结果
    		char *articleIdsAddress;//存放被聚类的文章的ID号
    		char *dbconnection;//数据库的链接字符串
    		char *dbselect;//数据库select语句
    		char *dbfield;//数据库字段
    		int beginIndex;//开始聚类的文章id
    		int endIndex;//结束聚类的文章id 
    	public:
    		typedef vector<string>(Preprocess::*FUNCSEG)(string,set<string>);
    		Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)
    		{
    				bagofwordsAddress=new char[c_style_stringsize];
    				featurewordsAddress=new char[c_style_stringsize];
    				arffFileAddress=new char[c_style_stringsize];
    				infoFromWekaAddress=new char[c_style_stringsize];
    				articleIdsAddress=new char[c_style_stringsize];
    				dbconnection=new char[c_style_stringsize];
    				dbselect=new char[c_style_stringsize];
    				this->beginIndex=beginIndex;
    				this->endIndex=endIndex;
    				sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);
    				sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);
    				sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);
    				sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);
    				sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);
    				sprintf_s(dbconnection,c_style_stringsize,conn);
    				sprintf_s(dbselect,c_style_stringsize,selectsql);
    			
    
    
    		}
    		/*Preprocess()
    		{
    
    		}*/
    
    		~Preprocess()
    		{
    			delete []bagofwordsAddress;
    			delete []featurewordsAddress;
    			delete []arffFileAddress;
    			delete [] infoFromWekaAddress;
    			delete []articleIdsAddress;
    			delete []dbconnection;
    			delete []dbselect;
    			
    
    		}
    		void trim(string  &str,const string val);//去除字符串首尾空白
    		//构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf
    		int ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg);
    		inline void TruncateArff()
    		{
    			ofstream ofile;
    			ofile.open(arffFileAddress,ios::trunc);
    			ofile.close();
    		}
    		//保存词袋子到硬盘
    		void save(map<string,vector<pair<int,int> > >&mymap);
    		//从内存中加载词袋子模型
    		void load(map<string,vector<pair<int,int> > >&mymap);
    		//打印词袋子模型
    		void print(map<string,vector<pair<int,int> > >&mymap);
    		//窄字符串转化成宽字符串
    		wstring myMultibyteToWideChar(string sResult);
    		//宽字符串转化成窄字符串
    		string myWideCharToMultibyte(wstring wsResult);
    		//调用ICTclass分词
    		string ICTsplit(const char *sInput);
    		//构造停用词表
    		set<string>MakeStopSet();
    		//去除停用词,噪声词
    		vector<string>goodWordsinPieceArticle(string rawtext,set<string> stopwords);
    		//整数转化成字符串
    		string do_fraction(int val);
    		//浮点数转化成字符串
    		string do_fraction(double val, int decplaces=5);
    		//特征词选择算法
    		void DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold);
    		//获取最后的特征词
    		vector<string> GetFinalKeyWords();
    		//获取特征词的maxTF,DF
    		vector<pair<int,int> >GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap);
    		//文档向量模型规范化
    		vector<pair<int,double> > NormalizationVSM(vector<pair<int,double> > tempVSM);
    		//建立文档向量模型并且写到arff文件里
    		void VSMFormation(map<string,vector<pair<int,int>>> &mymap);
    		/***单个文档向量模型字符串化***/
    		string FormatVSMtoString(vector<pair<int,double> > tempVSM);
    		//写Arff文件头部
    		void WriteHeadArff();
    		void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg);
    		/******************************************************以下函数完成聚类功能**********************************/
    		/***************建立文档向量模型,但是不形成字符串***********/
    		map<int,vector<double> >VSMConstruction(map<string,vector<pair<int,int>>> &mymap);
    		/************从weka给出的结果中获取聚类中心******/
    		map<string,vector<double> > GetClusters();
    		/**计算向量的内积*****************8*/
    		double CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2);
    		/************计算余弦相似度*******/
    		double CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2);
    		/* 获取聚类信息,即给每篇文章附上一个类别label    */
    		vector<pair<int,string> >GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters);
    		/****返回聚类中每个类别的文章ID******************/
    		map<string,vector<int> >FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo);
    		void RetreiveArticleInfoFromDataBase();
    		vector<string> mySplit(string s,set<string> stopwords);//分割关键词
    
    		
    
    
    
    
    
    
    };
    
    
    
    #endif
     
    Preprocess类的函数功能实现文件:
    #include"stdafx.h"
    #include "Preprocess.h"
    
    #pragma comment(lib, "ICTCLAS30.lib")
    using namespace std;
    /************************************************************************/
    /* 去掉字符串首尾空白                                                                     */
    /************************************************************************/
    bool isLonger(const  pair<string,int> &pair1, const pair<string,int>  &pair2)
    {
    	return pair1.second>pair2.second;
    }
    bool cntAssist(const  pair<string,int> &pair1)
    {
    	return pair1.second<=100;
    }
    bool PredTF(const pair<int,int>& pair1,int articleId)
    {
    	return pair1.first==articleId;
    
    }
    class PredTFclass
    {
    private: const int m;
    public: 
    	PredTFclass(int id):m(id){};
    	bool operator()(const pair<int,int>& pair1){return PredTF(pair1,m);};
    };
    bool myCmp(const pair<string,double>&pair1,const pair<string,double>&pair2 )
    {
    	return pair1.second>=pair2.second;
    }
    
    void Preprocess:: trim(string  &str,const string val)
    {
    	str.erase(0,str.find_first_not_of(val));
    	str.erase(str.find_last_not_of(val)+val.size());
    }
    /************************************************************************/
    /* 建立词袋子模型                                                                     */
    /************************************************************************/
    int Preprocess::ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg)
    {
    	//set<string>MakeStopSet();
    	CoInitialize(NULL);
    	_ConnectionPtr pConn(__uuidof(Connection));
    	_RecordsetPtr pRst(__uuidof(Recordset));
    	pConn->ConnectionString=dbconnection;
    	pConn->Open("","","",adConnectUnspecified);
    	pRst=pConn->Execute(dbselect,NULL,adCmdText);
    	set<string>stopwords=MakeStopSet();
    	
    	while(!pRst->rsEOF)
    	{	vector<string>wordcollection;
    	   //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
    		string rawtext=(_bstr_t)pRst->GetCollect(dbfield);
    		if(rawtext!="")
    		{
    			wordcollection=(this->*seg)(rawtext,stopwords);
    			string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
    			int articleid=atoi(tempid.c_str());
    			for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
    			{
    				vector<pair<int,int>>::iterator it;
    				if(mymap[*strit].empty())
    				{
    					pair<int,int>mytemppair=make_pair(articleid,1);
    					mymap[*strit].push_back(mytemppair);
    
    				}
    				else
    				{
    					for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
    					{  
    						if(it->first==articleid)
    						{
    							it->second=++(it->second);
    							break;
    						}
    
    				}
    				if(it==mymap[*strit].end())
    				{
    					pair<int,int>mytemppair=make_pair(articleid,1);
    					mymap[*strit].push_back(mytemppair);
    				}
    
    			}
    
    		}
    
    
    	}
    
    
    	pRst->MoveNext();
    	wordcollection.clear();
     }
    	pRst->Close();
    	pConn->Close();
    	pRst.Release();
    	pConn.Release();
    	CoUninitialize();
    	
    	return 0;
    
    }
    /************************************************************************/
    /* 保存词袋子模型到硬盘                                                                     */
    /************************************************************************/
    void Preprocess::save(map<string,vector<pair<int,int> > >&mymap)
    {
    	ofstream outfile(bagofwordsAddress,ios::binary);
    	outfile<<mymap.size()<<endl;
    	map<string,vector<pair<int,int> > >::iterator it;
    	for (it=mymap.begin();it!=mymap.end();it++)
    	{   outfile<<it->first<<endl;
    	vector<pair<int,int>>::iterator subit;
    	outfile<<it->second.size()<<endl;
    	for(subit=(it->second).begin();subit!=(it->second).end();++subit)
    	{
    		outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
    	}
    	outfile<<endl;
    	}
    	//outfile.write((char *)&mymap,sizeof(mymap));
    
    	outfile.close();
    
    }
    /************************************************************************/
    /* 加载词典信息到内存                                                                     */
    /************************************************************************/
    void Preprocess::load(map<string,vector<pair<int,int> > >&mymap)
    {
    	std::locale loc1 = std::locale::global(std::locale(".936"));
    	{
    		// 在这里使用std::ifstream 或者 std::fstream
    		ifstream infile(bagofwordsAddress,ios::binary);
    		int lenMyMap;//保存词典长度
    		int lenVector;//保存每个词出现的文章数目
    		string key;//保存读出的map的键值
    		int articleId;//文章标号
    		int count;//在该文章中刚出现的数目
    		string comma;
    		string semicolon;
    		infile>>lenMyMap;
    		while(!infile.eof())
    		{
    			infile>>key;
    			infile>>lenVector;
    			vector<pair<int,int> >temp;
    			for (int i=0;i<lenVector;i++)
    			{
    				infile>>articleId>>count>>semicolon;
    				temp.push_back(make_pair(articleId,count));
    			}
    			mymap[key]=temp;
    
    
    		}
    
    
    		infile.close();
    	}
    	std::locale::global(std::locale(loc1));
    
    }
    /************************************************************************/
    /* 打印词典信息                                                         */
    /************************************************************************/
    void print(map<string,vector<pair<int,int> > >&mymap)
    {   
    	cout<<mymap.size()<<endl;
    	map<string,vector<pair<int,int> > >::iterator it;
    	for (it=mymap.begin();it!=mymap.end();it++)
    	{   cout<<it->first<<endl;
    	vector<pair<int,int>>::iterator subit;
    	cout<<it->second.size()<<endl;
    	for(subit=(it->second).begin();subit!=(it->second).end();++subit)
    	{
    		cout<<subit->first<<','<<subit->second<<";";
    	}
    	cout<<endl;
    	}
    
    }
    /************************************************************************/
    /* 构造停用词表                                                                     */
    /************************************************************************/
    set<string> Preprocess::MakeStopSet()
    {
    	set<string> stopwordsSet;
    	ifstream ifile("stopwords.txt");
    	while(!ifile.eof())
    	{
    		string temp;
    		trim(temp," ");
    		ifile>>temp;
    		stopwordsSet.insert(temp);
    	}
    	return stopwordsSet;
    }
    /************************************************************************/
    /* 将整数转化成字符串                                                   */
    /************************************************************************/
    
    string Preprocess::do_fraction(int val)
    {
    	ostringstream out;
    	out<<val;
    	string str= out.str(); //从流中取出字符串
    	str.swap(string(str.c_str()));//删除nul之后的多余字符
    	return str;
    
    }
    /************************************************************************/
    /* 将浮点数转化成指定精度的字符串                                       */
    /************************************************************************/
    string Preprocess::do_fraction(double val,int decplaces)
    {
    	
    	//int prec=numeric_limits<double>::digits10;
    	char DECIMAL_POINT='.'; 
    	ostringstream out;
    	//out.precision(prec);
    	out<<val;
    	string str=out.str();
    	size_t n=str.find(DECIMAL_POINT);
    	if((n!=string::npos)&&n+decplaces<str.size())
    	{
    		str[n+decplaces]='\0';
    	}
    	str.swap(string(str.c_str()));
    
    	return str;
    }
    /************************************************************************/
    /* 窄字符串砖宽字符串                                                    */
    /************************************************************************/
    wstring Preprocess::myMultibyteToWideChar(string sResult)
    {
    	int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。(不包含字符串结束符)
    	wchar_t *lpwsz= new wchar_t [iWLen+1];
    	MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
    	lpwsz[iWLen] = L'\0'; 
    	wstring wsResult(lpwsz);
    	delete []lpwsz;
    	return wsResult;
    }
    /************************************************************************/
    /* 宽字符串转窄字符串                                                                     */
    /************************************************************************/
    string Preprocess::myWideCharToMultibyte(wstring wsResult)
    {
    	string sResult;
    	int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符)
    	char *lpsz= new char[iLen];
    	WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
    	sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。
    	delete []lpsz;
    	return sResult;
    
    }
    /************************************************************************/
    /* 调用ICTclas进行中文分词                                               */
    /************************************************************************/
    string Preprocess::ICTsplit(const char *sInput)
    {
    	if(!ICTCLAS_Init())
    	{
    		printf("ICTCLAS INIT FAILED!\n");
    		string strerr(sInput);
    		return strerr;
    	}
    	ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
    	//导入用户词典后
    	/*printf("\n导入用户词典后:\n");
    	int nCount = ICTCLAS_ImportUserDict("userdic.txt");//覆盖以前的用户词典
    	//保存用户词典
    	ICTCLAS_SaveTheUsrDic();
    	printf("导入%d个用户词。\n", nCount);*/
    
    	const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0);
    	string strresult(sResult);
    	//printf("%s\n", sResult);
    	//把字符串转化成宽字符串
    	wstring wsResult=myMultibyteToWideChar(strresult);
    	boost::wregex wreg(L"\\s+");
    	wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));
    	strresult=myWideCharToMultibyte(wsResult);
    
    
    
    	//ofile<<str1;
    	//ofile.close();
    	//cout<<str1<<endl;
    	//ICTCLAS_FileProcess("text.txt","test_result.txt",1);
    	ICTCLAS_Exit();
    
    	return strresult;
    }
    /************************************************************************/
    /* 对每一篇文章去掉噪声词,剩下好词                                     */
    /************************************************************************/
    vector<string>Preprocess::goodWordsinPieceArticle(string rawtext,set<string> stopwords)
    {
    	vector<wstring> goodWordstemp;
    	vector<string> goodWords;
    	const char* sInput=rawtext.c_str();
    	string sResult=ICTsplit(sInput);
    	wstring wsResult=myMultibyteToWideChar(sResult);
    	boost::wregex wreg(L"\\d+");//去掉中文空格
    	wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));
    	//boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
    	boost::split(goodWordstemp,wsResult,boost::is_any_of("|"));
    
    	for(vector<wstring>::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)
    	{
    		string temp=myWideCharToMultibyte(*it);
    		trim(temp," ");
    		if(!stopwords.count(temp)&&!temp.empty())
    		{
    			goodWords.push_back(temp);
    		}
    
    
    	}
    
    	return goodWords;
    }
    /************************************************************************/
    /* DF特征词选择法                                                       */
    /************************************************************************/
    void Preprocess::DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold)
    {
    	int finalKeyWordsCount=0;//计算共取了多少个关键词
    	vector<pair<string,int> >tempvector;
    	for(map<string,vector<pair<int,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
    	{
    		tempvector.push_back(make_pair(it->first,(it->second).size()));
    	}
    
    	stable_sort(tempvector.begin(),tempvector.end(),isLonger);
    	ofstream outfile(featurewordsAddress);
    	for(vector<pair<string,int> >::iterator it=tempvector.begin();it!=tempvector.end();it++)
    	{   
    		if(it->second>=DFthreshold)
    		{
    			//outfile<<it->first<<" "<<it->second<<endl;
    			outfile<<it->first<<endl;
    			finalKeyWordsCount++;
    
    		}
    
    	}
    	outfile.close();
    	cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
    	cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl;
    
    }
    /************************************************************************/
    /* 获得最终选定的构造文档向量模型的特征词                               */
    /************************************************************************/
    vector<string>Preprocess::GetFinalKeyWords()
    {
    	vector<string>myKeys;
    	ifstream infile(featurewordsAddress);
    	while(!infile.eof())
    	{
    		string temp;
    		infile>>temp;
    		if(temp!="")
    		{
    			myKeys.push_back(temp);
    		}
    
    
    	}
    	return myKeys;
    }
    /************************************************************************/
    /* 获得特征词的maxTF,DF                                                 */
    /************************************************************************/
    vector<pair<int,int> >Preprocess::GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap)
    {
    	vector<pair<int,int> >maxTFandDF;
    	vector<string>myKeys=GetFinalKeyWords();
    	for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++)
    	{  
    		int DF=mymap[*it].size();
    		int maxTF=0;
    		for(vector<pair<int,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)
    		{
    			if(subit->second>maxTF)
    			{
    				maxTF=subit->second;
    			}
    
    		}
    		maxTFandDF.push_back(make_pair(maxTF,DF));
    		//find_if(mymap[*it].begin(),mymap[*it].end(),
    	}
    	return maxTFandDF;
    }
    /************************************************************************/
    /* 文档向量模型归一化                                                                     */
    /************************************************************************/
    vector<pair<int,double> >Preprocess::NormalizationVSM(vector<pair<int,double> > tempVSM)
    {
    
    	double sum=0;
    	for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
    	{
    		sum+=pow(vsmit->second,2);
    	}
    	for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
    	{
    		vsmit->second/=sqrt(sum);
    	}
    	return tempVSM;
    
    }
    /************************************************************************/
    /*              单个文档向量模型字符串化                                                        */
    /************************************************************************/
    string Preprocess::FormatVSMtoString(vector<pair<int,double> > tempVSM)
    {
    	string ret="{";
    	int commaindication=0;
    	for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
    	{   
    
    		ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,8);
    		if(commaindication<tempVSM.size()-1)
    		{
    			ret+=",";
    		}
    		commaindication++;
    	}
    	ret+="}";
    	return ret;
    }
    /************************************************************************/
    /* 写Arff头文件                                                                     */
    /************************************************************************/
    void Preprocess::WriteHeadArff()
    {
    	ofstream ofile(arffFileAddress,ios::binary);
    	ofile<<"@relation aticle"<<endl;
    	ofile<<"\n";
    	vector<string> myKeys=GetFinalKeyWords();
    	for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++)
    	{
    		//string temp="@attribute "+"'"+(*it)+"'"+" real";
    		string temp="";
    		temp+="@attribute ";
    		temp+="'";
    		temp+=*(it);
    		temp+="'";
    		temp+=" real";
    		/*strcpy(temp,"@attribute ");
    		strcpy(temp,"'");
    		strcpy(temp,*(it));
    		strcpy(temp,"'");
    		strcpy(temp," real");*/
    
    		ofile<<temp<<endl;
    	}
    	ofile<<"\n"<<endl;
    	ofile<<"@data"<<endl;
    	ofile.close();
    }
    /************************************************************************/
    /* 将实验数据写成arff @data格式                                                                     */
    /************************************************************************/
    void Preprocess::VSMFormation(map<string,vector<pair<int,int>>> &mymap)
    {   int corpus_N=endIndex-beginIndex+1;
    	ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件
    	ofstream ofile2(arffFileAddress,ios::binary|ios::app);
    
    	vector<string> myKeys=GetFinalKeyWords();
    	vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
    	for(int i=beginIndex;i<=endIndex;i++)
    	{   vector<pair<int,double> >tempVSM;
    		for(vector<string>::size_type j=0;j<myKeys.size();j++)
    		{
    		//vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    			double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    
    
    			TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);
    			TF*=log((double)corpus_N/maxTFandDF[j].second);
    			if(TF!=0)
    			{
    				tempVSM.push_back(make_pair(j,TF));
    
    			}
    
    
    
    		}
    		if(!tempVSM.empty())
    		{
    			tempVSM=NormalizationVSM(tempVSM);
    			string vsmStr=FormatVSMtoString(tempVSM);
    			ofile1<<i<<endl;
    			ofile2<<vsmStr<<endl;
    		}
    		tempVSM.clear();
    
    
    
    	}
    	ofile1.close();
    	ofile2.close();
    
    
    }
    void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)
    {
    	
    	
    	map<string,vector<pair<int,int>>> mymap;
    	if(!isbagOfWordsExist)
    	{
    		ConstructMap(mymap,dbfield,seg);
    		save(mymap);
    		cout<<"词袋子信息已经保存到硬盘"<<endl;
    	}
    	else
    	{
    		load(mymap);
    	}
    	DFcharicteristicWordSelection(mymap,DFthreshold);
    	WriteHeadArff();
    	VSMFormation(mymap);
    	cout<<"arff文件已经形成"<<endl;
    	
    	
    	string temp(infoFromWekaAddress);
    
    	cout<<"请您将使用weka聚类,并保存为"<<temp<<endl;
    }
    /*****************以下函数辅助完成聚类功能*********************************************************************8**********************/
    /************************************************************************/
    /* 建立文档向量模型                                                                     */
    /************************************************************************/
    map<int,vector<double> > Preprocess::VSMConstruction(map<string,vector<pair<int,int>>> &mymap)
    {   
    	int corpus_N=endIndex-beginIndex+1;
    	map<int,vector<double>> vsmMatrix;
    	vector<string> myKeys=GetFinalKeyWords();
    	vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
    	for(int i=beginIndex;i<=endIndex;i++)
    	{   
    		vector<pair<int,double> >tempVSM;
    		for(vector<string>::size_type j=0;j<myKeys.size();j++)
    		{
    			//vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    			double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    			TF=0.5+(double)TF/(maxTFandDF[j].first);
    			TF*=log((double)corpus_N/maxTFandDF[j].second);
    			tempVSM.push_back(make_pair(j,TF));
    
    		}
    		if(!tempVSM.empty())
    		{
    			tempVSM=NormalizationVSM(tempVSM);
    			for(vector<pair<int,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
    			{
    				vsmMatrix[i].push_back(it->second);
    			}
    
    
    
    		}
    		tempVSM.clear();
    
    
    
    	}
    	return vsmMatrix;
    
    }
    /************************************************************************/
    /* 获得Weka提供的聚类信息                                                                     */
    /************************************************************************/
    map<string,vector<double> > Preprocess::GetClusters()
    {
    
    	map<string,vector<double> >clusters;
    	ifstream ifile(infoFromWekaAddress);
    	string temp;
    	while(getline(ifile,temp))
    	{   boost::smatch matchcluster;
    	boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);
    	if(boost::regex_search(temp,matchcluster,regcluster))	
    	{   
    		string clustertmp=matchcluster[0].str();
    		string ordinates="";
    		getline(ifile,ordinates);
    		boost::regex regordinates("\\d+(\\.\\d{1,4})?");
    		boost::smatch matchordinates;
    		std::string::const_iterator it=ordinates.begin();  
    		std::string::const_iterator end=ordinates.end();
    		while (boost::regex_search(it,end,matchordinates,regordinates)) 
    		{       
    			string digitstemp=matchordinates[0].str();
    			double digitval=0.0;
    			std::stringstream ss;
    			ss<<digitstemp;
    			ss>>digitval;
    			clusters[clustertmp].push_back(digitval);
    			it=matchordinates[0].second; 
    		}
    
    
    
    
    
    	}
    	}
    	return clusters;
    }
    /**计算向量内积*/
    double Preprocess::CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2)
    {
    	double result = 0.0f;
    	for (int i = 0; i < vector1.size(); i++)
    		result += vector1[i] * vector2[i];
    	return result;
    }
    /**计算向量余弦相似度*/
    double Preprocess::CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2)
    {
    	double numerator=CalDotProductOfVectors(vector1,vector2);
    	double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
    	denominator=sqrt(denominator);
    	return numerator/denominator;
    }
    /**为每篇文章打上个类别标签*/
    vector<pair<int,string> > Preprocess::GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters)
    {
    	vector<pair<int,string> >resultInfo;
    	for(map<int,vector<double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
    	{
    		vector<pair<string,double> >clusterDistanceAist;
    		for(map<string,vector<double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
    		{
    
    			double temp=CalCosineofVectors(it->second,clusterit->second);
    			clusterDistanceAist.push_back(make_pair(clusterit->first,temp));
    
    		}
    		sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
    		vector<pair<string,double> >::iterator cDAit=clusterDistanceAist.begin();
    
    		resultInfo.push_back(make_pair(it->first,cDAit->first));
    		clusterDistanceAist.clear();
    	}
    	return  resultInfo;
    
    }
    /************************************************************************/
    /* 获取每个类别所包含的文章ID                                           */
    /************************************************************************/
    map<string,vector<int> > Preprocess::FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo)
    {
    	map<string,vector<int>> articlesInfo;
    
    	for(vector<pair<int,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
    	{
    		for(map<string,vector<double> >::iterator it=clusters.begin();it!=clusters.end();it++)
    		{
    			if(retit->second==it->first)
    			{
    				articlesInfo[it->first].push_back(retit->first);
    			}
    		}
    	}
    
    
    
    
    
    	return articlesInfo;
    
    
    }
    void Preprocess::RetreiveArticleInfoFromDataBase()
    {
    	map<string,vector<pair<int,int>>> mymap;
    	vector<pair<int,string>>resultInfo;
    	map<string,vector<double> >clusters;
    	map<int,vector<double> >vsmMatrix;
    	map<string,vector<int>> articlesInfo;
    	ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt");
    	//boost::regex_replace(strresult)
    	//ConstructMap(mymap,1,500);
    	//save(mymap);
    	load(mymap);
    	vsmMatrix=VSMConstruction(mymap);
    	clusters=GetClusters();
    	resultInfo=GenerateClusterInfo(vsmMatrix,clusters);
    	articlesInfo=FetchArticlesOFClusters(clusters,resultInfo);
    
    	/*for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
    	{
    		ofile<<it->first<<endl;
    		int count=0;
    		ofile<<"(";
    		for(int i=0;i<it->second.size();i++)
    		{
    			ofile<<(it->second)[i];
    
    			if(count<it->second.size()-1)
    			{
    				ofile<<",";
    			}
    			count++;
    		}
    		ofile<<")";
    		ofile<<endl;
    
    
    	}*/
    	for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
    	{
    		ostringstream out;
    		string selectassist;
    		char *selectsql=new char[5000];
    		int count=0;
    		CoInitialize(NULL);
    		_ConnectionPtr pConn(__uuidof(Connection));
    		_RecordsetPtr pRst(__uuidof(Recordset));
    		pConn->ConnectionString=dbconnection;
    		pConn->Open("","","",adConnectUnspecified);
    		cout <<it->first<<endl;
    		ofile<<it->first<<endl;
    		out<<"(";
    		count=0;
    		for(int i=0;i<it->second.size();i++)
    		{
    			out<<(it->second)[i];
    			if(count<it->second.size()-1)
    			{
    				out<<",";
    			}
    			count++;
    			
    		
    		}
    		out<<")";
    		selectassist=out.str();
    		sprintf_s(selectsql,5000,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str());
    
    		pRst=pConn->Execute(selectsql,NULL,adCmdText);
    		while(!pRst->rsEOF)
    		{	
    		//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
    			string title=(_bstr_t)pRst->GetCollect("ArticleTitle");
    			//string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");
    			string categorization=(_bstr_t)pRst->GetCollect("class");
    			cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
    			ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
    
    
    			
    		
    
    
    			pRst->MoveNext();
    			
    		}
    		pRst->Close();
    		pConn->Close();
    		pRst.Release();
    		pConn.Release();
    		CoUninitialize();
    	
    	}
    	
    	
    
    
    ofile.close();	
    	
    	
    }
    /********按空白把关键词分割开*****************/
    vector<string>Preprocess:: mySplit(string s,set<string> stopwords)
    {
    	vector<string> wordCollection;
    	trim(s," ");
    
    	int nPosBegin=0;
    	int nPosEnd=s.find(' ',nPosBegin);
    	while(nPosEnd!=string::npos)
    	{
    		string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);
    		trim(temp," ");
    		wordCollection.push_back(temp);
    		nPosBegin=s.find_first_not_of(' ',nPosEnd);
    		nPosEnd=s.find(' ',nPosBegin);
    	}
    	string temp=s.substr(nPosBegin,s.size()-nPosBegin);
    	trim(temp," ");
    	wordCollection.push_back(temp);
    
    
    	return wordCollection;
    
    }
  • 相关阅读:
    samba 4.11 or newer version enable SMB1
    linux下使用mv将递归的文件从多个目录移动到一个目录中
    【转载】修改Windows下键盘按键对应功能的一些方案
    ACR122U读卡器在win7以上系统使用过程中的设置项
    恢复Chrome 78以上版本的地址栏https和www显示
    配置sshd的免密码登录
    1234
    MarkDown 语法记录
    语法测试
    解决 Linux grep 不高亮显示
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1817348.html
Copyright © 2020-2023  润新知