目的:从数据库中抽取文章关键词,并统计这些关键词在哪些文章中出现,出现多少次。(算是词袋子模型吧),然后对每篇文章形成形成VSM模型,写成weka的数据格式,然后调用weka对文章聚类。
目前“形成此代码模型一块已经完毕”
其中词袋子的数据结构如下:
map<string,vector<pair<int,int>>>&mymap),
目前已经完成此部分的serilize(save/load)以及print 功能
#include "stdafx.h"
#include<iostream>
#include<map>
#include<vector>
#include<string>
#include<iomanip>
#include<fstream>
//#include<boost/tokenizer.hpp>
using namespace std;
形成词袋子模型
nt ConstructMap(map<string,vector<pair<int,int>>>&mymap)
{
vector<string> mySplit(string s);
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection";
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute("select CKeyWord,ArticleId from Article order by ArticleId",NULL,adCmdText);
while(!pRst->rsEOF)
{ vector<string>wordcollection;
string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
if(keywordstr!="")
{
wordcollection=mySplit(keywordstr);
string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
int articleid=atoi(tempid.c_str());
for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
{
vector<pair<int,int>>::iterator it;
if(mymap[*strit].empty())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[*strit].push_back(mytemppair);
}
else
{
for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
{
if(it->first==articleid)
{
it->second=++(it->second);
break;
}
}
if(it==mymap[*strit].end())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[*strit].push_back(mytemppair);
}
}
}
}
pRst->MoveNext();
wordcollection.clear();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize();
return 0;
}
{
vector<string> mySplit(string s);
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection";
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute("select CKeyWord,ArticleId from Article order by ArticleId",NULL,adCmdText);
while(!pRst->rsEOF)
{ vector<string>wordcollection;
string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
if(keywordstr!="")
{
wordcollection=mySplit(keywordstr);
string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
int articleid=atoi(tempid.c_str());
for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
{
vector<pair<int,int>>::iterator it;
if(mymap[*strit].empty())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[*strit].push_back(mytemppair);
}
else
{
for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
{
if(it->first==articleid)
{
it->second=++(it->second);
break;
}
}
if(it==mymap[*strit].end())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[*strit].push_back(mytemppair);
}
}
}
}
pRst->MoveNext();
wordcollection.clear();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize();
return 0;
}
加载词袋子模型
保存词袋子模型
void load(map<string,vector<pair<int,int> > >&mymap)
{
ifstream infile("c:\\mydict.dat",ios::binary);
int lenMyMap;//保存词典长度
int lenVector;//保存每个词出现的文章数目
string key;//保存读出的map的键值
int articleId;//文章标号
int count;//在该文章中刚出现的数目
string comma;
string semicolon;
infile>>lenMyMap;
while(!infile.eof())
{
infile>>key;
infile>>lenVector;
vector<pair<int,int> >temp;
for (int i=0;i<lenVector;i++)
{
infile>>articleId>>count>>semicolon;
temp.push_back(make_pair(articleId,count));
}
mymap[key]=temp;
}
infile.close();
}
{
ifstream infile("c:\\mydict.dat",ios::binary);
int lenMyMap;//保存词典长度
int lenVector;//保存每个词出现的文章数目
string key;//保存读出的map的键值
int articleId;//文章标号
int count;//在该文章中刚出现的数目
string comma;
string semicolon;
infile>>lenMyMap;
while(!infile.eof())
{
infile>>key;
infile>>lenVector;
vector<pair<int,int> >temp;
for (int i=0;i<lenVector;i++)
{
infile>>articleId>>count>>semicolon;
temp.push_back(make_pair(articleId,count));
}
mymap[key]=temp;
}
infile.close();
}
void save(map<string,vector<pair<int,int> > >&mymap)
{ ofstream outfile("c:\\mydict.dat",ios::binary);
outfile<<mymap.size()<<endl;
map<string,vector<pair<int,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ outfile<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
outfile<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
}
outfile<<endl;
}
//outfile.write((char *)&mymap,sizeof(mymap));
outfile.close();
}
{ ofstream outfile("c:\\mydict.dat",ios::binary);
outfile<<mymap.size()<<endl;
map<string,vector<pair<int,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ outfile<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
outfile<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
}
outfile<<endl;
}
//outfile.write((char *)&mymap,sizeof(mymap));
outfile.close();
}
打印词袋子模型
void print(map<string,vector<pair<int,int> > >&mymap)
{
cout<<mymap.size()<<endl;
map<string,vector<pair<int,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ cout<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
cout<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
cout<<subit->first<<','<<subit->second<<";";
}
cout<<endl;
}
}
{
cout<<mymap.size()<<endl;
map<string,vector<pair<int,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ cout<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
cout<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
cout<<subit->first<<','<<subit->second<<";";
}
cout<<endl;
}
}