作者finallyliuyu 出处博客园
通过C++处理reuters21578(一)的代码,初步形成了两张表单存放训练语料库和测试语料库,由于这两个语料库的个别类别不一致,所以要找到这两个语料库类别的交集,然后最终形成文本分类的训练语料库和测试语料库。以下主函数中完成此功能。
class GT_clss
{
public:
GT_clss(string &s):comparepart(s){}
bool operator()(const string &elem)
{
return elem==comparepart;
}
private:
string comparepart;
};
{
public:
GT_clss(string &s):comparepart(s){}
bool operator()(const string &elem)
{
return elem==comparepart;
}
private:
string comparepart;
};
数据库中共有多少个类别
vector<string>GetLabels(string tablename)
{ vector<string>labels;
char * selectbySpecificId=new char [1000];
memset(selectbySpecificId,0,1000);
sprintf_s(selectbySpecificId,1000,"select Categorization from %s ",tablename.c_str());
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute(selectbySpecificId,NULL,adCmdText);
while(!pRst->rsEOF)
{
string label=(_bstr_t)pRst->GetCollect("Categorization");
if (!count_if(labels.begin(),labels.end(),GT_clss(label)))
{
labels.push_back(label);
}
pRst->MoveNext();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize();
delete []selectbySpecificId;
return labels;
}
{ vector<string>labels;
char * selectbySpecificId=new char [1000];
memset(selectbySpecificId,0,1000);
sprintf_s(selectbySpecificId,1000,"select Categorization from %s ",tablename.c_str());
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute(selectbySpecificId,NULL,adCmdText);
while(!pRst->rsEOF)
{
string label=(_bstr_t)pRst->GetCollect("Categorization");
if (!count_if(labels.begin(),labels.end(),GT_clss(label)))
{
labels.push_back(label);
}
pRst->MoveNext();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize();
delete []selectbySpecificId;
return labels;
}
主函数
int _tmain(int argc, _TCHAR* argv[])
{
int end;
//set<string>labels;
vector<string>labelsTrain=GetLabels("ReteursTrain");
vector<string>labelsTest=GetLabels("ReteursTest");
vector<string>finalLabels;
for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
{
trim(*it," ");
}
for(vector<string>::iterator it=labelsTest.begin();it!=labelsTest.end();it++)
{
trim(*it," ");
}
for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
{
if (count_if(labelsTest.begin(),labelsTest.end(),GT_clss(*it)))
{
finalLabels.push_back(*it);
}
}
char * selectbySpecificId=new char [1000];
memset(selectbySpecificId,0,1000);
sprintf_s(selectbySpecificId,1000,"select CArticleName,CAbstract,Categorization from ReteursTest");
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
_ConnectionPtr pConn2(__uuidof(Connection));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
pConn2->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=FinallyCorpus";
pConn->Open("","","",adConnectUnspecified);
pConn2->Open("","","",adConnectUnspecified);
pRst=pConn->Execute(selectbySpecificId,NULL,adCmdText);
while(!pRst->rsEOF)
{
string label=(_bstr_t)pRst->GetCollect("Categorization");
trim(label," ");
if (count_if(finalLabels.begin(),finalLabels.end(),GT_clss(label)))
{
string ArticleTitle=(_bstr_t)pRst->GetCollect("CArticleName");
string ArticleText=(_bstr_t)pRst->GetCollect("CAbstract");
ArticleTitle=ProcessforMSSQL(ArticleTitle);
ArticleText=ProcessforMSSQL(ArticleText);
char *sqlInsert=new char[1000000];
_variant_t RecordsAffected;
memset(sqlInsert,0,1000000);
sprintf_s(sqlInsert,1000000,"insert into ReteursTestingCorpus(CArticleName,CAbstract,Categorization) values('%s','%s','%s')",ArticleTitle.c_str(),ArticleText.c_str(),label.c_str());
pConn2->Execute(sqlInsert,&RecordsAffected,-1);
delete []sqlInsert;
}
pRst->MoveNext();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
pConn2->Close();
pConn2.Release();
CoUninitialize();
delete []selectbySpecificId;
cout<<"两标签集交集为"<<endl;
cout<<finalLabels.size()<<endl;
//DictionaryToDataBase();
//FindFile(L"E:\\新闻语料\\reuters21578");
//pRst=pConn->Execute(,NULL,adCmdText);
cout<<"finish"<<endl;
cin>>end;
}
{
int end;
//set<string>labels;
vector<string>labelsTrain=GetLabels("ReteursTrain");
vector<string>labelsTest=GetLabels("ReteursTest");
vector<string>finalLabels;
for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
{
trim(*it," ");
}
for(vector<string>::iterator it=labelsTest.begin();it!=labelsTest.end();it++)
{
trim(*it," ");
}
for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
{
if (count_if(labelsTest.begin(),labelsTest.end(),GT_clss(*it)))
{
finalLabels.push_back(*it);
}
}
char * selectbySpecificId=new char [1000];
memset(selectbySpecificId,0,1000);
sprintf_s(selectbySpecificId,1000,"select CArticleName,CAbstract,Categorization from ReteursTest");
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
_ConnectionPtr pConn2(__uuidof(Connection));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
pConn2->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=FinallyCorpus";
pConn->Open("","","",adConnectUnspecified);
pConn2->Open("","","",adConnectUnspecified);
pRst=pConn->Execute(selectbySpecificId,NULL,adCmdText);
while(!pRst->rsEOF)
{
string label=(_bstr_t)pRst->GetCollect("Categorization");
trim(label," ");
if (count_if(finalLabels.begin(),finalLabels.end(),GT_clss(label)))
{
string ArticleTitle=(_bstr_t)pRst->GetCollect("CArticleName");
string ArticleText=(_bstr_t)pRst->GetCollect("CAbstract");
ArticleTitle=ProcessforMSSQL(ArticleTitle);
ArticleText=ProcessforMSSQL(ArticleText);
char *sqlInsert=new char[1000000];
_variant_t RecordsAffected;
memset(sqlInsert,0,1000000);
sprintf_s(sqlInsert,1000000,"insert into ReteursTestingCorpus(CArticleName,CAbstract,Categorization) values('%s','%s','%s')",ArticleTitle.c_str(),ArticleText.c_str(),label.c_str());
pConn2->Execute(sqlInsert,&RecordsAffected,-1);
delete []sqlInsert;
}
pRst->MoveNext();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
pConn2->Close();
pConn2.Release();
CoUninitialize();
delete []selectbySpecificId;
cout<<"两标签集交集为"<<endl;
cout<<finalLabels.size()<<endl;
//DictionaryToDataBase();
//FindFile(L"E:\\新闻语料\\reuters21578");
//pRst=pConn->Execute(,NULL,adCmdText);
cout<<"finish"<<endl;
cin>>end;
}