• bag of words in c++


    #include <iostream>
    #include <vector>
    #include <cstddef>
    #include <string>
    #include <sstream>
    #include <fstream>
    #include <algorithm>
    #include <cmath>
    #include <set>
    #include <limits>
    #include <functional>
    #include <numeric>
    
    template <class DataType>
    void ReadMatFromFile(std::string &filename, std::vector<std::vector<DataType> > &lines_feat) {
        std::ifstream vm_info(filename.c_str());
        std::string lines;
        DataType var;
        std::vector<DataType> row;
    
        lines_feat.clear();
    
        while(!vm_info.eof()) {
            getline(vm_info, lines);
            if(lines.empty())
                break;
            std::stringstream stringin(lines);
            row.clear();
    
            while(stringin >> var) {
                row.push_back(var);
            }
            lines_feat.push_back(row);
        }
    }
    
    void ReadStringFromFile(std::string &filename, std::vector<std::string> &in_string) {
        std::ifstream vm_info(filename.c_str());
        std::string lines, var;
    
        while(!vm_info.eof()) {
            getline(vm_info, lines);
            if(lines.empty())
                break;
            std::stringstream stringin(lines);
    
            while(stringin >> var) {
                in_string.push_back(var);
            }
        }
    }
    
    std::string lowerCase(const std::string& s) {
        std::string lower(s); 
        for(size_t i=0;i<s.length();++i) {
            lower[i]=tolower(lower[i]);
        }
        return lower;
    }
    
    std::string letters(const std::string& s) {
        std::string letter;
        for(size_t i=0;i<s.length();++i) {
            char ch=s.at(i);
            bool flag=false;
            if((ch>=65 && ch<=90)) {
                ch=ch+32;
                flag=true;
            } 
            else if((ch>=97 && ch<=122) || (ch>=48 && ch<=57)) {
                flag=true;
            } 
            else {
                ;
            }
            if(flag) {
                letter.push_back(ch);
            }
        }
        letter.push_back('');
        return letter;
    }
    
    template <class T1, class T2>
    int MatMultiply(const std::vector<std::vector<T1> > &Mata, const std::vector<std::vector<T2> > &Matb, std::vector<std::vector<T1> > &MatOut) {
        if(Mata.at(0).size() != Matb.size()) {
            std::cout<<"not match!
    ";
            return -1;
        }
        for(size_t i=0; i<Mata.size(); ++i) {
            for(size_t j=0; j<Matb.at(0).size(); ++j) {
                std::vector<T2> col;
                col.clear();
                for(size_t k=0; k<Matb.size(); ++k) {
                    col.push_back(Matb.at(k).at(j));
                }
                MatOut.at(i).at(j)=inner_product(Mata.at(i).begin(), Mata.at(i).end(), col.begin(), 0);
            }
        }
        return 0;
    }
    
    template <class T1, class T2, class T3>
    void outer_product(const std::vector<T1> &inst1, const std::vector<T2> &inst2, std::vector<std::vector<T3> > &out) {
        std::vector<T3> temp_row(inst2.size());
    
        for(typename::std::vector<T1>::const_iterator it=inst1.begin();it!=inst1.end();++it) {
            transform(inst2.begin(), inst2.end(), temp_row.begin(), bind2nd(std::multiplies<T1>(), *it));
            out.push_back(temp_row);
        }
    }
    
    void ReadDataFromFile(std::string &filename, std::vector<std::string> &lines_feat) {
        std::ifstream vm_info(filename.c_str());
        std::string lines;
    
        lines_feat.clear();
    
        while(!vm_info.eof()) {
            getline(vm_info, lines);
            if(lines.empty())
                break;
    
            lines_feat.push_back(lines);
        }
    }
    
    std::vector<std::string> split(const std::string& s, char delimiter) {
        std::vector<std::string> tokens;
        std::string token;
        std::istringstream tokenStream(s);
    
        while(std::getline(tokenStream, token, delimiter)) {
            tokens.push_back(token);
        }
        return tokens;
    }
    
    int stringtoint(const std::string& s) {
        std::istringstream iss(s);
        int num;
        return iss>>num?num:0;
    }
    
    void printip(const std::string& s) {
        std::vector<std::string> temp, ip_segment;
    
        temp=split(s, '-');
        ip_segment=split(temp.front(), '.');
    
        std::string ip_start=ip_segment.back(), ip_end=temp.back();
        int start, end;
        start=stringtoint(ip_start);
        end=stringtoint(ip_end);
    
        for(size_t i=start;i<=end;++i) {
            std::cout<<ip_segment[0]<<"."<<ip_segment[1]<<"."<<ip_segment[2]<<"."<<i<<"
    ";
        }
    }
    
    template <class T>
    void Display2DVector(std::vector<std::vector<T> > &vv) {
        for(size_t i=0;i<vv.size();++i) {
            for(typename::std::vector<T>::const_iterator it=vv.at(i).begin();it!=vv.at(i).end();++it) {
                std::cout<<*it<<" ";
            }
            std::cout<<"
    ";
        }
        std::cout<<"--------the total of the 2DVector is "<<vv.size()<<std::endl;
    }
    
    int main() {
        std::string filename("data");
        std::vector<std::string> v_string;
        std::string words;
    std::set<std::string> s_string; ReadStringFromFile(filename, v_string);
    for(std::vector<std::string>::const_iterator it=v_string.begin(); it!=v_string.end(); ++it) { std::cout<<*it<<" ";
    words=letters(*it);
    s_string.insert(words);
    }
    std::cout
    <<std::endl;
    for(std::set<std::string>::const_iterator it=s_string.begin(); it!=s_string.end(); ++it) {
    std::cout
    <<*it<<" ";
    }
    std::cout
    <<std::endl;
    return 0;
    }

    The bag of words model ignores grammar and order of words.

    运行结果如下,第一行为原始数据,第二行为提取后的数据:

  • 相关阅读:
    如何让自己的app尽量不被系统杀死
    linux常用命令-权限管理命令
    linux常用命令-用户管理命令
    linux常用命令-文件处理命令
    npm命令
    新技术新框架新工具选型原则
    tomcat启动命令行中文乱码
    docker命令
    tinkpad e450c 进入 BIOS
    基于Java服务的前后端分离解决跨域问题
  • 原文地址:https://www.cnblogs.com/donggongdechen/p/10766901.html
Copyright © 2020-2023  润新知