• 包含停用词的词频统计(map,set非class版本)<< 0919


    
    #include <iostream>
    #include <string>
    #include <vector>
    #include<sstream>//string IO流头文件
    #include <fstream>
    #include<set>       //使用set的头文件
    #include <map>   // 使用map 的头文件
    #include <sys/time.h>
    #ifndef __STDC_FORMAT_MACROS                                                                                                                              
    #define __STDC_FORMAT_MACROS
    #endif /* __STDC_FORMAT_MACROS */
    #include <inttypes.h>//加上上面的一句,可以直接使用int64_t,使用“%“PRId64”来打印
    using namespace std;
    #define ERR_EXIT(m) 
        do { 
            perror(m);
            exit(EXIT_FAILURE);
        }while(0)
    
    void del_punct(string &word)
    {
        for (string::size_type ix = 0;
                ix != word.size();
                ++ix)
        {
            if (ispunct(word[ix]))
            {
                word.erase(ix,1);//string 中的erase的使用方法
                --ix;//小心越界
            }
        }
    }
    
    void read_del_punct(const char* filename, set<string> &str)//使用set存储stop_list
    {
        ifstream infile;
        infile.open(filename);
        if (!infile)//返回值是不是这样
            throw runtime_error("open file failed!");
        string line;
        while (getline (infile, line))
        {
            istringstream instring(line);//isstrngstream 的用法,不需要关闭
            string word;
            while (instring >> word)
            {
                del_punct(word);
                str.insert(word);
            }
        }
        infile.close();
    }
    
    bool in_stoplist(const set<string> &str, const string &word)
    {
        set<string>::iterator it = str.find(word);//find 的使用
        if (it == str.end())//遍历寻找set中的元素
            return false;
        return true;
    }
    
    void read_del_punct(const char* filename, const set<string> &str, vector<string> &words)//读取文件,删除文本中标点符号,删掉stop_list里面的单词,存至vector
    {
        ifstream infile;
        infile.open(filename);
        if (!infile)//返回值是不是这样
            throw runtime_error("open file failed!");
        string word;
        while (infile >> word)
        {
            del_punct(word);
            if (!in_stoplist(str,word))
            {
                words.push_back(word);
            }
        }
    }
    
    void map_creat(vector<string> &words, map<string,int> &M)//使用map统计单词
    {
        string word;
        for (vector<string>::iterator it = words.begin(); 
                it != words.end();
                ++it)
        {
            M[*it] ++;//map 的语法:直接赋值,自动排序
        }
    }
    
    void map_print(map<int,string, greater<int> > &N)//打印map
    {
        for (map<int, string>::iterator it = N.begin(); 
                it != N.end();
                ++it)//定义迭代器不需要加上第三个参数
        {
            cout << it -> second << " : " << it -> first << endl;
        }
    
    }
    
    void map_trans(map<string,int> &M, map<int, string, greater<int> > &N)//交换map的key和value
    {
    
        for (map<string,int>::iterator it = M.begin(); 
                it != M.end();
                ++it)
        {
            N[it -> second] = it -> first;
        }
    }
    
    int64_t gettime ()//gettime()函数
    {
        struct timeval tm;
        memset(&tm,0,sizeof tm);
        if (gettimeofday(&tm, NULL) == -1)//gettimeofday 的用法和返回值
            throw runtime_error("gettimeofday");
        int64_t t = tm.tv_usec;
        t += tm.tv_sec * 1000 * 1000;
        return t;
    }
    int main(int argc, const char *argv[])
    {
        if (argc < 3)//语法是不是这样写?
        {
            // cerr << "Usage : " << argv[0] << " file"  << endl;
            fprintf(stderr, "Usage : %s filename stoplist
    ", argv[0]);
            // ERR_EXIT("usage");//不是这么写,usage未定义,此时perror里面没有错误
            exit(EXIT_FAILURE);//exit 为小写!!
        }
    
        int64_t start,end;
        vector<string> words;
        set<string> str;
    
        start = gettime();
        read_del_punct (argv[2],str);//读取stoplist
        read_del_punct (argv[1],str,words);//读取文件
        end = gettime();
        cout << "读取文件花费 " << end - start  << " us" << endl;
    
        map<string, int> M;//map 可以按照value排序,但不能建立在value值改变的基础上,就是说,在统计词频的时候,不要尝试着在按照key排序操作value的基础上同时按照value排序
        start = gettime();
        map_creat (words,M);
        end = gettime();
        cout << "读取单词入map花费 " << end - start  << " us" << endl;
    
        map<int, string, greater<int> > N;
        start = gettime();
        map_trans(M,N);
        end = gettime();
        cout << "转置map花费 " << end - start  << " us" << endl;
        map_print(N);
        return 0;
    }
    
    
  • 相关阅读:
    nowcoderD Xieldy And His Password
    Codeforces681D Gifts by the List
    nowcoder80D applese的生日
    Codeforces961E Tufurama
    Codeforces957 Mahmoud and Ehab and yet another xor task
    nowcoder82E 无向图中的最短距离
    nowcoder82B 区间的连续段
    Codeforces903E Swapping Characters
    Codeforces614C Peter and Snow Blower
    Codeforces614D Skills
  • 原文地址:https://www.cnblogs.com/sunstars/p/3991953.html
Copyright © 2020-2023  润新知