c++实现之 -- 文章TF-IDF值的计算

首先，是关键词的选取：

好吧这个我这模型实在是太简单了，但还是讲一讲比较好呢。。。

我们现在手头有的是一堆百度百科词条w的DF(w, c)值，c是整个百科词条。。。原因是。。。方便嘛~（而且人家现成的只有介个了啦~）

我们发现有830W+的词条数目，都存下来显然是不理智、不科学、不魔法的。所以选取一部分作为关键词。

如何选取关键词呢？我选择了DF值在[100, 5000]之间的词。虽然也很不理智、不科学、不魔法，但是比直接存下来理智、科学、魔法多了，恩！

于是就全读进来，然后找到需要的词语，顺便计算下IDF值什么的输出到新的文件里去。

 1 #include <cstdio>
 2 #include <iostream>
 3 #include <iomanip>
 4 #include <cmath>
 5 #include <string>
 6 #include <algorithm>
 7 
 8 using namespace std;
 9 typedef double lf;
10 const int cnt_id = 9000005;
11 const lf tot_file = 4500000;
12 const lf eps = 1e-8;
13 
14 struct data {
15     int id;
16     lf IDF;
17     string st;
18     
19     data() {}
20     data(int _id, lf _IDF, string _st) : id(_id), IDF(_IDF), st(_st) {}
21     
22     inline bool operator < (const data &a) const {
23         return IDF > a.IDF;
24     }
25 } a[cnt_id];
26 inline bool cmp_id(data a, data b) {
27     return a.id < b.id;
28 }
29 
30 
31 string st;
32 int id, cnt;
33 int St, Ed;
34 lf NUM_max, NUM_min;
35 
36 inline lf calc(int x) {
37     return (lf) log((lf) tot_file / (x + eps));
38 }
39 
40 int main() {
41     int i, DF;
42     freopen("data", "r", stdin);
43     freopen("data_new", "w", stdout);
44     ios::sync_with_stdio(true);
45     while (cin >> id >> st >> DF)
46         a[++cnt] = data(id, (lf) calc(DF), st);
47     sort(a + 1, a + cnt + 1);
48     
49     NUM_max = calc(100), NUM_min = calc(5000);
50     for (i = 1; i <= cnt; ++i)
51         if (a[i].IDF < NUM_max) break;
52     St = i;
53     for ( ; i <= cnt; ++i)
54         if (a[i].IDF < NUM_min) break;
55     Ed = i;
56     
57     sort(a + St, a + Ed, cmp_id);
58     cout << Ed - St << endl;
59     for (i = St; i < Ed; ++i)
60         cout << a[i].id << ' ' << a[i].st << ' ' << setprecision(10) << a[i].IDF << endl;
61     return 0;
62 }

View Code

这样子我们就选出来了339,896个数作为关键词，占全部词条的4.1%，数量的减少，可以大幅之后的程序提高效率。

（p.s. 这里使用了一个小技巧，就是setprecision(x)，表示在cout里，小数输出多少位关键字）

好了，关键词选选取完毕，接下来就是读入文章（已分词），并且计算出TF-IDF值啦！

我们可以边读边做，顺便达到节省空间且提高效率的目的。（data和passage两个map可以只剩下一个）

具体实现甚是蛋疼，各种搞不定。最后搞定了也不知道是怎么搞定的。。。反正现在是没什么问题，以后有没有就布吉岛了

 1 #include <cstdio>
 2 #include <iostream>
 3 #include <cmath>
 4 #include <string>
 5 #include <cstring>
 6 #include <algorithm>
 7 #include <map>
 8 
 9 using namespace std;
10 typedef double lf;
11 const int mod1 = 19997;
12 const int mod2 = 30001;
13 const int bin = 1 << 9;
14 
15 struct TF_IDF {
16     int TF;
17     lf IDF, TF_IDF;
18 };
19 
20 struct Word {
21     string st;
22     int h1, h2;
23 
24     inline bool operator < (const Word &x) const {
25         return h1 == x.h1 ? h2 < x.h2 : h1 < x.h1;
26     }
27     inline bool operator == (const Word &x) const {
28         return h1 == x.h1 && h2 == x.h2;
29     }
30 
31     #define x (int) st[i]
32     #define Weight 3001
33     inline void calc_hash() {
34         int len = st.length(), tmp, i;
35         for (i = tmp = 0; i < len; ++i)
36             ((tmp *= Weight) += (x < 0 ? x + bin : x)) %= mod1;
37         h1 = tmp;
38         for (i = tmp = 0; i < len; ++i)
39             ((tmp *= Weight) += (x < 0 ? x + bin : x)) %= mod2;
40         h2 = tmp;
41     }
42     #undef x
43     #undef Weight
44 } w;
45 
46 typedef map <Word, TF_IDF> map_for_words;
47 typedef map_for_words :: iterator iter_for_words;
48 
49 map_for_words passage;
50 
51 void read_in_passage() {
52     Word w;
53     freopen("E:\test\test.in", "r", stdin);
54     while (cin >> w.st) {
55         w.calc_hash();
56         passage[w].TF += 1;
57     }
58     fclose(stdin);
59 }
60 
61 void read_in_IDF_and_work() {
62     int id, tot = 1, i;
63     lf IDF;
64     string st;
65     Word w;
66     iter_for_words it;
67     freopen("E:\test\new.dat", "r", stdin);
68     ios::sync_with_stdio(false);
69     cin >> tot;
70     for (i = 1; i <= tot; ++i) {
71         cin >> id >> w.st >> IDF;
72         w.calc_hash();
73         it = passage.find(w);
74         if (it != passage.end()) {
75             it -> second.IDF = IDF;
76             it -> second.TF_IDF = (lf) it -> second.TF * it -> second.IDF;
77         }
78     }
79     fclose(stdin);
80 }
81 
82 void print() {
83     iter_for_words it;
84     cout << passage.size() << endl;
85     for (it = passage.begin(); it != passage.end(); ++it)
86         cout << it -> first.st << ' ' << it -> second.TF << ' ' << it -> second.IDF << ' ' << it -> second.TF_IDF << endl;
87 }
88 
89 int main() {
90     freopen("E:\test\test.out", "w", stdout);
91     read_in_passage();
92     read_in_IDF_and_work();
93     print();
94     return 0;
95 }

View Code

特别被坑死的点：

第一次打开test.in不能加上"ios::sync_with_stdio(false);"，但是第二次必须加上"ios::sync_with_stdio(false);"

否则第二次是可以打开文件的，但是什么都读不到= =

谁能告诉我这是什么坑货？、、、跪求巨神解答。。。

By Xs酱~ 转载请说明博客地址：http://www.cnblogs.com/rausen

相关阅读:
Tomcat详解系列(3)
Tomcat详解系列(2)
Tomcat详解系列(1)
常用开发库
 单元测试
 [MongoDB知识体系] 一文全面总结MongoDB知识体系
 问题记录：net::ERR_CERT_AUTHORITY_INVALID
CSS+DIV特色开关按钮
 Jquery的Ajax简易优化思路
 CSS+DIV简易灯泡案例
原文地址：https://www.cnblogs.com/rausen/p/4148719.html