参考源:
http://www.ruanyifeng.com/blog/2013/03/tf-idf.html 写的很明了
package com.data.text.tfidf;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
public class TF_IDF {
private double NUM_DOCS;
private Map<String, Integer> idf_map;
public TF_IDF(String fileName){
idf_map = new HashMap<String, Integer>();
File file = new File(fileName);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String tempString = null;
//第一行为Num_docs
tempString = reader.readLine();
NUM_DOCS = (double)Integer.parseInt(tempString);
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
String[] arr = tempString.split(" : ");
String key = arr[0];
Integer value = Integer.parseInt(arr[1]);
idf_map.put(key, value);
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
}
public List<Feature> cacu(Map<String, Integer> tf_map) {
// 统计总词数
Integer word_num_sum = 0;
for (Entry<String, Integer> entry : tf_map.entrySet()) {
word_num_sum += entry.getValue();
}
//计算tf-idf
List<Feature> list_fea = new ArrayList<Feature>();
for (Entry<String, Integer> entry : tf_map.entrySet()) {
String word = entry.getKey();
Integer num = entry.getValue();
double tf = (double) num / word_num_sum;
double idf = Math.log(NUM_DOCS / idf_map.get(word) + 1);//+1平滑 逆文档频率
double weight = tf * idf;
list_fea.add(new Feature(word, num, weight));
}
//根据权重排序
Collections.sort(list_fea);
return list_fea;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}
package com.data.text.tfidf;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
public class StopWord {
public static Set<String> GetStopWords(){
String fileName = "stopwords.txt";
return readwords(fileName);
}
/**
* 读取停用词表
* @param fileName
* @return
*/
private static Set<String> readwords(String fileName){
Set<String> set = new HashSet<String>();
File file = new File(fileName);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
set.add(tempString.trim());
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
return set;
}
}
package com.data.text.tfidf;
/**
* 特征词
* @author root
*
*/
public class Feature implements Comparable<Feature> {
private String word;
private Integer num;
private double weight;
public Feature(String word, Integer num, double weight) {
this.word = word;
this.num = num;
this.weight = weight;
}
public String getWord() {
return word;
}
public Integer getNum() {
return num;
}
public double getWeight() {
return weight;
}
@Override
public int compareTo(Feature o) {
if(this.getWeight() == o.getWeight()){
return 0;
}else if(this.getWeight() > o.getWeight()){
return -1;
}else{
return 1;
}
}
public String toString(){
return this.word + " freq: " + num + " weight: " + weight;
}
}
__author__ = 'dell'
import math
import re
from operator import itemgetter
class TfIdf:
def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5):
self.num_docs = 0
self.term_num_docs = {}
self.stopwords = []
self.idf_default = DEFAULT_IDF
if corpus_filename:
corpus_file = open(corpus_filename, 'r')
#load num of documents
line = corpus_file.readline()
self.num_docs = int(line)
#read term:frequency from each subsequent line in the file
for line in corpus_file:
tokens = line.split(':')
term = tokens[0].strip()
frequency = int(tokens[1].strip())
self.term_num_docs[term] = frequency
if stopword_filename:
stopword_file = open(stopword_filename)
self.stopwords = [line.strip() for line in stopword_file]
def get_tokens(self, str):
return re.findall(r"<a.*?/a>|<[^>]*>|[w'@#]+", str.lower())
def add_input_document(self, input):
self.num_docs += 1
words = set(self.get_tokens(input))
for word in words:
if word in self.term_num_docs:
self.term_num_docs[word] += 1
else:
self.term_num_docs[word] = 1
def get_num_docs(self):
return self.num_docs
def get_idf(self, term):
if term in self.stopwords:
return 0
if term not in self.term_num_docs:
return self.idf_default
return math.log(float(1 + self.get_num_docs()) / (1 + self.term_num_docs[term]))
def get_doc_keywords(self, curr_doc):
tfidf = {}
tokens = self.get_tokens(curr_doc)
tokens_set = set(tokens)
for word in tokens_set:
tf = float(tokens.count(word) / len(tokens))
idf = self.get_idf(word)
tfidf[word] = tf * idf
return sorted(tfidf.items(), key=itemgetter(1), reverse=True)