    http://www.ruanyifeng.com/blog/2013/03/tf-idf.html 写的很明了
    package com.data.text.tfidf;
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.Map.Entry;
    public class TF_IDF {
        private  double NUM_DOCS;
        private  Map<String, Integer> idf_map;
        public TF_IDF(String fileName){
            idf_map = new HashMap<String, Integer>();
            File file = new File(fileName);
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new FileReader(file));
                String tempString = null;
                tempString = reader.readLine();
                NUM_DOCS = (double)Integer.parseInt(tempString);
                // 一次读入一行,直到读入null为文件结束
                while ((tempString = reader.readLine()) != null) {
                    String[] arr = tempString.split(" : ");
                    String key = arr[0];
                    Integer value = Integer.parseInt(arr[1]);
                    idf_map.put(key, value);
            } catch (IOException e) {
            } finally {
                if (reader != null) {
                    try {
                    } catch (IOException e1) {
        public List<Feature> cacu(Map<String, Integer> tf_map) {
            // 统计总词数
            Integer word_num_sum = 0;
            for (Entry<String, Integer> entry : tf_map.entrySet()) {
                word_num_sum += entry.getValue();
            List<Feature> list_fea = new ArrayList<Feature>();
            for (Entry<String, Integer> entry : tf_map.entrySet()) {
                String word = entry.getKey();
                Integer num = entry.getValue();
                double tf = (double) num / word_num_sum;
                double idf = Math.log(NUM_DOCS / idf_map.get(word) + 1);//+1平滑 逆文档频率
                double weight = tf * idf;
                list_fea.add(new Feature(word, num, weight));
            return list_fea;
        public static void main(String[] args) {
            // TODO Auto-generated method stub
    package com.data.text.tfidf;
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.HashSet;
    import java.util.Set;
    public class StopWord {
        public static Set<String> GetStopWords(){
            String fileName = "stopwords.txt";
            return readwords(fileName);
         * 读取停用词表
         * @param fileName
         * @return
        private static Set<String> readwords(String fileName){
            Set<String> set = new HashSet<String>();
            File file = new File(fileName);
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new FileReader(file));
                String tempString = null;
                // 一次读入一行,直到读入null为文件结束
                while ((tempString = reader.readLine()) != null) {
            } catch (IOException e) {
            } finally {
                if (reader != null) {
                    try {
                    } catch (IOException e1) {
            return set;
    package com.data.text.tfidf;
     * 特征词
     * @author root
    public class Feature implements Comparable<Feature> {
        private String word;
        private Integer num;
        private double weight;
        public Feature(String word, Integer num, double weight) {
            this.word = word;
            this.num = num;
            this.weight = weight;
        public String getWord() {
            return word;
        public Integer getNum() {
            return num;
        public double getWeight() {
            return weight;
        public int compareTo(Feature o) {
            if(this.getWeight() == o.getWeight()){
                return 0;
            }else if(this.getWeight() > o.getWeight()){
                return -1;
                return 1;
        public String toString(){
            return this.word + " freq: " + num + " weight: " + weight;
    __author__ = 'dell'
    import math
    import re
    from operator import itemgetter
    class TfIdf:
        def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5):
            self.num_docs = 0
            self.term_num_docs = {}
            self.stopwords = []
            self.idf_default = DEFAULT_IDF
            if corpus_filename:
                corpus_file = open(corpus_filename, 'r')
                #load num of documents
                line = corpus_file.readline()
                self.num_docs = int(line)
                #read term:frequency from each subsequent line in the file
                for line in corpus_file:
                    tokens = line.split(':')
                    term = tokens[0].strip()
                    frequency = int(tokens[1].strip())
                    self.term_num_docs[term] = frequency
            if stopword_filename:
                stopword_file = open(stopword_filename)
                self.stopwords = [line.strip() for line in stopword_file]
        def get_tokens(self, str):
            return re.findall(r"<a.*?/a>|<[^>]*>|[w'@#]+", str.lower())
        def add_input_document(self, input):
            self.num_docs += 1
            words = set(self.get_tokens(input))
            for word in words:
                if word in self.term_num_docs:
                    self.term_num_docs[word] += 1
                    self.term_num_docs[word] = 1
        def get_num_docs(self):
            return self.num_docs
        def get_idf(self, term):
            if term in self.stopwords:
                return 0
            if term not in self.term_num_docs:
                return self.idf_default
            return math.log(float(1 + self.get_num_docs()) / (1 + self.term_num_docs[term]))
        def get_doc_keywords(self, curr_doc):
            tfidf = {}
            tokens = self.get_tokens(curr_doc)
            tokens_set = set(tokens)
            for word in tokens_set:
                tf = float(tokens.count(word) / len(tokens))
                idf = self.get_idf(word)
                tfidf[word] = tf * idf
            return sorted(tfidf.items(), key=itemgetter(1), reverse=True)
  • 相关阅读:
    算法竞赛入门经典 训练指南 之 图论(完全版持续更新)
    uva 11324 The Largest Clique 强连通分量求缩点构造DAG
    hdu 4288 Coder 一个很水的版本 >_<
    hoj 2939 Coin Question
    成都网络赛 1002 Control 1005 Food
  • 原文地址:https://www.cnblogs.com/i80386/p/3240601.html
Copyright © 2020-2023  润新知