• rf-idf的java实现


    还存在的问题是,对于其中分词借助的库还存在问题

    参考此篇链接

    http://www.cnblogs.com/ywl925/archive/2013/08/26/3275878.html

    具体代码部分:

    具体代码在老电脑linux系统中

    下面这个类:主要是,1列出某个目录下的所有文件名。2,读取某个特定文件

    package com.bobo.paper.util;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.util.ArrayList;
    import java.util.List;
    
    public class FileUtil {
     
        public static ArrayList<String> FileList = new ArrayList<String>(); // the list of file
    /**
     * 列出某個目錄及其子目錄下所有的文件列表
     * @param filepath 目錄路徑
     * @return 該路徑及其子路經下的所有文件名列表
     * @throws FileNotFoundException
     * @throws IOException
     */
        public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException
        {
            try
            {
                File file = new File(filepath);
                if(!file.isDirectory())
                {
                    System.out.println("输入的不是目錄名称;");
                    System.out.println("filepath:" + file.getAbsolutePath());
                }
                else
                {
                    String[] flist = file.list();
                    for(int i = 0; i < flist.length; i++)
                    {
                        File newfile = new File(filepath + "/" + flist[i]);
                        if(!newfile.isDirectory())
                        {
                            FileList.add(newfile.getAbsolutePath());
                        }
                        else if(newfile.isDirectory()) //if file is a directory, call ReadDirs
                        {
                            readDirs(filepath + "/" + flist[i]);
                        }                    
                    }
                }
            }catch(FileNotFoundException e)
            {
                System.out.println(e.getMessage());
            }
            return FileList;
        }
        /**
         * 讀取文件內容,以字符串的方式返回
         * @param file 需要讀取的文件名
         * @return 返回讀取的文件內容構成的字符串,行之間用
    進行分割
         * @throws FileNotFoundException
         * @throws IOException
         */
        public static String readFile(String file) throws FileNotFoundException, IOException
        {
            StringBuffer strSb = new StringBuffer(); //String is constant, StringBuffer can be changed.
            InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams
            BufferedReader br = new BufferedReader(inStrR); 
            String line = br.readLine();
            while(line != null){
                strSb.append(line).append("
    ");
                line = br.readLine();    
            }
            
            return strSb.toString();
        }
        
        
    
    }
    FileUtil

    下面这个类主要用于分词

    package com.bobo.paper.util;
    
    import java.io.IOException;
    import java.util.ArrayList;
    
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    public class CutWordsUtil {
    
    
        /**
         * 进行分词操作
         * @param file
         * @return
         * @throws IOException
         */
        public static ArrayList<String> cutWords(String file) throws IOException{
            
            ArrayList<String> words = new ArrayList<String>();
            String text = FileUtil.readFile(file);
            IKAnalyzer analyzer = new IKAnalyzer();
            // 这里貌似缺少一个分词jar包进一步依赖的包?
            
          // analyzer.split(text);
            //这个分词的工具,回头要以下即可
            return null;
                
            
             
        }
    
         
        
    
    }
    CutWords

    下面这个类主要实现tf-idf算法

    package com.bobo.paper.athology;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    
    import com.bobo.paper.util.CutWordsUtil;
    import com.bobo.paper.util.FileUtil;
    
    public class TfIdfAthology {
     
         /**
          * 统计各个词语列表中各个词语出现的次数
          * @param cutwords 分词之后的词语列表
          * @return 返回一个hashmap,key为词,value为词出现的次数
          */
        public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords){
            HashMap<String, Integer> resTF = new HashMap<String, Integer>();
            
            for(String word : cutwords){
                if(resTF.get(word) == null){
                    resTF.put(word, 1);
                    System.out.println(word);
                }
                else{
                    resTF.put(word, resTF.get(word) + 1);
                    System.out.println(word.toString());
                }
            }
            return resTF;
        }
        /**
         * 统计词频,即tf值
         * @param cutwords 分词之后的词语列表
         * @return
         */
        public static HashMap<String, Float> tf(ArrayList<String> cutwords){
            HashMap<String, Float> resTF = new HashMap<String, Float>();
            
            int wordLen = cutwords.size();
            HashMap<String, Integer> intTF = normalTF(cutwords); 
            
            Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF
            while(iter.hasNext()){
                Map.Entry entry = (Map.Entry)iter.next();
                resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
                System.out.println(entry.getKey().toString() + " = "+  Float.parseFloat(entry.getValue().toString()) / wordLen);
            }
            return resTF;
        } 
       /**
        * 将以个目录下所有的文件进行分词,返回一个HashMap<String, HashMap<String, Integer>> ,前面一个key是文件名,后面一个key是词,其值为该词出现的次数
        * @param dirc
        * @return
        * @throws IOException
        */
        public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles(String dirc) throws IOException{
            HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String,Integer>>();
            List<String> filelist = FileUtil.readDirs(dirc);
            for(String file : filelist){
                HashMap<String, Integer> dict = new HashMap<String, Integer>();
                ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut word for one file
                
                dict =  normalTF(cutwords);
                allNormalTF.put(file, dict);
            }    
            return allNormalTF;
        }
        /**
         * 計算一個目錄下所有文件中詞語的詞頻
         * @param dirc 目錄名稱
         * @return 返回一個HashMap<String,HashMap<String, Float>>,第一個key是文件名,第二個key是詞,value是該詞語在該文件中的頻率
         * @throws IOException
         */
        public static HashMap<String,HashMap<String, Float>> tfAllFiles(String dirc) throws IOException{
            HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>();
            List<String> filelist = FileUtil.readDirs(dirc);
            
            for(String file : filelist){
                HashMap<String, Float> dict = new HashMap<String, Float>();
                ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut words for one file
                
                dict = tf(cutwords);
                allTF.put(file, dict);
            }
            return allTF;
        }
        /**
         * 計算词语的idf值 log(|D|/{包含该词语的文档个数+1})
         * @param all_tf 爲HashMap<String,HashMap<String, Float>>,第一個key爲文件名,第二個key爲詞語,float代表該詞語在本文件中的詞頻
         * @return
         */
        public static HashMap<String, Float> idf(HashMap<String,HashMap<String, Float>> all_tf){
            HashMap<String, Float> resIdf = new HashMap<String, Float>();
            //dict的key值为词,其value为出现该词的文档个数
            HashMap<String, Integer> dict = new HashMap<String, Integer>();
            int docNum = FileUtil.FileList.size();
            //循环所有的文件
            for(int i = 0; i < docNum; i++){
                //all_tf中記錄的是
                HashMap<String, Float> temp = all_tf.get(FileUtil.FileList.get(i));
                Iterator iter = temp.entrySet().iterator();
                
                while(iter.hasNext()){
                    //循环一个文件中的所有词语的词频
                    Map.Entry entry = (Map.Entry)iter.next();
                    String word = entry.getKey().toString();
                    //IDF的公式,idfi=log(|D|/|{j:ti屬於dj}|),其中|D|爲語料庫中的文件總數目,|{j:ti屬於dj}|指的是包含詞語ti的文件數目,如果该词语不在语料库中,就会导致被除数为零,因此一般情况下使用1 + |{j : t_{i} in d_{j}}|
                    if(dict.get(word) == null){
                        dict.put(word, 1);
                    }else {
                        dict.put(word, dict.get(word) + 1);
                    }
                }
            }
            System.out.println("IDF for every word is:");
            Iterator iter_dict = dict.entrySet().iterator();
            while(iter_dict.hasNext()){
                Map.Entry entry = (Map.Entry)iter_dict.next();
                float value = (float)Math.log(docNum / Float.parseFloat(entry.getValue().toString()));
                resIdf.put(entry.getKey().toString(), value);
                System.out.println(entry.getKey().toString() + " = " + value);
            }
            return resIdf;
        }
        /**
         * 计算某个词语的idf值
         * @param all_tf  记录所有词语tf值的map,第一个key为文件名,第二个key为词语
         * @param idfs  记录所有词语idf值的map,key为词语
         */
        public static void tf_idf(HashMap<String,HashMap<String, Float>> all_tf,HashMap<String, Float> idfs){
            HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>();
            int docNum = FileUtil.FileList.size();
            for(int i = 0; i < docNum; i++){
                String filepath = FileUtil.FileList.get(i);
                HashMap<String, Float> tfidf = new HashMap<String, Float>();
                HashMap<String, Float> temp = all_tf.get(filepath);
                Iterator iter = temp.entrySet().iterator();
                while(iter.hasNext()){
                    Map.Entry entry = (Map.Entry)iter.next();
                    String word = entry.getKey().toString();
                    Float value = (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word); 
                    tfidf.put(word, value);
                }
                resTfIdf.put(filepath, tfidf);
            }
            System.out.println("TF-IDF for Every file is :");
            DisTfIdf(resTfIdf);
        }
        //這個主要用來顯示最終計算得到的tf-idf值
        public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf){
            Iterator iter1 = tfidf.entrySet().iterator();
            while(iter1.hasNext()){
                Map.Entry entrys = (Map.Entry)iter1.next();
                System.out.println("FileName: " + entrys.getKey().toString());
                System.out.print("{");
                HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue();
                Iterator iter2 = temp.entrySet().iterator();
                while(iter2.hasNext()){
                    Map.Entry entry = (Map.Entry)iter2.next(); 
                    System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
                }
                System.out.println("}");
            }
            
        }
    }
    tfIdfAthology

    最终的调用方法为:

    package com.bobo.paper;
    
    import java.io.IOException;
    import java.util.HashMap;
    
    import com.bobo.paper.athology.TfIdfAthology;
    
    public class Welcome {
    
        /**
         * @param args
         */
        public static void main(String[] args) {
             
     
               String file = "D:/testfiles";
    
                HashMap<String, HashMap<String, Float>> all_tf;
                try {
                    all_tf = TfIdfAthology.tfAllFiles(file);
                    System.out.println();
                    HashMap<String, Float> idfs = TfIdfAthology.idf(all_tf);
                    System.out.println();
                    TfIdfAthology.tf_idf(all_tf, idfs);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                
        }
    
    }
    Test
  • 相关阅读:
    教务管理及教材订购系统设计文档
    Lambda Expression
    Domain logic approaches
    load data local to table
    Spring AOP Capabilities and Goals
    CDI Features
    java设计模式
    Spring RESTful
    ecipse 新建Android test project error
    Excel数据导出
  • 原文地址:https://www.cnblogs.com/bobodeboke/p/3493035.html
Copyright © 2020-2023  润新知