• 一个mapreduce得到需要计算单词概率的基础数据


    第一步,先计算需要计算概率的词频,单词种类数,类别单词总数(类别均是按照文件夹名区分)(基础数据以及分词了,每个单词一行,以及预处理好)

    package org.lukey.hadoop.classifyBayes;
    
    import java.io.IOException;
    import java.net.URI;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataOutputStream;
    import org.apache.hadoop.fs.FileStatus;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IOUtils;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    
    import org.apache.hadoop.mapreduce.Counter;
    import org.apache.hadoop.mapreduce.Counters;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
    
    /**
     * 
     * 一次将需要的结果都统计到对应的文件夹中 AFRICA 484017newsML.txt afford 1
     * 
     * 按照这个格式输出给后面处理得到需要的: 1. AFRICA 484017newsML.txt AFRICA 487141newsML.txt
     * 类别中的文本数, ---> 计算先验概率(单独解决这个) 所有类别中的文本总数, ---> 可以由上面得到,计算先验概率
     * 
     * 2. AFRICA afford 1 AFRICA boy 3 每个类中的每个单词的个数,---> 计算各个类中单词的概率
     * 
     * 3. AFRICA 768 类中单词总数, ---> 将2中的第一个key相同的第三个数相加即可
     * 
     * 4. AllWORDS 12345 所有类别中单词种类数 ---> 将1中的第三个key归并,计算个数
     *
     */
    
    public class MyWordCount {
    
        private static MultipleOutputs<Text, IntWritable> mos;
        static String baseOutputPath = "/user/hadoop/test_out";
    
        // 设计两个map分别计算每个类别的文本数//和每个类别的单词总数
        private static Map<String, List<String>> fileCountMap = new HashMap<String, List<String>>();
        private static Map<String, Integer> fileCount = new HashMap<String, Integer>();
        // static Map<String, List<String>> wordsCountInClassMap = new
        // HashMap<String, List<String>>();
    
        static enum WordsNature {
            CLSASS_NUMBER, CLASS_WORDS, TOTALWORDS
        }
    
        public static void main(String[] args) throws Exception {
    
            Configuration conf = new Configuration();
    
            // 设置不同文件的路径
            // 文本数路径
            String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbality.txt";
            conf.set("priorProbality", priorProbality);
    
            String[] otherArgs = { "/user/hadoop/input/NBCorpus/Country", "/user/hadoop/mid/wordsFre" };
    
            Job job = new Job(conf, "file count");
    
            job.setJarByClass(MyWordCount.class);
    
            // job.setInputFormatClass(CustomInputFormat.class);
    
            job.setMapperClass(First_Mapper.class);
            job.setReducerClass(First_Reducer.class);
    
            //过滤掉文本数少于10的类别
            List<Path> inputPaths = getSecondDir(conf, otherArgs[0]);
            for (Path path : inputPaths) {
                FileInputFormat.addInputPath(job, path);
            }
    
            // 调用自己写的方法
    //        MyUtils.addInputPath(job, inputpath, conf);
            // CustomInputFormat.setInputPaths(job, inputpath);
            // FileInputFormat.addInputPath(job, inputpath);
            FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
    
            int exitCode = job.waitForCompletion(true) ? 0 : 1;
    
            // 调用计数器
            Counters counters = job.getCounters();
            Counter c1 = counters.findCounter(WordsNature.TOTALWORDS);
            System.out.println("-------------->>>>: " + c1.getDisplayName() + ":" + c1.getName() + ": " + c1.getValue());
    
            // 将单词种类数写入文件中
            Path totalWordsPath = new Path("/user/hadoop/output/totalwords.txt");
            FileSystem fs = FileSystem.get(conf);
            FSDataOutputStream outputStream = fs.create(totalWordsPath);
            outputStream.writeBytes(c1.getDisplayName() + ":" + c1.getValue());
            IOUtils.closeStream(outputStream);
    
    
            
            
            // 下次求概率是尝试单词总种类数写到configuration中
            //
            // conf.set("TOTALWORDS", totalWords.toString());
    
            System.exit(exitCode);
    
        }
    
        // Mapper
        static class First_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
            private final static IntWritable one = new IntWritable(1);
            private final static IntWritable zero = new IntWritable(0);
    
            private Text className = new Text();
            private Text countryName = new Text();
    
            @Override
            protected void cleanup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                    throws IOException, InterruptedException {
                Configuration conf = context.getConfiguration();
                String file = conf.get("priorProbality");
                FileSystem fs = FileSystem.get(URI.create(file), conf);
                Path priorPath = new Path(file);
                FSDataOutputStream priorStream = fs.create(priorPath);
                for (Map.Entry<String, List<String>> entry : fileCountMap.entrySet()) {
                    fileCount.put(entry.getKey(), entry.getValue().size());
                    priorStream.writeBytes(entry.getKey() + "	" + entry.getValue().size());
                }
    
                // 求文本总数
                int fileSum = 0;
                for (Integer num : fileCount.values()) {
                    fileSum += num;
                }
                System.out.println("fileSum = " + fileSum);
    
                // 计算每个类的先验概率并写入文件
                for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {
                    double p = (double) entry.getValue() / fileSum;
                    priorStream.writeBytes(entry.getKey() + ":" + p);
                }
                IOUtils.closeStream(priorStream);
    
            }
    
            
            @Override
            protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                    throws IOException, InterruptedException {
                // TODO Auto-generated method stub
                FileSplit fileSplit = (FileSplit) context.getInputSplit();
    
                // 文件名
                String fileName = fileSplit.getPath().getName();
    
                // 文件夹名(即类别名)
                String dirName = fileSplit.getPath().getParent().getName();
    
                className.set(dirName + "	" + value.toString());
                countryName.set(dirName + "	" + fileName + "	" + value.toString());
    
                // 将文件名添加到map中用于统计文本个数(单独跑了一个程序计算主要还是为了筛选文本数太少的类别)
                if (fileCountMap.containsKey(dirName)) {
                    if (!fileCountMap.get(dirName).contains(fileName)) {
                        fileCountMap.get(dirName).add(fileName);
                    }
                } else {
                    List<String> oneList = new ArrayList<String>();
                    oneList.add(fileName);
                    fileCountMap.put(dirName, oneList);
                }
    
                context.write(className, one); // 每个类别的每个单词数 // ABDBI hello 1
                context.write(new Text(dirName), one);// 统计每个类中的单词总数 //ABDBI 1
                context.write(value, zero); // 用于统计所有类中单词个数
    
            }
        }
    
        // Reducer
        static class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    
            // result 表示每个类别中每个单词的个数
            IntWritable result = new IntWritable();
            Map<String, List<String>> classMap = new HashMap<String, List<String>>();
            Map<String, List<String>> fileMap = new HashMap<String, List<String>>();
    
            @Override
            protected void reduce(Text key, Iterable<IntWritable> values,
                    Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                            throws IOException, InterruptedException {
                int sum = 0;
                for (IntWritable value : values) {
                    sum += value.get();
                }
    
                // sum为0,总得单词数加1,统计所有单词的种类
                if (sum == 0) {
                    context.getCounter(WordsNature.TOTALWORDS).increment(1);
                } else {// sum不为0时,通过key的长度来判断,
                    String[] temp = key.toString().split("	");
                    if (temp.length == 2) { // 用tab分隔类别和单词
                        result.set(sum);
                        context.write(key, result);
                        // mos.write(new Text(temp[1]), result, temp[0]);
                    } else { // 类别中单词总数
                        result.set(sum);
                        mos.write(key, result, "wordsInClass");
                    }
    
                }
    
            }
    
            @Override
            protected void cleanup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                    throws IOException, InterruptedException {
                // TODO Auto-generated method stub
                mos.close();
            }
    
            @Override
            protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                    throws IOException, InterruptedException {
                // TODO Auto-generated method stub
                mos = new MultipleOutputs<Text, IntWritable>(context);
            }
    
        }
        
        
        // 获取文件夹下面二级文件夹路径的方法
            static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {
                FileSystem fs = FileSystem.get(conf);
                Path path = new Path(folder);
                FileStatus[] stats = fs.listStatus(path);
                List<Path> folderPath = new ArrayList<Path>();
                for (FileStatus stat : stats) {
                    if (stat.isDir()) {
                        if (fs.listStatus(stat.getPath()).length > 10) {    //筛选出文件数大于10个的类别作为 输入路径
                            folderPath.add(stat.getPath());
                        }
                    }
                }
                return folderPath;
            }
    
    
    }
    View Code

    第二步,计算每个类别单词的概率,需提前读取每个类别单词总数,以及总得单词种类数(都可以通过configuration.set)也可以在setup里面先于map处理前读取数据。

    package org.lukey.hadoop.classifyBayes;
    
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.URI;
    import java.util.HashMap;
    import java.util.Map;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.DoubleWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
    
    public class Probability {
    
        private static final Log LOG = LogFactory.getLog(FileInputFormat.class);
        public static int total = 0;
        private static MultipleOutputs<Text, DoubleWritable> mos;
    
        // Client
        public static void main(String[] args) throws Exception {
    
            Configuration conf = new Configuration();
            conf.set("mapred.job.tracker", "192.168.190.128:9001");
            conf.set("mapred.jar", "probability.jar");
            // 读取单词总数,设置到congfiguration中
            String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";
            String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence/wordsInClass-r-00000";
    
            conf.set("wordsInClassPath", wordsInClassPath);
            // Map<String, Integer> wordsInClassMap = new HashMap<String,
            // Integer>();//保存每个类别的单词总数
    
            // 先读取单词总类别数
            FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);
            FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));
            BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
            String strLine = buffer.readLine();
            String[] temp = strLine.split(":");
            if (temp.length == 2) {
                // temp[0] = TOTALWORDS
                conf.set(temp[0], temp[1]);// 设置两个String
            }
    
            total = Integer.parseInt(conf.get("TOTALWORDS"));
            LOG.info("------>total = " + total);
    
            System.out.println("total ==== " + total);
            /*
             * String[] otherArgs = new GenericOptionsParser(conf,
             * args).getRemainingArgs();
             * 
             * if (otherArgs.length != 2) { System.out.println("Usage <in> <out>");
             * System.exit(-1); }
             */
            Job job = new Job(conf, "file count");
    
            job.setJarByClass(Probability.class);
    
            job.setMapperClass(WordsOfClassCountMapper.class);
            job.setReducerClass(WordsOfClassCountReducer.class);
    
            String input = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";
            String output = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";
    
            FileInputFormat.addInputPath(job, new Path(input));
            FileOutputFormat.setOutputPath(job, new Path(output));
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(DoubleWritable.class);
    
            System.exit(job.waitForCompletion(true) ? 0 : 1);
    
        }
    
        // Mapper
        static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
    
            private static DoubleWritable number = new DoubleWritable();
            private static Text className = new Text();
    
            // 保存类别中单词总数
            private static Map<String, Integer> filemap = new HashMap<String, Integer>();
    
            protected void map(LongWritable key, Text value,
                    Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
                            throws IOException, InterruptedException {
                Configuration conf = context.getConfiguration();
                int tot = Integer.parseInt(conf.get("TOTALWORDS"));
    
                System.out.println("total = " + total);
                System.out.println("tot = " + tot);
    
                // 输入的格式如下:
                // ALB weekend 1
                // ALB weeks 3
                Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据
                // Map<String, Map<String, Double>> priorMap = new HashMap<String,
                // Map<String, Double>>(); // 保存每个单词出现的概率
    
                String[] temp = value.toString().split("	");
                // 先将数据存到baseMap中
                if (temp.length == 3) {
                    // 文件夹名类别名
                    if (baseMap.containsKey(temp[0])) {
                        baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));
                    } else {
                        Map<String, Integer> oneMap = new HashMap<String, Integer>();
                        oneMap.put(temp[1], Integer.parseInt(temp[2]));
                        baseMap.put(temp[0], oneMap);
                    }
    
                } // 读取数据完毕,全部保存在baseMap中
    
                int allWordsInClass = 0;
                
    
                for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别
                    allWordsInClass = filemap.get(entries.getKey());
                    for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率
                        double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);
    
                        className.set(entries.getKey() + "	" + entry.getKey());
                        number.set(p);
                        LOG.info("------>p = " + p);
    
                        context.write(className, number);
                    }
                }
    
            }
    
            protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
                    throws IOException, InterruptedException {
                // TODO Auto-generated method stub
                mos.close();
            }
    
            protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
                    throws IOException, InterruptedException {
                // TODO Auto-generated method stub
                Configuration conf = context.getConfiguration();
                mos = new MultipleOutputs<Text, DoubleWritable>(context);
                String filePath = conf.get("wordsInClassPath");
                FileSystem fs = FileSystem.get(URI.create(filePath), conf);
                FSDataInputStream inputStream = fs.open(new Path(filePath));
                BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
                String strLine = null;
                while ((strLine = buffer.readLine()) != null) {
                    String[] temp = strLine.split("	");
                    filemap.put(temp[0], Integer.parseInt(temp[1]));
                }
            }
    
        }
    
        // Reducer
        static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
    
            // result 表示每个文件里面单词个数
            DoubleWritable result = new DoubleWritable();
            // Configuration conf = new Configuration();
            // int total = conf.getInt("TOTALWORDS", 1);
    
            protected void reduce(Text key, Iterable<DoubleWritable> values,
                    Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
                            throws IOException, InterruptedException {
    
                double sum = 0L;
                for (DoubleWritable value : values) {
                    sum += value.get();
                }
                result.set(sum);
    
                context.write(key, result);
            }
    
        }
    
    }
    View Code
  • 相关阅读:
    XAF 有条件的对象访问权限
    XAF 顯示 UnInplace Report(設置自定義條件顯示報表,不是根據選擇ListView記錄條件顯示報表)
    XAF 如何自定义PivotGrid单元格显示文本?
    XAF 如何布局详细视图上的按钮
    XAF How to set size of a popup detail view
    XAF Delta Replication Module for Devexpress eXpressApp Framework
    XAF 帮助文档翻译 EasyTest Basics(基础)
    XAF 用户双击ListView记录时禁止显示DetailView
    XAF How to enable LayoutView mode in the GridControl in List Views
    XAF 如何实现ListView单元格批量更改?
  • 原文地址:https://www.cnblogs.com/luolizhi/p/4944760.html
Copyright © 2020-2023  润新知