• wordcount实例


    scala的wordcount实例

    package com.wondersgroup.myscala
    
    import scala.actors.{Actor, Future}
    import scala.collection.mutable.ListBuffer
    import scala.io.Source
    
    //首先统计每个文本中出现的频率=》汇总
    case class SubmitTask(f:String)
    case object StopTask
    
    //统计一个文本中单词出现的次数
    
    
    class ActorTest3 extends Actor{
    
      override def act() :Unit = {
        while (true) {
          receive{
            case SubmitTask(f) => {
              //把文件的一行内容作为一个元素存入list
              val lines = Source.fromFile(f).getLines().toList
              //文件中的每一个单词作为一个元素存入list
              val words = lines.flatMap(_.split(" "))
              print("----------"+words)
              println("================"+words.map((_,1)))
              //得到一个map ,当前文本的单词,以及相应单词出现的次数
              println("++++++"+words.map((_,1)).groupBy(_._1))
              val result = words.map((_,1)).groupBy(_._1).mapValues(_.size)
              println("&&&&&&&&&&&&&&&&"+result)
    
              sender ! result
    
            }
    
            case StopTask => exit()
          }
        }
      }
    
    }
    
    object ActorTest3{
      def main(args: Array[String]): Unit = {
        //把文本分析任务提交给actor
        val replys = new ListBuffer[Future[Any]]
        val results = new ListBuffer[Map[String,Int]]
        val files = Array("src/wordcount.txt","src/wordcount1.txt")
        for(f <- files) {
          val actor = new ActorTest3
          actor.start()
          val reply = actor !! SubmitTask(f)
          //把处理结果放到replys
          replys += reply
        }
    
        //对多个文件的处理结果汇总
        while (replys.size > 0) {
          //判断结果是否可取
          val done = replys.filter(_.isSet)
          print("@@@@@@@@@@@"+done)
          for(res <- done) {
            results += res.apply().asInstanceOf[Map[String,Int]]
            replys -= res
          }
          Thread.sleep(5000)
        }
    
        //对各个分析结果进行汇总
        val res2 = results.flatten.groupBy(_._1).mapValues(_.foldLeft(0)(_+_._2))
        println("******************"+res2)
    
      }
    }  

     输出

    @@@@@@@@@@@ListBuffer()----------List(python, is, a, very, brief, language, It, is, also, a, shell, language, we, like, python)================List((python,1), (is,1), (a,1), (very,1), (brief,1), (language,1), (It,1), (is,1), (also,1), (a,1), (shell,1), (language,1), (we,1), (like,1), (python,1))
    ----------List(python, java, go, python, c++, c++, java, ruby, c, javascript, c++)================List((python,1), (java,1), (go,1), (python,1), (c++,1), (c++,1), (java,1), (ruby,1), (c,1), (javascript,1), (c++,1))
    ++++++Map(java -> List((java,1), (java,1)), c++ -> List((c++,1), (c++,1), (c++,1)), go -> List((go,1)), python -> List((python,1), (python,1)), c -> List((c,1)), ruby -> List((ruby,1)), javascript -> List((javascript,1)))
    ++++++Map(is -> List((is,1), (is,1)), shell -> List((shell,1)), a -> List((a,1), (a,1)), also -> List((also,1)), language -> List((language,1), (language,1)), brief -> List((brief,1)), python -> List((python,1), (python,1)), It -> List((It,1)), very -> List((very,1)), we -> List((we,1)), like -> List((like,1)))
    &&&&&&&&&&&&&&&&Map(is -> 2, shell -> 1, a -> 2, also -> 1, language -> 2, brief -> 1, python -> 2, It -> 1, very -> 1, we -> 1, like -> 1)
    &&&&&&&&&&&&&&&&Map(java -> 2, c++ -> 3, go -> 1, python -> 2, c -> 1, ruby -> 1, javascript -> 1)
    @@@@@@@@@@@ListBuffer(<function0>, <function0>)******************Map(is -> 2, shell -> 1, a -> 2, java -> 2, c++ -> 3, go -> 1, also -> 1, language -> 2, brief -> 1, python -> 4, It -> 1, c -> 1, ruby -> 1, very -> 1, we -> 1, like -> 1, javascript -> 1)
    

    spark的wordcount

    object WordCount {
    
      def main(args: Array[String]): Unit = {
    
        val spark: SparkSession = SparkSession.builder()
        .appName("wordCount")
        .master("local[*]")
        .getOrCreate()
    
        //读取数据
        val ds: Dataset[String] = spark.read.textFile("文件路径/word.txt")
        //引包,不然无法调用 flatMap()
        import spark.implicits._
        //整理数据 (切分压平)
        val ds1: Dataset[String] = ds.flatMap(_.split(" "))
        //构建临时表
        ds1.createTempView("word")
        //执行 SQL 语句,结果倒序
        val df: DataFrame = spark.sql("select value,count(*) count from word group by value order by count desc")
        //展示
        df.show()
        //关闭
        spark.stop()
      }
    
    }
    

      

    mapreduce的wordcount

    mapper

    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    //import org.apache.hadoop.io.*;
    //import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
    /**
     * 输入key   LongWritable  行号
     * 输入的value Text   一行内容
     * 输出的key  Text  单词 
     * 输出的value IntWritable  单词的个数
     * @author lenovo
     *
     */
    public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    	
    	Text k =new Text();
    	IntWritable v = new IntWritable(1);
    //	@SuppressWarnings("unused")
    	@Override
    	protected void map(LongWritable key, Text value, Context context)
    			throws IOException, InterruptedException {
    		
    		//  1 将一行内容转化为String
    		String line = value.toString();
    		
    		// 2 切分
    		String[] words = line.split(" ");
    		
    		// 3 循环写出到下一个阶段   写
    		for (String word : words) {
    
    			k.set(word);
    			context.write(k,v);//写入
    		}
    	}
    }  

    reducer

    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class WordCountReducer extends Reducer<Text, IntWritable, Text,IntWritable>{
    	
    	// hello 1
    	// hello 1
    	
    	@Override
    	//相同的进来
    	protected void reduce(Text key, Iterable<IntWritable> values,Context context) 
    			throws IOException, InterruptedException {
    		//  1 汇总 单词总个数
    		int sum = 0;
    		for (IntWritable count : values) {
    			sum +=count.get();
    		}
    		
    		// 2 输出单词的总个数
    		
    		context.write(key, new IntWritable(sum));
    	}
    }  

    driver

    import java.io.IOException;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.compress.BZip2Codec;
    import org.apache.hadoop.io.compress.CompressionCodec;
    import org.apache.hadoop.io.compress.DefaultCodec;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    
    public class WordCountDriver {
    	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    
    		// 1获取job信息
    		Configuration configuration = new Configuration();
    		
    		// 开启 map 端输出压缩
    		configuration.setBoolean("mapreduce.map.output.compress", true);
    		// 设置 map 端输出压缩方式
    //		configuration.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
    		configuration.setClass("mapreduce.map.output.compress.codec", DefaultCodec.class, CompressionCodec.class);
    
    		Job job = Job.getInstance(configuration);
    
    		// 2 获取jar包位置
    
    		job.setJarByClass(WordCountDriver.class);
    
    		// 3 关联mapper he reducer
    		job.setMapperClass(WordCountMapper.class);
    		job.setReducerClass(WordCountReducer.class);
    
    		// 4 设置map输出数据类型
    		job.setMapOutputKeyClass(Text.class);
    		job.setMapOutputValueClass(IntWritable.class);
    
    		// 5 设置最终输出类型
    		job.setOutputKeyClass(Text.class);
    		job.setOutputValueClass(IntWritable.class);
    
    		
    		// 9 添加combiner     进入reduce之前先进行合并,不是所有的map都能合并,需要满足要求
    //		job.setCombinerClass(WordcountCombiner.class);
    		
    		
    		// 8 设置读取输入文件切片的类     多个小文件的处理方式 使用CombineTextInputFormat     系统默认TextInputFormat
    		
    //		job.setInputFormatClass(CombineTextInputFormat.class);
    //		CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);
    //		CombineTextInputFormat.setMinInputSplitSize(job, 2097152);
    		// 6 设置数据输入 输出文件的 路径
    		FileInputFormat.setInputPaths(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    		
    		// 设置 reduce 端输出压缩开启
    		FileOutputFormat.setCompressOutput(job, true);
    		// 设置压缩的方式
    		 FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); 
    		// FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); 
    		// FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); 
    		
    		// 7提交代码
    		
    		boolean result = job.waitForCompletion(true);
    		System.exit(result?0:1);
    	}
    }  

    combiner

    import java.io.IOException;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class WordcountCombiner extends Reducer<Text, IntWritable, Text, IntWritable>{
    	
    	@Override
    	protected void reduce(Text key, Iterable<IntWritable> values,
    			Context context) throws IOException, InterruptedException {
    		// 1 汇总
    		int sum = 0;
    		for (IntWritable value : values) {
    			sum += value.get();
    		}
    		
    		// 2 输出
    		context.write(key, new IntWritable(sum));
    	}
    } 
  • 相关阅读:
    SystemManage_系统管理
    安装openoffice.org
    Skill_技巧
    squid代理服务器架设与维护
    FreeBSD 添加硬盘并分区操作说明
    常见游戏端口
    /usr was not properly dismounted 解决办法
    自动获取电信/网通等IP列表
    Cacti文档下载(linux/windows Cacti安装、cacti模板、cacti插件)
    实现基于DNS的负载均衡
  • 原文地址:https://www.cnblogs.com/snow-wolf-1/p/11827088.html
Copyright © 2020-2023  润新知