• 三、spark入门:文本中发现5个最常用的word,排除常用停用词


    package com.yl.wordcount

    import java.io.File

    import org.apache.spark.{SparkConf, SparkContext}

    import scala.collection.Iterator
    import scala.io.Source

    /**
    * wordcount进行排序并排除停用词
    */
    object WordCountStopWords {

    def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("spark://localhost:7077").setAppName("wordcount")
    val sc = new SparkContext(conf)

    val outFile = "/Users/admin/spark/sparkoutput"
    var stopWords:Iterator[String] = null
    val stopWordsFile = new File("/Users/admin/src"+"/tingyongci.txt")

    if(stopWordsFile.exists()){
    stopWords = Source.fromFile(stopWordsFile).getLines
    }
    val stopWordList = stopWords.toList

    val textFile = sc.textFile("/Users/admin/spark/spark-1.5.1-bin-hadoop2.4/README.md")
    val result = textFile.flatMap(_.split(" ")).filter(!_.isEmpty).filter(!stopWordList.contains(_)).map((_,1)).reduceByKey(_+_).map{case (word,count) =>(count,word)}.sortByKey(false)

    result.saveAsTextFile(outFile)
    }

    }
    http://www.cnblogs.com/ylcoder/
  • 相关阅读:
    JAVA 基本数据类型长度
    字符编码详解
    几种编码方式
    Java1.5泛型指南中文版(Java1.5 Generic Tutorial)
    java泛型小问题
    java中的equals()方法
    Java泛型中E、T、K、V等的含义
    数据库的基本操作
    Mysql数据类型简介(大概了解)
    [BZOJ 2007] 海拔
  • 原文地址:https://www.cnblogs.com/ylcoder/p/5730947.html
Copyright © 2020-2023  润新知