// 创建 Spark 运行配置对象 val sparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount") // 创建 Spark 上下文环境对象(连接对象) val sc : SparkContext = new SparkContext(sparkConf) // 读取文件数据 val fileRDD: RDD[String] = sc.textFile("input/word.txt") // 将文件中的数据进行分词 val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") ) // 转换数据结构 word => (word, 1) val word2OneRDD: RDD[(String, Int)] = wordRDD.map((_,1)) // 将转换结构后的数据按照相同的单词进行分组聚合 val word2CountRDD: RDD[(String, Int)] = word2OneRDD.reduceByKey(_+_) // 将数据聚合结果采集到内存中 val word2Count: Array[(String, Int)] = word2CountRDD.collect() // 打印结果 word2Count.foreach(println) //关闭 Spark 连接 sc.stop()
执行过程中,会产生大量的执行日志,如果为了能够更好的查看程序的执行结果,可以在项
目的 resources 目录中创建 log4j.properties 文件,并添加日志配置信息:
log4j.rootCategory=ERROR, console 尚硅谷大数据技术之 Spark ————————————————————————————— 更多 Java –大数据 –前端 –python 人工智能资料下载,可百度访问:尚硅谷官网 8 log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n # Set the default spark-shell log level to ERROR. When running the spark-shell, the # log level for this class is used to overwrite the root logger's log level, so that # the user can have different defaults for the shell and regular Spark apps. log4j.logger.org.apache.spark.repl.Main=ERROR # Settings to quiet third party logs that are too verbose log4j.logger.org.spark_project.jetty=ERROR log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR log4j.logger.org.apache.parquet=ERROR log4j.logger.parquet=ERROR # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR