运行
mport org.apache.log4j.{Level, Logger} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * Created by Lee_Rz on 2017/8/30. */ object SparkDemo { def main(args: Array[String]) { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val sc: SparkContext = new SparkContext(new SparkConf().setAppName(this.getClass().getName()).setMaster("local[2]")) val rdd1: RDD[String] = sc.textFile("C:\Users\166\Desktop\text.txt") //一行一行的读数据 //懒算子 val key: RDD[(String, Int)] = rdd1.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_) println(key.collect().toBuffer)//收集到Driver } }
报错
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 17/09/02 13:01:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 17/09/02 13:01:17 INFO Slf4jLogger: Slf4jLogger started 17/09/02 13:01:17 INFO Remoting: Starting remoting 17/09/02 13:01:17 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.0.166:51388] 17/09/02 13:01:18 ERROR Shell: Failed to locate the winutils binary in the hadoop binary path java.io.IOException: Could not locate executable nullinwinutils.exe in the Hadoop binaries. at org.apache.hadoop.util.Shell.getQualifiedBinPath(Shell.java:278) at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:300) at org.apache.hadoop.util.Shell.<clinit>(Shell.java:293) at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:76) at org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:362) at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$33.apply(SparkContext.scala:1015) at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$33.apply(SparkContext.scala:1015) at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176) at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176) at scala.Option.map(Option.scala:145) at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:176) at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:195) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) at org.apache.spark.Partitioner$.defaultPartitioner(Partitioner.scala:65) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$reduceByKey$3.apply(PairRDDFunctions.scala:331) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$reduceByKey$3.apply(PairRDDFunctions.scala:331) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.PairRDDFunctions.reduceByKey(PairRDDFunctions.scala:330) at zx.SparkDemo$.main(SparkDemo.scala:15) at zx.SparkDemo.main(SparkDemo.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at com.intellij.rt.execution.application.AppMain.main(AppMain.java:140) 17/09/02 13:01:19 INFO FileInputFormat: Total input paths to process : 1 17/09/02 13:01:19 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id 17/09/02 13:01:19 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id 17/09/02 13:01:19 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap 17/09/02 13:01:19 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition 17/09/02 13:01:19 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id ArrayBuffer((are,2), (hello,1), (any,1), (ok,4), (world,1), (me,1), (alone,1), (you,2), (no,1), (believie,1), (more,1)) 17/09/02 13:01:19 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon. Process finished with exit code 0
检查发现hadoop下bin目录下已经存在winutils.exe,检查hadoop的path路径,发现没有严格按照格式创建hadoop的path,真确的格式是HADOOP_HOME=......,因为在hadoop的生态圈中很多框架都是依赖hadoop的,所以他们的配置文件中,默认的export的hadoop路径是格式是HADOOP_HOME