实例一:
teacher.log
http://bigdata.baidu.cn/zhangsan http://bigdata.baidu.cn/zhangsan http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/wangwu http://bigdata.baidu.cn/wangwu http://javaee.baidu.cn/xiaoxu http://javaee.baidu.cn/xiaoxu http://javaee.baidu.cn/laoyang http://javaee.baidu.cn/laoyang http://javaee.baidu.cn/laoyang http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/wangwu http://bigdata.baidu.cn/wangwu http://javaee.baidu.cn/xiaoxu http://javaee.baidu.cn/xiaoxu http://javaee.baidu.cn/laoyang http://javaee.baidu.cn/laoyang http://javaee.baidu.cn/laoyang http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/lisi http://bigdata.baidu.cn/wangwu http://bigdata.baidu.cn/wangwu http://javaee.baidu.cn/xiaoxu http://javaee.baidu.cn/xiaoxu http://javaee.baidu.cn/laoyang http://javaee.baidu.cn/laoyang http://javaee.baidu.cn/laoyang http://php.baidu.cn/laoli http://php.baidu.cn/laoliu http://php.baidu.cn/laoli http://php.baidu.cn/laoli
全局topn 组内topn
代码:
package dayo1 import java.net.URL import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object teacher2 { def main(args: Array[String]): Unit = { val conf = new SparkConf () .setAppName ( this.getClass.getSimpleName ) .setMaster ( "local[1]" ) val sc = new SparkContext ( conf ) val lines = sc.textFile ( "E:\teacher.log" ) val overAll: RDD[((String, String), Int)] = lines.map ( tp => { val teacher: String = tp.split ( "/" ).last val host = new URL ( tp ).getHost val subject = host.substring ( 0, host.indexOf ( "." ) ) ((teacher, subject), 1) } ) //所有科目和老师的前三 val topOverAll = overAll.reduceByKey ( _ + _ ).sortBy ( -_._2 ).take ( 3 ).foreach ( println ) //每个科目前两名的老师 val topGroup = overAll.reduceByKey ( _ + _ ).groupBy ( _._1._2 ).mapValues ( _.toList.sortBy ( -_._2 ).take ( 2 ) ).foreach ( println ) sc.stop () } }
实例二:
去重
file1: 2012-3-1 a 2012-3-2 b 2012-3-3 c 2012-3-4 d 2012-3-5 a 2012-3-6 b 2012-3-7 c 2012-3-3 c file2: 2012-3-1 b 2012-3-2 a 2012-3-3 b 2012-3-4 d 2012-3-5 a 2012-3-6 c 2012-3-7 d 2012-3-3 c
代码:
package dayo1 import org.apache.spark.{SparkConf, SparkContext} object distinct { def main(args: Array[String]): Unit = { val cof = new SparkConf () .setAppName ( this.getClass.getSimpleName ) .setMaster ( "local[1]" ) val sc = new SparkContext ( cof ) val file1 = sc.textFile ( "E:\file1.txt" ) val file2 = sc.textFile ( "E:\file2.txt" ) val list = file1.union ( file2 ).distinct ().sortBy ( tp => tp ) list.foreach ( println ) sc.stop () } }
实例三:
temperature.txt
0067011990999991950051507004888888889999999N9+00001+9999999999999999999999 0067011990999991950051512004888888889999999N9+00221+9999999999999999999999 0067011990999991950051518004888888889999999N9-00111+9999999999999999999999 0067011990999991949032412004888888889999999N9+01111+9999999999999999999999 0067011990999991950032418004888888880500001N9+00001+9999999999999999999999 0067011990999991950051507004888888880500001N9+00781+9999999999999999999999
需求:分析每年的最高温度
代码:
package dayo1 import org.apache.spark.{SparkConf, SparkContext} /** * 0067011990999991950051507004888888889999999N9+00001+9999999999999999999999 * 0067011990999991950051512004888888889999999N9+00221+9999999999999999999999 * 0067011990999991950051518004888888889999999N9-00111+9999999999999999999999 * 0067011990999991949032412004888888889999999N9+01111+9999999999999999999999 * 0067011990999991950032418004888888880500001N9+00001+9999999999999999999999 * 0067011990999991950051507004888888880500001N9+00781+9999999999999999999999 * * 12345678911234567892123456789312345678941234567895123456789612345678971234 * 需求:分析每年的最高温度 * 数据说明: * * * 第15-19个字符是year 6-9 * * 第45-50位是温度表示,+表示零上 -表示零下,且温度的值不能是9999,9999表示异常数据 * * 第50位值只能是0、1、4、5、9几个数字 */ object temperature { def main(args: Array[String]): Unit = { val cof = new SparkConf () .setAppName ( this.getClass.getSimpleName ) .setMaster ( "local[*]" ) val sc = new SparkContext ( cof ) val lines = sc.textFile ( "E:\temperature.txt" ) val yearAndTemp = lines.filter ( tp => { var temp = 0 val query = tp.charAt ( 50 ).toString //val query=tp.subString(50,51) if (tp.charAt ( 45 ).equals ( "+" )) { temp = tp.substring ( 45, 50 ).toInt } else { temp = tp.substring ( 45, 50 ).toInt } temp != 9999 && query.matches ( "[01459]" ) } ).map ( tp => { val year = tp.substring ( 15, 19 ) var temp = 0 if (tp.charAt ( 45 ).equals ( "+" )) { temp = tp.substring ( 45, 50 ).toInt } else { temp = tp.substring ( 45, 50 ).toInt } (year, temp) } ) val res = yearAndTemp.reduceByKey ( (x, y) => if (x > y) x else y ) res.foreach ( tp => println ( "year:" + tp._1 + " temp:" + tp._2 ) ) sc.stop () } }