• 大数据学习——spark运营案例


    iplocation需求

    在互联网中,我们经常会见到城市热点图这样的报表数据,例如在百度统计中,会统计今年的热门旅游城市、热门报考学校等,会将这样的信息显示在热点图中。

     

     因此,我们需要通过日志信息(运行商或者网站自己生成)和城市ip段信息来判断用户的ip段,统计热点经纬度。

    练习数据 

    链接:https://pan.baidu.com/s/14IA1pzUWEnDK_VCH_LYRLw
    提取码:pnwv


    package org.apache.spark
    
    import org.apache.spark.broadcast.Broadcast
    import org.apache.spark.rdd.RDD
    import java.io.{BufferedReader, FileInputStream, InputStreamReader}
    import java.sql.{Connection, DriverManager, PreparedStatement}
    
    import org.apache.spark.{SparkConf, SparkContext}
    
    import scala.collection.mutable.ArrayBuffer
    
    /**
      * Created by Administrator on 2019/6/12.
      */
    object IPLocation {
      def ip2Long(ip: String): Long = {
        //ip转数字口诀
        //分金定穴循八卦,toolong插棍左八圈
        val split: Array[String] = ip.split("[.]")
        var ipNum = 0L
        for (i <- split) {
          ipNum = i.toLong | ipNum << 8L
        }
        ipNum
      }
    
      //二分法查找
      def binarySearch(ipNum: Long, value: Array[(String, String, String, String, String)]): Int = {
        //上下循环循上下,左移右移寻中间
        var start = 0
        var end = value.length - 1
        while (start <= end) {
          val middle = (start + end) / 2
          if (ipNum >= value(middle)._1.toLong && ipNum <= value(middle)._2.toLong) {
            return middle
          }
          if (ipNum > value(middle)._2.toLong) {
            start = middle
          }
          if (ipNum < value(middle)._1.toLong) {
            end = middle
          }
        }
        -1
      }
    
      def data2MySQL(iterator: Iterator[(String, Int)]): Unit = {
        var conn: Connection = null
        var ps: PreparedStatement = null
        val sql = "INSERT INTO location_count (location, total_count) VALUES (?, ?)"
        try {
          conn = DriverManager.getConnection("jdbc:mysql://192.168.74.100:3306/test", "root", "123456")
          iterator.foreach(line => {
            ps = conn.prepareStatement(sql)
            ps.setString(1, line._1)
            ps.setInt(2, line._2)
            ps.executeUpdate()
          })
        } catch {
          case e: Exception => println(e)
        } finally {
          if (ps != null)
            ps.close()
          if (conn != null)
            conn.close()
        }
      }
    
      def main(args: Array[String]) {
        val conf = new SparkConf().setAppName("iplocation").setMaster("local[5]")
        val sc = new SparkContext(conf)
        //读取数据(ipstart,ipend,城市基站名,经度,维度)
        val jizhanRDD = sc.textFile("E:\ip.txt").map(_.split("\|")).map(x => (x(2), x(3), x(4) + "-" + x(5) + "-" + x(6) + "-" + x(7) + "-" + x(8) + "-" + x(9), x(13), x(14)))
        //    jizhanRDD.foreach(println)
        //把RDD转换成数据
        val jizhanPartRDDToArray: Array[(String, String, String, String, String)] = jizhanRDD.collect()
        //广播变量,一个只读的数据区,是所有的task都能读取的地方,相当于mr的分布式内存
        val jizhanRDDToArray: Broadcast[Array[(String, String, String, String, String)]] = sc.broadcast(jizhanPartRDDToArray)
        //    println(jizhanRDDToArray.value)
    
        val IPS = sc.textFile("E:\20090121000132.394251.http.format").map(_.split("\|")).map(x => x(1))
    
        //把ip地址转换为Long类型,然后通过二分法去ip段数据中查找,对找到的经纬度做wordcount
        //((经度,纬度),1)
        val result = IPS.mapPartitions(it => {
    
          val value: Array[(String, String, String, String, String)] = jizhanRDDToArray.value
          it.map(ip => {
            //将ip转换成数字
            val ipNum: Long = ip2Long(ip)
    
            //拿这个数字去ip段中通过二分法查找,返回ip在ip的Array中的角标
            val index: Int = binarySearch(ipNum, value)
            //通Array拿出想要的数据
            ((value(index)._4, value(index)._5), 1)
          })
        })
    
        //聚合操作
        val resultFinnal: RDD[((String, String), Int)] = result.reduceByKey(_ + _)
    
        //    resultFinnal.foreach(println)
        //将数据存储到数据库
        resultFinnal.map(x => (x._1._1 + "-" + x._1._2, x._2)).foreachPartition(data2MySQL _)
        sc.stop()
    
      }
    
    }

     

    PV案例

    package org.apache.spark
    
    import org.apache.spark.rdd.RDD
    
    /**
      * Created by Administrator on 2019/6/12.
      */
    //PV(Page View)访问量, 即页面浏览量或点击量
    object PV {
      def main(args: Array[String]) {
        val conf = new SparkConf().setAppName("pv").setMaster("local[2]")
        val sc = new SparkContext(conf)
        //读取数据access.log
        val file: RDD[String] = sc.textFile("e:\access.log")
        //将一行数据作为输入,将()
        val pvAndOne: RDD[(String, Int)] = file.map(x => ("pv", 1))
        //聚合计算
        val result = pvAndOne.reduceByKey(_ + _)
        result.foreach(println)
      }
    }

     

    UV 

    package org.apache.spark
    
    import org.apache.spark.rdd.RDD
    
    /**
      * Created by Administrator on 2019/6/12.
      */
    //UV(Unique Visitor)独立访客,统计1天内访问某站点的用户数(以cookie为依据);访问网站的一台电脑客户端为一个访客
    object UV {
      def main(args: Array[String]) {
        val conf = new SparkConf().setAppName("pv").setMaster("local[2]")
        val sc = new SparkContext(conf)
        //读取数据access.log
        val file: RDD[String] = sc.textFile("e:\access.log")
    
        //要分割file,拿到ip,然后去重
        val uvAndOne = file.map(_.split(" ")).map(x => x(0)).distinct().map(x => ("uv", 1))
    
        //聚合
        val result = uvAndOne.reduceByKey(_ + _)
        result.foreach(println)
      }
    }

    pv uv环比分析

    package org.apache.spark
    
    import scala.collection.mutable.ArrayBuffer
    
    /**
      * Created by Administrator on 2019/6/12.
      */
    object Pvbi {
    //  LoggerLevels.setStreamingLogLevels()
      val conf = new SparkConf().setAppName("pv").setMaster("local[7]")
      val sc = new SparkContext(conf)
      val PVArr = ArrayBuffer[(String, Int)]()
      val UVArr = ArrayBuffer[(String, Int)]()
    
      def main(args: Array[String]) {
        computePVOneDay("e:\access/tts7access20140824.log")
        computePVOneDay("e:\access/tts7access20140825.log")
        computePVOneDay("e:\access/tts7access20140826.log")
        computePVOneDay("e:\access/tts7access20140827.log")
        computePVOneDay("e:\access/tts7access20140828.log")
        computePVOneDay("e:\access/tts7access20140829.log")
        computePVOneDay("e:\access/tts7access20140830.log")
        println(PVArr)
        computeUVOneDay("e:\access/tts7access20140824.log")
        computeUVOneDay("e:\access/tts7access20140825.log")
        computeUVOneDay("e:\access/tts7access20140826.log")
        computeUVOneDay("e:\access/tts7access20140827.log")
        computeUVOneDay("e:\access/tts7access20140828.log")
        computeUVOneDay("e:\access/tts7access20140829.log")
        computeUVOneDay("e:\access/tts7access20140830.log")
        println(UVArr)
      }
    
      def computePVOneDay(filePath: String): Unit = {
        val file = sc.textFile(filePath)
        val pvTupleOne = file.map(x => ("pv", 1)).reduceByKey(_ + _)
        val collect: Array[(String, Int)] = pvTupleOne.collect()
        PVArr.+=(collect(0))
      }
    
      def computeUVOneDay(filePath: String): Unit = {
        val rdd1 = sc.textFile(filePath)
        val rdd3 = rdd1.map(x => x.split(" ")(0)).distinct
        val rdd4 = rdd3.map(x => ("uv", 1))
        val rdd5 = rdd4.reduceByKey(_ + _)
        val collect: Array[(String, Int)] = rdd5.collect()
        UVArr.+=(collect(0))
      }
    }

    TopK

    package org.apache.spark
    
    import org.apache.spark.rdd.RDD
    
    /**
      * Created by Administrator on 2019/6/12.
      */
    object TopK {
      def main(args: Array[String]) {
        //创建配置,设置app的name
        val conf = new SparkConf().setAppName("topk").setMaster("local[2]")
        //创建sparkcontext,将conf传进来
        val sc = new SparkContext(conf)
        //读取数据access.log
        val file: RDD[String] = sc.textFile("e:\access.log")
        //将一行数据作为输入,将()
        val refUrlAndOne: RDD[(String, Int)] = file.map(_.split(" ")).map(x => x(10)).map((_, 1))
    
        //聚合
        val result: Array[(String, Int)] = refUrlAndOne.reduceByKey(_ + _).sortBy(_._2, false).take(3)
    
        println(result.toList)
      }
    }

     https://web.umeng.com/main.php?c=flow&a=frame&siteid=1254552353#!/1560332550346/flow/trend/1/1254552353/2019-06-12/2019-06-12

    mobile_location案例

    需要的数据

    链接:https://pan.baidu.com/s/1JbGxnrgxcy05LFUmVo8AUQ 
    提取码:h7io 

    package org.apache.spark
    
    import org.apache.spark.rdd.RDD
    
    import scala.collection.mutable.Map
    
    object MobileLocation {
    
      def main(args: Array[String]) {
        //本地运行
        val conf = new SparkConf().setAppName("UserLocation").setMaster("local[5]")
        val sc = new SparkContext(conf)
    
        //todo:过滤出工作时间(读取基站用户信息:18688888888,20160327081200,CC0710CC94ECC657A8561DE549D940E0,1)
        val officetime = sc.textFile("e:\ce\*.log")
          .map(_.split(",")).filter(x => (x(1).substring(8, 14) >= "080000" && (x(1).substring(8, 14) <= "180000")))
    
        //todo:过滤出家庭时间(读取基站用户信息:18688888888,20160327081200,CC0710CC94ECC657A8561DE549D940E0,1)
        val hometime = sc.textFile("e:\ce\*.log")
          .map(_.split(",")).filter(x => (x(1).substring(8, 14) > "180000" && (x(1).substring(8, 14) <= "240000")))
    
        //todo:读取基站信息:9F36407EAD0629FC166F14DDE7970F68,116.304864,40.050645,6
        val rdd20 = sc.textFile("e:\ce\loc_info.txt")
          .map(_.split(",")).map(x => (x(0), (x(1), x(2))))
    
        //todo:计算多余的时间次数
        val map1Result = computeCount(officetime)
        val map2Result = computeCount(hometime)
        val mapBro1 = sc.broadcast(map1Result)
        val mapBro2 = sc.broadcast(map2Result)
    
        //todo:计算工作时间
        computeOfficeTime(officetime, rdd20, "c://out/officetime", mapBro1.value)
    
        //todo:计算家庭时间
        computeHomeTime(hometime, rdd20, "c://out/hometime", mapBro2.value)
        sc.stop()
      }
    
      /**
        * 计算多余的时间次数
        *
        * 1、将“电话_基站ID_年月日"按key进行分组,如果value的大小为2,那么证明在同一天同一时间段(8-18或者20-24)同时出现两次,那么这样的数据需要记录,减去多余的时间
        * 2、以“电话_基站ID”作为key,将共同出现的次数为2的累加,作为value,存到map中,
        * 例如:
        * 13888888888_8_20160808100923_1和13888888888_8_20160808170923_0表示在13888888888在同一天20160808的8-18点的时间段,在基站8出现入站和出站
        * 那么,这样的数据对于用户13888888888在8基站就出现了重复数据,需要针对key为13888888888_8的value加1
        * 因为我们计算的是几个月的数据,那么,其他天数也会出现这种情况,累加到13888888888_8这个key中
        */
      def computeCount(rdd1: RDD[Array[String]]): Map[String, Int] = {
        var map = Map(("init", 0))
        //todo:groupBy:按照"电话_基站ID_年月日"分组,将符合同一组的数据聚在一起
        for ((k, v) <- rdd1.groupBy(x => x(0) + "_" + x(2) + "_" + x(1).substring(0, 8)).collect()) {
          val tmp = map.getOrElse(k.substring(0, k.length() - 9), 0)
          if (v.size % 2 == 0) {
            //todo:以“电话_基站ID”作为key,将共同出现的次数作为value,存到map中
            map += (k.substring(0, k.length() - 9) -> (tmp + v.size / 2))
          }
        }
        map
      }
    
      /**
        * 计算在家的时间
        */
      def computeHomeTime(rdd1: RDD[Array[String]], rdd2: RDD[(String, (String, String))], outDir: String, map: Map[String, Int]) {
    
        //todo:(手机号_基站ID,时间)算法:24-x 或者 x-20
        val rdd3 = rdd1.map(x => ((x(0) + "_" + x(2), if (x(3).toInt == 1) 24 - Integer.parseInt(x(1).substring(8, 14)) / 10000
        else Integer.parseInt(x(1).substring(8, 14)) / 10000 - 20)))
    
        //todo:手机号_基站ID,总时间
        val rdd4 = rdd3.reduceByKey(_ + _).map {
          case (telPhone_zhanId, totalTime) => {
            (telPhone_zhanId, totalTime - (Math.abs(map.getOrElse(telPhone_zhanId, 0)) * 4))
          }
        }
    
        //todo:按照总时间排序(手机号_基站ID,总时间<倒叙>)
        val rdd5 = rdd4.sortBy(_._2, false)
    
        //todo:分割成:手机号,(基站ID,总时间)
        val rdd6 = rdd5.map {
          case (telphone_zhanId, totalTime) => (telphone_zhanId.split("_")(0), (telphone_zhanId.split("_")(1), totalTime))
        }
    
        //todo:找到时间的最大值:(手机号,compactBuffer((基站ID,总时间1),(基站ID,总时间2)))
        val rdd7 = rdd6.groupByKey.map {
          case (telphone, buffer) => (telphone, buffer.head)
        }.map {
          case (telphone, (zhanId, totalTime)) => (telphone, zhanId, totalTime)
        }
    
        //todo:join都获取基站的经纬度
        val rdd8 = rdd7.map {
          case (telphon, zhanId, time) => (zhanId, (telphon, time))
        }.join(rdd2).map {
          //todo:(a,(1,2))
          case (zhanId, ((telphon, time), (jingdu, weidu))) => (telphon, zhanId, jingdu, weidu)
        }
        rdd8.foreach(println)
        //rdd8.saveAsTextFile(outDir)
      }
    
      /**
        * 计算工作的时间
        */
      def computeOfficeTime(rdd1: RDD[Array[String]], rdd2: RDD[(String, (String, String))], outDir: String, map: Map[String, Int]) {
    
        //todo:(手机号_基站ID,时间) 算法:18-x 或者 x-8
        val rdd3 = rdd1.map(x => ((x(0) + "_" + x(2), if (x(3).toInt == 1) 18 - Integer.parseInt(x(1).substring(8, 14)) / 10000
        else Integer.parseInt(x(1).substring(8, 14)) / 10000 - 8)))
    
        //todo:手机号_基站ID,总时间
        val rdd4 = rdd3.reduceByKey(_ + _).map {
          case (telPhone_zhanId, totalTime) => {
            (telPhone_zhanId, totalTime - (Math.abs(map.getOrElse(telPhone_zhanId, 0)) * 10))
          }
        }
    
        //todo:按照总时间排序(手机号_基站ID,总时间<倒叙>)
        val rdd5 = rdd4.sortBy(_._2, false)
    
        //todo:分割成:手机号,(基站ID,总时间)
        val rdd6 = rdd5.map {
          case (telphone_zhanId, totalTime) => (telphone_zhanId.split("_")(0), (telphone_zhanId.split("_")(1), totalTime))
        }
    
        //todo:找到时间的最大值:(手机号,compactBuffer((基站ID,总时间1),(基站ID,总时间2)))
        val rdd7 = rdd6.groupByKey.map {
          case (telphone, buffer) => (telphone, buffer.head)
        }.map {
          case (telphone, (zhanId, totalTime)) => (telphone, zhanId, totalTime)
        }
    
        //todo:join都获取基站的经纬度
        val rdd8 = rdd7.map {
          case (telphon, zhanId, time) => (zhanId, (telphon, time))
        }.join(rdd2).map {
          case (zhanId, ((telphon, time), (jingdu, weidu))) => (telphon, zhanId, jingdu, weidu)
        }
        rdd8.foreach(println)
        //rdd8.saveAsTextFile(outDir)
      }
    
    }

  • 相关阅读:
    VS2015中SharedProject与可移植类库(PCL)项目
    Windows.Web.Http.HttpClient.GetStringAsync 总是返回相同的结果
    博客园新闻WP8.1客户端
    Webpack 2 视频教程 001
    快速零配置迁移 API 适配 iOS 对 IPv6 以及 HTTPS 的要求
    免费的 Vue.js 入门与进阶视频教程
    Webpack 3 中的新特性
    使用可视化图表对 Webpack 2 的编译与打包进行统计分析
    React.js 开发参见问题 Q&A
    官方 React 快速上手脚手架 create-react-app
  • 原文地址:https://www.cnblogs.com/feifeicui/p/11009366.html
Copyright © 2020-2023  润新知