• 实际业务代码开发


    数据清洗

    时间工具类开发: DateUtils.scala

    package com.imooc.utils
    
    import java.util.Date
    import org.apache.commons.lang3.time.FastDateFormat
    
    /**
      * 日期时间工具类
      */
    object DateUtils {
    
    //  2019-03-31 06:00:00
      val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
      val TARGET_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
    
      def getTime(time: String) = {
        YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
      }
    
      def parseToMinute(time: String) = {
        TARGET_FORMAT.format(new Date(getTime(time)))
      }
    
      def main(args: Array[String]): Unit = {
        println(parseToMinute("2019-03-31 06:12:00"))
      }
    }
    

    定义数据结果

    ClickLog.scala

    package com.imooc
    /**
      * 清洗后的日志信息
      * @param time 日志访问的时间
      * @param ip 日志访问的时间
      * @param courseId 日志访问的实战课程编号
      * @param statusCode 日志访问产生的状态码
      * @param referer 日志访问产生的referer
      */
    case class ClickLog(time: String, ip: String, courseId: Int, statusCode: Int, referer: String)
    //    10.25.63.72   2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1   500 https://cn.bing.com/search?q=Spark Streaming实战
    

    主代码:

    ImoocStatStreamingApp.scala

    package com.imooc
    
    import com.imooc.utils.DateUtils
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.kafka.KafkaUtils
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    
    /**
      * 测试Kafka对接Spark Streaming
      */
    object ImoocStatStreamingApp {
    
      def main(args: Array[String]): Unit = {
        if (args.length != 4) {
          System.err.println("Usage: <zkQuorum> <groupId> <topics> <numThreads>")
          System.exit(1)
        }
    
        val Array(zkQuorum, groupId, topics,numThreads) = args
        val sparkconf = new SparkConf()
          .setAppName("ImoocStatStreamingApp")
          .setMaster("local[2]")
        val ssc = new StreamingContext(sparkconf, Seconds(60))
    
        val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
        val message = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)
    
    
    //    10.25.63.72   2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1   500 https://cn.bing.com/search?q=Spark Streaming实战
        val logs = message.map(_._2)
        val cleanData = logs.map(line => {
        val infos = line.split("	")
    
    
          // info(2) = "GET /class/153.html HTTP/1.1"
          // url = /class/153.html
          val url = infos(2).split(" ")(1)
          var courseId = 0
    
          // 把实战课程的课程编号拿到了
          if (url.startsWith("/class")) {
            val courseIdHTML = url.split("/")(2)
            courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
          }
          
          
    // 最终清洗后的结果
    // 20190331065001,29.198.72.55,153,500,https://search.yahoo.com/search?p=Hadoop基础
          ClickLog(DateUtils.parseToMinute(infos(1)), infos(0), courseId, infos(3).toInt, infos(4))
        }).filter(clicklog => clicklog.courseId != 0)
    
        cleanData.print()
    
        ssc.start()
        ssc.awaitTermination()
      }
    }
    
  • 相关阅读:
    Fragment中获取Activity的Context (转)
    raw cannot be resolved or is not a field解决办法
    商业分析07_06分布情况分析
    商业分析07_04漏斗分析
    商业分析07_03数据涨跌异动如何处理
    商业分析07_02多维度拆解
    商业分析07_01对比分析
    商业分析07_00概述 数据分析
    商业分析06选择数据工具
    商业分析05如何选取数据指标
  • 原文地址:https://www.cnblogs.com/suixingc/p/shi-ji-ye-wu-dai-ma-kai-fa.html
Copyright © 2020-2023  润新知