• sparkStreaming统计各平台最近一分钟实时注册收入 时间段,平台,金额,订单数


    样例数据:

    __clientip=10.10.9.153&paymentstatus=0&__opip=&memberid=89385239&iamount=1&itype=16&oper_res=1&channeltype=8&__timestamp=1457252427&productid=112&selectbank=&icount=0&ordersrc=web&paymentip=61.159.104.134&orderdate=2016-03-06 16:19:55&subjecttype=zheanaiMessenger&oper_type=1&paydate=&orderamount=259.0&paymentchannel=16&oper_time=2016-03-06 16:20:27&orderid=127145727&iunit=month&bussinessid=80125727&isuse=0
    __clientip=10.10.9.175&paymentstatus=0&__opip=&memberid=89378034&iamount=12&itype=17&oper_res=1&channeltype=75&__timestamp=1457252429&productid=124&selectbank=&icount=0&ordersrc=100&paymentip=59.37.137.119&orderdate=2016-03-06 16:20:29&subjecttype=zheanaiMessenger&oper_type=0&paydate=&orderamount=388.0&paymentchannel=1028&oper_time=2016-03-06 16:20:29&orderid=127145736&iunit=month&bussinessid=8012580&isuse=0
    __clientip=10.10.9.153&paymentstatus=0&__opip=&memberid=75372899&iamount=12&itype=16&oper_res=1&channeltype=&__timestamp=1457252286&productid=131&selectbank=&icount=0&ordersrc=web&paymentip=113.226.244.206&orderdate=2016-03-06 16:18:06&subjecttype=zheanaiMessenger&oper_type=0&paydate=&orderamount=99.0&paymentchannel=307&oper_time=2016-03-06 16:18:06&orderid=127145700&iunit=month&bussinessid=80125477&isuse=0
    __clientip=10.10.9.175&paymentstatus=0&__opip=&memberid=87634711&iamount=1&itype=16&oper_res=1&channeltype=8&__timestamp=1457252432&productid=129&selectbank=&icount=0&ordersrc=web&paymentip=114.246.35.251&orderdate=2016-03-06 16:19:05&subjecttype=zheanaiMessenger&oper_type=1&paydate=&orderamount=19.0&paymentchannel=16&oper_time=2016-03-06 16:20:32&orderid=127145713&iunit=month&bussinessid=66213022&isuse=0
    __clientip=10.10.9.153&paymentstatus=0&__opip=&memberid=89172717&iamount=12&itype=17&oper_res=1&channeltype=77&__timestamp=1457252371&productid=124&selectbank=&icount=0&ordersrc=4&paymentip=111.126.43.83&orderdate=2016-03-06 16:19:31&subjecttype=zheanaiMessenger&oper_type=0&paydate=&orderamount=388.0&paymentchannel=1116&oper_time=2016-03-06 16:19:31&orderid=127145723&iunit=month&bussinessid=8012568&isuse=0

    spark处理过程如下:

    1.读取,ssc自带的receiver,解析(valueSplit方法 处理成kv格式)

    2.过滤filterRegex,类似sql中的where条件放弃一些不需要的数据,比如只需要买单的数据而不要下单数据
    3.转换,getPlatform、getFormatDate,类似case when
    4.创建了一个class命名为result,重写了toString方法。该class存放从kafka中处理后的所有需要的数据字段。
    5.写入MySQL,insertIntoMySQL,方法在每个partition中调用
    另外代码中使用了getOrCreate以便恢复,利用了计数器简单统计了一下有效记录数

    代码如下:

    package com.homed.stream

    /**
     * Created by hadoop on 2016/12/17.
     *
     */
    import java.sql.Connection
    import java.text.SimpleDateFormat
    import java.util.Date

    import org.apache.log4j.PropertyConfigurator
    import org.apache.spark.rdd.RDD
    import org.apache.spark.streaming.kafka.KafkaUtils
    import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
    import org.apache.spark.{SparkConf, SparkContext}
    import org.joda.time.DateTime
    import org.slf4j.LoggerFactory

    import scala.collection.mutable.Map

    object KafkaStreaming {

    val logger = LoggerFactory.getLogger(this.getClass)
    PropertyConfigurator.configure(System.getProperty("user.dir")+"\src\log4j.properties")

    case class result(ftime:String,hour:String,orderid:Long,memberid:Long,platform:String,iamount:Double,orderamount:Double)extends Serializable{
            override def toString: String="%s %s %d %d %s %.2f %.2f".format(ftime, hour,orderid,memberid,platform,iamount,orderamount)
    }


    def getFormatDate(date:Date,format:SimpleDateFormat): String ={
           format.format(date)
    }
    def stringFormatTime(time:String,simpleformat:SimpleDateFormat): Date ={
           simpleformat.parse(time)
    }

    // kafka中的value解析为Map
    def valueSplit(value:String): Map[String,String] ={
         val x = value.split("&")
         val valueMap:Map[String,String] = Map()
         x.foreach { kvs =>
             if (!kvs.startsWith("__")){
                val kv = kvs.split("=")
                if (kv.length==2) {
                     valueMap += (kv(0) -> kv(1))
                }
             }

          }
    valueMap
    }

    // 实现类似where的条件,tips:优先过滤条件大的减少后续操作
    def filterRegex(map:Map[String,String]): Boolean ={
          //过滤操作类型,控制为支付操作
          val oper_type = map.getOrElse("oper_type","-1")
          if(!oper_type.equals("2") && !oper_type.equals("3"))
              return false
         // 过滤未支付成功记录
         if(!map.getOrElse("paymentstatus","0").equals("1"))
             return false
         // 过滤无效支付ip
         val paymentip = map.getOrElse("paymentip",null)
         if (paymentip.startsWith("10.10")||paymentip.startsWith("183.62.134")||paymentip.contains("127.0.0.1"))
             return false
        return true
    }
    // 实现类似 case when的方法,上报的p字段不一定为数值
    def getPlatform(p:String,x:Int): String ={
          val platformname = (p,x) match{
          case (p,x) if(Array[String]("1","2","3").contains(p)) => "wap"
          case (p,x) if(Array[String]("4","8").contains(p)&& x!=18) =>"andriod"
          case (p,x) if((Array[String]("5","7","51","100").contains(p))&&(p!=18)) => "ios"
          case _ => "pc"
         }
      platformname
    }
    // 数据库写入
    def insertIntoMySQL(con:Connection,sql:String,data:result): Unit ={
         // println(data.toString)
    try {
    val ps = con.prepareStatement(sql)
    ps.setString(1, data.ftime)
    ps.setString(2, data.hour)
    ps.setLong(3,data.orderid)
    ps.setLong(4, data.memberid)
    ps.setString(5, data.platform)
    ps.setDouble(6, data.iamount)
    ps.setDouble(7, data.orderamount)
    ps.executeUpdate()
    ps.close()

    }catch{
         case exception:Exception=>
        logger.error("Error in execution of query "+exception.getMessage+" ----------------------- "+exception.printStackTrace()+" -----------------------------")
      }
    }
    def createContext(zkqurm:String,topic:scala.Predef.Map[String,Int],checkPointDir:String): StreamingContext ={


    val simpleformat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    val dateFormat = new SimpleDateFormat("yyyyMMdd")
    val timeFormat = new SimpleDateFormat("HH:mm")

    val sql ="insert into t_ssc_toufang_result_mi(ftime,hour,orderid,memberid,platform,iamount,orderamount) values(?,?,?,?,?,?,?);"


    val conf = new SparkConf()
    conf.setAppName("Scala Streaming read kafka")
    // VM option -Dspark.master=local
    // conf.setMaster("local[4]")
    val sc = new SparkContext(conf)

    val totalcounts = sc.accumulator(0L,"Total count")

    val ssc = new StreamingContext(sc,Seconds(60))
    //ssc.checkpoint(checkPointDir)
    //统计各平台最近一分钟实时注册收入 时间段,平台,金额,订单数
    val lines = KafkaUtils.createStream(ssc, zkqurm, "mytopic_local",topic).map(_._2)

    val filterRecord = lines.filter(x => !x.isEmpty).map(valueSplit).filter(filterRegex).map{x =>
          val orderdate = stringFormatTime(x.getOrElse("orderdate",null),simpleformat)
          val day = getFormatDate(orderdate,dateFormat)
          val hour = getFormatDate(orderdate,timeFormat)
          var orderamount = x.getOrElse("orderamount","0").toDouble
          if (x.getOrElse("oper_type",-1)==3)
              orderamount = -1*orderamount
          val res = new result(
               day
              ,hour
             ,x.getOrElse("orderid",null).toLong
             ,x.getOrElse("memberid",null).toLong
            ,getPlatform(x.getOrElse("ordersrc",null),x.getOrElse("itype",null).toInt)
             ,x.getOrElse("iamount","0").toDouble
             ,orderamount
            )
       res
    }

    filterRecord.foreachRDD((x: RDD[result],time: Time) =>{
         if(!x.isEmpty()) {
             // 打印一下这一批batch的处理时间段以及累计的有效记录数(不含档次)
             println("--"+new DateTime(time.milliseconds).toString("yyyy-MM-dd HH:mm:ss")+"--totalcounts:"+totalcounts.value+"-----")
             x.foreachPartition{res =>
              {
                 if(!res.isEmpty){
                    val connection = ConnectionPool.getConnection.getOrElse(null)
                     res.foreach {
                        r: result =>totalcounts.add(1L)
                        insertIntoMySQL(connection, sql, r)
                     }
                   ConnectionPool.closeConnection(connection)
                }
             }
          }
       }
    })

    ssc
    }
    // 主函数入口=================================================================

    def main(args:Array[String]): Unit ={
          val zkqurm = "10.10.10.177:2181,10.10.10.175:2181,10.10.10.179:2181"

          val topic = scala.Predef.Map("t_fw_00015"->30)
          val checkPointDir ="/user/root/sparkcheck"
          val ssc = StreamingContext.getOrCreate(checkPointDir,
                 () => {
                       createContext(zkqurm, topic,checkPointDir)
               })
          ssc.start()
          ssc.awaitTermination()
         }
    }

    连接池部分,代码如下:

    package com.homed.stream

    /**
    * Created by hadoop on 2016/12/17.
    */

    import java.sql.Connection

    import com.jolbox.bonecp.{BoneCP, BoneCPConfig}
    import org.slf4j.LoggerFactory

    object ConnectionPool {

    val logger = LoggerFactory.getLogger(this.getClass)
    private val connectionPool = {
    try{
    Class.forName("com.mysql.jdbc.Driver")
    val config = new BoneCPConfig()
    config.setJdbcUrl("jdbc:mysql://localhost:3306/test")
    config.setUsername("etl")
    config.setPassword("xxxxx")
    config.setLazyInit(true)

    config.setMinConnectionsPerPartition(3)
    config.setMaxConnectionsPerPartition(5)
    config.setPartitionCount(5)
    config.setCloseConnectionWatch(true)
    config.setLogStatementsEnabled(false)

    Some(new BoneCP(config))
    } catch {
         case exception:Exception=>
         logger.warn("Error in creation of connection pool"+exception.printStackTrace())
         None
    }
    }
    def getConnection:Option[Connection] ={
    connectionPool match {
           case Some(connPool) => Some(connPool.getConnection)
           case None => None
    }
    }
    def closeConnection(connection:Connection): Unit = {
        if(!connection.isClosed) {
               connection.close()

        }
     }
    }

  • 相关阅读:
    局部变量、全局变量和修改全局变量
    python中函数的参数
    python之匿名函数和递归函数
    设计模式之职责链模式
    设计模式之代理模式
    设计模式之flyweight享元模式
    设计模式之外观模式
    设计模式之装饰模式
    组合模式更清晰的例子
    设计模式之组合模式
  • 原文地址:https://www.cnblogs.com/hd-zg/p/7879086.html
Copyright © 2020-2023  润新知