• 统计web日志里面一个时间段的get请求数量


    日志数据:

    0:0:0:0:0:0:0:1 - - [11/Nov/2016:14:41:31 +0800] "GET /CloudDocLib/portal/deamon/manage.jsp HTTP/1.1" 200 13821
    0:0:0:0:0:0:0:1 - - [11/Nov/2016:14:41:32 +0800] "GET /CloudDocLib/xng/xngAction!listDeamons.action?page=0&count=10&sort=SYMBOL&order=asc&query=STYPE%3AEQA%3BCINDUSTRY.STYLE%3A009%3BCINDUSTRY.STYLECODE%3AZC7&jobListType=1&host=unknown HTTP/1.1" 200 332
    0:0:0:0:0:0:0:1 - - [11/Nov/2016:14:41:40 +0800] "POST /CloudDocLib/xng/xngAction!startDeamon.action HTTP/1.1" 200 132```
    **要求:按照时间每个小时统计get产生的次数**
    第一种做法是使用sql的做法:
    scala代码:
    import org.apache.Spark.sql.SparkSession
    import org.apache.spark.{SparkConf, SparkContext}
    
    /**
    * Created by xiaopengpeng on 2016/12/15.
    */
    class countget {
    
    }
    object countget{
    def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName(“countget”).setMaster(“local[*]”)
    val spark = SparkSession
    .builder()
    .config(conf)
    .getOrCreate()
    import spark.implicits._
    //0:0:0:0:0:0:0:1 - - [11/Nov/2016:14:41:31 +0800] “GET /CloudDocLib/portal/deamon/manage.jsp HTTP/1.1” 200 13821
    val logDF = spark.sparkContext.textFile(“D:Programapache-tomcat-7.0.72logslocalhost_access_log.2016-11-11.txt”)
    //.foreach(x=>x.split(” “).map())
    .map(line =>line.split(” “)).map(list=>( list(3).substring(list(3).lastIndexOf(“/”)+1,list(3).lastIndexOf(“/”)+8),list(5)))
    .toDF(“time”,”method”);
    logDF.show();
    logDF.createOrReplaceTempView(“log”);
    spark.sql(“SELECT time,COUNT(method) FROM log WHERE method=’”GET’ group by time”).show();
    
    }
    }
    第二种做法是用的纯粹的scala代码实现的
    代码:
    import org.apache.spark.SparkConf
    import org.apache.spark.sql.SparkSession
    
    /**
    * Created by root on 2016/12/15.
    */
    class CountGetByScala {
    
    }
    object CountGetByScala{
    def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName(“countget”).setMaster(“local[*]”)
    val spark = SparkSession
    .builder()
    .config(conf)
    .getOrCreate()
    import spark.implicits._
    //0:0:0:0:0:0:0:1 - - [11/Nov/2016:14:41:31 +0800] “GET /CloudDocLib/portal/deamon/manage.jsp HTTP/1.1” 200 13821
    val logLine = spark.sparkContext.textFile(“D:Programapache-tomcat-7.0.72logslocalhost_access_log.2016-11-11.txt”)
    .map(line =>line.split(” “)).map(list=>( list(3).substring(list(3).lastIndexOf(“/”)+1,list(3).lastIndexOf(“/”)+8),list(5)))
    val filter = logLine.filter(y=>y._2.equals(“”GET”))
    
    val group = filter.groupBy(line=>line._1)
    val result = group.map(g =>(g._1,g._2.toList.size))
    result.foreach(x=>println(x))
    
    }
    }
     
  • 相关阅读:
    重新定义容器化 Serverless 应用的数据访问
    Redis 数据类型list以及使用场景
    Pandas+ SLS SQL:融合灵活性和高性能的数据透视
    模型代码联动难? BizWorks来助力
    全链路灰度新功能:MSE上线配置标签推送
    一种关于低代码平台(LCDP)建设实践与设计思路
    常用正则
    React 通过dom获取fiber
    mac 前端环境搭建 homebrew nvm ohmyzsh code.
    js 实现forEach
  • 原文地址:https://www.cnblogs.com/itboys/p/6860772.html
Copyright © 2020-2023  润新知