• Shell编程之文本处理


    cut 截取自定列

    可以按照某个字符进行分割,然后取出其中的指定列:

    [root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt
    140.205.201.30 - - [02/Dec/2017:00:15:24 +0800] "GET / HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:17:51 +0800] "GET /rs-status HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:19:09 +0800] "GET /ganglia/index.php HTTP/1.1" 404 -
    164.132.91.1 - - [02/Dec/2017:00:22:21 +0800] "GET / HTTP/1.1" 404 -
    114.215.45.101 - - [02/Dec/2017:00:23:43 +0800] "GET / HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:32:41 +0800] "GET /index.php HTTP/1.1" 404 -
    140.205.201.30 - - [02/Dec/2017:00:39:08 +0800] "GET /jobs/ HTTP/1.1" 404 -
    [root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt |cut -d ' ' -f 6
    "GET
    "GET
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "POST
    "GET
    "GET
    "GET
    "GET
    "GET

    可以指定更多的列:

    [root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt |cut -d ' ' -f 2,3,4
    - - [02/Dec/2017:00:15:24
    - - [02/Dec/2017:00:17:51
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:06
    - - [02/Dec/2017:00:19:07
    - - [02/Dec/2017:00:19:07
    - - [02/Dec/2017:00:19:07
    - - [02/Dec/2017:00:19:07
    - - [02/Dec/2017:00:19:07
    - - [02/Dec/2017:00:19:09
    - - [02/Dec/2017:00:22:21
    - - [02/Dec/2017:00:23:43
    - - [02/Dec/2017:00:32:41
    - - [02/Dec/2017:00:39:08
    [root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt |cut -d ' ' -f 2,3,6-
    - - "GET / HTTP/1.1" 404 -
    - - "GET /rs-status HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /phpmyadmin/ HTTP/1.1" 404 -
    - - "POST /phpmyadmin/ HTTP/1.1" 404 -
    - - "GET /ganglia/index.php HTTP/1.1" 404 -
    - - "GET / HTTP/1.1" 404 -
    - - "GET / HTTP/1.1" 404 -
    - - "GET /index.php HTTP/1.1" 404 -
    - - "GET /jobs/ HTTP/1.1" 404 -

     sort 对列进行排序

    例如,对tomcat访问日志,对请求响应返回大小进行排序:

    cat localhost_access_log.2017-12-01.txt |sort -t ' ' -k 10

    -t : 指定分隔符

    -k : 指定排序的列

    114.241.108.197 - - [01/Dec/2017:09:03:45 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
    114.241.108.197 - - [01/Dec/2017:11:45:30 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
    114.241.108.197 - - [01/Dec/2017:14:41:04 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
    223.72.82.98 - - [01/Dec/2017:15:26:10 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
    59.108.217.106 - - [01/Dec/2017:09:35:17 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
    59.108.217.106 - - [01/Dec/2017:13:08:46 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
    114.241.108.197 - - [01/Dec/2017:09:03:32 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
    114.241.108.197 - - [01/Dec/2017:11:28:29 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
    114.241.108.197 - - [01/Dec/2017:14:40:51 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
    223.72.82.98 - - [01/Dec/2017:15:26:03 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
    59.108.217.106 - - [01/Dec/2017:09:35:01 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
    59.108.217.106 - - [01/Dec/2017:09:35:10 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
    59.108.217.106 - - [01/Dec/2017:13:08:52 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
    114.241.108.197 - - [01/Dec/2017:12:00:15 +0800] "GET /interview/detail.do?manageKey=15ba76c6fbeeccd2f8df875379ac88e9&targetPanel=dialog HTTP/1.1" 200 9952
    59.108.217.106 - - [01/Dec/2017:16:44:53 +0800] "GET /interview/detail.do?manageKey=15ba76c6fbeeccd2f8df875379ac88e9&targetPanel=dialog HTTP/1.1" 200 9952
    59.108.217.106 - - [01/Dec/2017:16:44:57 +0800] "GET /interview/detail.do?manageKey=15ba76c6fbeeccd2f8df875379ac88e9&targetPanel=dialog HTTP/1.1" 200 9952

    排序是由方向的,默认是升序排序,如果要降序排列,可以在列号后面增加一个r:

    cat localhost_access_log.2017-12-01.txt |sort -t ' ' -k 10r

    最后要注意的是,这里的排序默认是按字符串的字典顺序排列的,如果要按其数值拍,则需要增加一个n:

     cat localhost_access_log.2017-12-01.txt |sort -t ' ' -k 10n
    114.241.108.197 - - [01/Dec/2017:09:03:28 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
    114.241.108.197 - - [01/Dec/2017:11:28:29 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
    114.241.108.197 - - [01/Dec/2017:14:40:49 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
    223.72.82.98 - - [01/Dec/2017:15:25:59 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
    59.108.217.106 - - [01/Dec/2017:09:34:56 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
    59.108.217.106 - - [01/Dec/2017:09:35:06 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
    59.108.217.106 - - [01/Dec/2017:13:08:43 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
    112.65.193.14 - - [01/Dec/2017:11:28:44 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
    114.241.108.197 - - [01/Dec/2017:09:03:30 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
    114.241.108.197 - - [01/Dec/2017:11:28:33 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
    114.241.108.197 - - [01/Dec/2017:14:40:49 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
    223.72.82.98 - - [01/Dec/2017:15:26:01 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
    59.108.217.106 - - [01/Dec/2017:09:34:56 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
    59.108.217.106 - - [01/Dec/2017:09:35:06 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
    59.108.217.106 - - [01/Dec/2017:13:08:43 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844

     由此可见,此网站最大的静态资源是这个jquery-ui.min.js文件。

    uniq去重

     cat localhost_access_log.2017-12-01.txt |cut -d ' ' -f 1,10 |sort -t ' ' -k 2n,1|uniq
    223.72.82.98 61692
    59.108.217.106 61692
    114.241.108.197 95786
    223.72.82.98 95786
    59.108.217.106 95786
    114.241.108.197 116060
    223.72.82.98 116060
    59.108.217.106 116060
    112.65.193.14 284394
    114.241.108.197 284394
    223.72.82.98 284394
    59.108.217.106 284394
    114.241.108.197 394554
    223.72.82.98 394554
    59.108.217.106 394554
    112.65.193.14 435844
    114.241.108.197 435844
    223.72.82.98 435844
    59.108.217.106 435844

    wc统计

    [root@iZ25klm6k7uZ logs]# wc -l localhost_access_log.2017-12-01.txt  统计行数
    1967 localhost_access_log.2017-12-01.txt
    [root@iZ25klm6k7uZ logs]# wc -w localhost_access_log.2017-12-01.txt  统计词数
    19670 localhost_access_log.2017-12-01.txt
    [root@iZ25klm6k7uZ logs]# wc -m localhost_access_log.2017-12-01.txt  共计字符数
    219011 localhost_access_log.2017-12-01.txt
    [root@iZ25klm6k7uZ logs]# 

    sed正则查找

    用sed来查找500的日志信息:

    [root@iZ25klm6k7uZ logs]# sed -n '/500/p' localhost_access_log.2017-12-01.txt
    119.127.17.97 - - [01/Dec/2017:14:23:18 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:23:24 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:24:12 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:31:11 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:49:51 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:49:57 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:55:45 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:58:03 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    59.108.217.106 - - [01/Dec/2017:15:00:22 +0800] "POST /interview/add.do HTTP/1.1" 500 19582

    注意:-n和-p配合,表示只打印匹配的行。

    awk正则匹配

    用awk来查找500日志信息:

    awk '($9 ~ /500/)' localhost_access_log.2017-12-01.txt 

    输出和上面的sed一样。

    zwk有默认的分隔符,比如 ,空格等。如果要指定分隔符可以用-F。

    zwk的强大之处在于它支持编程,格式如下:

    awk pattern { action } 例如上面的查找500日志可以完整表达如下:

    [root@iZ25klm6k7uZ logs]# awk -F ' ' '($9 ~ /500/){print }' localhost_access_log.2017-12-01.txt 
    119.127.17.97 - - [01/Dec/2017:14:23:18 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:23:24 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:24:12 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:31:11 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:49:51 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:49:57 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:55:45 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    119.127.17.97 - - [01/Dec/2017:14:58:03 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
    59.108.217.106 - - [01/Dec/2017:15:00:22 +0800] "POST /interview/add.do HTTP/1.1" 500 19582

    同时查找500和404的日志:

    awk -F ' ' '($9 ~ /500/ || $9 ~ /404/){print $1,$6,$7,$9}' localhost_access_log.2017-12-01.txt

    或者

    awk -F ' ' '($9 ~ /500|404|400/){print $1,"-",$4,"-",$6,"-",$9}' localhost_access_log.2017-12-01.txt
  • 相关阅读:
    silverlight 跨域文件位置
    Asp.net弹出新窗口,获得返回值
    开通CNblog咯。
    访问IIS元数据库失败
    li可以显示多列
    [转]vs2005 sp1 下载地址、安装方法更新非常慢
    英特尔未来教育核心课程
    很好用的软键盘
    CMD里显示彩色文字
    将输入的中文按要求翻译成拼音
  • 原文地址:https://www.cnblogs.com/at0x7c00/p/7945275.html
Copyright © 2020-2023  润新知