• 搜索关键词统计


    #!/bin/bash
    # 统计搜索词 分析171,173上的日志 
    
    sourceDir="/export/manager/kmsearch/log/wordlog"
    tmpDateFile="/tmp/search_wordlog_tmp.txt"
    tmpSearchWordlog="/tmp/search_wordlog"
    
    
    # 分析获取哪些日志文件
    startDate="2015-05-04"
    startTimeStamp=`date -d "$startDate" +%s`
    
    endDate="2015-12-31"
    endTimeStamp=`date -d "$endDate" +%s`
    
    echo "" > $tmpDateFile 
    for((i=$startTimeStamp; i<=$endTimeStamp; i=i+86400))
    do
    dateStr=`date -d @$i  "+%Y-%m-%d"`
    echo "$dateStr.txt" >> $tmpDateFile
    done
    
    #下载 171
    echo "downloading from 171..."
    dateArr=$(cat $tmpDateFile )
    for tmpStr in ${dateArr[@]}
    do
    scp root@10.15.200.171:$sourceDir/$tmpStr $tmpSearchWordlog/171/
    done
    
    #173
    echo "downloading from 173..."
    dateArr=$(cat $tmpDateFile )
    for tmpStr in ${dateArr[@]}
    do
    scp root@10.15.200.173:$sourceDir/$tmpStr $tmpSearchWordlog/173/
    done
    
    
    #输出到同一个文件
    echo "combine all data... "
    echo '' > $tmpSearchWordlog/alldata.txt
    dateArr=$(cat $tmpDateFile )
    for tmpStr in ${dateArr[@]}
    do
    cat $tmpSearchWordlog/171/$tmpStr >> $tmpSearchWordlog/alldata.txt
    cat $tmpSearchWordlog/173/$tmpStr >> $tmpSearchWordlog/alldata.txt
    done
    
    
    #统计 - all
    #cat $tmpSearchWordlog/alldata.txt | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -100 | awk '{print $1"	"$2" "$3}' > $tmpSearchWordlog/allTop.txt
    #exit
    
    
    #拆分文件 3,000,000行 <200M
    cd $tmpSearchWordlog
    find . -name 'part.alldata.txt*' | xargs rm -rf
    split -l3000000 alldata.txt part.alldata.txt
    allPartFiles=`find . -name "part.alldata.txt*"`
    for tmpStr in $allPartFiles
    do
    cat $tmpStr | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -900 | awk '{print $1"	"$2" "$3}' > ${tmpStr}_Tops.txt &
    done
    
    echo 'waiting 1分钟...'
    sleep 60
    
    # 整合统计
    find . -name 'part.alldata.txt*_Tops.txt' | xargs cat | awk '{print $2"	"$1}'  | tr '[A-Z]' '[a-z]' | sort > partsAllTops.txt
    
    # 关键词统计
    awk '{a[$1]+=$2;}END{for(i in a){print i,a[i];}}' partsAllTops.txt | awk '{print $2"	"$1}' | sort -rn | grep -v 'www.' | grep -v 'http:' > statistic.result
  • 相关阅读:
    程序保护机制
    ubuntu单独安装DDMS
    Linux Syste m Call Table
    任意程序添加ShellCode
    解析结构化异常处理(SEH)(第一部分)
    CONTEXT(线程结构体)
    WINNT.H
    Html的空格显示
    随机变量的联合分布
    期望和期望的性质
  • 原文地址:https://www.cnblogs.com/bandbandme/p/5156947.html
Copyright © 2020-2023  润新知