5715问答卡展现query获取:
处理脚本run.sh
#!/bin/bash date="$1" echo "跑日期为:$date的5715文章卡展和未展现query数据" file="medbeauty-$1.log" echo "全量日志保存至$file中" #获取一天全量日志 hadoop fs -getmerge /app/dt/minos/3/70061577/$1/*/*/ ./$file echo "全天AE日志拉取成功" #获取5715问答的日志 wendalog="5715-$1.log" echo "5715日志保存至$wendalog中" awk '/srcid[5715/' $file > $wendalog echo "5715日志抽取成功" rm $file #获取5715未展现日志 unshowlog="5715unshow-$1.log" echo "5715未展现日志保存至$unshowlog中..." grep -v 'res_answerId' $wendalog > $unshowlog echo "5715未展现日志抽取成功" #获取5715未展现日志的query unshowquerylog="5715unshow-query-$1.log" echo "5715日志的query保存至$unshowquerylog中..." awk '{match($0,/.+orgquery[(.+?)] from/,a);print a[1]}' $unshowlog > $unshowquerylog echo "5715未展现日志的query抽取成功" #query合并&排序 sortlog="5715unshow-query-sort-uniq-$1.log" echo "query合并&排序保存至$sortlog中..." sort $unshowquerylog|uniq -c|sort -n -r > $sortlog echo "5715query合并&排序成功" echo "未展现部分完成" echo "---------------------" #获取5715展现日志 showlog="5715show-$1.log" echo "5715展现日志保存至$showlog中..." grep 'res_answerId' $wendalog > $showlog echo "5715展现日志抽取成功" #展现的query和answerId showquerylog="5715show-query-answerid-$1.log" echo "5715展现日志query&answerid保存至$showquerylog中..." awk '{match($0,/.+orgquery[(.+?)] from.+answerId[(.+?)] resultCode/,a);print a[1],a[2]}' $showlog > $showquerylog echo "5715展现日志query&answerid保存至完成" echo "done!!!!!!!!!!!"
批量运行batch.sh
nohup ./run.sh 20210409 > myout-20210409.file 2>&1 & nohup ./run.sh 20210410 > myout-20210410.file 2>&1 & nohup ./run.sh 20210411 > myout-20210411.file 2>&1 &
命中反爬duedge的ip排序提取:
run.sh
#!/bin/bash date="$1" echo "跑日期为:$date的duedgef爬ip排序数据" file="medbeauty-$1.log" echo "全量日志保存至$file中" #获取一天全量日志 hadoop fs -getmerge /app/dt/minos/3/70061577/$1/*/*/ ./$file echo "全天AE日志拉取成功" #获取5715问答的日志 duedgelog="duedge-$1.log" duedgelogtemp="duedge-$1_temp.log" echo "反爬日志保存至$duedgelog中,格式riskscore,ipv4,ipv6" awk '{match($0,/user_ip[([0-9.]+?)].+user_ip_v6[([0-9a-zA-Z:]+?)].+duedge_spider[([1-9].+?)] upModule/,a);print a[3],a[1],a[2]}' $file > $duedgelogtemp #删除空行 awk NF $duedgelogtemp > $duedgelog echo "反爬日志抽取成功" rm $file rm $duedgelogtemp #获取sort结果 duedgesortlog="duedge-$1-sort.log" awk -F " " '$1>9{print $2,$3}' $duedgelog | sort -n | uniq -c | sort -n -r > $duedgesortlog echo "保存完成" echo "done!!!!!!!!!!!"