###############################################################################
#
Name : Mahavairocana
#
Author : Mahavairocana
#
QQ : 10353512
#
WeChat : shenlan-qianlan
#
Blog : http://www.cnblogs.com/Mahavairocana/
#
Description : You are welcome to reprint, or hyperlinks to indicate the
#
source of the article, as well as author
information.
###############################################################################
1、菜鸟版
1、查看主机网卡流量 #!/bin/bash #network while : ; do time=’date +%m”-”%d” “%k”:”%M’ day=’date +%m”-”%d’ rx_before=’ifconfig eth0|sed -n “8″p|awk ‘{print $2}’|cut -c7-’ tx_before=’ifconfig eth0|sed -n “8″p|awk ‘{print $6}’|cut -c7-’ sleep 2 rx_after=’ifconfig eth0|sed -n “8″p|awk ‘{print $2}’|cut -c7-’ tx_after=’ifconfig eth0|sed -n “8″p|awk ‘{print $6}’|cut -c7-’ rx_result=$[(rx_after-rx_before)/256] tx_result=$[(tx_after-tx_before)/256] echo “$time Now_In_Speed: “$rx_result”kbps Now_OUt_Speed: “$tx_result”kbps” sleep 2 done 2、系统状况监控 #!/bin/sh #systemstat.sh ip=192.168.1.227 top -n 2| grep “Cpu” >>./temp/cpu.txt free -m | grep “Mem” >> ./temp/mem.txt df -k | grep “sda1″ >> ./temp/drive_sda1.txt #df -k | grep sda2 >> ./temp/drive_sda2.txt df -k | grep “/mnt/storage_0″ >> ./temp/mnt_storage_0.txt df -k | grep “/mnt/storage_pic” >> ./temp/mnt_storage_pic.txt time=`date +%m”.”%d” “%k”:”%M` connect=`netstat -na | grep “219.238.148.30:80″ | wc -l` echo “$time $connect” >> ./temp/connect_count.txt 3、监控主机的磁盘空间,当使用空间超过90%就通过发mail来发警告 #!/bin/bash #monitor available disk space SPACE=’df | sed -n ‘/ / $ / p’ | gawk ‘{print $5}’ | sed ’s/%//’ if [ $SPACE -ge 90 ] then fty89@163.com fi 4、监控CPU和内存的使用情况 #!/bin/bash #script to capture system statistics OUTFILE=/home/xu/capstats.csv DATE=’date +%m/%d/%Y’ TIME=’date +%k:%m:%s’ TIMEOUT=’uptime’ VMOUT=’vmstat 1 2′ users=’echo $TIMEOUT | gawk ‘{print $4}’ ‘ LOAD=’echo $TIMEOUT | gawk ‘{print $9}’ | sed “s/,//’ ‘ FREE=’echo $VMOUT | sed -n ‘/[0-9]/p’ | sed -n ’2p’ | gawk ‘{print $4} ‘ ‘ IDLE=’echo $VMOUT | sed -n ‘/[0-9]/p’ | sed -n ’2p’ |gawk ‘{print $15}’ ‘ echo “$DATE,$TIME,$USERS,$LOAD,$FREE,$IDLE” >> $OUTFILE 5、全方位监控主机 #!/bin/bash # check_xu.sh # 0 * * * * /home/check_xu.sh DAT=”`date +%Y%m%d`” HOUR=”`date +%H`” DIR=”/home/oslog/host_${DAT}/${HOUR}” DELAY=60 COUNT=60 # whether the responsible directory exist if ! test -d ${DIR} then /bin/mkdir -p ${DIR} fi # general check export TERM=linux /usr/bin/top -b -d ${DELAY} -n ${COUNT} > ${DIR}/top_${DAT}.log 2>&1 & # cpu check /usr/bin/sar -u ${DELAY} ${COUNT} > ${DIR}/cpu_${DAT}.log 2>&1 & #/usr/bin/mpstat -P 0 ${DELAY} ${COUNT} > ${DIR}/cpu_0_${DAT}.log 2>&1 & #/usr/bin/mpstat -P 1 ${DELAY} ${COUNT} > ${DIR}/cpu_1_${DAT}.log 2>&1 & # memory check /usr/bin/vmstat ${DELAY} ${COUNT} > ${DIR}/vmstat_${DAT}.log 2>&1 & # I/O check /usr/bin/iostat ${DELAY} ${COUNT} > ${DIR}/iostat_${DAT}.log 2>&1 & # network check /usr/bin/sar -n DEV ${DELAY} ${COUNT} > ${DIR}/net_${DAT}.log 2>&1 & #/usr/bin/sar -n EDEV ${DELAY} ${COUNT} > ${DIR}/net_edev_${DAT}.log 2>&1 &
2、进阶版
#! /bin/sh ################################################# # 主机健康状态监控脚本 # (监控内容:内存、CPU、磁盘、网卡) # # V1.0 Writen by: MR.G Date:2012-03-20 ################################################## export LANG=C #设定管理员的信箱 Email=zhangxiaogang@8tgame.com #设定日期格式 time=`date "+%Y-%m-%d %H:%M:%S"` #设定日志文件 log=`date +%Y-%m-%d`.log #设定配置文件的路径 config=config.ini if [[ -f $config && -s $config ]];then for ip in `cat $config` do # ------------------------------------------------------------------------------------------------- # 检查服务器的状态,如果异常,则发送mail报警。 # ------------------------------------------------------------------------------------------------- ssh $ip pwd &> /dev/null if [ $? -ne 0 ];then #echo "主机:$ip的SSH无法登陆,请及时处理!" | mail -s "$ip SSH状态异常" $Email echo "$time $ip 的SSH状态检查完毕,状态:Failed.">>$log else echo "$time $ip 的SSH状态检查完毕,状态:Ok." >>$log # ------------------------------------------------------------------------------------------------- # 更新服务器的时间 # ---------------------------------------------------------- #设定NTP Server server="ntp.fudan.edu.cn" ssh $ip /usr/sbin/ntpdate -s $server ssh $ip /usr/sbin/hwclock --systohc # ------------------------------------------------------------------------------------------------- # 检查服务器网络状态,如果无法ping通,则发送mail报警。(ICMP过滤的除外) # ------------------------------------------------------------------------------------------------- #设定检测的网站 site=www.baidu.com ssh $ip ping -c3 www.baidu.com >/dev/null if [ $? -ne 0 ]; then #echo "主机:$ip无法ping通,请及时处理!" | mail -s "$ip 磁盘空间警告" $Email echo "$time $ip 的网络状态检查完毕,状态:Failed.">>$log else echo "$time $ip 的网络状态检查完毕,状态:Ok." >>$log fi # ------------------------------------------------------------------------------------------------- # 检查系统的磁盘空间,如果使用率超过90%,则发送mail报警。 # ------------------------------------------------------------------------------------------------- #设定的阀值 space_warn="90" ssh $ip df -P | grep "^/dev"| awk '{print $0}' | while read x do space_name=`echo $x | awk '{print $1}'` space_per=`echo $x | awk '{print $5}' | sed 's/%//g'` space_used=`echo $x | awk '{print $3}'` if [ $space_per -ge $space_warn ];then #echo "主机:$ip的$space_name分区仅剩下$space_used M,使用率为$space_per,已超过指定阀值,请及时处理!" | mail -s "$ip 磁盘空间警告" $Email echo -e "$time $ip 的$space_name分区检查完毕,状态:Failed.">>$log else echo -e "$time $ip 的$space_name分区检查完毕,状态:Ok." >>$log fi done # ------------------------------------------------------------------------------------------------- # 检查系统的内存状态,如果交换分区的使用率超过80%,则发送mail报警。 # ------------------------------------------------------------------------------------------------- #设定的阀值 swap_warn=80 swap_total=`ssh $ip free -m | grep "Swap" | awk '{print $4}'` swap_free=`ssh $ip free -m | grep "Swap" | awk '{print $2}' ` swap_used=`ssh $ip free -m | grep "Swap" | awk '{print $3}' ` if [ $swap_used -ne 0 ];then swap_per=$[[$swap_used / $swap_total]*100] if [ $swap_per -ge $swap_warn ];then #echo "主机:$ip的Swap交换分区仅剩下$swap_free M,使用率为$swap_per,已超过指定阀值,请及时处理!" | mail -s "$ip 内存使用警告" $Email echo "$time $ip的Swap分区检查完毕,状态:Failed." >> $log else echo "$time $ip的Swap分区检查完毕,状态:Ok." >> $log fi fi # ------------------------------------------------------------------------------------------------- # 检查系统CPU的状态,如果使用率超过80%,则发送mail报警 # ------------------------------------------------------------------------------------------------- #设定的阀值 cpu_warn=80 cpu_free=`ssh $ip top -b -n 1 | grep "Cpu" | awk '{print $5}' | sed 's/%id,//g'` cpu_used=$(echo "100 - $cpu_free" | bc) if [ `echo "$cpu_used >= $cpu_warn" | bc ` -eq 1 ];then #echo "主机:$ip的CPU使用率为$cpu_used%,已超过指定阀值,请及时处理!" | mail -s "$ip CPU使用警告" $Email echo "$time $ip 的CPU状态检查完毕,状态:Failed." >> $log else echo "$time $ip 的CPU状态检查完毕,状态:Ok." >> $log fi # ------------------------------------------------------------------------------------------------- # 检查系统登陆的用户数,如果当前用户数超过3个,则发送mail报警 # ------------------------------------------------------------------------------------------------- #设定的阀值 users_max=4 users_now=`ssh $ip uptime | awk '{print $4}'` if [ $user_now >=$users_max ];then #echo "$ip登陆的用户数已经达到了$user_nowg个,已超过指定的阀值,请及时处理!" | mail -s "$ip 用户数报警" $Email echo "$time $ip的用户数检查完毕,状态:Failed." >> $log else echo "$time $ip 的用户数检查完毕,状态:Ok." >> $log fi # ------------------------------------------------------------------------------------------------- # 检查系统十五分钟内的平均负载情况,如果超过0.7(单核),则发送mail报警 # ------------------------------------------------------------------------------------------------- #设定的阀值 load_warn=0.7 cpu_num=`ssh $ip cat /proc/cpuinfo | grep -c "model name"` load_num=`ssh $ip uptime | awk '{print $10}'` load_average=`echo "scale=2;$load_num/$cpu_num" | bc` if [ `echo "$load_average >= $load_warn" | bc` -eq 1 ];then #echo "$ip 15分钟单核的平均负载已经达到$load_average,已超过指定的阀值,请及时处理!" | mail -s "$ip 平均负载报警" $Email echo "$time $ip 的平均负载检查完毕,状态:Failed." >> $log else echo "$time $ip 的平均负载检查完毕,状态:Ok." >> $log fi # ------------------------------------------------------------------------------------------------- # 检查系统当前的IP连接数,如果超过8000,则发送mail报警 # ------------------------------------------------------------------------------------------------- #设定的阀值 conns_warn=8000 ip_conns=`ssh $ip netstat -an | grep tcp | grep EST | wc -l` if [ $ip_conns -ge $conns_warn ];then #echo "$ip 的IP连接数已经达到$ip_conns,已超过指定的阀值,请及时处理!" | mail -s "$ip IP连接数" $Email echo "$time $ip 的IP连接数检查完毕,状态:Failed." >> $log else echo "$time $ip 的IP连接数检查完毕,状态:Ok." >> $log fi # ------------------------------------------------------------------------------------------------- # 检查系统Apache服务的运行状态,如果不返回200,则发送mail报警 # ------------------------------------------------------------------------------------------------- httpd=`ssh $ip ps -ef |grep httpd |awk '{if($3==1)print $0}'|awk '{if($1=="root")print $3 }'` if [ "$httpd" != "1" ];then echo "$time $ip Apache状态异常,尝试重启进程……" >> $log ssh $ip /etc/init.d/httpd restart &> /dev/null ssh $ip sleep 100 httpd=`ssh $ip ps -ef |grep httpd |awk '{if($3==1)print $0}'|awk '{if($1=="root")print $3 }'` if [ "$httpd" != "1" ];then result=`wget -o /dev/stdout "http://$ip/" | grep "HTTP"| awk '{print $6}'` if [ "$result" != "200" ];then #echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log else echo "$time $ip 的Apache状态检查完毕,状态:Ok." >> $log fi else #echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log fi else result=`wget -o /dev/stdout "http://$ip/" | grep "HTTP" | awk '{print $6}'` if [ "$result" != "200" ];then #echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log else echo "$time $ip 的Apache状态检查完毕,状态:Ok." >> $log fi fi # ------------------------------------------------------------------------------------------------- # 检查系统MySQL服务的运行状态,通过检查端口3360,若重启后不正常发送mail报警(没有考虑锁表的情况) # ------------------------------------------------------------------------------------------------- PORT=`ssh $ip netstat -na|grep "LISTEN"|grep "3306"|awk -F[:" "]+ '{print $5}'` if [ $PORT -eq 3306 ];then echo "$time $ip 的MySQL状态检查完毕,状态:Ok." >> $log else echo "$time $ip MySQL状态异常,尝试重启进程……" >> $log ssh $ip /etc/init.d/mysqld restart &>/dev/null PORT=`ssh $ip netstat -na|grep "LISTEN"|grep "3306"|awk -F[:" "]+ '{print $5}'` if [ $PORT -eq 3306 ];then echo "$time $ip 的MySQL状态检查完毕,状态:Ok." >> $log else #echo "主机:$ip 的MySQL服务已经没有响应,请及时处理!" | mail -s "$ip MySQL服务警告" $Email echo "$time $ip 的MySQL状态检查完毕,状态:Failed." >> $log fi fi # ------------------------------------------------------------------------------------------------- # 检查系统网卡的流速情况,如果超过指定的阀值,则发送mail报警 # ------------------------------------------------------------------------------------------------- #设定的阀值,单位KB/S speed_warn=10240 send_before=`ifconfig eth0 | grep bytes | awk '{print $6}' | awk -F : '{print $2}'` recv_before=`ifconfig eth0 | grep bytes | awk '{print $2}' | awk -F : '{print $2}'` sleep 1 send_after=`ifconfig eth0 | grep bytes | awk '{print $6}' | awk -F : '{print $2}'` recv_after=`ifconfig eth0 | grep bytes | awk '{print $2}' | awk -F : '{print $2}'` send_bytes=`expr $send_after - $send_before` recv_bytes=`expr $recv_after - $recv_before` send_speed=`expr $send_bytes / 1024` recv_speed=`expr $recv_bytes / 1024` if [[ `echo "$send_speed >= $speed_warn" | bc` -eq 1 || `echo "$recv_speed >= $speed_warn" | bc` -eq 1 ]];then # echo "$ip 的网卡流速为$send_speed Kb/s(上行)/$recv_speed Kb/s(下行),已超过指定的阀值,请及时处理!" | mail -s "$ip 平均负载报警" $Email echo "$time $ip 的网卡流速检查完毕,状态:Failed." >> $log else echo "$time $ip 的网卡流速检查完毕,状态:Ok." >> $log fi fi done else echo "配置文件不存在或内容为空,请检查!" fi
3、日志监控
需要准备环境 1: rsync 安装包 (yum 安装 编译安装均可) 2:防火墙开放相应端口 3:sendemail 客户端 上传到/usr/bin/后添加执行权限 (sendEmail附件放在附件) 服务端执行命令 echo "work:work" > /etc/rsyncd.pas | chmod600 /etc/rsync.pas 1,编辑配置文件/etc/rsyncd.conf如下 [global] uid = root gid = root use chroot = yes max connections = 50 pid file = /var/run/rsyncd.pid lock file = /var/run/rsyncd.lock log file = /var/log/rsyncd.log transfer logging = yes log format = %t %a %m %f %b syslog facility = local3 timeout = 300 [1.1] read only = false write only = yes path = /$path comment = log auth users = log secrets file = /etc/rsync.pas hosts allow = 10.1.1.1 [1.2] read only = false write only = yes path = /$path comment = log auth users = log secrets file = /etc/rsync.pas hosts allow = 10.1.1.2 [1.21] read only = false write only = yes path = /$path comment = log auth users = log secrets file = /etc/rsync.pas hosts allow = 10.1.1.21 [1.22] read only = false write only = yes path = /$path comment = log auth users = log secrets file = /etc/rsync.pas hosts allow = 10.1.1.22 巡检脚本 #!/bin/bash Path=/var/log/ Time=`date "+%Y-%m-%d"` Ytime=`date -d yesterday "+%Y-%m-%d"` Tlmip="10.1.8.1 10.1.8.2" Tpip="10.1.8.21 10.1.8.22" ID=tomcatserver.pid Tlmlog="tlm.log tlm-trace.log catalina-`date -d yesterday "+%Y-%m-%d"`.out catalina-`date "+%Y-%m-%d"`.out" Tplog="tp.log tp-trace.log catalina-`date -d yesterday "+%Y-%m-%d"`.out catalina-`date "+%Y-%m-%d"`.out" Errorlog="$Path""$Time-error.log" Contacts="a@vmware.com,b@vmware.com" ###定义接受邮件的联系人,中间以英文逗号隔开即可### ############################################检测服务############################################## for I in "$Tlmip $Tpip";do nc -v -w 10 -z $I 80 if [ $? -ne 0] then echo "$l service abnormity" >> $Errorlog fi done #############################################巡检TLM############################################# for P in $Tlmip;do id=`ls /proc/$(cat /$Path$P/tomcatserver.pid)/fd | wc -l` if [$id -gt 4000 ];then echo "$P 连接数超过4000,请查看!!!>>$Errorlog" fi for L in $Tlmlog;do cat $Path$P/$L | grep -v INFO | grep -v vmext |grep -v "at com"|grep -v "at sun"|grep -v "at org" | grep -v "at java" |grep -v "Caused by" | grep -v "more" | grep -E "$Time|$Ytime" >> $Errorlog done done ##############################################巡检TP############################################# for P in $Tpip;do id=`ls /proc/$(cat /$Path$P/tomcatserver.pid)/fd | wc -l` if [$id -gt 4000 ];then echo "$P 连接数超过4000,请查看!!!>>$Errorlog" fi for L in $Tplog;do cat $Path$P/$L | grep -v INFO | grep -v vmext |grep -v "at com"|grep -v "at sun"|grep -v "at org" | grep -v "at java" |grep -v "Caused by" | grep -v "more" | grep -E "$Time|$Ytime" >> $Errorlog done done ################################将报错信息以附件形式发送到指定邮件##################################### /usr/bin/sendEmail -t $Contacts -f 抄送账号 -s smtp地址以及端口:25 -xu 发件箱账号 -xp 发件箱密码 -o message-file=$Errorlog -u "巡检报错信息" rm -rf $Errorlog echo "55 8 * * * /path/*.sh" >> /var/spool/cron/root 客户端分别执行命令 echo "work:work" > /etc/rsyncd.pas | chmod600 /etc/rsync.pas vim /path/rsync.sh #/bin/sh rsync --port=服务端口 -aP --bwlimit 3000 /opt/vmware/instances/myserver/logs/ work@跳板机IP::模块 --password-file=/etc/rsync.pas #####模块即为客户端定义好的1.1 1.2 1.21 1.22 chmod a+x /path/rsync echo "50 8 * * * /path/rsync.sh" >> /var/spool/cron/root