|
|
发表于:
2013-1-4 14:03:13 |
[全部帖]
[楼主帖]
|
楼主
|
|
|
|
1 crontab - crontab -l
- #ping
- */1 * * * * /root/xianshang_sh/chk_ping.sh >> /root/xianshang_sh/cron_log 2>&1
- #mysql_replicate
- */1 * * * * /root/xianshang_sh/chk_mysql_replicate.sh >> /root/xianshang_sh/cron_log 2>&1
- #df
- */1 * * * * /root/xianshang_sh/chk_df.sh >> /root/xianshang_sh/cron_log 2>&1
- #load
- */1 * * * * /root/xianshang_sh/chk_load.sh >> /root/xianshang_sh/cron_log 2>&1
- #web
- */1 * * * * /root/xianshang_sh/chk_web.sh >> /root/xianshang_sh/cron_log 2>&1
2 配置文件
- cat CONFIG
- MOBILES="13xxxxxxxxx 18xxxxxxxxx 13xxxxxxxxx"
- MAILS="dongnan@xxx.com"
- ESXI_HOSTS="192.168.57.91 192.168.57.93"
- PHYSICAL_HOSTS="192.168.57.112 192.168.0.1 192.168.57.99"
- LINUX_WEB_HOSTS="192.168.57.82 192.168.57.70 10.0.100.72 10.0.100.73 10.0.100.75 10.0.100.76 10.0.100.77 10.0.100.78"
- WIN_WEB_HOSTS="10.0.100.81 10.0.100.83"
- DB_SLAVE_HOSTS="10.0.100.82"
- ALLHOSTS="$ESXI_HOSTS $PHYSICAL_HOSTS $LINUX_WEB_HOSTS $WIN_WEB_HOSTS $DB_SLAVE_HOSTS"
3 检查网络
- cat chk_ping.sh
- #!/bin/bash
- #
- #Use the ping command to check network
- #
- #$ ping -f -c 10 10.0.100.1
- #PING 10.0.100.1 (10.0.100.1) 56(84) bytes of data.
- #.........E
- #--- 10.0.100.1 ping statistics ---
- #10 packets transmitted, 0 received, +3 errors, 100% packet loss, time 120ms
- #, pipe 3
- #$ ping -f -c 10 10.0.100.71
- #PING 10.0.100.71 (10.0.100.71) 56(84) bytes of data.
- #
- #--- 10.0.100.71 ping statistics ---
- #10 packets transmitted, 10 received, 0% packet loss, time 0ms
- #rtt min/avg/max/mdev = 0.003/0.009/0.056/0.015 ms, ipg/ewma 0.029/0.019 ms
- #
- #$ echo $?
- #1 #返回值1
- #
- #variables
- ping=/bin/ping
- DELAY_LIMIT=100
- LOSS_LIMIT=20
- sh_dir=/root/xianshang_sh/
- crondir=${sh_dir}crontab
- source ${sh_dir}CONFIG
- hosts="$ALLHOSTS"
- #main
- #每个ip
- for HOST in $hosts ;do
- flag_ping_file="$crondir/log/$HOST.ping"
- flag_ping_fatal="$crondir/log/$HOST.ping.fatal"
- error_log="$crondir/log/ping_error.log"
- #返回值0,定义delay 与 loss 变量
- if $ping -f -c 50 $HOST >$crondir/log/ping.log;then
- grep "rtt min" $crondir/log/ping.log >$crondir/log/ping.delay
- grep "packet loss" $crondir/log/ping.log >$crondir/log/ping.loss
- delay=`awk -F= '{print $2}' $crondir/log/ping.delay | awk -F/ '{print $2}'| awk -F. '{print $1}'`
- loss=`awk '{print $6}' $crondir/log/ping.loss | awk -F% '{print $1}'`
- #判断delay 或者loss 大于规定限制,则发送报警邮件
- if [ "$delay" -ge "$DELAY_LIMIT" -o "$loss" -ge "$LOSS_LIMIT" ];then
- if [ ! -f $flag_ping_file ];then
- #for mobile in $MOBILES;do
- #echo "Monitor to $HOST delay $delay'ms;$loss% packet loss" | /usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "Monitor to $HOST delay $delay'ms;$loss% packet loss" | mail -s "$HOST delay" $MAILS
- date +'%F %T' >>$error_log
- cat $crondir/log/ping.log >>$error_log
- #生成用于判断的flag文件
- echo "$HOST network error" >$flag_ping_file
- fi
- fi
- #判断delay 或者loss 小于规定限制,则发送解除报警邮件
- if [ "$delay" -lt "$DELAY_LIMIT" -a "$loss" -lt "$LOSS_LIMIT" -a -f $flag_ping_file ];then
- echo "$HOST network ok" | mail -s "$HOST nk ok" $MAILS
- rm -f $flag_ping_file
- if [ -f $flag_ping_fatal ];then
- #for mobile in $MOBILES;do
- #echo "$HOST network ok"|/usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- #删除用于判断的flag文件
- rm -f $flag_ping_fatal
- fi
- fi
- #
- else
- #返回值1,则直接判断为失败,发送报警邮件
- if [ ! -f $flag_ping_fatal ];then
- #for mobile in $MOBILES;do
- #echo "Monitor to $HOST fatal;100% packet loss" | /usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "Monitor to $HOST fatal;100% packet loss" | mail -s "$HOST loss" $MAILS
- date +'%F %T' >>$error_log
- cat $crondir/log/ping.log >>$error_log
- echo "$HOST network fatal" >$flag_ping_fatal
- echo "$HOST network error" >$flag_ping_file
- fi
- fi
- done
4 检查mysql 主从状态
- cat chk_mysql_replicate.sh
- #!/bin/bash
- #
- #check the mysql replicate
- #
- #/usr/local/mysql/bin/mysql -uroot -pdongnan -e 'show slave status\G' -ss | awk '{print $2}' | head -n 13 | tail -n2
- #Yes
- #Yes
- #
- #
- #variables
- ssh=/usr/bin/ssh
- let dflimit=90
- sh_dir=/root/xianshang_sh/
- crondir=${sh_dir}crontab
- source ${sh_dir}CONFIG
- HOSTS="$DB_SLAVE_HOSTS"
- #main
- for HOST in $HOSTS;do
- $ssh root@$HOST "/usr/local/mysql/bin/mysql -uroot -pdongnan -e 'show slave status\G' -ss" | awk '{print $2}' | head -n 13 | tail -n2 >"${crondir}/log/chk_mysql_replicate.log"
- YN=""
- #
- while read line;do
- YN="$YN $line"
- done < "${crondir}/log/chk_mysql_replicate.log"
- #
- if [ "$YN" == " Yes Yes" ];then
- #echo "$HOST Slave is running!"
- if [ -f "${crondir}/log/$HOST.mysql" ];then
- #for mobile in $MOBILES;do
- #echo ""$HOST"."$i" replicate ok" | /usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "$HOST replicate ok" | mail -s "$HOST replicate ok" $MAILS
- rm -f "${crondir}/log/$HOST.mysql"
- fi
- #
- else
- #echo "$HOST Slave is not running!"
- if [ ! -f "${crondir}/log/$HOST.mysql" ];then
- #for mobile in $MOBILES;do
- #echo ""$HOST"."$i" replicate error" | /usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "$HOST replicate error" | mail -s "$HOST replicate error" $MAILS
- echo "replicate error" >"${crondir}/log/$HOST.mysql"
- fi
- fi
- #
- done
5 检查磁盘
- cat chk_df.sh
- #!/bin/bash
- #
- #check the disk usage via ssh
- #variables
- ssh=/usr/bin/ssh
- let dflimit=90
- sh_dir=/root/xianshang_sh/
- crondir=${sh_dir}crontab
- source ${sh_dir}CONFIG
- host="$PHYSICAL_HOSTS $LINUX_WEB_HOSTS $DB_SLAVE_HOSTS"
- #main
- #执行ssh 命令
- for HOST in $host ;do
- flag_disk_file=$crondir/log/"$HOST".disk
- log=$crondir/log/disk_error.log
- capacity=$($ssh root@$HOST "df" | grep "/dev/" | sed 's/\%//' | awk '{print $5}')
- let flags=0
- #判断ssh命令返回结果
- for used in $capacity ;do
- if [ $used -ge $dflimit ];then
- let flags=1
- break
- fi
- done
- #如果磁盘超过限制,则发送报警邮件
- if [ "$flags" -eq "1" -a ! -f "$flag_disk_file" ];then
- #for mobile in "$MOBILES";do
- #echo "$HOST disk will full" | /usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "$HOST disk will full" | mail -s "$HOST disk will full" $MAILS
- date +'%F %T' >>$log
- echo "$HOST disk will full" >> $log
- echo "disk_error" >$flag_disk_file
- fi
- #如果磁盘正常,则发邮件解除报警邮件
- if [ "$flags" -eq "0" -a -f "$flag_disk_file" ];then
- for mobile in "$MOBILES";do
- echo "$HOST disk ok"|/usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- done
- echo "$HOST disk ok" | mail -s "$HOST disk ok" $MAILS
- date +'%F %T' >>$log
- echo "$HOST disk ok" >> $log
- rm -f $flag_disk_file
- fi
- done
6 检查系统负载
- cat chk_load.sh
- #!/bin/bash
- #
- #check the system load through ssh
- #variables
- ssh=/usr/bin/ssh
- let loadlimit=5
- sh_dir=/root/xianshang_sh/
- crondir=${sh_dir}crontab
- log=${crondir}/log/system_load.log
- source ${sh_dir}CONFIG
- host="$PHYSICAL_HOSTS $LINUX_WEB_HOSTS $DB_SLAVE_HOSTS"
- #main
- #执行ssh 命令
- for HOST in $host ;do
- LOAD=$($ssh root@$HOST "/bin/cat /proc/loadavg" | awk '{print $1}' | awk -F. '{print $1}')
- if [ -z "$LOAD" ];then
- echo "$HOST is null"
- continue
- fi
- #判断ssh命令返回结果
- if [ "$LOAD" -lt "$loadlimit" ];then
- #echo "$HOST load is OK!"
- #如果找到load文件,则发送解除报警邮件
- if [ -f $crondir/log/"$HOST".load ];then
- #for mobile in $MOBILES;do
- # echo "$HOST load ok"|/usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "$HOST load ok" | mail -s "$HOST load ok" $MAILS
- #删除load文件
- rm -f $crondir/log/"$HOST".load
- date +'%F %T' >>$log
- echo "$HOST load ok" >> $log
- fi
- else
- #echo "$HOST is overload!"
- #不能找到load文件,则发送报警邮件
- if [ ! -f $crondir/log/"$HOST".load ];then
- #for mobile in $MOBILES;do
- # echo ""$HOST" overload"|/usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "$HOST overload" | mail -s "$HOST overload" $MAILS
- #生成load文件
- echo "overload" >$crondir/log/"$HOST".load
- date +'%F %T' >>$log
- echo "$HOST overload" >> $log
- fi
- fi
- done
7 检查web server
- cat chk_web.sh
- #!/bin/bash
- #
- #check the http 80 port via ssh
- #variables
- curl=/usr/bin/curl
- usleep=/bin/usleep
- sh_dir=/root/xianshang_sh/
- crondir=${sh_dir}crontab
- source ${sh_dir}CONFIG
- host="$LINUX_WEB_HOSTS $WIN_WEB_HOSTS"
- #main
- for HOST in $host ;do
- flag_apache_file=$crondir/log/"$HOST".web
- log=$crondir/log/apache_error.log
- let n=1
- #
- while [ "$n" -lt "5" ];do
- #if wget http://"$HOST"/check.html -O "$LOG_DIR/$HOST".html --timeout=3 --tries=2 -o $log;then
- if $curl -IL -m 2 http://"$HOST"/check.html 2>&1 | grep '200' > /dev/null;then
- if [ -f $flag_apache_file ];then
- #for mobile in $MOBILES ;do
- #echo "$HOST 80 port ok"|/usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "$HOST 80 port ok" | mail -s "$HOST 80 port ok" $MAILS
- rm -f $flag_apache_file
- fi
- break
- else
- let n++
- $usleep 300000
- fi
- done
- #
- if [ "$n" -eq "5" -a ! -f $flag_apache_file ];then
- #for mobile in $MOBILES ;do
- #echo "$HOST 80 port error"|/usr/local/bin/gammu --sendsms TEXT "$mobile" -unicode
- #done
- echo "$HOST 80 port error" | mail -s "$HOST 80 port error" $MAILS
- date +'%F %T' >>$log
- echo "$HOST apache error" >> $log
- echo "apache_error" >$flag_apache_file
- fi
- done
# # 本文出自 “dongnan” 博客
|
|
|