阿裏雲環境遷移記錄 - 服務監控及報警
Part1 郵件服務搭建
安裝mailx
yum -y install mailx
############################
##qq個人郵箱配置
############################
vim /etc/mail.rc
添加如下配置:
set [email protected] set smtp=smtps://smtp.qq.com:465 set [email protected] set smtp-auth-password=你的 QQ 郵箱授權碼 (登錄qq郵箱到賬戶設置中,打開smtp服務時,提示的驗證碼,此驗證碼非密碼。) set smtp-auth=login #set smtp-use-starttls 這裏是不需要配置的,很多地方沒說明,配置了反而會驗證失敗,所以我註釋掉; set ssl-verify=ignore set nss-config-dir=/root/.certs
##創建證書
mkdir -p /root/.certs/ cd /root/.certs/ echo -n | openssl s_client -connect smtp.qq.com:465 | sed -ne ‘/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p‘ > ~/.certs/qq.crt certutil -A -n "GeoTrust SSL CA" -t "C,," -d ~/.certs -i ~/.certs/qq.crt certutil -A -n "GeoTrust Global CA" -t "C,," -d ~/.certs -i ~/.certs/qq.crt certutil -L -d /root/.certs ##認證 certutil -A -n "GeoTrust SSL CA - G3" -t "Pu,Pu,Pu" -d ./ -i qq.crt
#返回如下提示即可:
Notice: Trust flag u is set automatically if the private key is present.
#發送主題為“郵箱測試”,內容為當前目錄下 message_fiel.txt 文件內容到 [email protected] 郵箱。
mailx -s "郵箱測試" [email protected] < message_file.txt
############################
##qq企業郵箱配置
############################
vim /etc/mail.rc
添加如下配置:
set [email protected] set smtp=smtps://smtp.exmail.qq.com:465 set [email protected] set smtp-auth-password=*****(登錄密碼,不同於個人郵箱的授權碼) set smtp-auth=login set ssl-verify=ignore set nss-config-dir=/etc/pki/nssdb/
cd /etc/pki/nssdb/
#生成證書
echo -n | openssl s_client -connect smtp.qq.com:465 | sed -ne ‘/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p‘ > /etc/pki/nssdb/qq.crt
certutil -A -n "GeoTrust SSL CA" -t "C,," -d /etc/pki/nssdb/ -i /etc/pki/nssdb/qq.crt
certutil -A -n "GeoTrust Global CA" -t "C,," -d /etc/pki/nssdb/ -i /etc/pki/nssdb/qq.crt
certutil -L -d /etc/pki/nssdb/
certutil -A -n "GeoTrust SSL CA - G3" -t "Pu,Pu,Pu" -d ./ -i qq.crt #認證
同樣,認證完會返回如下提示:
Notice: Trust flag u is set automatically if the private key is present.
##測試echo "this email come from centos 172.26.27.71"|mail -v -s "mysql check test" [email protected]
Part2 監控腳本準備
- mysql監控腳本
大致想法:mysql監控腳本分別運行在兩個實例上,如果當前實例宕機,則重啟本機mysql服務,如果其他服務器上的mysql連接不上則郵件通知。
#!/bin/bash
notify_addr=‘[email protected],[email protected]‘
error_log="/opt/script/logs/check_mysql.err"
###定義一個簡單判斷mysql是否可用的函數
function excute_query {
echo -e "`date "+%F %H:%M:%S"` -----checking mysql instance $1 by querying -----" >> ${error_log}
/usr/local/mysql/bin/mysql -uroot -p88gongxiangMYSQL -h $1 --port 30468 -e "select 1;" 2>> ${error_log}
}
###定義無法執行查詢,且mysql服務異常時的處理函數
function service_error {
echo -e "`date "+%F %H:%M:%S"` -----mysql service error,notify manager now-----" >> ${error_log}
systemctl restart mysql.service
echo "$1 無法連接並被重啟"|mail -s "MYSQL $1 實例正在被重啟, 請及時登錄查看狀態!" ${notify_addr} 2>> ${error_log}
echo -e "\n---------------------------------------------------------\n" >> ${error_log}
}
###定義無法執行查詢,但mysql服務正常的處理函數
function query_error {
echo -e "`date "+%F %H:%M:%S"` -----mysql instance $1 query error, retry after 30s-----" >> ${error_log}
sleep 30
excute_query $1
if [ $? -ne 0 ];then
echo -e "`date "+%F %H:%M:%S"` -----mysql instance $1 still can‘t execute query-----" >> ${error_log}
echo "mysql isntance $1 is down"|mail -s "MYSQL $1 無法連接查詢, 請及時處理!from(172.26.27.70)" ${notify_addr} 2>> ${error_log}
else
echo -e "`date "+%F %H:%M:%S"` -----mysql instance $1 query ok after 10s-----" >> ${error_log}
echo -e "\n---------------------------------------------------------\n" >> ${error_log}
fi
}
###監控本機mysql狀態
excute_query 172.26.27.70
if [ $? -ne 0 ];then
systemctl status mysql.service &>/dev/null
if [ $? -ne 0 ];then
service_error 172.26.27.70
else
query_error 172.26.27.70
fi
else
echo -e "\n-----------mysql instance 172.26.27.70 is ok for query-------------\n" >> ${error_log}
fi
###監控備機mysql狀態
excute_query 172.26.27.71
if [ $? -ne 0 ];then
query_error 172.26.27.71
else
echo -e "\n-----------mysql instance 172.26.27.71 is ok for query-------------\n" >> ${error_log}
fi
- mongo監控腳本
大致思想:通過mongo命令登錄或者mongostat判斷節點是否正常運行。
notify_addr=‘[email protected],[email protected]‘
error_log="/opt/script/logs/check_mongo.err"
###定義一個簡單判斷mysql是否可用的函數
function connect_db {
echo -e "`date "+%F %H:%M:%S"` -----checking mongo instance $1 by login -----" >> ${error_log}
echo "db.serverStatus().mem" | /usr/local/mongodb/bin/mongo admin -uroot -p88gongxiangds --host $1 --port 20467 2>> ${error_log}
}
function replication_stat_query {
echo -e "`date "+%F %H:%M:%S"` -----checking mongo instance $1 by mongostat -----" >> ${error_log}
/usr/local/mongodb/bin/mongostat --uri=mongodb://suroot:88gongxiangds@$1:20467/admin 2>> ${error_log}
}
###定義無法執行查詢,且mysql服務異常時的處理函數
function service_error {
echo -e "`date "+%F %H:%M:%S"` -----mongo service $1 error,notify manager now-----" >> ${error_log}
##/usr/local/mongo/bin/mongod -f /etc/mongo.conf --shutdown
echo "$1 mongo連接失敗,請及時處理"|mail -s "Mongo $1 實例無法連接, 請及時登錄處理!from(172.26.27.70)" ${notify_addr} 2>> ${error_log}
echo -e "\n---------------------------------------------------------\n" >> ${error_log}
}
###監控本機mongo node 狀態
function monitor_node {
connect_db $1
if [ $? -ne 0 ];then
service_error $1
#else
#replication_stat_query $1
#if [ $? -ne 0 ];then
#service_error $1
#else
#echo -e "\n-----------mongostat of node $1 is ok! -------------\n" >> ${error_log}
echo -e "\n-----------mongo connection to node $1 is ok! -------------\n" >> ${error_log}
#fi
fi
}
###監控本機mongo node 狀態
monitor_node 172.26.27.70
monitor_node 172.26.27.71
monitor_node 172.26.27.72
- redis監控腳本
大致思想: 通過redis-cli登錄並檢索clusterinfo是否enable來判斷該節點及集群是否正常工作。
#!/bin/bash
notify_addr=‘[email protected],[email protected]‘
error_log="/opt/script/logs/check_redis.err"
###定義無法執行查詢,且mysql服務異常時的處理函數
function service_error {
echo -e "`date "+%F %H:%M:%S"` -----redis service $1:$2 error,notify manager now-----" >> ${error_log}
##/usr/local/mongo/bin/mongod -f /etc/mongo.conf --shutdown
echo "$1 redis連接異常,請及時處理"|mail -s "Redis $1:$2 節點連接失敗, 請及時登錄處理!(from 172.26.27.70)" ${notify_addr} 2>> ${error_log}
echo -e "\n---------------------------------------------------------\n" >> ${error_log}
}
###監控redis 狀態
function monitor_node {
echo -e "`date "+%F %H:%M:%S"` -----checking mongo redis $1:$2 by cli -----" >> ${error_log}
/usr/local/bin/redis-cli -h $1 -p $2 -a 88gongxiangrds info |grep cluster_enabled
if [ $? -ne 0 ];then
service_error $1 $2
echo -e "\n-----------redis connection to node $1:$2 is ok! -------------\n" >> ${error_log}
fi
}
###監控本機mongo node 狀態
monitor_node 172.26.27.70 6239
monitor_node 172.26.27.70 6339
monitor_node 172.26.27.71 6239
monitor_node 172.26.27.71 6339
monitor_node 172.26.27.72 6239
monitor_node 172.26.27.72 6339
- rabbitmq監控腳本
通過rabbitmqctl查看集群狀態或者節點狀態
#!/bin/bash
notify_addr=‘[email protected],[email protected]‘
error_log="/opt/script/logs/check_redis.err"
###定義無法執行查詢,且mysql服務異常時的處理函數
function service_error {
echo -e "`date "+%F %H:%M:%S"` -----rabbitmq service error,notify manager now-----" >> ${error_log}
#ps -ef | grep ^rabbitmq | awk ‘{print $2}‘ | xargs kill -9
#service rabbitmq-server start
echo "$1 rabbitmq服務異常, 請及時處理"|mail -s " $1 RabbitMQ服務異常, 請及時登錄處理!(from $2)" ${notify_addr} 2>> ${error_log}
echo -e "\n---------------------------------------------------------\n" >> ${error_log}
}
###監控rabbitmq 狀態
function monitor_node {
echo -e "`date "+%F %H:%M:%S"` -----checking mongo redis $1:$2 by cli -----" >> ${error_log}
#/usr/lib/rabbitmq/bin/rabbitmqctl cluster_status |grep cluster_name
/usr/sbin/rabbitmqctl cluster_status | grep cluster_name
if [ $? -ne 0 ];then
service_error $1 $2
echo -e "\n-----------redis connection to node $1:$2 is ok! -------------\n" >> ${error_log}
fi
}
monitor_node 172.26.27.72 172.26.27.72
Part3 定時任務配置
crontab -e
*/1 * * * * /opt/script/check_mysql.sh > /opt/script/logs/cron_result.log 2>&1
*/3 * * * * /opt/script/check_mongo.sh > /opt/script/logs/cron_result.log 2>&1
*/5 * * * * /opt/script/check_redis.sh > /opt/script/logs/cron_result.log 2>&1
*/5 * * * * /opt/script/check_rabbitmq.sh > /opt/script/logs/cron_result.log 2>&1
阿裏雲環境遷移記錄 - 服務監控及報警