在多个运维技术分享中都会谈及到“告警风暴”这个词,即在大规模网络发生异常的时候,报警量激增,运维伙伴手机在1个多小时中不断的接受报警通知,这样有可能会造成“狼来了”等一系列的问题,不但对运维人员造成了困扰,给排查问题带了不小的难度。某族同样也困扰过,但在建设了告警平台,对告警进行合并等操作,由原来的月报警8000+ 降低至800条。
一、收敛架构图
①所有产生告警均由zabbix调用脚本推入缓存redis当中
②分析系统将在规定时间(1分钟)内去redis中拉取数据,根据定义好的一系列规则进行,合并、分析或直接丢弃,并存入分析平台数据库,以便供历史查询。
③根据预先定义好的规则将报警通过定义好的方式发送给相关人员。
二、对zabbix 进行设置
1.zabbix Actions
对Actions进行特殊设置,Default subject极为重要,是识别收敛的标示。
Default subject
{EVENT.ID}_1
Default message
triggervalue|{TRIGGER.VALUE}#hostname|{HOSTNAME1}#ipaddress|{IPADDRESS}#hostgroup|{TRIGGER.HOSTGROUP.NAME}#triggernseverity|{TRIGGER.NSEVERITY}#triggername|{TRIGGER.NAME}#triggerkey|{TRIGGER.KEY1}#triggeritems|{ITEM.NAME}#itemvalue|{ITEM.VALUE}#eventid|{EVENT.ID}
Recovery subject
{EVENT.ID}_0
Recovery message
triggervalue|{TRIGGER.VALUE}#hostname|{HOSTNAME1}#ipaddress|{IPADDRESS}#hostgroup|{TRIGGER.HOSTGROUP.NAME}#triggernseverity|{TRIGGER.NSEVERITY}#triggername|{TRIGGER.NAME}#triggerkey|{TRIGGER.KEY1}#triggeritems|{ITEM.NAME}#itemvalue|{ITEM.VALUE}#eventid|{EVENT.ID}
2.Media types
这里只需要传递subject 参数就可以了。
police.py--报警函数:zabbix告警是调用此函数,将事件id推入redis
redis 安装查看第三部分
/usr/local/zabbix/share/zabbix/alertscripts/police.py #!/usr/bin/env python #coding:utf-8 import redis import sys subject=sys.argv[1] r = redis.StrictRedis(host='**.**.**.**', port=6379) r.set(subject,subject)
三、操作部分
1.环境安装
pip安装
cd /usr/local/src
wget https://bootstrap.pypa.io/get-pip.py --no-check-certificate -O ./get-pip.py
python get-pip.py
# pip更换国内源地址
cd
mkdir .pip
vim .pip/pip.conf
# ===========================================
[global]
index-url = http://pypi.douban.com/simple
[install]
trusted-host = pypi.douban.com
redis安装
# 编译安装
cd /usr/local/src/
wget http://download.redis.io/releases/redis-3.2.2.tar.gz
tar xzf redis-3.2.2.tar.gz
cd redis-3.2.2
make
cd src
make install
# 配置管理
mkdir -p /usr/local/redis/bin
mkdir -p /usr/local/redis/etc
# 复制配置文件
cd /usr/local/src/redis-3.2.2
mv redis.conf /usr/local/redis/etc/
# 复制命令文件
cd /usr/local/src/redis-3.2.2/src
cp mkreleasehdr.sh redis-benchmark redis-check-aof redis-check-rdb redis-cli redis-sentinel redis-server redis-trib.rb /usr/local/redis/bin/
# 修改后台启动
vim /usr/local/redis/etc/redis.conf
将 daemonize no
改为daemonize yes
# 启动
/usr/local/redis/bin/redis-server /usr/local/redis/etc/redis.conf
# 安装redis的python api
pip install redis
2.脚本部署
①dbread.py--数据库查询函数:接收事件id参数,将zabbix数据库内的数据切割分片并返回告警信息
#!/usr/bin/python #coding:utf-8 #脚本中*****需要修改的地方 import MySQLdb import datetime,time import sys #定义通过actionid和subject获取数据库告警具体信息,并以字典形式返回 def alerts_eventid(actionid,subject): try: conn=MySQLdb.connect(host='*****',user=*****',passwd='******',db='*****',port=3306) #host:zabbix数据库ip #user:zabbix数据库用户 #passwd:zabbix数据库密码 #db:zabbix数据库名称 cursor = conn.cursor() cursor.execute("SET NAMES utf8"); sql = "SELECT * FROM alerts where actionid = '%s' and subject = '%s' ;" % (actionid,subject) cursor.execute(sql) data = cursor.fetchall() cursor.close() conn.close() event=data[0] messagelist=[] message=event[8] messageone=message.split('#') for i in messageone: messagelist.append(i.split('|')) print messagelist messagedict=dict(messagelist) return messagedict except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1])
②operation.py --操作函数:接收dbread.py返回的告警信息,进行告警合并,告警压缩处理。并返回处理结果
#!/usr/bin/python #coding:utf-8 import datetime,time #告警合并 def mergeproblem(originallist): problemlist=[] normalist=[] Unknown=[] triggerkeylist=[] sorts=[] alarminfo=[] #告警or恢复 for origina in originallist: if origina['triggervalue']=='1' : problemlist.append(origina) if origina['triggerkey'] not in triggerkeylist: triggerkeylist.append(origina['triggerkey']) else : Unknown.append(origina) for triggerkey in triggerkeylist: for problem in problemlist: if problem['triggerkey']==triggerkey: sorts.append(problem) alarminfo.append(sorts) sorts=[] return alarminfo #恢复合并 def mergenormal(originallist): normallist=[] Unknown=[] triggerkeylist=[] sorts=[] alarminfo=[] #告警or恢复 for origina in originallist: if origina['triggervalue']=='0' : normallist.append(origina) if origina['triggerkey'] not in triggerkeylist: triggerkeylist.append(origina['triggerkey']) else : Unknown.append(origina) for triggerkey in triggerkeylist: for normal in normallist: if normal['triggerkey']==triggerkey: sorts.append(normal) alarminfo.append(sorts) sorts=[] return alarminfo #告警压缩 def compressproblem(alarminfo): currenttime=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())) messagelist=[] for info in alarminfo: hostlist='' hostgroup='' infonum=len(info) for host in info: triggername=host['triggername'] hostinfo=host['hostname']+':'+host['ipaddress']+'\n' if host['hostgroup'] not in hostgroup: hostgroup+=host['hostgroup']+'\n' hostlist+=hostinfo if infonum >= 3 and infonum <= 6: message='告警◕﹏◕\n'+'告警主机:'+str(infonum)+'台\n'+hostlist+'涉及主机组:\n'+hostgroup+'告警项目:\n'+triggername+'\n'+'分析时间:\n'+currenttime messagelist.append(message) elif infonum > 6: message='告警◕﹏◕\n'+'当前存在大量相同告警项,可能发生网络故障!\n详情请查看云警系统!\n'+'告警主机:'+str(infonum)+'台\n'+'告警项目:\n'+triggername+'\n'+'分析时间:\n'+currenttime messagelist.append(message) return messagelist #恢复压缩 def compressnormal(alarminfo): currenttime=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())) messagelist=[] for info in alarminfo: hostlist='' hostgroup='' infonum=len(info) for host in info: triggername=host['triggername'] hostinfo=host['hostname']+':'+host['ipaddress']+'\n' if host['hostgroup'] not in hostgroup: hostgroup+=host['hostgroup']+'\n' hostlist+=hostinfo if infonum >= 3 and infonum <= 6: message='恢复◕‿◕\n'+'恢复主机:'+str(infonum)+'台\n'+hostlist+'涉及主机组:\n'+hostgroup+'恢复项目:\n'+triggername+'\n'+'分析时间:\n'+currenttime messagelist.append(message) elif infonum > 6: message='恢复◕‿◕\n'+'大量主机已经恢复!\n详情请查看监控系统!\n'+'恢复主机:'+str(infonum)+'台\n'+'恢复项目:\n'+triggername+'\n'+'分析时间:\n'+currenttime messagelist.append(message) return messagelist
③weixin.py--发送微信告警通知函数:将处理结果发送至指定运维人员
#!/usr/bin/python # coding: utf-8 #jipeng 2016.8.11 #python2将zabbix报警信息发送到微信。 #脚本中*****需要修改的地方 import urllib,urllib2 import json import sys def gettoken(): CropID='*****' Secret='*****' GURL="https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid="+CropID+"&corpsecret="+Secret token_file = urllib2.urlopen(GURL) token_data = token_file.read().decode('utf-8') token_json = json.loads(token_data) token_json.keys() token = token_json['access_token'] return token def senddata(access_token,user,content): PURL="https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token="+access_token send_values = { "touser":user, #企业号中的用户帐号,在zabbix用户Media中配置,如果配置不正常,将按部门发送。 #"toparty":"2", #企业号中的部门id "msgtype":"text", #消息类型 "agentid":"*****", #填写企业号中的应用id, "text":{ "content":content }, "safe":"0" } send_data = json.dumps(send_values, ensure_ascii=False) send_request = urllib2.Request(PURL, send_data) response = json.loads(urllib2.urlopen(send_request).read()) print str(response) if __name__ == '__main__': user = str(sys.argv[1]) #zabbix传过来的第一个参数 content = str(sys.argv[3]) #zabbix传过来的第三个参数 accesstoken = gettoken() senddata(accesstoken,user,content)
④allpolice.py--综合函数:将①②③整合起来,定时每1分钟执行一次
#!/usr/bin/env python #coding:utf-8 import MySQLdb import redis import sys from dbread import * from operation import * from weixin import * import datetime,time sendtime=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())) accesstoken = gettoken() #发送微信给运维人员 users=['*****'] #在zabbix 可以找到告警收敛的动作ID(actionid) actionid=***** #连接redis,并读取所有事件id r = redis.StrictRedis(host='*****', port=6379) subjectlist=r.keys() for i in subjectlist: r.delete(i) #r.flushdb() #获取原始数据并存入数据库 originallist=[] for subject in subjectlist: a=alerts_eventid(str(actionid),subject) originallist.append(a) problem=mergeproblem(originallist) normal=mergenormal(originallist) #发送告警信息 messagelist=compressproblem(problem) if len(messagelist) != 0: for content in messagelist: print sendtime for user in users: senddata(accesstoken,user,content) #发送恢复信息 messagelist=compressnormal(normal) if len(messagelist) != 0: for content in messagelist: print sendtime for user in users: senddata(accesstoken,user,content)
*#文件分布
mkdir -p /data/police
├── police
│ ├── dbread.py
│ ├── allpolice.py
│ ├── operation.py
│ ├── send.log
│ └── weixin.py
四、定时任务
crontab -e
#告警收敛定时检测#
*/1 * * * * python /data/police/allpolice.py >> /data/police/send.log
五、最终效果
发布者:LJH,转发请注明出处:https://www.ljh.cool/41215.html