redis基础监控
redis监控
redis server 监控:
redis存活判断:ping判断,如果指定时间返回PONG表示存活,否则redis不能响应请求,可能阻塞或死亡
机器端口检查:nc 判断端口是否正常。
连接数:connected_clients >5000 时告警
连接数使用率:connected_clients/maxclients >90% 告警
list阻塞调用被阻塞的连接个数 blocked_clients >0 告警
redis内存使用率 ,占用最大内存使用率 > 80% 告警
最大内存容量限制而被驱逐(evict)的键数量 :evicted_keys>0 说明内存超设置最大内存
因为最大客户端数量限制而被拒绝的连接请求数量: rejected_connections >0
请求键的命中率:keyspace_hits#查找数据库键成功的次数 / 总查询(失败+成功)<50%
redis_cluster 监控
集群健康状态:cluster_state不为OK则告警
集群的节点数 :cluster_known_nodes 集群中redis节点的个数。
检测下线的数据槽slots个数:集群正常运行时,cluster_slots_fail 应该为0. 如果大于0说明集群有slot存在故障
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 功能说明: redis监控
# 创建者: zhouwj
# 创建时间: 2019/12/03
# 修改历史: import redis
import sys
import subprocess
import json
import logging
import time
import requests
import logging.handlers
data=[]
log_filename='/home/zhouwj/zhouwj/bin/redis_monitor/redis_monitor.log'
#logging.basicConfig(
# filename='/home/zhouwj/zhouwj/bin/redis_monitor/redis_monitor.log',
# level=logging.INFO,
# format='%(levelname)s:%(asctime)s:%(message)s')
#with open('/home/zhouwj/zhouwj/bin/redis_monitor/nodes_ip.txt','rt') as f:
# ip_list=f.read().splitlines()
#进入redis集群模式,如果异常,记录到日志中,并终止脚本
localtime = time.strftime("[%H:%M:%S]", time.localtime())
headers={"Content-Type": "application/json"}
dl="\n-------------------------------------\n"
#logging
def loghandler(name):#初始化logginglogging.basicConfig()log=logging.getLogger(name)#设置日志级别log.setLevel(logging.INFO)#添加TimeRoatingFileHandler#定义一个1天换一次log文件的handler#保留7个旧log文件timefilehandler=logging.handlers.TimedRotatingFileHandler(filename=log_filename,when='D',interval=1,backupCount=7)timefilehandler.suffix="%Y-%m-%d.log"#设置log记录输出的格式formatter=logging.Formatter('%(asctime)s %(levelname)s: %(name)s %(message)s')timefilehandler.setFormatter(formatter)#添加到logger中log.addHandler(timefilehandler)return log#调用函数,实现日志输出
log=loghandler('redis')
def nodes_ip():with open('/home/zhouwj/zhouwj/bin/redis_monitor/nodes_ip.txt','rt') as f:lines=[line.split() for line in f]for line in lines :yield line #获取连接数,>5000 时告警
def redis_connections():try:return info['connected_clients']except Exception, e:return 0#redis的连接使用率
def redis_connections_usage():try:curr_connections = redis_connections()max_clients = parse_config('maxclients')rate = float(curr_connections) / float(max_clients)return "%.2f" % (rate * 100)except Exception, e:return 0#redis内存使用量
def redis_used_memory():try:print info['used_memory']return info['used_memory']except Exception, e:return 0#redis内存使用率
def redis_memory_usage():try:used_memory = info['used_memory']max_memory = info['maxmemory']system_memory = info['total_system_memory']if max_memory:rate = float(used_memory) / float(max_memory)else:rate = float(used_memory) / float(system_memory)return "%.2f" % (rate * 100)except Exception, e:return 0#拒绝连接数
def rejected_connections():try:return info['rejected_connections']except Exception, e:return 999#运行以来删除过的key的数量
def evicted_keys():try:return info['evicted_keys']except Exception, e:return 999#正在等待阻塞客户端数量
def blocked_clients():try:return info['blocked_clients']except Exception, e:return 0#redis的OPS,redis内部较实时的每秒执行的命令数
def ops(self):try:return info['instantaneous_ops_per_sec']except Exception, e:return 0#请求键的命中率,命中率低于50%告警
def hitRate():try:misses = info['keyspace_misses']hits = info['keyspace_hits']rate = float(hits) / float(int(hits) + int(misses))return "%.2f" % (rate * 100)except Exception, e:return 0#获取最大连接数
def parse_config(type):try:return redisconn.config_get(type)[type]except Exception, e:return None
def send_alarm(localtime,dl,headers,param,ip):MSG = localtime+dl+"DCR-db_error:"+paramSecret = '3e15a344-f620-47a6-aa7a-afde087a8104'url = 'https://qyapi.weixin.qq.com'send_msg = '{ "msgtype": "text","text": {"content": "%s"}}' % (MSG)send_url = '%s/cgi-bin/webhok/send?key=%s' % (url,Secret)#print send_url#print headers#print send_msg#p_post=requests.post(url=send_url,headers=headers,data=send_msg)#print p_postalarm_name='DCR_db_error'alarm_cmd='msalarm -h %s -n %s -p %s' % (ip,alarm_name,param)#alarm_cmd='msalarm -n "%s"-p "%s"' % (alarm_name,param)log.error(MSG)log.error(alarm_cmd)subprocess.call(alarm_cmd,shell=True)
def alarm(ip):if str(redisconn.ping()) != 'True':param=ip+":"+'redis_ping:'+str(redisconn.ping())send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s redis_ping: normal",ip)if check_alive(ip, 12201) != 0 :param=ip+":"+'check_alive:'+'port_fail'send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s check_alive: normal",ip)if redis_connections() > 5000 :param=ip+":"+'redis_connections:'+str(redis_connections())send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s redis_connections: normal",ip)if blocked_clients() > 0 :param=ip+":"+'blocked_clients:'+str(blocked_clients())send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s blocked_clients: normal",ip)if float(redis_connections_usage().strip("%")) > 90 :param=ip+":"+'redis_connections_usage:'+str(redis_connections_usage())send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s redis_connections_usage: normal",ip)if float(redis_memory_usage()) > 80 :param=ip+":"+'redis_memory_usage:'+str(redis_memory_usage())send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s redis_memory_usage: normal",ip)if evicted_keys() > 0 :param=ip+":"+'evicted_keys:'+str(evicted_keys())send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s evicted_keys: normal",ip)if rejected_connections() > 0 :param=ip+":"+'rejected_connections:'+str(rejected_connections())send_alarm(localtime,dl,headers,param,ip)log.error(param)else:log.info("%s rejected_connections: normal",ip)
# if float(hitRate().strip("%")) < 50 :
# param=ip+":"+'hitRate:'+str(hitRate())
# send_alarm(localtime,dl,headers,param,ip)
# log.error(param)
# else:
# log.info("%s is single_mode: normal",ip)
def check_alive(host, port):cmd = 'nc -z %s %s > /dev/null 2>&1' % (host, port)return subprocess.call(cmd, shell=True)
#定义函数,抓取集群infodef clusterstatus(var):if var == 'ok' :item = 0else:item = 1return itemdef clusterslotsfail(var):item = varreturn itemdef clusterknownnodes(var):item = varreturn item
if __name__ == '__main__':for x in nodes_ip():ip,mode,keyword,nodes= xtry:redisconn=redis.StrictRedis(host=ip,port=12201,password=keyword,socket_connect_timeout=1)#info=redisconn.info()#alarm(ip)except Exception,e:param=ip+":""连接失败"send_alarm(localtime,dl,headers,param,ip)continueinfo=redisconn.info()alarm(ip) if mode == 'cluster':try:cluster_info = redisconn.execute_command('cluster','info')cluster_info = cluster_info.split('\r\n')except:param=ip+":""集群查询失败"send_alarm(localtime,dl,headers,param,ip)continuetry:for i in cluster_info:data.append([i.split(':')[0],i.split(':')[1]])except:passfor key,var in data:if key == 'cluster_state':clusters_status=clusterstatus(var)elif key == 'cluster_slots_fail':clusters_lotsfail=clusterslotsfail(var)elif key == 'cluster_known_nodes':clusters_knownnodes=clusterknownnodes(var)if str(clusters_status) == '0':log.info("%s clusters_status: normal ",ip)else:param='clusters_status:'+ip+":"+clusters_statussend_alarm(localtime,dl,headers,param,ip)if clusters_lotsfail == '0':log.info("%s clusters_lotsfail: normal ",ip)else:param='clusters_lotsfail:'+ip+":"+clusters_lotsfailsend_alarm(localtime,dl,headers,parami,ip)if clusters_knownnodes== nodes:log.info("%s clusters_knownnodes:normal ",ip)else:param='clusters_knownnodes:'+ip+":"+clusters_knownnodessend_alarm(localtime,dl,headers,param,ip)
nodes_ip.txt 格式:
ip single/cluster null/槽数
1.1.1.1 single null
2.2.2.2 cluster 100
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!