REdis版本:4.0.9
运行环境:Linux 3.10.107 x86_64 gcc_version:4.8.5
结论:是一个BUG,在4.0.11版本中被作者antirez所修复
现象:
1) top显示
2) 执行REdis info命令直接卡住不动
3) 集群通讯端口大量的“CLOSE_WAIT”
4) 日志文件大量的“Bad message length or signature received from Cluster bus”
5) 物理内存和虚拟内存均占用不高(配置最大内存为5G,实际占用物理内存才4M多)
6) 通过其它正常节点查看,该故障节点处于“fail”状态
推测:发生了死循环。
GDB分析:
(gdb) bt #0 je_malloc (size=size@entry=47) at src/jemalloc.c:1425 #1 0x00000000004329ee in zmalloc (size=size@entry=47) at zmalloc.c:98 #2 0x000000000043cea9 in createEmbeddedStringObject (ptr=0x7f5da721b101 "dog:cgi:proj_trans_query_hb", len=27) at object.c:85 #3 0x000000000043cf65 in createStringObject (ptr=ptr@entry=0x7f5da721b101 "dog:cgi:proj_trans_query_hb", len=<optimized out>) at object.c:119 #4 0x000000000044141b in dbRandomKey (db=0x7f5da72d5000) at db.c:236 #5 0x00000000004414c2 in randomkeyCommand (c=0x7f5da6dd9e00) at db.c:498 #6 0x000000000042c03e in call (c=c@entry=0x7f5da6dd9e00, flags=flags@entry=15) at server.c:2229 #7 0x000000000042c6e7 in processCommand (c=0x7f5da6dd9e00) at server.c:2510 #8 0x000000000043b745 in processInputBuffer (c=0x7f5da6dd9e00) at networking.c:1354 #9 0x00000000004267f0 in aeProcessEvents (eventLoop=eventLoop@entry=0x7f5da723a050, flags=flags@entry=11) at ae.c:440 #10 0x0000000000426adb in aeMain (eventLoop=0x7f5da723a050) at ae.c:498 #11 0x00000000004238ef in main (argc=<optimized out>, argv=0x7ffc0451ab58) at server.c:3894 (gdb) f 5 #5 0x00000000004414c2 in randomkeyCommand (c=0x7f5da6dd9e00) at db.c:498 498 db.c: No such file or directory. (gdb) p *c $7 = {id = 144, fd = 78, db = 0x7f5da72d5000, name = 0x0, querybuf = 0x7f5da70ae285 "", pending_querybuf = 0x7f5da7216743 "", querybuf_peak = 0, argc = 1, argv = 0x7f5da72167f0, cmd = 0x741820 <redisCommandTable+7520>, lastcmd = 0x741820 <redisCommandTable+7520>, reqtype = 2, multibulklen = 0, bulklen = -1, reply = 0x7f5da6d20ea0, reply_bytes = 0, sentlen = 0, ctime = 1542039740, lastinteraction = 1542039740, obuf_soft_limit_reached_time = 0, flags = 0, authenticated = 0, replstate = 0, repl_put_online_on_ack = 0, repldbfd = -1496183376, repldboff = 0, repldbsize = 0, replpreamble = 0x5bdb29d9 <Address 0x5bdb29d9 out of bounds>, read_reploff = 0, reploff = 0, repl_ack_off = 0, repl_ack_time = 0, psync_initial_offset = 1090943882, replid = " 00 00 00 00]177 00 00 00 00 00 00 00 00 00 00216 00 00 00 00 00 00 00377377377377 00 00 00 00 00P-247]177 00 00", slave_listening_port = 0, slave_ip = " 00 00 00 00 00 00 00 00243k!247]177", ' 00' <repeats 14 times>, "]177 00 00@g!247]177 00 00 00 00 00 00 00", slave_capa = 0, mstate = {commands = 0x0, count = 0, minreplicas = -1, minreplicas_timeout = 140040207470240}, btype = 0, bpop = {timeout = 0, keys = 0x7f5da72b2aa0, target = 0x0, numreplicas = 0, reploffset = 0, module_blocked_handle = 0x0}, woff = 0, watched_keys = 0x7f5da6d20e70, pubsub_channels = 0x7f5da72b2b00, pubsub_patterns = 0x7f5da6d20ed0, peerid = 0x0, bufpos = 0, buf = ' 00' <repeats 20 times>, " 23.Kp 00 00 00 00240 16322246]177 00 00 00++247]177 00 00320 16322246]177", ' 00' <repeats 14 times>, "$-1 ", ' 00' <repeats 39 times>, "240*+247]177", ' 00' <repeats 34 times>, "242;223< |