#!/bin/bash while [ 1 ] do job_error_no=`kubectl get pod -n weifeng |grep -i "job"|grep -ci error` if [ $job_error_no -gt 0 ];then ps -fe|grep k8s_job_status_monitor|grep -v grep|awk '{print $2}'|xargs kill -9 echo "k8s job running is not stable " >> /tmp/k8s_job_error_no.log fi sleep 60 done
若k8s集群job状态出现error , 脚本自动kill 掉自己的 montior进程, 通过阿里云的云监控进程监控来触发报警
阿里云云监控 进程监控 文档 https://www.cnblogs.com/weifeng1463/p/11591796.html