• openshift 容器云从入门到崩溃之九《容器监控-报警》


    容器状态监控

    主要是监控POD的状态包括重启、不健康等等这些k8s api 状态本身会报出来,在配合zabbix报警

    导入zabbix模板关联上oc master主机

    <?xml version="1.0" encoding="UTF-8"?>
    <zabbix_export>
        <version>3.2</version>
        <date>2019-02-27T07:33:05Z</date>
        <groups>
            <group>
                <name>Templates</name>
            </group>
        </groups>
        <templates>
            <template>
                <template>OC Pods</template>
                <name>OC Pods</name>
                <description/>
                <groups>
                    <group>
                        <name>Templates</name>
                    </group>
                </groups>
                <applications>
                    <application>
                        <name>restartCount</name>
                    </application>
                    <application>
                        <name>RunningStatus</name>
                    </application>
                </applications>
                <items/>
                <discovery_rules>
                    <discovery_rule>
                        <name>OC Pods Discover</name>
                        <type>0</type>
                        <snmp_community/>
                        <snmp_oid/>
                        <key>oc.pod.status[discover,discover]</key>
                        <delay>300</delay>
                        <status>0</status>
                        <allowed_hosts/>
                        <snmpv3_contextname/>
                        <snmpv3_securityname/>
                        <snmpv3_securitylevel>0</snmpv3_securitylevel>
                        <snmpv3_authprotocol>0</snmpv3_authprotocol>
                        <snmpv3_authpassphrase/>
                        <snmpv3_privprotocol>0</snmpv3_privprotocol>
                        <snmpv3_privpassphrase/>
                        <delay_flex/>
                        <params/>
                        <ipmi_sensor/>
                        <authtype>0</authtype>
                        <username/>
                        <password/>
                        <publickey/>
                        <privatekey/>
                        <port/>
                        <filter>
                            <evaltype>0</evaltype>
                            <formula/>
                            <conditions/>
                        </filter>
                        <lifetime>7</lifetime>
                        <description/>
                        <item_prototypes>
                            <item_prototype>
                                <name>Pod {#POD_NAME} Get Status</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.pod.status[{#POD_NAME},get_status]</key>
                                <delay>300</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>RunningStatus</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Pod {#POD_NAME} Restarts</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.pod.status[{#POD_NAME},restarts]</key>
                                <delay>300</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>restartCount</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Pod {#POD_NAME} Running</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.pod.status[{#POD_NAME},running]</key>
                                <delay>300</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>RunningStatus</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                        </item_prototypes>
                        <trigger_prototypes>
                            <trigger_prototype>
                                <expression>{OC Pods:oc.pod.status[{#POD_NAME},running].str(Running_true)}=0&#13;
    and&#13;
    {OC Pods:oc.pod.status[{#POD_NAME},running].str(Pod deleted)}=0</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Pod {#POD_NAME} Not Running</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>0</status>
                                <priority>1</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning)}=1</expression>
                                <recovery_mode>1</recovery_mode>
                                <recovery_expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning,#3)}=0</recovery_expression>
                                <name>Pod {#POD_NAME} restarted Warning</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>0</status>
                                <priority>1</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                        </trigger_prototypes>
                        <graph_prototypes/>
                        <host_prototypes/>
                    </discovery_rule>
                </discovery_rules>
                <httptests/>
                <macros/>
                <templates/>
                <screens/>
            </template>
        </templates>
    </zabbix_export>

    zabbix客户端配置

    修改zabbix_agentd.conf

    Timeout=30
    UserParameter=oc.pod.status[*],/data/app/zabbix/etc/oc_pod_monitor.sh $1 $2

    oc_pod_monitor.sh内容

    #!/bin/bash
    TOKEN=""
    ENDPOINT=""
    POD_NAME="`echo "$1" |sed 's/.*=(.*$)/1/'`"
    Monitoring_type="$2"
    WORKSPACE="/data/tmp/oc_monitor"
    mkdir -p $WORKSPACE
    
    #通过pod name获得pod所在的namespace
    NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 $POD_NAME |grep -v $POD_NAME`"
    
    #验证pod是否存在
    if [ "$POD_NAME" == "discover" ]; then
      echo
    elif [ ! -n "$NAMESPACE" ]; then
      echo "Pod deleted"
      exit 0
    fi
    ##自动发现
    case $Monitoring_type in
       discover)
         #获取所有pod只保留pod name
         curl -k 
           -H "Authorization: Bearer $TOKEN" 
           -H 'Accept: application/json' 
           https://$ENDPOINT/api/v1/pods 2>/dev/null  > $WORKSPACE/all_pods.json
    
         Pod_Name=(`jq -r '.items | .[] | .metadata | .name' $WORKSPACE/all_pods.json |egrep -v 'build|deploy|debug'`)
         #转换为json格式
         printf "{
    "
         printf '	"data":[
    '
         for ((i=0;i<${#Pod_Name[@]};i++))
         do
            NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 ${Pod_Name[i]} |grep -v ${Pod_Name[i]}`"
            Pod_Name_N=""$NAMESPACE"="${Pod_Name[i]}""
            printf '		{
    '
            num=$(echo $((${#Pod_Name[@]}-1)))
            if [ "$i" == ${num} ];
            then
                    printf "			"{#POD_NAME}":"${Pod_Name_N}"}
    "
            else
                    printf "			"{#POD_NAME}":"${Pod_Name_N}"},
    "
            fi
         done
         printf "	]
    "
         printf "}
    "   
         exit 0
      ;;
    
       get_status)#获取pod状态以供所有项目调用
         curl -k 
           -H "Authorization: Bearer $TOKEN" 
           -H 'Accept: application/json' 
           https://${ENDPOINT}/api/v1/namespaces/$NAMESPACE/pods/$POD_NAME/status 2>/dev/null > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
         Pod_NotFound="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status |grep '"code": 404'`"
         if [ -n "$Pod_NotFound" ]; then
           echo "Pod_Status=NotFound"
           exit 0
         else
           echo "Success"
           exit 0 
         fi
       ;;
    esac
    
    #获取pod状态数据
    if [ -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.status" ];then
       Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
    else
       echo "" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
       Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
    fi
    
    #处理Pod_Status的异常
    if [ ! -n "$Pod_Status" ]; then  #处理Pod_Status的为空的异常
       echo "Running_true Pod_Status=Null"
       exit 0
    elif [ -n "`echo "$Pod_Status" |grep '"code": 404'`" ]; then  #处理pod不存在但是all_pods.json还没更新的异常
       echo "Pod_Status=NotFound"
       exit 0
    elif [ "`echo "$Pod_Status" |jq -r '.status |.phase'`" = "Pending" ]; then  #验证容器是否在Pending状态
       echo "Pending"
       exit 0
    fi
    
    #选择要获取的数据
    case $Monitoring_type in
       restarts)#监控pod是否重启过
         #判断是否是新pod
         if [ ! -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount" ]; then
           echo "Warning New Pod"
           echo "0" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
           exit 0
         fi
        
         ##获取上次的值
         A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
         B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
         if [ ! -n "$B_line_null" ]; then  #处理有两个restartCount值的pod
           B_line="0"
         else
           B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
         fi
         Last_state=`expr $A_line + $B_line`
         ##
         
         ##获取本次的值
         echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.restartCount' > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
         A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
         B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
         if [ ! -n "$B_line_null" ]; then  #处理有两个restartCount值的pod
           B_line="0"
           else
           B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
         fi
         Current_state=`expr $A_line + $B_line`
         ##
     
         #对比本次拿到的restartCount值与上此的restartCount值
         if [ "$Current_state" -gt "$Last_state" ]; then
           Restart_status="Warning restart_count=$Current_state"
         else
           Restart_status="Normal restart_count=$Current_state"
         fi
         echo "$Restart_status"
      ;;
    
       running)#监控pod的运行状态和容器的状态返回字符串
         
         #获取pod和容器的状态
         running_status=`echo "$Pod_Status" |jq -r '.status |.phase'`
         Container_status="`echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.ready' |grep false`"
         if [ ! -n "$Container_status" ]; then
            Container_status="_true"
         else
            Container_status="_false"
         fi
         echo "${running_status}${Container_status}"
      ;;
     
       *)
         echo "Error parameters"
         exit 0
      ;;
    
    esac
    exit 0

    这样POD重启或者新建都会报出来

    集群NODE节点监控

    主要监控node节点的不健康状态,还有lvm卷容量监控

    导入zabbix模板关联上oc master主机

    <?xml version="1.0" encoding="UTF-8"?>
    <zabbix_export>
        <version>3.2</version>
        <date>2019-02-27T07:47:32Z</date>
        <groups>
            <group>
                <name>Templates</name>
            </group>
        </groups>
        <templates>
            <template>
                <template>OC Node Status</template>
                <name>OC Node Status</name>
                <description/>
                <groups>
                    <group>
                        <name>Templates</name>
                    </group>
                </groups>
                <applications>
                    <application>
                        <name>oc_node</name>
                    </application>
                </applications>
                <items/>
                <discovery_rules>
                    <discovery_rule>
                        <name>OC Nodes Discover</name>
                        <type>0</type>
                        <snmp_community/>
                        <snmp_oid/>
                        <key>oc.node.status[discover,discover]</key>
                        <delay>60</delay>
                        <status>0</status>
                        <allowed_hosts/>
                        <snmpv3_contextname/>
                        <snmpv3_securityname/>
                        <snmpv3_securitylevel>0</snmpv3_securitylevel>
                        <snmpv3_authprotocol>0</snmpv3_authprotocol>
                        <snmpv3_authpassphrase/>
                        <snmpv3_privprotocol>0</snmpv3_privprotocol>
                        <snmpv3_privpassphrase/>
                        <delay_flex/>
                        <params/>
                        <ipmi_sensor/>
                        <authtype>0</authtype>
                        <username/>
                        <password/>
                        <publickey/>
                        <privatekey/>
                        <port/>
                        <filter>
                            <evaltype>0</evaltype>
                            <formula/>
                            <conditions/>
                        </filter>
                        <lifetime>7</lifetime>
                        <description/>
                        <item_prototypes>
                            <item_prototype>
                                <name>Node {#NODE_NAME}  DiskPressure</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},DiskPressure]</key>
                                <delay>30</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>1</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME} Get Status</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},get_status]</key>
                                <delay>30</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications/>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME}  MemoryPressure</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},MemoryPressure]</key>
                                <delay>30</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>1</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME}  Ready</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},node_ready]</key>
                                <delay>30</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME} CPU Limits</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},node_resources,cpu_limits]</key>
                                <delay>120</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>%</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME} CPU Requests</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},node_resources,cpu_requests]</key>
                                <delay>120</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>%</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME} Memory Limits</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},node_resources,memory_limits]</key>
                                <delay>120</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>%</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME} Memory Requests</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},node_resources,memory_requests]</key>
                                <delay>120</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>%</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>Node {#NODE_NAME}  OutOfDisk</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>oc.node.status[{#NODE_NAME},OutOfDisk]</key>
                                <delay>30</delay>
                                <history>7</history>
                                <trends>0</trends>
                                <status>1</status>
                                <value_type>4</value_type>
                                <allowed_hosts/>
                                <units/>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>oc_node</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                        </item_prototypes>
                        <trigger_prototypes>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_limits].last()}&gt;150</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} CPU Limits 150%</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>0</status>
                                <priority>1</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_requests].last()}&gt;100</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} CPU Requests 100%</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>0</status>
                                <priority>2</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},DiskPressure].str(DiskPressure_False)}=0</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} DiskPressure</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>1</status>
                                <priority>5</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_limits].last()}&gt;150</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} Memory Limits 150%</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>0</status>
                                <priority>1</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},MemoryPressure].str(MemoryPressure_False)}=0</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} MemoryPressure</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>1</status>
                                <priority>5</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_requests].last()}&gt;95</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} Memory Requests 95%</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>0</status>
                                <priority>2</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_ready].str(Ready_True)}=0</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} Not Ready</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>0</status>
                                <priority>5</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{OC Node Status:oc.node.status[{#NODE_NAME},OutOfDisk].str(OutOfDisk_False)}=0</expression>
                                <recovery_mode>0</recovery_mode>
                                <recovery_expression/>
                                <name>Node {#NODE_NAME} OutOfDisk</name>
                                <correlation_mode>0</correlation_mode>
                                <correlation_tag/>
                                <url/>
                                <status>1</status>
                                <priority>5</priority>
                                <description/>
                                <type>0</type>
                                <manual_close>1</manual_close>
                                <dependencies/>
                                <tags/>
                            </trigger_prototype>
                        </trigger_prototypes>
                        <graph_prototypes/>
                        <host_prototypes/>
                    </discovery_rule>
                </discovery_rules>
                <httptests/>
                <macros/>
                <templates/>
                <screens/>
            </template>
        </templates>
    </zabbix_export>

    zabbix客户端配置

    修改zabbix_agentd.conf

    Timeout=30
    UserParameter=oc.node.status[*],/data/app/zabbix/etc/oc_node_monitor.sh $1 $2 $3

    oc_node_monitor.sh的内容

    #!/bin/bash
    TOKEN=""
    ENDPOINT=""
    NODE_NAME="$1"
    Monitoring_type="$2"
    WORKSPACE="/data/tmp/oc_monitor"
    mkdir -p $WORKSPACE
    
    case $Monitoring_type in
       discover)#自动发现节点
         Node_Name=(`curl -k 
                       -H "Authorization: Bearer $TOKEN" 
                       -H 'Accept: application/json' 
                        https://$ENDPOINT/api/v1/nodes 2>/dev/null |jq -r '.items|.[]|.metadata|.name'`)
    
         printf "{
    "
         printf '	"data":[
    '
         for ((i=0;i<${#Node_Name[@]};i++))
         do
            printf '		{
    '
            num=$(echo $((${#Node_Name[@]}-1)))
            if [ "$i" == ${num} ];
            then
                    printf "			"{#NODE_NAME}":"${Node_Name[$i]}"}
    "
            else
                    printf "			"{#NODE_NAME}":"${Node_Name[$i]}"},
    "
            fi
         done
         printf "	]
    "
         printf "}
    "
         exit 0
    ;;
       get_status)#获取node状态以供所有项目调用
         curl -k 
           -H "Authorization: Bearer $TOKEN" 
           -H 'Accept: application/json' 
           https://${ENDPOINT}/api/v1/nodes/$NODE_NAME 2>/dev/null > $WORKSPACE/${NODE_NAME}.status
         if [ -n "`cat $WORKSPACE/${NODE_NAME}.status |grep '"code": 404'`" ]; then
           echo "Node_Status=NotFound"
           exit 0
         elif [ ! -n "`cat $WORKSPACE/${NODE_NAME}.status`" ]; then
           echo "Node_Status=null"
           exit 0
         else
           echo "Success"
           exit 0
         fi
       ;;
    esac 
    
    case $Monitoring_type in
       OutOfDisk)#监控node是否磁盘空间不足
         Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 1p`"
         if [ "$Node_Status" == "False" ]; then 
           echo "OutOfDisk_False"
         elif [ ! -n "$Node_Status" ]; then
           echo "OutOfDisk_False"
         else
           echo "OutOfDisk_$Node_Status"
         fi
      ;;
    
       MemoryPressure)#监控node是否磁盘空间不足
         Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 2p`"
         if [ "$Node_Status" == "False" ]; then
           echo "MemoryPressure_False"
         elif [ ! -n "$Node_Status" ]; then
           echo "MemoryPressure_False"
         else
           echo "MemoryPressure_$Node_Status"
         fi
      ;;
      
       DiskPressure)#监控node是否磁盘压力太大
         Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 3p`"
         if [ "$Node_Status" == "False" ]; then
           echo "DiskPressure_False"
         elif [ ! -n "$Node_Status" ]; then
           echo "DiskPressure_False"
         else
           echo "DiskPressure_$Node_Status"
         fi
      ;;
    
       node_ready)#监控node是否准备好了
         Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 4p`"
         if [ "$Node_Status" == "True" ]; then
           echo "Ready_True"
         elif [ ! -n "$Node_Status" ]; then
           echo "Ready_True"
         else
           echo "Ready_$Node_Status"
         fi
      ;;
    
       node_resources)#监控node资源分配情况
         null="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}'`"
         if [ ! -n "$null" ]; then
            sleep 1
         fi
         if [ "$3" == "cpu_requests" ]; then
            data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}' |grep -o '[0-9]*'`"
            if [ $data -gt 0 ]; then
              echo $data
            else
              echo 0
            fi  
         elif [ "$3" == "cpu_limits" ]; then 
            data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $4}' |grep -o '[0-9]*'`"
            if [ $data -gt 0 ]; then
              echo $data
            else
              echo 0
            fi
    
         elif [ "$3" == "memory_requests" ]; then
            data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $6}' |grep -o '[0-9]*'`"
            if [ "$data" -gt 0 ]; then
              echo $data
            else
              echo 0
            fi 
    
         elif [ "$3" == "memory_limits" ]; then
            data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $8}' |grep -o '[0-9]*'`"
            if [ $data -gt 0 ]; then
              echo $data
            else
              echo 0
            fi 
         fi
      ;;
    esac

    crontab -e

    */2 * * * * /data/scripts/oc_master_crontab.sh >/dev/null 2>&1

    oc_master_crontab.sh内容

    node_name=(`oc get node |grep -v "NAME" |awk '{print $1}'`)
    for ((i=0;i<${#node_name[*]};i++))
    do
    oc describe node "${node_name[i]}" |grep -B 1 "Events"  |grep -v "Events"  > /data/tmp/oc_monitor/${node_name[i]}.resources
    chmod -R 777 /data/tmp/
    done
  • 相关阅读:
    剑指offer:二进制中1的个数
    剑指offer:反转链表
    剑指offer:链表中倒数第k个结点
    剑指offer:调整数组顺序使奇数位于偶数前面
    剑指offer:矩形覆盖
    剑指offer:变态跳台阶
    剑指offer :跳台阶
    剑指offer:旋转数组的最小数字
    剑指offer:用两个栈实现队列
    剑指offer:重建二叉树
  • 原文地址:https://www.cnblogs.com/37yan/p/10444009.html
Copyright © 2020-2023  润新知