• zabbix4.0监控gpu


    zabbix监控gpu

    安装监控工具

    yum install munin --nogpgcheck -y


    1.编写获取gpu参数的脚本

    # vim /usr/local/zabbix_agents/scripts/get_gpus_info.sh

    #!/bin/bash
    
    result=$(/usr/bin/nvidia-smi -L | sed 's/^GPU ([0-9]*):.*(UUID: (.*))$/,{"{#GPUINDEX}":"1","{#GPUUUID}":"2"}/g')
    
    first=1
    
    echo "{"
    echo ""data":["
    
    for line in ${result[@]}
    do
      if [ "$first" == "1" ]; then
        echo ${line:1}
        first=0
      else
        echo -n $line
      fi
    done
    
    echo 
    echo "]"
    echo "}"

    2.添加监控项

    UserParameter=gpu.number,/usr/bin/nvidia-smi -L | /usr/bin/wc -l
    UserParameter=gpu.discovery,/usr/local/zabbix_agents_3.2.0/scripts/get_gpus_info.sh
    UserParameter=gpu.fanspeed[*],/usr/bin/nvidia-smi --query-gpu=fan.speed --format=csv,noheader,nounits -i $1 | tr -d "
    "
    UserParameter=gpu.power[*],/usr/bin/nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i $1 | tr -d "
    "
    UserParameter=gpu.temp[*],/usr/bin/nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits -i $1 | tr -d "
    "
    UserParameter=gpu.utilization[*],/usr/bin/nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i $1 | tr -d "
    "
    UserParameter=gpu.memfree[*],/usr/bin/nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i $1 | tr -d "
    "
    UserParameter=gpu.memused[*],/usr/bin/nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $1 | tr -d "
    "
    UserParameter=gpu.memtotal[*],/usr/bin/nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i $1 | tr -d "
    "

    3.添加模板

    zbx_nvidia-smi-multi-gpu.xml

    <?xml version="1.0" encoding="UTF-8"?>
    <zabbix_export>
        <version>3.0</version>
        <date>2018-06-05T20:56:12Z</date>
        <groups>
            <group>
                <name>Templates</name>
            </group>
        </groups>
        <templates>
            <template>
                <template>Template Nvidia GPUs Performance</template>
                <name>Template Nvidia GPUs Performance</name>
                <description/>
                <groups>
                    <group>
                        <name>Templates</name>
                    </group>
                </groups>
                <applications>
                    <application>
                        <name>Nvidia</name>
                    </application>
                </applications>
                <items>
                    <item>
                        <name>Number of GPUs</name>
                        <type>0</type>
                        <snmp_community/>
                        <multiplier>0</multiplier>
                        <snmp_oid/>
                        <key>gpu.number</key>
                        <delay>30</delay>
                        <history>90</history>
                        <trends>365</trends>
                        <status>0</status>
                        <value_type>0</value_type>
                        <allowed_hosts/>
                        <units/>
                        <delta>0</delta>
                        <snmpv3_contextname/>
                        <snmpv3_securityname/>
                        <snmpv3_securitylevel>0</snmpv3_securitylevel>
                        <snmpv3_authprotocol>0</snmpv3_authprotocol>
                        <snmpv3_authpassphrase/>
                        <snmpv3_privprotocol>0</snmpv3_privprotocol>
                        <snmpv3_privpassphrase/>
                        <formula>1</formula>
                        <delay_flex/>
                        <params/>
                        <ipmi_sensor/>
                        <data_type>0</data_type>
                        <authtype>0</authtype>
                        <username/>
                        <password/>
                        <publickey/>
                        <privatekey/>
                        <port/>
                        <description>The number of GPUs present on this system.</description>
                        <inventory_link>0</inventory_link>
                        <applications>
                            <application>
                                <name>Nvidia</name>
                            </application>
                        </applications>
                        <valuemap/>
                        <logtimefmt/>
                    </item>
                </items>
                <discovery_rules>
                    <discovery_rule>
                        <name>GPU discovery</name>
                        <type>0</type>
                        <snmp_community/>
                        <snmp_oid/>
                        <key>gpu.discovery</key>
                        <delay>600</delay>
                        <status>0</status>
                        <allowed_hosts/>
                        <snmpv3_contextname/>
                        <snmpv3_securityname/>
                        <snmpv3_securitylevel>0</snmpv3_securitylevel>
                        <snmpv3_authprotocol>0</snmpv3_authprotocol>
                        <snmpv3_authpassphrase/>
                        <snmpv3_privprotocol>0</snmpv3_privprotocol>
                        <snmpv3_privpassphrase/>
                        <delay_flex/>
                        <params/>
                        <ipmi_sensor/>
                        <authtype>0</authtype>
                        <username/>
                        <password/>
                        <publickey/>
                        <privatekey/>
                        <port/>
                        <filter>
                            <evaltype>0</evaltype>
                            <formula/>
                            <conditions/>
                        </filter>
                        <lifetime>30</lifetime>
                        <description>Discovery of graphics cards.</description>
                        <item_prototypes>
                            <item_prototype>
                                <name>GPU $1 Fan Speed</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>1</multiplier>
                                <snmp_oid/>
                                <key>gpu.fanspeed[{#GPUINDEX}]</key>
                                <delay>60</delay>
                                <history>7</history>
                                <trends>365</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>%</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>Nvidia</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>GPU $1 Memory Free</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>gpu.memfree[{#GPUINDEX}]</key>
                                <delay>60</delay>
                                <history>7</history>
                                <trends>365</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>MB</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>Nvidia</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>GPU $1 Memory Total</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>gpu.memtotal[{#GPUINDEX}]</key>
                                <delay>60</delay>
                                <history>7</history>
                                <trends>365</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>MB</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>Nvidia</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>GPU $1 Memory Used</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>gpu.memused[{#GPUINDEX}]</key>
                                <delay>60</delay>
                                <history>7</history>
                                <trends>365</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>MB</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>Nvidia</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>GPU $1 Power in decaWatts</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>1</multiplier>
                                <snmp_oid/>
                                <key>gpu.power[{#GPUINDEX}]</key>
                                <delay>60</delay>
                                <history>7</history>
                                <trends>365</trends>
                                <status>0</status>
                                <value_type>0</value_type>
                                <allowed_hosts/>
                                <units>dW</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>0.1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>Nvidia</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>GPU $1 Temperature</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>gpu.temp[{#GPUINDEX}]</key>
                                <delay>60</delay>
                                <history>7</history>
                                <trends>365</trends>
                                <status>0</status>
                                <value_type>0</value_type>
                                <allowed_hosts/>
                                <units>C</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>Nvidia</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                            <item_prototype>
                                <name>GPU $1 Utilization</name>
                                <type>0</type>
                                <snmp_community/>
                                <multiplier>0</multiplier>
                                <snmp_oid/>
                                <key>gpu.utilization[{#GPUINDEX}]</key>
                                <delay>60</delay>
                                <history>7</history>
                                <trends>365</trends>
                                <status>0</status>
                                <value_type>3</value_type>
                                <allowed_hosts/>
                                <units>%</units>
                                <delta>0</delta>
                                <snmpv3_contextname/>
                                <snmpv3_securityname/>
                                <snmpv3_securitylevel>0</snmpv3_securitylevel>
                                <snmpv3_authprotocol>0</snmpv3_authprotocol>
                                <snmpv3_authpassphrase/>
                                <snmpv3_privprotocol>0</snmpv3_privprotocol>
                                <snmpv3_privpassphrase/>
                                <formula>1</formula>
                                <delay_flex/>
                                <params/>
                                <ipmi_sensor/>
                                <data_type>0</data_type>
                                <authtype>0</authtype>
                                <username/>
                                <password/>
                                <publickey/>
                                <privatekey/>
                                <port/>
                                <description/>
                                <inventory_link>0</inventory_link>
                                <applications>
                                    <application>
                                        <name>Nvidia</name>
                                    </application>
                                </applications>
                                <valuemap/>
                                <logtimefmt/>
                                <application_prototypes/>
                            </item_prototype>
                        </item_prototypes>
                        <trigger_prototypes>
                            <trigger_prototype>
                                <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;80</expression>
                                <name>GPU {#GPUINDEX} Temperature is extremely high</name>
                                <url/>
                                <status>0</status>
                                <priority>5</priority>
                                <description>A GPU's temperature is getting extremely high!</description>
                                <type>0</type>
                                <dependencies/>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;70</expression>
                                <name>GPU {#GPUINDEX} Temperature is high</name>
                                <url/>
                                <status>0</status>
                                <priority>2</priority>
                                <description>A GPU's temperature is getting high!</description>
                                <type>0</type>
                                <dependencies>
                                    <dependency>
                                        <name>GPU {#GPUINDEX} Temperature is very high</name>
                                        <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;75</expression>
                                    </dependency>
                                </dependencies>
                            </trigger_prototype>
                            <trigger_prototype>
                                <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;75</expression>
                                <name>GPU {#GPUINDEX} Temperature is very high</name>
                                <url/>
                                <status>0</status>
                                <priority>4</priority>
                                <description>A GPU's temperature is getting very high!</description>
                                <type>0</type>
                                <dependencies>
                                    <dependency>
                                        <name>GPU {#GPUINDEX} Temperature is extremely high</name>
                                        <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;80</expression>
                                    </dependency>
                                </dependencies>
                            </trigger_prototype>
                        </trigger_prototypes>
                        <graph_prototypes>
                            <graph_prototype>
                                <name>GPU {#GPUINDEX} Memory</name>
                                <width>900</width>
                                <height>200</height>
                                <yaxismin>0.0000</yaxismin>
                                <yaxismax>100.0000</yaxismax>
                                <show_work_period>1</show_work_period>
                                <show_triggers>1</show_triggers>
                                <type>0</type>
                                <show_legend>1</show_legend>
                                <show_3d>0</show_3d>
                                <percent_left>0.0000</percent_left>
                                <percent_right>0.0000</percent_right>
                                <ymin_type_1>0</ymin_type_1>
                                <ymax_type_1>0</ymax_type_1>
                                <ymin_item_1>0</ymin_item_1>
                                <ymax_item_1>0</ymax_item_1>
                                <graph_items>
                                    <graph_item>
                                        <sortorder>0</sortorder>
                                        <drawtype>0</drawtype>
                                        <color>00AA00</color>
                                        <yaxisside>0</yaxisside>
                                        <calc_fnc>2</calc_fnc>
                                        <type>0</type>
                                        <item>
                                            <host>Template Nvidia GPUs Performance</host>
                                            <key>gpu.memfree[{#GPUINDEX}]</key>
                                        </item>
                                    </graph_item>
                                    <graph_item>
                                        <sortorder>1</sortorder>
                                        <drawtype>0</drawtype>
                                        <color>0000DD</color>
                                        <yaxisside>0</yaxisside>
                                        <calc_fnc>2</calc_fnc>
                                        <type>0</type>
                                        <item>
                                            <host>Template Nvidia GPUs Performance</host>
                                            <key>gpu.memused[{#GPUINDEX}]</key>
                                        </item>
                                    </graph_item>
                                </graph_items>
                            </graph_prototype>
                            <graph_prototype>
                                <name>GPU {#GPUINDEX} Temperature, Fan Speed and Power</name>
                                <width>900</width>
                                <height>200</height>
                                <yaxismin>0.0000</yaxismin>
                                <yaxismax>100.0000</yaxismax>
                                <show_work_period>1</show_work_period>
                                <show_triggers>1</show_triggers>
                                <type>0</type>
                                <show_legend>1</show_legend>
                                <show_3d>0</show_3d>
                                <percent_left>0.0000</percent_left>
                                <percent_right>0.0000</percent_right>
                                <ymin_type_1>0</ymin_type_1>
                                <ymax_type_1>0</ymax_type_1>
                                <ymin_item_1>0</ymin_item_1>
                                <ymax_item_1>0</ymax_item_1>
                                <graph_items>
                                    <graph_item>
                                        <sortorder>0</sortorder>
                                        <drawtype>0</drawtype>
                                        <color>1A7C11</color>
                                        <yaxisside>0</yaxisside>
                                        <calc_fnc>2</calc_fnc>
                                        <type>0</type>
                                        <item>
                                            <host>Template Nvidia GPUs Performance</host>
                                            <key>gpu.power[{#GPUINDEX}]</key>
                                        </item>
                                    </graph_item>
                                    <graph_item>
                                        <sortorder>1</sortorder>
                                        <drawtype>0</drawtype>
                                        <color>2774A4</color>
                                        <yaxisside>0</yaxisside>
                                        <calc_fnc>2</calc_fnc>
                                        <type>0</type>
                                        <item>
                                            <host>Template Nvidia GPUs Performance</host>
                                            <key>gpu.fanspeed[{#GPUINDEX}]</key>
                                        </item>
                                    </graph_item>
                                    <graph_item>
                                        <sortorder>2</sortorder>
                                        <drawtype>0</drawtype>
                                        <color>F63100</color>
                                        <yaxisside>0</yaxisside>
                                        <calc_fnc>2</calc_fnc>
                                        <type>0</type>
                                        <item>
                                            <host>Template Nvidia GPUs Performance</host>
                                            <key>gpu.temp[{#GPUINDEX}]</key>
                                        </item>
                                    </graph_item>
                                </graph_items>
                            </graph_prototype>
                            <graph_prototype>
                                <name>GPU {#GPUINDEX} Utilization</name>
                                <width>900</width>
                                <height>200</height>
                                <yaxismin>0.0000</yaxismin>
                                <yaxismax>100.0000</yaxismax>
                                <show_work_period>1</show_work_period>
                                <show_triggers>1</show_triggers>
                                <type>0</type>
                                <show_legend>1</show_legend>
                                <show_3d>0</show_3d>
                                <percent_left>0.0000</percent_left>
                                <percent_right>0.0000</percent_right>
                                <ymin_type_1>0</ymin_type_1>
                                <ymax_type_1>0</ymax_type_1>
                                <ymin_item_1>0</ymin_item_1>
                                <ymax_item_1>0</ymax_item_1>
                                <graph_items>
                                    <graph_item>
                                        <sortorder>0</sortorder>
                                        <drawtype>0</drawtype>
                                        <color>2774A4</color>
                                        <yaxisside>0</yaxisside>
                                        <calc_fnc>2</calc_fnc>
                                        <type>0</type>
                                        <item>
                                            <host>Template Nvidia GPUs Performance</host>
                                            <key>gpu.utilization[{#GPUINDEX}]</key>
                                        </item>
                                    </graph_item>
                                </graph_items>
                            </graph_prototype>
                        </graph_prototypes>
                        <host_prototypes/>
                    </discovery_rule>
                </discovery_rules>
                <macros/>
                <templates/>
                <screens/>
            </template>
        </templates>
    </zabbix_export>

    参考:https://github.com/zhangyuteng/zabbix-nvidia-smi-multi-gpu

  • 相关阅读:
    Blender基础操作
    反汇编及linux下edb的下载
    混淆矩阵(confusion_matrix)含义
    Python大数据第三次的作业
    Python的DataFrame基础使用
    Python数据标准化
    爬虫之xpath
    luffy项目上线
    爬虫之selenium
    celery
  • 原文地址:https://www.cnblogs.com/reblue520/p/14662988.html
Copyright © 2020-2023  润新知