zabbix监控gpu
安装监控工具
yum install munin --nogpgcheck
-y
1.编写获取gpu参数的脚本
# vim /usr/local/zabbix_agents/scripts/get_gpus_info.sh
#!/bin/bash result=$(/usr/bin/nvidia-smi -L | sed 's/^GPU ([0-9]*):.*(UUID: (.*))$/,{"{#GPUINDEX}":"1","{#GPUUUID}":"2"}/g') first=1 echo "{" echo ""data":[" for line in ${result[@]} do if [ "$first" == "1" ]; then echo ${line:1} first=0 else echo -n $line fi done echo echo "]" echo "}"
2.添加监控项
UserParameter=gpu.number,/usr/bin/nvidia-smi -L | /usr/bin/wc -l UserParameter=gpu.discovery,/usr/local/zabbix_agents_3.2.0/scripts/get_gpus_info.sh UserParameter=gpu.fanspeed[*],/usr/bin/nvidia-smi --query-gpu=fan.speed --format=csv,noheader,nounits -i $1 | tr -d " " UserParameter=gpu.power[*],/usr/bin/nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i $1 | tr -d " " UserParameter=gpu.temp[*],/usr/bin/nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits -i $1 | tr -d " " UserParameter=gpu.utilization[*],/usr/bin/nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i $1 | tr -d " " UserParameter=gpu.memfree[*],/usr/bin/nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i $1 | tr -d " " UserParameter=gpu.memused[*],/usr/bin/nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $1 | tr -d " " UserParameter=gpu.memtotal[*],/usr/bin/nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i $1 | tr -d " "
3.添加模板
zbx_nvidia-smi-multi-gpu.xml
<?xml version="1.0" encoding="UTF-8"?> <zabbix_export> <version>3.0</version> <date>2018-06-05T20:56:12Z</date> <groups> <group> <name>Templates</name> </group> </groups> <templates> <template> <template>Template Nvidia GPUs Performance</template> <name>Template Nvidia GPUs Performance</name> <description/> <groups> <group> <name>Templates</name> </group> </groups> <applications> <application> <name>Nvidia</name> </application> </applications> <items> <item> <name>Number of GPUs</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>gpu.number</key> <delay>30</delay> <history>90</history> <trends>365</trends> <status>0</status> <value_type>0</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description>The number of GPUs present on this system.</description> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> </item> </items> <discovery_rules> <discovery_rule> <name>GPU discovery</name> <type>0</type> <snmp_community/> <snmp_oid/> <key>gpu.discovery</key> <delay>600</delay> <status>0</status> <allowed_hosts/> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <delay_flex/> <params/> <ipmi_sensor/> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <filter> <evaltype>0</evaltype> <formula/> <conditions/> </filter> <lifetime>30</lifetime> <description>Discovery of graphics cards.</description> <item_prototypes> <item_prototype> <name>GPU $1 Fan Speed</name> <type>0</type> <snmp_community/> <multiplier>1</multiplier> <snmp_oid/> <key>gpu.fanspeed[{#GPUINDEX}]</key> <delay>60</delay> <history>7</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>%</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>GPU $1 Memory Free</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>gpu.memfree[{#GPUINDEX}]</key> <delay>60</delay> <history>7</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>MB</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>GPU $1 Memory Total</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>gpu.memtotal[{#GPUINDEX}]</key> <delay>60</delay> <history>7</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>MB</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>GPU $1 Memory Used</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>gpu.memused[{#GPUINDEX}]</key> <delay>60</delay> <history>7</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>MB</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>GPU $1 Power in decaWatts</name> <type>0</type> <snmp_community/> <multiplier>1</multiplier> <snmp_oid/> <key>gpu.power[{#GPUINDEX}]</key> <delay>60</delay> <history>7</history> <trends>365</trends> <status>0</status> <value_type>0</value_type> <allowed_hosts/> <units>dW</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>0.1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>GPU $1 Temperature</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>gpu.temp[{#GPUINDEX}]</key> <delay>60</delay> <history>7</history> <trends>365</trends> <status>0</status> <value_type>0</value_type> <allowed_hosts/> <units>C</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>GPU $1 Utilization</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>gpu.utilization[{#GPUINDEX}]</key> <delay>60</delay> <history>7</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>%</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>Nvidia</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> </item_prototypes> <trigger_prototypes> <trigger_prototype> <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>80</expression> <name>GPU {#GPUINDEX} Temperature is extremely high</name> <url/> <status>0</status> <priority>5</priority> <description>A GPU's temperature is getting extremely high!</description> <type>0</type> <dependencies/> </trigger_prototype> <trigger_prototype> <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>70</expression> <name>GPU {#GPUINDEX} Temperature is high</name> <url/> <status>0</status> <priority>2</priority> <description>A GPU's temperature is getting high!</description> <type>0</type> <dependencies> <dependency> <name>GPU {#GPUINDEX} Temperature is very high</name> <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>75</expression> </dependency> </dependencies> </trigger_prototype> <trigger_prototype> <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>75</expression> <name>GPU {#GPUINDEX} Temperature is very high</name> <url/> <status>0</status> <priority>4</priority> <description>A GPU's temperature is getting very high!</description> <type>0</type> <dependencies> <dependency> <name>GPU {#GPUINDEX} Temperature is extremely high</name> <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>80</expression> </dependency> </dependencies> </trigger_prototype> </trigger_prototypes> <graph_prototypes> <graph_prototype> <name>GPU {#GPUINDEX} Memory</name> <width>900</width> <height>200</height> <yaxismin>0.0000</yaxismin> <yaxismax>100.0000</yaxismax> <show_work_period>1</show_work_period> <show_triggers>1</show_triggers> <type>0</type> <show_legend>1</show_legend> <show_3d>0</show_3d> <percent_left>0.0000</percent_left> <percent_right>0.0000</percent_right> <ymin_type_1>0</ymin_type_1> <ymax_type_1>0</ymax_type_1> <ymin_item_1>0</ymin_item_1> <ymax_item_1>0</ymax_item_1> <graph_items> <graph_item> <sortorder>0</sortorder> <drawtype>0</drawtype> <color>00AA00</color> <yaxisside>0</yaxisside> <calc_fnc>2</calc_fnc> <type>0</type> <item> <host>Template Nvidia GPUs Performance</host> <key>gpu.memfree[{#GPUINDEX}]</key> </item> </graph_item> <graph_item> <sortorder>1</sortorder> <drawtype>0</drawtype> <color>0000DD</color> <yaxisside>0</yaxisside> <calc_fnc>2</calc_fnc> <type>0</type> <item> <host>Template Nvidia GPUs Performance</host> <key>gpu.memused[{#GPUINDEX}]</key> </item> </graph_item> </graph_items> </graph_prototype> <graph_prototype> <name>GPU {#GPUINDEX} Temperature, Fan Speed and Power</name> <width>900</width> <height>200</height> <yaxismin>0.0000</yaxismin> <yaxismax>100.0000</yaxismax> <show_work_period>1</show_work_period> <show_triggers>1</show_triggers> <type>0</type> <show_legend>1</show_legend> <show_3d>0</show_3d> <percent_left>0.0000</percent_left> <percent_right>0.0000</percent_right> <ymin_type_1>0</ymin_type_1> <ymax_type_1>0</ymax_type_1> <ymin_item_1>0</ymin_item_1> <ymax_item_1>0</ymax_item_1> <graph_items> <graph_item> <sortorder>0</sortorder> <drawtype>0</drawtype> <color>1A7C11</color> <yaxisside>0</yaxisside> <calc_fnc>2</calc_fnc> <type>0</type> <item> <host>Template Nvidia GPUs Performance</host> <key>gpu.power[{#GPUINDEX}]</key> </item> </graph_item> <graph_item> <sortorder>1</sortorder> <drawtype>0</drawtype> <color>2774A4</color> <yaxisside>0</yaxisside> <calc_fnc>2</calc_fnc> <type>0</type> <item> <host>Template Nvidia GPUs Performance</host> <key>gpu.fanspeed[{#GPUINDEX}]</key> </item> </graph_item> <graph_item> <sortorder>2</sortorder> <drawtype>0</drawtype> <color>F63100</color> <yaxisside>0</yaxisside> <calc_fnc>2</calc_fnc> <type>0</type> <item> <host>Template Nvidia GPUs Performance</host> <key>gpu.temp[{#GPUINDEX}]</key> </item> </graph_item> </graph_items> </graph_prototype> <graph_prototype> <name>GPU {#GPUINDEX} Utilization</name> <width>900</width> <height>200</height> <yaxismin>0.0000</yaxismin> <yaxismax>100.0000</yaxismax> <show_work_period>1</show_work_period> <show_triggers>1</show_triggers> <type>0</type> <show_legend>1</show_legend> <show_3d>0</show_3d> <percent_left>0.0000</percent_left> <percent_right>0.0000</percent_right> <ymin_type_1>0</ymin_type_1> <ymax_type_1>0</ymax_type_1> <ymin_item_1>0</ymin_item_1> <ymax_item_1>0</ymax_item_1> <graph_items> <graph_item> <sortorder>0</sortorder> <drawtype>0</drawtype> <color>2774A4</color> <yaxisside>0</yaxisside> <calc_fnc>2</calc_fnc> <type>0</type> <item> <host>Template Nvidia GPUs Performance</host> <key>gpu.utilization[{#GPUINDEX}]</key> </item> </graph_item> </graph_items> </graph_prototype> </graph_prototypes> <host_prototypes/> </discovery_rule> </discovery_rules> <macros/> <templates/> <screens/> </template> </templates> </zabbix_export>
参考:https://github.com/zhangyuteng/zabbix-nvidia-smi-multi-gpu