https://prometheus.io/docs/introduction/overview
https://docs.timescale.com/timescaledb/latest/tutorials/promscale
https://hanlingsha.cn/2021/03/31/%E4%BD%BF%E7%94%A8Prometheus%E6%89%B9%E9%87%8F%E9%83%A8%E7%BD%B2%E7%9B%91%E6%8E%A7%E4%B8%BB%E6%9C%BA/
1. Bridge
docker network create --driver bridge promscale-timescaledb
2. TimescaleDB
docker run --name timescaledb -d \
--network promscale-timescaledb \
-p 5432:5432 \
-e POSTGRES_PASSWORD=abcd1234 \
192.168.100.198:5000/timescale/timescaledb-ha:pg13-latest
3. Promscale
docker run --name promscale -d \
--network promscale-timescaledb \
-p 9201:9201 \
192.168.100.198:5000/timescale/promscale:0.6.2 \
-db-uri postgres://postgres:abcd1234@timescaledb:5432/postgres?sslmode=allow
4. node_exporter
docker run --name node_exporter -d \
--network promscale-timescaledb \
-p 9100:9100 \
192.168.100.198:5000/prom/node-exporter:v1.3.0
5. Prometheus
mkdir -p /etc/prometheus/{targets,rules}
cat > /etc/prometheus/prometheus.yml << EOF
# Prometheus全局配置项
global:
scrape_interval: 15s # 设定抓取数据的周期,默认为1min
evaluation_interval: 15s # 设定更新rules文件的周期,默认为1min
scrape_timeout: 15s # 设定抓取数据的超时时间,默认为10s
external_labels: # 额外的属性,会添加到拉取得数据并存到数据库中
# Alertmanager配置
alerting:
alertmanagers:
- static_configs:
- targets: ["192.168.100.181:9093"] # 设定alertmanager和prometheus交互的接口,即alertmanager监听的ip地址和端口
# rule配置,首次读取默认加载,之后根据evaluation_interval设定的周期加载
rule_files:
- "/etc/prometheus/rules/*.yml"
# scape配置
scrape_configs:
- job_name: file_sd_configs
file_sd_configs:
- files:
- /etc/prometheus/targets/*.json
remote_write:
- url: http://192.168.100.181:9201/write
remote_read:
- url: http://192.168.100.181:9201/read
EOF
cat > /etc/prometheus/targets/vm-181.json << EOF
[{"targets": ["192.168.100.181:9100"], "labels": {"hostname": "vm-181", "service": "node"}}]
EOF
cat > /etc/prometheus/targets/prometheus.json << EOF
[{"targets": ["192.168.100.181:9100"], "labels": {"hostname": "vm-181", "service": "prometheus"}}]
EOF
docker run --name prometheus -d \
--network promscale-timescaledb \
-p 9090:9090 \
-v /etc/prometheus:/etc/prometheus \
192.168.100.198:5000/prom/prometheus:v2.31.1
http://192.168.100.181:9090/targets
6. alertmanager
mkdir -p /etc/alertmanager/template
cat > /etc/alertmanager/alertmanager.yml << EOF
# 全局配置项
global:
resolve_timeout: 5m #处理超时时间,默认为5min
smtp_smarthost: 'smtp.sina.com:25' # 邮箱smtp服务器代理
smtp_from: '******@sina.com' # 发送邮箱名称
smtp_auth_username: '******@sina.com' # 邮箱名称
smtp_auth_password: '******' # 邮箱密码或授权码
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' # 企业微信地址
# 定义模板信心
templates:
- 'template/*.tmpl'
# 定义路由树信息
route:
group_by: ['alertname'] # 报警分组依据
group_wait: 10s # 最初即第一次等待多久时间发送一组警报的通知
group_interval: 10s # 在发送新警报前的等待时间
repeat_interval: 1m # 发送重复警报的周期 对于email配置中,此项不可以设置过低,否则将会由于邮件发送太多频繁,被smtp服务器拒绝
receiver: 'email' # 发送警报的接收者的名称,以下receivers name的名称
# 定义警报接收者信息
receivers:
- name: 'email' # 警报
email_configs: # 邮箱配置
- to: '******@163.com' # 接收警报的email配置
html: '{{ template "test.html" . }}' # 设定邮箱的内容模板
headers: { Subject: "[WARN] 报警邮件"} # 接收邮件的标题
webhook_configs: # webhook配置
- url: 'http://127.0.0.1:5001'
send_resolved: true
wechat_configs: # 企业微信报警配置
- send_resolved: true
to_party: '1' # 接收组的id
agent_id: '1000002' # (企业微信-->自定应用-->AgentId)
corp_id: '******' # 企业信息(我的企业-->CorpId[在底部])
api_secret: '******' # 企业微信(企业微信-->自定应用-->Secret)
message: '{{ template "test_wechat.html" . }}' # 发送消息模板的设定
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
EOF
docker run --name alertmanager -d \
--network promscale-timescaledb \
-p 9093:9093 \
-v /etc/alertmanager:/etc/alertmanager \
192.168.100.198:5000/prom/alertmanager:v0.23.0
7. pushgateway
docker run --name pushgateway -d \
--network promscale-timescaledb \
-p 9091:9091 \
192.168.100.198:5000/prom/pushgateway:v1.4.2
8. docker-compose
cat > docker-compose.yaml << EOF
version: '3.0'
services:
db:
image: timescaledev/timescaledb-ha:pg12-latest
ports:
- 5432:5432/tcp
environment:
POSTGRES_PASSWORD: password
POSTGRES_USER: postgres
prometheus:
image: prom/prometheus:latest
ports:
- 9090:9090/tcp
volumes:
- ${PWD}/prometheus.yml:/etc/prometheus/prometheus.yml
promscale:
image: timescale/promscale:latest
ports:
- 9201:9201/tcp
restart: on-failure
depends_on:
- db
- prometheus
environment:
PROMSCALE_DB_CONNECT_RETRIES: 10
PROMSCALE_WEB_TELEMETRY_PATH: /metrics-text
PROMSCALE_DB_URI: postgres://postgres:password@db:5432/postgres?sslmode=allow
node_exporter:
image: quay.io/prometheus/node-exporter
ports:
- "9100:9100"
EOF
docker-compose up
9. grafana
mkdir -p /var/lib/grafana && chown -R 472:472 /var/lib/grafana
docker run --name grafana -d \
--network promscale-timescaledb \
-p 3000:3000 \
-v /var/lib/grafana:/var/lib/grafana \
-e "GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource" \
192.168.100.198:5000/grafana/grafana:8.2.5
# node-export
https://grafana.com/grafana/dashboards/8919
https://grafana.com/grafana/dashboards/8919
kubernetes
https://grafana.com/grafana/dashboards/8685
ceph
https://grafana.com/grafana/dashboards/2842
https://grafana.com/grafana/dashboards/5342
https://grafana.com/grafana/dashboards/5336
https://grafana.com/grafana/dashboards/9966
10. grafana loki
192.168.100.198:5000/grafana/loki:main-88feda4
11. run args
/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles
/bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager
/bin/pushgateway
/bin/node_exporter
12. 报警群发
https://work.weixin.qq.com 注册企业微信,自建应用,获得企业id和应用secret
cat /etc/alertmanager/template/email.tmpl << EOF
{{ define "test.html" }}
<table border="1">
<tr>
<td>报警项</td>
<td>实例</td>
<td>报警阀值</td>
<td>开始时间</td>
</tr>
{{ range $i, $alert := .Alerts }}
<tr>
<td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
{{ end }}
EOF
cat /etc/alertmanager/template/wechat.tmpl << EOF
{{ define "webchat.html" }}
<table border="1">
<tr>
<td>报警项</td>
<td>实例</td>
<td>报警阀值</td>
<td>开始时间</td>
</tr>
{{ range $i, $alert := .Alerts }}
<tr>
<td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
{{ end }}
EOF
cat > /etc/prometheus/rules/alertmanager_rule.yml << EOF
groups:
- name: test-rules
rules:
- alert: InstanceDown # 告警名称
expr: up == 0 # 告警的判定条件,参考Prometheus高级查询来设定
for: 2m # 满足告警条件持续时间多久后,才会发送告警
labels: #标签项
team: node
annotations: # 解析项,详细解释告警信息
summary: "{{$labels.instance}}: has been down"
description: "{{$labels.instance}}: job {{$labels.job}} has been down "
value: {{$value}}
EOF