Prometheus Alertmanager 整合釘釘告警
阿新 • • 發佈:2022-05-17
Prometheus Alertmanager 整合釘釘告警
安裝Prometheus、Altermanager
1.使用docker-compose整合Prometheus和Altermanager
cat docker-compose.yml
version: "3" services: prometheus: image: prom/prometheus:v2.35.0 container_name: prometheus hostname: prometheus volumes: #- "./prometheus.yml:/etc/prometheus/prometheus.yml" - "./prometheus:/etc/prometheus" - "/etc/localtime:/etc/localtime" - "./data:/prometheus" restart: on-failure network_mode: "host" logging: driver: "json-file" options: tag: prometheus cap_add: - ALL command: - '--config.file=/etc/prometheus/prometheus.yml' - '--web.enable-admin-api' - '--web.enable-lifecycle' alertmanager: image: prom/alertmanager:v0.24.0 container_name: alertmanager hostname: alertmanager restart: on-failure network_mode: "host" logging: driver: "json-file" options: tag: prometheus cap_add: - ALL volumes: - ./alertmanager/:/etc/alertmanager/ command: - '--config.file=/etc/alertmanager/config.yml' - '--storage.path=/alertmanager'
2.準備Prometheus配置檔案
prometheus.yml
cat prometheus.yml
# my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - alertmanagers:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "/etc/prometheus/rules/*.yml" # - "first_rules.yml" # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: "prometheus" # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ["localhost:9090"] # 使用配置檔案自動發現規則 - job_name: "actuator_health" metrics_path: '/actuator/prometheus' file_sd_configs: - refresh_interval: 1m files: - "./service_endpoint*.yml" - job_name: "docker" file_sd_configs: - refresh_interval: 1m files: - "./docker_endpoint*.yml" - job_name: "node-exporter" file_sd_configs: - refresh_interval: 1m files: - "./node-exporter*.yml"
service_endpoint_all.yml
cat service_endpoint_all.yml
- targets:
- ip:20006
labels:
servicename: sname01
- targets:
- ip:20005
labels:
servicename: sname01
node-exporter-all.yml
cat node-exporter-all.yml
- targets: ['ip:7100'] labels: hostname: "node-01" - targets: ['ip:7100'] labels: hostname: "node-02" - targets: ['ip:7100'] labels: hostname: "node-03"
docker_endpoint_all.yml
cat docker_endpoint_all.yml
- targets: ['ip:7080']
labels:
hostname: "env-mid"
- targets: ['ip:7080']
labels:
hostname: "env-ap-02"
- targets: ['ip:7080']
labels:
hostname: "env-ap-01"
3.準備告警規則
service_alter.yml
cat service_alter.yml # 此規則的labels與annotations將用於下面的告警模板
groups:
- name: Service_Down
rules:
- alert: 服務下線通知
#expr: up{job="actuator_health"}==0
expr: up{job="actuator_health",servicename!="iot-aircraft_192.168.0.22"}==0
for: 10s
labels:
user: prometheus
severity: warning
env: "prod"
sname: "{{ $labels.servicename }}"
annotations:
summary: "{{ $labels.servicename }} 服務下線"
description: "{{ $labels.servicename }} of job {{ $labels.job }} has been Down."
title: "{{ $labels.servicename }} 服務狀態告警"
[root@prometheus rules]#
4.準備altermanager配置檔案
config.yml
cat config.yml
global:
#每一分鐘檢查一次是否恢復
resolve_timeout: 1m
# 自定義告警模板
templates:
- '/etc/alertmanager/dingtalk.tmpl'
route:
#設定預設接收人
receiver: 'devops'
group_by: ['Service_Down']
#組告警等待時間。也就是告警產生後等待10s,如果有同組告警一起發出
group_wait: 10s
#兩組告警的間隔時間
group_interval: 10s
#重複告警的間隔時間,減少相同微信告警的傳送頻率
repeat_interval: 1h
#採用哪個標籤來作為分組依據
routes:
- receiver: devops
group_wait: 10s
match:
team: DevOps
receivers:
- name: 'devops' #與釘釘告警組相匹配
webhook_configs:
- url: http://192.168.0.28:8060/dingtalk/devops/send
#警報被解決之後是否通知
send_resolved: true
啟動容器
docker-compose up -d
安裝dingtalk
1.下載安裝包
cd /opt
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar xvf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 prometheus-webhook-dingtalk
2.配置系統服務託管
## cat /usr/lib/systemd/system/dingtalk.service
[Unit]
Descripton=dingtalk
Documentation=https://github.com/timonwong/prometheus-webhook-dingtalk/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/opt/prometheus-webhook-dingtalk
ExecStart=/opt/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/opt/prometheus-webhook-dingtalk/config.yml --web.enable-ui
[Install]
WantedBy=multi-user.target
#命令列啟動
systemctl daemon-reload
systemctl enable dingtalk.service
systemctl start dingtalk.service
systemctl status dingtalk.service
ss -tnl | grep 8060
3.準備配置檔案
配置模板路徑:
/opt/prometheus-webhook-dingtalk/config.example.yml
複製模板:
cp /opt/prometheus-webhook-dingtalk/config.example.yml /opt/prometheus-webhook-dingtalk/config.yml
修改配置模板: cat config.yml
## Request timeout
# timeout: 5s
## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true
## Customizable templates path
#templates:
# - contrib/templates/legacy/template.tmpl
# 配置自定義釘釘訊息模板
templates:
- /opt/prometheus/alertmanager/dingtalk.tmpl
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
# 告警分組 可以新增多個
targets:
devops:
url: https://oapi.dingtalk.com/robot/send?access_token=631dbf86f484df72d92311e1664d08feef84334b8a668535f0bc8e7cce91a718
secret: SECd2ce1b8cac649c29ab37788d402be1ef089fd19734cdb4dd4a78f1f03add3952
message:
title: '{{ template "ops.title" . }}'
text: '{{ template "ops.content" . }}'
4.準備自定義訊息模板
注意:此模板變數與告警規則相關聯
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}
{{ define "__alert_list" }}{{ range . }}
---
**告警名稱**: {{ index .Annotations "title" }}
**告警環境**: {{ .Labels.env }}
**告警級別**: {{ .Labels.severity }}
**告警主機**: {{ .Labels.instance }}
**告警服務**: {{ .Labels.sname }}
**告警資訊**: {{ index .Annotations "description" }}
**告警時間**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "__resolved_list" }}{{ range . }}
---
**告警名稱**: {{ index .Annotations "title" }}
**告警環境**: {{ .Labels.env }}
**告警級別**: {{ .Labels.severity }}
**告警主機**: {{ .Labels.instance }}
**告警服務**: {{ .Labels.sname }}
**告警資訊**: {{ index .Annotations "description" }}
**告警時間**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**恢復時間**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}
{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====偵測到{{ .Alerts.Firing | len }}個故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
**====恢復{{ .Alerts.Resolved | len }}個故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }}
{{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }}
{{ template "ops.title" . }}
{{ template "ops.content" . }}
模板可以使用dingtalk外掛的ui介面:http://altermanager:8060/ui除錯,開啟方法是啟動引數新增 --web.enable-ui
5.配置完成後重啟服務並檢查服務狀態
systemctl restart dingtalk.service
systemctl status dingtalk.service