1. 程式人生 > 其它 >Prometheus Alertmanager 整合釘釘告警

Prometheus Alertmanager 整合釘釘告警

Prometheus Alertmanager 整合釘釘告警

安裝Prometheus、Altermanager

1.使用docker-compose整合Prometheus和Altermanager

cat docker-compose.yml

version: "3"
services:
  prometheus:
    image: prom/prometheus:v2.35.0
    container_name: prometheus
    hostname: prometheus
    volumes:
      #- "./prometheus.yml:/etc/prometheus/prometheus.yml"
      - "./prometheus:/etc/prometheus"
      - "/etc/localtime:/etc/localtime"
      - "./data:/prometheus"
    restart: on-failure
    network_mode: "host"
    logging:
      driver: "json-file"
      options:
        tag: prometheus
    cap_add:
      - ALL
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--web.enable-admin-api'
      - '--web.enable-lifecycle'
  alertmanager:
    image: prom/alertmanager:v0.24.0
    container_name: alertmanager
    hostname: alertmanager
    restart: on-failure
    network_mode: "host"
    logging:
      driver: "json-file"
      options:
        tag: prometheus
    cap_add:
      - ALL
    volumes:
      - ./alertmanager/:/etc/alertmanager/
    command:
      - '--config.file=/etc/alertmanager/config.yml'
      - '--storage.path=/alertmanager'

2.準備Prometheus配置檔案

prometheus.yml

cat prometheus.yml

# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanagers:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "/etc/prometheus/rules/*.yml"
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]
#  使用配置檔案自動發現規則
  - job_name: "actuator_health"
    metrics_path: '/actuator/prometheus'
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - "./service_endpoint*.yml"

  - job_name: "docker"
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - "./docker_endpoint*.yml"

  - job_name: "node-exporter"
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - "./node-exporter*.yml"

service_endpoint_all.yml

cat service_endpoint_all.yml

- targets:
  - ip:20006
  labels:
    servicename: sname01
- targets:
  - ip:20005
  labels:
    servicename: sname01
node-exporter-all.yml

cat node-exporter-all.yml

- targets: ['ip:7100']
  labels:
    hostname: "node-01"
- targets: ['ip:7100']
  labels:
    hostname: "node-02"
- targets: ['ip:7100']
  labels:
    hostname: "node-03"
docker_endpoint_all.yml

cat docker_endpoint_all.yml

- targets: ['ip:7080']
  labels:
    hostname: "env-mid"
- targets: ['ip:7080']
  labels:
    hostname: "env-ap-02"
- targets: ['ip:7080']
  labels:
    hostname: "env-ap-01"

3.準備告警規則

service_alter.yml

cat service_alter.yml # 此規則的labelsannotations將用於下面的告警模板

groups:
- name: Service_Down
  rules:
  - alert: 服務下線通知
    #expr: up{job="actuator_health"}==0
    expr: up{job="actuator_health",servicename!="iot-aircraft_192.168.0.22"}==0
    for: 10s
    labels:
      user: prometheus
      severity: warning
      env: "prod"
      sname: "{{ $labels.servicename }}"
    annotations:
      summary: "{{ $labels.servicename }} 服務下線"
      description: "{{ $labels.servicename }} of job {{ $labels.job }} has been Down."
      title: "{{ $labels.servicename }} 服務狀態告警"
[root@prometheus rules]# 

4.準備altermanager配置檔案

config.yml

cat config.yml

global:
  #每一分鐘檢查一次是否恢復
  resolve_timeout: 1m

# 自定義告警模板
templates:
  - '/etc/alertmanager/dingtalk.tmpl'
route:
  #設定預設接收人
  receiver: 'devops' 
  group_by: ['Service_Down']
  #組告警等待時間。也就是告警產生後等待10s,如果有同組告警一起發出
  group_wait: 10s
  #兩組告警的間隔時間
  group_interval: 10s
  #重複告警的間隔時間,減少相同微信告警的傳送頻率
  repeat_interval: 1h
  #採用哪個標籤來作為分組依據
  routes:
  - receiver: devops
    group_wait: 10s
    match:
      team: DevOps
receivers:
- name: 'devops'  #與釘釘告警組相匹配
  webhook_configs:
  - url: http://192.168.0.28:8060/dingtalk/devops/send 
    #警報被解決之後是否通知
    send_resolved: true

啟動容器

docker-compose up -d

安裝dingtalk

1.下載安裝包

cd /opt
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar xvf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 prometheus-webhook-dingtalk

2.配置系統服務託管

## cat /usr/lib/systemd/system/dingtalk.service

[Unit]
Descripton=dingtalk
Documentation=https://github.com/timonwong/prometheus-webhook-dingtalk/
After=network.target

[Service]
Restart=on-failure
WorkingDirectory=/opt/prometheus-webhook-dingtalk 
ExecStart=/opt/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/opt/prometheus-webhook-dingtalk/config.yml --web.enable-ui

[Install]
WantedBy=multi-user.target

#命令列啟動
systemctl daemon-reload
systemctl enable dingtalk.service
systemctl start dingtalk.service
systemctl status dingtalk.service
ss -tnl | grep 8060

3.準備配置檔案

配置模板路徑:

/opt/prometheus-webhook-dingtalk/config.example.yml

複製模板:

cp /opt/prometheus-webhook-dingtalk/config.example.yml /opt/prometheus-webhook-dingtalk/config.yml

修改配置模板: cat config.yml

## Request timeout
# timeout: 5s

## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true

## Customizable templates path
#templates:
#  - contrib/templates/legacy/template.tmpl
# 配置自定義釘釘訊息模板
templates:
  - /opt/prometheus/alertmanager/dingtalk.tmpl

## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
#  title: '{{ template "legacy.title" . }}'
#  text: '{{ template "legacy.content" . }}'

## Targets, previously was known as "profiles"
# 告警分組 可以新增多個
targets:
  devops:
    url: https://oapi.dingtalk.com/robot/send?access_token=631dbf86f484df72d92311e1664d08feef84334b8a668535f0bc8e7cce91a718
    secret: SECd2ce1b8cac649c29ab37788d402be1ef089fd19734cdb4dd4a78f1f03add3952
    message:
      title: '{{ template "ops.title" . }}'
      text: '{{ template "ops.content" . }}'

4.準備自定義訊息模板

注意:此模板變數與告警規則相關聯

{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}


{{ define "__alert_list" }}{{ range . }}
---
**告警名稱**: {{ index .Annotations "title" }}

**告警環境**: {{ .Labels.env }} 

**告警級別**: {{ .Labels.severity }} 

**告警主機**: {{ .Labels.instance }} 

**告警服務**: {{ .Labels.sname }} 

**告警資訊**: {{ index .Annotations "description" }}

**告警時間**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}

{{ define "__resolved_list" }}{{ range . }}
---
**告警名稱**: {{ index .Annotations "title" }}

**告警環境**: {{ .Labels.env }} 

**告警級別**: {{ .Labels.severity }} 

**告警主機**: {{ .Labels.instance }} 

**告警服務**: {{ .Labels.sname }} 

**告警資訊**: {{ index .Annotations "description" }}

**告警時間**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}

**恢復時間**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}


{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}

{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====偵測到{{ .Alerts.Firing | len  }}個故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}

{{ if gt (len .Alerts.Resolved) 0 }}
**====恢復{{ .Alerts.Resolved | len  }}個故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}

{{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }}
{{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }}
{{ template "ops.title" . }}
{{ template "ops.content" . }}

模板可以使用dingtalk外掛的ui介面:http://altermanager:8060/ui除錯,開啟方法是啟動引數新增 --web.enable-ui

5.配置完成後重啟服務並檢查服務狀態

systemctl restart dingtalk.service
systemctl status dingtalk.service