1. 程式人生 > 其它 >Prometheus+Grafana+Altermanager監控告警(四)

Prometheus+Grafana+Altermanager監控告警(四)

rule監控規則配置

語法檢查規則

promtool check rules /path/to/example.rules.yml

nodes.rules:

groups:
- name: nodes.rules
  rules:             
  - alert: NodeFilesystemUsage
    expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}) * 100 > 80
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: 
'Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分割槽使用率過高' description: '節點: {{ $labels.instance }}: {{ $labels.mountpoint }} 分割槽使用大於80% (當前值: {{ $value }})' - alert: NodeMemoryUsage expr: 100 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100
> 90 for: 1m labels: severity: warning annotations: summary: 'Instance {{ $labels.instance }} 記憶體使用率過高' description: '節點: {{ $labels.instance }}記憶體使用大於80% (當前值: {{ $value }})' - alert: NodeCPUUsage expr: 100 - (avg by(instance, cluster) (irate(node_cpu_seconds_total{mode="
idle"}[5m])) * 100) > 80 for: 1m labels: severity: warning annotations: summary: 'Instance {{ $labels.instance }} CPU使用率過高' description: '節點: {{ $labels.instance }}CPU使用大於60% (當前值: {{ $value }})' - alert: KubeNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 1m labels: severity: error annotations: summary: '節點: {{ $labels.node }} 已經有10多分鐘沒有準備好了.'
View Code

pods.rules:

groups:
- name: pods.rules
  rules:
  - alert: PodFailed
    expr: sum
      by(pod, namespace) (kube_pod_status_phase{phase="Failed"})
      > 0
    for: 30s
    labels:
      severity: error
    annotations:
      summary: '名稱空間: {{ $labels.namespace }} | Pod名稱: {{ $labels.pod
        }} Pod狀態Failed (當前值: {{ $value }})'
        
  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      severity: error
    annotations:
      description: '{{ $labels.instance }} job {{ $labels.job }} 已經停止5分鐘以上.'
      summary: Instance {{ $labels.instance }} 停止工作
      
  - alert: PodCPUUsage
    expr: sum
      by(pod, namespace, cluster, container) (rate(container_cpu_usage_seconds_total{cluster!~"(test|job)",image!=""}[10m])
      * 100) > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: '名稱空間: {{ $labels.namespace }} | Pod名稱: {{ $labels.pod
        }} 容器:{{ $labels.container }} CPU使用大於90% (當前值: {{ $value }})'
        
  - alert: PodMemoryUsage
    expr: sum
      by(pod, namespace, container) (container_memory_rss{image!=""})
      / sum by(pod, namespace, container) (container_spec_memory_limit_bytes{image!=""})
      * 100 != +Inf > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: '名稱空間: {{ $labels.namespace }} | Pod名稱: {{ $labels.pod
        }} 容器:{{ $labels.container }} 記憶體使用大於80% (當前值: {{ $value }})'
        
  - alert: PodNetworkReceive
    expr: sum
      by(pod, namespace) (rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m])
      / 1000) > 30000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: '名稱空間: {{ $labels.namespace }} | Pod名稱: {{ $labels.pod
        }} 入口流量大於30MB/s (當前值: {{ $value }}K/s)'
        
  - alert: PodNetworkTransmit
    expr: sum
      by(pod, namespace) (rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m])
      / 1000) > 30000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: '名稱空間: {{ $labels.namespace }} | Pod名稱: {{ $labels.pod
        }} 出口流量大於30MB/s (當前值: {{ $value }}/K/s)'
        
  - alert: PodPending
    expr: sum
      by(pod, namespace) (kube_pod_status_phase{phase="Pending"})
      > 0
    for: 1m
    labels:
      severity: error
    annotations:
      summary: '名稱空間: {{ $labels.namespace }} | Pod名稱: {{ $labels.pod
        }} Pod狀態Pending (當前值: {{ $value }})'
        
  - alert: PodRestart
    expr: sum
      by(pod, namespace) (changes(kube_pod_container_status_restarts_total{}[1m]))
      > 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: '名稱空間: {{ $labels.namespace }} | Pod名稱: {{ $labels.pod
        }} Pod重啟 (當前值: {{ $value }})'
View Code