cAdvisor容器監控規則
阿新 • • 發佈:2020-09-01
其他說明參考host主機監控規則:https://www.cnblogs.com/sanduzxcvbnm/p/13589848.html
在prometheus主程式目錄下的rules目錄下新建docker.yml檔案,新增上如下內容,然後重啟prometheus。
groups: - name: Docker containers monitoring rules: - alert: ContainerKilled expr: time() - container_last_seen > 60 for: 5m labels: severity: warning annotations: summary: "Container killed (instance {{ $labels.instance }})" description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerCpuUsage expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container CPU usage (instance {{ $labels.instance }})" description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerMemoryUsage expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Memory usage (instance {{ $labels.instance }})" description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeUsage expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume usage (instance {{ $labels.instance }})" description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeIoUsage expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume IO usage (instance {{ $labels.instance }})" description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 5m labels: severity: warning annotations: summary: "Container high throttle rate (instance {{ $labels.instance }})" description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerActiveConnectinos expr: pgbouncer_pools_server_active_connections > 200 for: 5m labels: severity: warning annotations: summary: "PGBouncer active connectinos (instance {{ $labels.instance }})" description: "PGBouncer pools are filling up\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerErrors expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "PGBouncer errors (instance {{ $labels.instance }})" description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerMaxConnections expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0 for: 5m labels: severity: critical annotations: summary: "PGBouncer max connections (instance {{ $labels.instance }})" description: "The number of PGBouncer client connections has reached max_client_conn.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: SidekiqQueueSize expr: sidekiq_queue_size{} > 100 for: 5m labels: severity: warning annotations: summary: "Sidekiq queue size (instance {{ $labels.instance }})" description: "Sidekiq queue {{ $labels.name }} is growing\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: SidekiqSchedulingLatencyTooHigh expr: max(sidekiq_queue_latency) > 120 for: 5m labels: severity: critical annotations: summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})" description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulServiceHealthcheckFailed expr: consul_catalog_service_node_healthy == 0 for: 5m labels: severity: critical annotations: summary: "Consul service healthcheck failed (instance {{ $labels.instance }})" description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulMissingMasterNode expr: consul_raft_peers < 3 for: 5m labels: severity: critical annotations: summary: "Consul missing master node (instance {{ $labels.instance }})" description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulAgentUnhealthy expr: consul_health_node_status{status="critical"} == 1 for: 5m labels: severity: critical annotations: summary: "Consul agent unhealthy (instance {{ $labels.instance }})" description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"