prometheus監控kubernetes容器
阿新 • • 發佈:2021-07-22
prometheus.yaml
# Prometheus self-monitoring 普羅米修斯自我監控 groups: - name: 普羅米修斯-監控告警 #組名,報警規則組名稱 rules: #定義角色 # 1.1.1. Prometheus job missing 普羅米修斯失蹤 # A Prometheus job has disappeared 普羅米修斯的工作不見了- alert: PrometheusJobMissing #告警名稱,例項在規定時間無法訪問發出告警 expr: absent(up{job="prometheus"}) #expr表示式 for: 0m #for持續時間,表示0M獲取不到資訊,觸發告警 labels: severity: warning #告警級別 annotations: #註釋告警通知 summary: Prometheus job missing (instance {{ $labels.instance }}) #自定義告警通知 description:"A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.2. Prometheus target missing # A Prometheus target has disappeared. An exporter might be crashed.普羅米修斯的目標消失了。出口商可能會破產。 - alert: PrometheusTargetMissing expr: up == 0 for: 0m labels: severity: critical annotations: summary: Prometheus target missing (instance {{ $labels.instance }}) description:"A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.3. Prometheus all targets missing # A Prometheus job does not have living target anymore. 普羅米修斯的工作已經沒有活的目標了 - alert: PrometheusAllTargetsMissing expr: count by (job) (up) == 0 for: 0m labels: severity: critical annotations: summary: Prometheus all targets missing (instance {{ $labels.instance }}) description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.4. Prometheus configuration reload failure # Prometheus configuration reload error 普羅米修斯配置重新載入錯誤 - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 0m labels: severity: warning annotations: summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.5. Prometheus too many restarts # Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. 在過去的15分鐘裡,普羅米修斯已經重啟了兩次以上。可能是撞車 - alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 for: 0m labels: severity: warning annotations: summary: Prometheus too many restarts (instance {{ $labels.instance }}) description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.6. Prometheus AlertManager configuration reload failure # AlertManager configuration reload error AlertManager配置重新載入錯誤 - alert: PrometheusAlertmanagerConfigurationReloadFailure expr: alertmanager_config_last_reload_successful != 1 for: 0m labels: severity: warning annotations: summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.7. Prometheus AlertManager config not synced # Configurations of AlertManager cluster instances are out of sync AlertManager群集例項的配置不同步 - alert: PrometheusAlertmanagerConfigNotSynced expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 for: 0m labels: severity: warning annotations: summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.8. Prometheus AlertManager E2E dead man switch #Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager. 普羅米修斯死神開關是一個隨時開火的警報。它被用作通過Alertmanager對普羅米修斯的端到端測試 - alert: PrometheusAlertmanagerE2eDeadManSwitch expr: vector(1) for: 0m labels: severity: critical annotations: summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.9. Prometheus not connected to alertmanager # Prometheus cannot connect the alertmanager 普羅米修斯無法連線alertmanager - alert: PrometheusNotConnectedToAlertmanager expr: prometheus_notifications_alertmanagers_discovered < 1 for: 0m labels: severity: critical annotations: summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.10. Prometheus rule evaluation failures # Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. 普羅米修斯遇到{$value}}規則評估失敗,導致可能被忽略的警報 - alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.11. Prometheus template text expansion failures # Prometheus encountered {{ $value }} template text expansion failures 普羅米修斯遇到{$value}}模板文字擴充套件失敗 - alert: PrometheusTemplateTextExpansionFailures expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.12. Prometheus rule evaluation slow # Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.普羅米修斯規則評估花費的時間比計劃的時間間隔長。它表示儲存後端訪問速度較慢或查詢太複雜。 - alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.13. Prometheus notifications backlog # The Prometheus notification queue has not been empty for 10 minutes 普羅米修斯通知佇列已經有10分鐘沒有空了。 - alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 for: 0m labels: severity: warning annotations: summary: Prometheus notifications backlog (instance {{ $labels.instance }}) description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.14. Prometheus AlertManager notification failing # Alertmanager is failing sending notifications Alertmanager無法傳送通知 - alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.15. Prometheus target empty # Prometheus has no target in service discovery 普羅米修斯在服務發現中沒有目標 - alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 for: 0m labels: severity: critical annotations: summary: Prometheus target empty (instance {{ $labels.instance }}) description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.16. Prometheus target scraping slow # Prometheus is scraping exporters slowly 普羅米修斯正在慢慢地刮 - alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 for: 5m labels: severity: warning annotations: summary: Prometheus target scraping slow (instance {{ $labels.instance }}) description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.17. Prometheus large scrape # Prometheus has many scrapes that exceed the sample limit 普羅米修斯有許多刮痕超過了樣本限制 - alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: summary: Prometheus large scrape (instance {{ $labels.instance }}) description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.18. Prometheus target scrape duplicate # Prometheus has many samples rejected due to duplicate timestamps but different values 普羅米修斯有許多樣本由於重複的時間戳而被拒絕,但值不同 - alert: PrometheusTargetScrapeDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 for: 0m labels: severity: warning annotations: summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.19. Prometheus TSDB checkpoint creation failures # Prometheus encountered {{ $value }} checkpoint creation failures 普羅米修斯遇到{$value}}檢查點建立失敗 - alert: PrometheusTsdbCheckpointCreationFailures expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.20. Prometheus TSDB checkpoint deletion failures # Prometheus encountered {{ $value }} checkpoint deletion failures Prometheus遇到{$value}}檢查點刪除失敗 - alert: PrometheusTsdbCheckpointDeletionFailures expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.21. Prometheus TSDB compactions failed # Prometheus encountered {{ $value }} TSDB compactions failures 普羅米修斯遇到{$value}}TSDB壓縮失敗 - alert: PrometheusTsdbCompactionsFailed expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.22. Prometheus TSDB head truncations failed # Prometheus encountered {{ $value }} TSDB head truncation failures Prometheus遇到{$value}}TSDB頭截斷失敗 - alert: PrometheusTsdbHeadTruncationsFailed expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.23. Prometheus TSDB reload failures # Prometheus encountered {{ $value }} TSDB reload failures 普羅米修斯遇到{$value}}TSDB重新載入失敗 - alert: PrometheusTsdbReloadFailures expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.24. Prometheus TSDB WAL corruptions # Prometheus encountered {{ $value }} TSDB WAL corruptions 普羅米修斯遇到了{$value}}TSDB-WAL腐蝕 - alert: PrometheusTsdbWalCorruptions expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.25. Prometheus TSDB WAL truncations failed # Prometheus encountered {{ $value }} TSDB WAL truncation failures 普羅米修斯遇到{$value}}TSDB WAL截斷失敗 - alert: PrometheusTsdbWalTruncationsFailed expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
windows.yaml
# 1.5. Windows Server : prometheus-community/windows_exporter (5 rules) groups: - name: Docker容器-監控告警 #組名,報警規則組名稱 rules: #定義角色 # 1.5.1. Windows Server collector Error # Collector {{ $labels.collector }} was not successful - alert: WindowsServerCollectorError expr: windows_exporter_collector_success == 0 for: 0m labels: severity: critical annotations: summary: Windows Server collector Error (instance {{ $labels.instance }}) description: "Collector {{ $labels.collector }} was not successful\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.2. Windows Server service Status # Windows Service state is not OK - alert: WindowsServerServiceStatus expr: windows_service_status{status="ok"} != 1 for: 1m labels: severity: critical annotations: summary: Windows Server service Status (instance {{ $labels.instance }}) description: "Windows Service state is not OK\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.3. Windows Server CPU Usage # CPU Usage is more than 80% - alert: WindowsServerCpuUsage expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80 for: 0m labels: severity: warning annotations: summary: Windows Server CPU Usage (instance {{ $labels.instance }}) description: "CPU Usage is more than 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.4. Windows Server memory Usage # Memory usage is more than 90% - alert: WindowsServerMemoryUsage expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90 for: 2m labels: severity: warning annotations: summary: Windows Server memory Usage (instance {{ $labels.instance }}) description: "Memory usage is more than 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.5. Windows Server disk Space Usage # Disk usage is more than 80% - alert: WindowsServerDiskSpaceUsage expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80 for: 2m labels: severity: critical annotations: summary: Windows Server disk Space Usage (instance {{ $labels.instance }}) description: "Disk usage is more than 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
node-exporter.yaml
# Host and hardware : node-exporter (31 rules) groups: - name: 主機節點-監控告警 #組名,報警規則組名稱 rules: #定義角色 # 1.2.1. Host out of memory # 節點記憶體已滿(<10%) - alert: 主機記憶體 #告警名稱,例項在規定時間無法訪問發出告警 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 #expr規則 for: 10m ##for持續時間,表示2M獲取不到資訊,觸發告警 labels: severity: warning #告警級別 annotations: #註釋告警通知 summary: 主機記憶體不足 (instance {{ $labels.instance }}) #自定義告警通知 description: "節點記憶體已滿(<10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.2. Host memory under memory pressure # The node is under heavy memory pressure. High rate of major page faults - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m labels: severity: warning annotations: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.3. Host unusual network throughput in # 主機網路介面可能接收的資料太多(>100 MB/s) - alert: 網絡卡接收資料 expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: summary: 主機網路吞吐量 (instance {{ $labels.instance }}) description: "主機網路介面可能接收的資料太多主機網路介面可能接收的資料太多 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.4. Host unusual network throughput out # 主機網路介面可能傳送太多資料 (> 100 MB/s) - alert: 網絡卡傳送資料 expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: summary: 主機網路吞吐量 (instance {{ $labels.instance }}) description: "主機網路介面可能傳送太多資料 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.5. Host unusual disk read rate # 磁碟可能讀取了太多資料(>50 MB/s) - alert: 主機磁碟異常讀取 expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: summary: 主機磁碟讀取率 (instance {{ $labels.instance }}) description: "磁碟可能讀取了太多資料 (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.6. Host unusual disk write rate # Disk is probably writing too much data (> 50 MB/s) - alert: 主機異常磁碟寫入 expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 for: 2m labels: severity: warning annotations: summary: Host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.7. Host out of disk space # Disk is almost full (< 10% left) # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: 主機磁碟空間 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: summary: 主機磁碟空間不足 (instance {{ $labels.instance }}) description: "磁碟快滿了 (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.8. Host disk will fill in 24 hours # Filesystem is predicted to run out of space within the next 24 hours at current write rate # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".1 # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: 主機磁碟將在24小時內填滿 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: summary: 主機磁碟將佔用24小時 (instance {{ $labels.instance }}) description: "檔案系統預計將在未來24小時內以當前寫入速率耗盡空間\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.9. Host out of inodes # 磁碟上的可用索引節點快用完了(<10%) - alert: 主機inodes expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 20 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0 for: 2m labels: severity: warning annotations: summary: 主機已用inode(instance {{ $labels.instance }}) description: "磁碟的可用索引節點快用完了 (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.10. Host inodes will fill in 24 hours # Filesystem is predicted to run out of inodes within the next 24 hours at current write rate - alert: 主機inode將在24小時內用完 expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 for: 2m labels: severity: warning annotations: summary: 主機索引節點將在24小時內用完 (instance {{ $labels.instance }}) description: "檔案系統預計將在未來24小時內以當前寫入速率耗盡inode\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.11. Host unusual disk read latency # Disk latency is growing (read operations > 100ms) - alert: 主機磁碟讀取延遲 expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: summary: 主機磁碟讀取延遲 (instance {{ $labels.instance }}) description: "磁碟延遲正在增長 (讀取操作 > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.12. Host unusual disk write latency # Disk latency is growing (write operations > 100ms) - alert: 主機磁碟寫入延遲 expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: summary: 主機磁碟寫入延遲 (instance {{ $labels.instance }}) description: "磁碟延遲正在增長 (寫入操作 > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.13. Host high CPU load #mode="idle" 從系統啟動開始,累計到當前時刻,除IO等待時間以外的其它等待時間,亦即空閒時間 # CPU load is > 80% - alert: 主機CPU高負載 expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 for: 0m labels: severity: warning annotations: summary: 主機高負載 (instance {{ $labels.instance }}) description: "CPU負載為 > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.14. Host CPU steal noisy neighbor #mode="steal"當執行在虛擬化環境中,花費在其它 OS 中的時間(基於虛擬機器監視器 hypervisor 的排程);可以理解成由於虛擬機器排程器將 cpu 時間用於其它 OS 了,故當前 OS 無法使用 CPU 的時間。 # CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. - alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 0m labels: severity: warning annotations: summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU竊取>10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.15. Host context switching # Context switching is growing on node (> 1000 / s) # 1000 context switches is an arbitrary number. # Alert threshold depends on nature of application. # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - alert: 主機上下文切換 expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 15000 for: 0m labels: severity: warning annotations: summary: Host context switching (instance {{ $labels.instance }}) description: "Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.16. Host swap is filling up # Swap is filling up (>80%) - alert: 主機交換分割槽 expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 for: 2m labels: severity: warning annotations: summary: 主機交換已滿 (instance {{ $labels.instance }}) description: "主機交換分割槽 (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.17. Host systemd service crashed # systemd service crashed - alert: systemd服務崩潰 expr: node_systemd_unit_state{state="failed"} == 1 for: 0m labels: severity: warning annotations: summary: 主機systemd服務崩潰 (instance {{ $labels.instance }}) description: "systemd服務崩潰\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.22. Host kernel version deviations # Different kernel versions are running - alert: 主機核心 expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 for: 6h labels: severity: warning annotations: summary: Host kernel version deviations (instance {{ $labels.instance }}) description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.23. Host OOM kill detected # OOM kill detected - alert: 檢測到OOM殺死 expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: summary: 檢測到主機OOM終止 (instance {{ $labels.instance }}) description: "檢測到OOM殺死\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.26. Host Network Receive Errors # Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes. - alert: 主機網路接收錯誤 expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: summary: 主機網路接收錯誤 (instance {{ $labels.instance }}) description: "主機 {{ $labels.instance }} 介面 {{ $labels.device }} 在過去五分鐘內收到錯誤遇到 {{ printf \"%.0f\" $value }} .\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.27. Host Network Transmit Errors # Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes. - alert: 主機網路傳輸錯誤 expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: summary: 主機網路傳輸錯誤 (instance {{ $labels.instance }}) description: "主機 {{ $labels.instance }} 介面 {{ $labels.device }} 在過去五分鐘內收到錯誤遇到 {{ printf \"%.0f\" $value }} \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.28. Host Network Interface Saturated # The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded. - alert: 主機網路介面 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 for: 1m labels: severity: warning annotations: summary: 主機網路介面飽和 (instance {{ $labels.instance }}) description: "網路介面 \"{{ $labels.interface }}\" 在 \"{{ $labels.instance }}\" 已經超負荷了.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.29. Host conntrack limit # The number of conntrack is approching limit - alert: 連線數接近極限 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 for: 5m labels: severity: warning annotations: summary: 主機連線數接近極限 (instance {{ $labels.instance }}) description: "主機連線數接近極限\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.30. Host clock skew # Clock skew detected. Clock is out of sync. - alert: 時鐘偏移 expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) for: 2m labels: severity: warning annotations: summary: 主機時間偏移 (instance {{ $labels.instance }}) description: "檢測到時鐘偏移。時鐘不同步.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.31. Host clock not synchronising # Clock not synchronising. - alert: 主機時間不同步 expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 for: 2m labels: severity: warning annotations: summary: 主機時間不同步 (instance {{ $labels.instance }}) description: "時鐘不同步。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
docker.yaml
# Docker containers : google/cAdvisor (6 rules) groups: - name: Docker容器-監控告警 #組名,報警規則組名稱 rules: #定義角色 # 1.3.1. Container killed # A container has disappeared - alert: ContainerKilled expr: time() - container_last_seen > 60 for: 0m labels: severity: warning annotations: summary: Container killed (instance {{ $labels.instance }}) description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.3.2. Container CPU usage # Container CPU usage is above 80% # cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly. # If you want to exclude it from this alert, exclude the serie having an empty name: container_cpu_usage_seconds_total{name!=""} - alert: 容器cpu使用量 expr: sum(rate(container_cpu_system_seconds_total{name=~".+"}[1m])) by (name,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_host_ip,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) * 100 > 80 for: 2m labels: severity: warning annotations: summary: 容器cpu使用量 (instance {{ $labels.instance }}) description: "容器cpu使用量達到80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: 容器記憶體使用率 expr: (container_memory_working_set_bytes/container_spec_memory_limit_bytes )*100 for: 2m labels: severity: warning annotations: summary: 容器記憶體使用率 description: "容器記憶體使用率是 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.3.5. Container Volume IO usage # Container Volume IO usage is above 80% - alert: 容器磁碟使用量 expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 for: 2m labels: severity: warning annotations: summary: Container Volume IO usage (instance {{ $labels.instance }}) description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.3.6. Container high throttle rate # Container is being throttled - alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 2m labels: severity: warning annotations: summary: Container high throttle rate (instance {{ $labels.instance }}) description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
blackbox.yaml
# 1.4. Blackbox : prometheus/blackbox_exporter (8 rules) groups: - name: Blackbox黑匣子-監控告警 #組名,報警規則組名稱 rules: #定義角色 # 1.4.1. Blackbox probe failed # Probe failed - alert: BlackboxProbeFailed expr: probe_success == 0 for: 0m labels: severity: critical annotations: summary: Blackbox probe failed (instance {{ $labels.instance }}) description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.2. Blackbox slow probe # Blackbox probe took more than 1s to complete - alert: BlackboxSlowProbe expr: avg_over_time(probe_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox slow probe (instance {{ $labels.instance }}) description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.3. Blackbox probe HTTP failure # HTTP status code is not 200-399 - alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 for: 0m labels: severity: critical annotations: summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.4. Blackbox SSL certificate will expire soon # SSL certificate expires in 30 days - alert: BlackboxSslCertificateWillExpireSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 for: 0m labels: severity: warning annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.5. Blackbox SSL certificate will expire soon # SSL certificate expires in 3 days - alert: BlackboxSslCertificateWillExpireSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.6. Blackbox SSL certificate expired # SSL certificate has expired already - alert: BlackboxSslCertificateExpired expr: probe_ssl_earliest_cert_expiry - time() <= 0 for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.7. Blackbox probe slow HTTP # HTTP request took more than 1s - alert: BlackboxProbeSlowHttp expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.8. Blackbox probe slow ping # Blackbox ping took more than 1s - alert: BlackboxProbeSlowPing expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox probe slow ping (instance {{ $labels.instance }}) description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
kube-state-mertric.yaml
# 5.1。Kubernetes: kube-state-metrics (33條規則) groups: - name: Docker容器-監控告警 #組名,報警規則組名稱 rules: #定義角色 # 5.1.1. Kubernetes Node ready # Node {{ $labels.node }} has been unready for a long time - alert: 節點斷開連線 expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 10m labels: severity: critical annotations: summary: 節點斷開連線 (instance {{ $labels.instance }}) description: "節點 {{ $labels.node }} 已經很長時間沒有聯絡上了\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.2. Kubernetes memory pressure # {{ $labels.node }} has MemoryPressure condition - alert: k8s節點記憶體有壓力 expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 for: 2m labels: severity: critical annotations: summary: k8s節點記憶體有壓力 (instance {{ $labels.instance }}) description: "{{ $labels.node }} 是否存在記憶體有壓力\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.3. Kubernetes disk pressure # {{ $labels.node }} has DiskPressure condition - alert: k8s節點磁碟有壓力 expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 for: 2m labels: severity: critical annotations: summary: k8s節點存在磁碟有壓力 (instance {{ $labels.instance }}) description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.4. Kubernetes out of disk # {{ $labels.node }} has OutOfDisk condition - alert: k8s磁碟不足 expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 for: 2m labels: severity: critical annotations: summary: K8s磁碟空間不足 (instance {{ $labels.instance }}) description: "{{ $labels.node }} 磁碟空間不足\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.5. Kubernetes out of capacity # {{ $labels.node }} is out of capacity - alert: 容量不足 expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum(kube_node_status_allocatable_pods) by (node) * 100 > 90 for: 2m labels: severity: warning annotations: summary: Kubernetes 容量不足 (instance {{ $labels.instance }}) description: "{{ $labels.node }} 容量不足\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.6. Kubernetes container oom killer # Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. - alert: 十分鐘容器被kill的次數 expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 for: 0m labels: severity: warning annotations: summary: 十分鐘pod被kill的次數 (instance {{ $labels.instance }}) description: "過去10分鐘內容器 {{ $labels.container }} 在pod {{ $labels.namespace }}/{{ $labels.pod }} 被殺死了 {{ $value }} \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.7. Kubernetes Job failed # Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete - alert: job 未能完成 expr: kube_job_status_failed > 0 for: 0m labels: severity: warning annotations: summary: Kubernetes Job 未完成 (instance {{ $labels.instance }}) description: "Job {{$labels.namespace}}/{{$labels.exported_job}} 未能完成\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.9. Kubernetes PersistentVolumeClaim pending # PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending - alert: k8s volumeclaim 已掛起 expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1 for: 2m labels: severity: warning annotations: summary: k8s PersistentVolumeClaim 已掛起 (instance {{ $labels.instance }}) description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} 已掛起\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.12. Kubernetes PersistentVolume error # Persistent volume is in bad state - alert: 永久卷處於錯誤狀態 expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0 for: 0m labels: severity: critical annotations: summary: K8s 永久卷處於錯誤狀態 (instance {{ $labels.instance }}) description: "永久卷處於錯誤狀態\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.13. Kubernetes StatefulSet down # A StatefulSet went down - alert: k8s 狀態集 expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1 for: 1m labels: severity: critical annotations: summary: Kubernetes 狀態集 down (instance {{ $labels.instance }}) description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.17. Kubernetes Pod not healthy # Pod has been in a non-ready state for longer than 15 minutes. - alert: POd 亞健康狀態 expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0 for: 0m labels: severity: critical annotations: summary: k8s Pod not healthy (instance {{ $labels.instance }}) description: "Pod已處於非就緒狀態超過15分鐘。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.18. Kubernetes pod crash looping # Pod {{ $labels.pod }} is crash looping - alert: K8s Pod CrashLooping expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 for: 2m labels: severity: warning annotations: summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) description: "Pod {{ $labels.pod }} 崩潰迴圈\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.21. Kubernetes StatefulSet replicas mismatch # A StatefulSet does not match the expected number of replicas. - alert: 狀態集與副本的預期數量不匹配 expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas for: 10m labels: severity: warning annotations: summary: Kubernetes 狀態集副本不匹配 (instance {{ $labels.instance }}) description: "狀態集與副本的預期數量不匹配.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.23. Kubernetes StatefulSet generation mismatch # A StatefulSet has failed but has not been rolled back. - alert: K8s狀態集生成失配 expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation for: 10m labels: severity: critical annotations: summary: Kubernetes 狀態集生成失配 (instance {{ $labels.instance }}) description: "狀態集已失敗,但尚未被回滾。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.28. Kubernetes job slow completion # Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time - alert: K8s Job 緩慢完成 expr: kube_job_spec_completions - kube_job_status_succeeded > 0 for: 12h labels: severity: critical annotations: summary: Kubernetes job 完成緩慢 (instance {{ $labels.instance }}) description: "K8s Job {{ $labels.namespace }}/{{ $labels.job_name }} 未及時完成.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.30. Kubernetes API client errors # Kubernetes API client is experiencing high error rate - alert: K8s API客戶端錯誤 expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 for: 2m labels: severity: critical annotations: summary: Kubernetes API客戶端錯誤 (instance {{ $labels.instance }}) description: "Kubernetes API客戶端遇到高錯誤率\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.31. Kubernetes client certificate expires next week # A client certificate used to authenticate to the apiserver is expiring next week. # - alert: KubernetesClientCertificateExpiresNextWeek # expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 # for: 0m # labels: # severity: warning # annotations: # summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) # description: "用於向apiserver進行身份驗證的客戶端證書將於下週過期。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.32. Kubernetes client certificate expires soon # A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. # - alert: KubernetesClientCertificateExpiresSoon # expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60 # for: 0m # labels: # severity: critical # annotations: # summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }}) # description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.33. Kubernetes API server latency # Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. # - alert: KubernetesApiServerLatency # expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1 # for: 2m # labels: # severity: warning # annotations: # summary: Kubernetes API server latency (instance {{ $labels.instance }}) # description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
注:未測試,謹慎使用