prometheus node-exporter cadvisor grafana alertmanager 安裝及服務發現
prometheus node-exporter cadvisor grafana alertmanager 安裝及服務發現
本次搭建基於docker環境
前期準備
拉取映象
docker pull google/cadvisor docker pull prom/prometheus docker pull grafana/grafana docker pull prom/alertmanager
建立持久化目錄
mkdir /home/prometheus/config vim /home/prometheus/prometheus.yml mkdir /home/grafana-storage
啟動node-exporter硬體系統監控
docker run -d -p 9100:9100 \ -v /proc:/host/proc:ro \ -v /sys:/host/sys:ro \ -v /:/rootfs:ro \ --name=node-exporter \ prom/node-exporter
啟動cadvisor容器監控
docker run \ -v /:/rootfs:ro \ -v /var/run:/var/run:rw \ -v /sys:/sys:ro \ -v /var/lib/docker/:/var/lib/docker:ro \ -p 9080:8080 \ --detach=true \ --name=cadvisor \ google/cadvisor #--detach=true #分離容器
啟動grafana
docker run -d -p 3000:3000 \ --user=root \ --name=grafana \ -v /home/grafana-storage:/var/lib/grafana \ grafana/grafana #--user=root #以root使用者執行
啟動prometheus
docker run -d -p 9090:9090 \ --name prometheus \ -v /home/prometheus:/etc/prometheus \ -v /home/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \ prom/prometheus \ --web.enable-lifecycle \ --config.file="/etc/prometheus/prometheus.yml" #--web.enable-lifecycle #熱載入引數,需要配合配置檔案--config.file使用,否則會報錯 #curl -X POST http://localhost:9090/-/reload #熱載入prometheus配置檔案 #--config.file #配置檔案路徑 #--storage.tsdb.path="/etc/prometheus/data" #資料儲存路徑
Prometheus配置grafana
[root@localhost prometheus]# vim prometheus.yml global: # 全域性設定,可以被覆蓋 scrape_interval: 15s # 抓取取樣資料的時間間隔,每15秒去被監控機上取樣,即資料採集頻率 evaluation_interval: 15s # 監控資料規則的評估頻率,比如設定檔案系統使用率>75%發出告警則每15秒執行一次該規則,進行檔案系統檢查 #告警管理 alerting: alertmanagers: - static_configs: #告警靜態目標配置 # - targets: ['192.168.31.131:9093'] #告警ui地址 #告警規則 rule_files: #- /etc/prometheus/rules/*.rules #告警規則檔案路徑 scrape_configs: # 抓取配置 #靜態發現 - job_name: 'grafana' #任務名 全域性唯一 scrape_interval: 5s # 抓取取樣資料的時間間隔 static_configs: #靜態目標配置 - targets: ['192.168.31.131:3000'] #抓取地址,預設為/metrics labels: #標籤 instance: grafana
curl -X POST http://localhost:9090/-/reload #熱載入prometheus配置檔案
啟動pushgateway
docker run -d \ --name pushgateway \ -p 9091:9091 \ prom/pushgateway
推送exporter到pushgateway
curl http://localhost:9090/metrics | curl --data-binary @- http://192.168.31.158:9091/metrics/job/prometheus/instance/131-普羅米修斯 curl http://192.168.31.158:9104/metrics | curl --data-binary @- http://192.168.31.158:9091/metrics/job/mysql/instance/158-MYSQL
注:推送到pushgateway的指標不會顯示在prometheus的網頁介面上,只能通過promsql查詢
curl -X DELETE http://192.168.31.158:9104/metrics/job/mysql
Prometheus配置pushgateway
[root@localhost prometheus]# vim prometheus.yml scrape_configs: # 抓取配置 #靜態發現 - job_name: 'grafana' #任務名 全域性唯一 scrape_interval: 5s # 抓取取樣資料的時間間隔 static_configs: #靜態目標配置 - targets: ['192.168.31.131:3000'] #抓取地址,預設為/metrics labels: #標籤 instance: grafana #pushgateway中轉 - job_name: pushgateway static_configs: - targets: ['192.168.31.158:9091'] labels: instance: pushgateway
curl -X POST http://localhost:9090/-/reload #熱載入prometheus配置檔案
啟動告警管理alertmanager
docker run --name alertmanager -d -p 9093:9093 prom/alertmanager docker cp alertmanager:/etc/alertmanager/alertmanager.yml /home/alertmanager/ docker rm -f alertmanager docker run -d --name alertmanger -p 9093:9093 \ -v /home/alertmanager:/etc/alertmanager \ prom/alertmanager #--storage.path 資料儲存路徑 #--config.file 配置檔案路徑
Prometheus配置alertmanager連線
[root@localhost prometheus]# cat rules/hoststats-alert.rules groups: - name: hostStatsAlert rules: - alert: hostCpuUsageAlert expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!="idle"}[5m]))) by (instance) > 0.85 for: 1m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} CPU usgae high" description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})" - alert: hostMemUsageAlert expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.85 for: 1m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} MEM usgae high" description: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }})"
[root@localhost prometheus]# vim prometheus.yml global: # 全域性設定,可以被覆蓋 scrape_interval: 15s # 抓取取樣資料的時間間隔,每15秒去被監控機上取樣,即資料採集頻率 evaluation_interval: 15s # 監控資料規則的評估頻率,比如設定檔案系統使用率>75%發出告警則每15秒執行一次該規則,進行檔案系統檢查 #告警管理 alerting: alertmanagers: - static_configs: #告警靜態目標配置 - targets: ['192.168.31.131:9093'] #告警ui地址 #告警規則 rule_files: - /etc/prometheus/rules/*.rules #告警規則檔案路徑
curl -X POST http://localhost:9090/-/reload #熱載入prometheus配置檔案
配置基於檔案發現
[root@localhost config]# cat /home/prometheus/config/target.yml - targets: ['192.168.31.131:9090'] #prometheus的地址埠,監控Prometheus資訊 labels: app: 'app1' env: 'game1' region: 'reg1' - targets: ['192.168.31.158:9100'] #另外伺服器的node-exporter的地址埠,監控伺服器資訊 labels: app: 'app2' env: 'game2' region: 'reg2'
[root@localhost prometheus]# cat /home/prometheus/prometheus.yml global: # 全域性設定,可以被覆蓋 scrape_interval: 15s # 抓取取樣資料的時間間隔,每15秒去被監控機上取樣,即資料採集頻率 evaluation_interval: 15s # 監控資料規則的評估頻率,比如設定檔案系統使用率>75%發出告警則每15秒執行一次該規則,進行檔案系統檢查 #告警管理 alerting: alertmanagers: - static_configs: #靜態目標配置 - targets: ['192.168.31.131:9093'] #告警ui地址 #告警規則 rule_files: - /etc/prometheus/rules/*.rules #告警規則檔案路徑 scrape_configs: # 抓取配置 #靜態發現 - job_name: 'grafana' #任務名 全域性唯一 scrape_interval: 5s # 抓取取樣資料的時間間隔 static_configs: #靜態目標配置 - targets: ['192.168.31.131:3000'] #抓取地址,預設為/metrics labels: #標籤 instance: grafana #pushgateway中轉 - job_name: pushgateway static_configs: - targets: ['192.168.31.158:9091'] labels: instance: pushgateway #檔案發現 - job_name: 'file_ds' #任務名 全域性唯一 file_sd_configs: #基於檔案發現配置 - files: ['/etc/prometheus/config/*.yml'] #配置檔案路徑,匹配config目錄下所有yml檔案 refresh_interval: 5s #每五秒掃描重新整理配置檔案
curl -X POST http://localhost:9090/-/reload #熱載入prometheus配置檔案
配置基於服務發現
安裝consul
wget https://releases.hashicorp.com/consul/1.6.1/consul_1.6.1_linux_amd64.zip unzip consul_1.5.3_linux_amd64.zip ./consul agent -dev 或者 docker run --name consul -d -p 8500:8500 consul
登出服務
curl -X PUT http://192.168.31.131:8500/v1/agent/service/deregister/node-exporter
#node-exporter就是"id": "node-exporter"
註冊服務
#vim /home/prometheus/config/consul-1.json
{
"ID": "node-exporter",
"Name": "node-exporter-192.168.31.131",
"Tags": [
"node-exporter"
],
"Address": "192.168.31.131",
"Port": 9100,
"Meta": {
"app": "spring-boot",
"team": "appgroup",
"project": "bigdata"
},
"EnableTagOverride": false,
"Check": {
"HTTP": "http://192.168.31.131:9100/metrics",
"Interval": "10s"
},
"Weights": {
"Passing": 10,
"Warning": 1
}
}
# 更新註冊服務
#curl --request PUT --data @/home/prometheus/config/consul-1.json http://192.168.31.131:8500/v1/agent/service/register?replace-existing-checks=1
$ vim /home/prometheus/config/consul-2.json
{
"ID": "cadvisor-exporter",
"Name": "cadvisor-exporter-192.168.31.131",
"Tags": [
"cadvisor-exporter"
],
"Address": "192.168.31.131",
"Port": 9080,
"Meta": {
"app": "docker",
"team": "cloudgroup",
"project": "docker-service"
},
"EnableTagOverride": false,
"Check": {
"HTTP": "http://192.168.31.131:9080/metrics",
"Interval": "10s"
},
"Weights": {
"Passing": 10,
"Warning": 1
}
}
# 註冊服務
# curl --request PUT --data @/home/prometheus/config/consul-2.json http://192.168.31.131:8500/v1/agent/service/register?replace-existing-checks=1
更新Prometheus.yml
[root@localhost prometheus]# vim /home/prometheus/prometheus.yml #檔案發現 - job_name: 'file_ds' #任務名 全域性唯一 file_sd_configs: #基於檔案發現配置 - files: ['/etc/prometheus/config/*.yml'] #配置檔案路徑 refresh_interval: 5s #每五秒掃描重新整理配置檔案 #服務發現 - job_name: 'consul-node-exporter' consul_sd_configs: #基於服務發現型別 - server: '192.168.31.131:8500' #服務地址 services: [] relabel_configs: - source_labels: [__meta_consul_tags] #注意兩個橫槓"__" regex: .*node-exporter.* #匹配__meta_consul_tags中值包含node-exporter的 action: keep #keep丟棄未匹配到regex中內容的資料 - regex: __meta_consul_service_metadata_(.+) #獲取__meta_consul_service_metadata_的值(標籤) action: labelmap #將獲取的值作為新的標籤 - job_name: 'consul-cadvisor-exproter' consul_sd_configs: - server: '192.168.31.131:8500' services: [] relabel_configs: - source_labels: [__meta_consul_tags] regex: .*cadvisor-exporter.* action: keep - regex: __meta_consul_service_metadata_(.+) action: labelmap
curl -X POST http://localhost:9090/-/reload #熱載入prometheus配置檔案