|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[S] 为部署Prometheus的服务端 [C]为向Prometheus服务端发送数据的客户端,为测试方便,此实验所有主机关闭防火墙
2. 准备目录以及文件
// 创建工作目录
mkdir -p /etc/{prometheus,grafana,alertmanager}
// 创建docker-compose目录
mkdir -p /app/prometheus
准备Prometheus配置文件
vim /etc/prometheus/prometheus.yml
# 全局配置
global:
scrape_interval: 15s
evaluation_interval: 15s
# scrape_timeout is set to the global default (10s).
# 告警配置
alerting:
alertmanagers:
- static_configs:
- targets: ['172.16.25.104:9093']
# 加载一次规则,并根据全局“评估间隔”定期评估它们。
rule_files:
- "/etc/prometheus/rules.yml"
- "/etc/prometheus/record-rules.yml"
# 控制Prometheus监视哪些资源
# 默认配置中,有一个名为prometheus的作业,它会收集Prometheus服务器公开的时间序列数据。
scrape_configs:
# 作业名称将作为标签“job=<job_name>`添加到此配置中获取的任何数据。
- job_name: 'prometheus'
static_configs:
- targets: ['172.16.250.104:9090']
- job_name: 'docker-metrics'
static_configs:
- targets:
- '172.16.250.104:8080'
relabel_configs:
- source_labels: [__address__]
target_label: addr
regex: (.*):(.*)
replacement: $1
- job_name: 'nodes-metrics'
static_configs:
- targets:
- '172.16.25.104:9100' # 此处为需要客户端IP,即为node_exporter
- '172.16.250.100:9100'
- '172.16.250.101:9100'
- '172.16.250.102:9100'
labels:
server: ops
relabel_configs:
- source_labels: [__address__]
target_label: addr
regex: (.*):(.*)
replacement: $1
vim /etc/prometheus/rules.yml
groups:
- name: example
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
serverity: page
annotations:
summary: "Instance $labels.instance down"
description: " $labels.instance of job $labels.job has been down for more than 5 minutes."
# 进阶告警
groups:
- name: prometheus-alert
rules:
- alert: prometheus-down
expr: prometheus:up == 0
for: 1m
labels:
severity: 'critical'
annotations:
summary: "instance: $labels.instance 宕机了"
description: "instance: $labels.instance \n- job: $labels.job 关机了, 时间已经1分钟了。"
value: " $value "
instance: " $labels.instance "
- alert: prometheus-cpu-high
expr: prometheus:cpu:total:percent > 80
for: 3m
labels:
severity: info
annotations:
summary: "instance: $labels.instance cpu 使用率高于 $value "
description: "instance: $labels.instance \n- job: $labels.job CPU使用率已经持续一分钟高过80% 。"
value: " $value "
instance: " $labels.instance "
- alert: prometheus-cpu-iowait-high
expr: prometheus:cpu:iowait:percent >= 12
for: 3m
labels:
severity: info
annotations:
summary: "instance: $labels.instance cpu iowait 使用率高于 $value "
description: "instance: $labels.instance \n- job: $labels.job cpu iowait使用率已经持续三分钟高过12%"
value: " $value "
instance: " $labels.instance "
- alert: prometheus-load-load1-high
expr: (prometheus:load:load1) > (prometheus:cpu:count) * 1.2
for: 3m
labels:
severity: info
annotations:
summary: "instance: $labels.instance load1 使用率高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-memory-high
expr: prometheus:memory:used:percent > 85
for: 3m
labels:
severity: info
annotations:
summary: "instance: $labels.instance memory 使用率高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-disk-high
expr: prometheus:disk:used:percent > 80
for: 10m
labels:
severity: info
annotations:
summary: "instance: $labels.instance disk 使用率高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-disk-read:count-high
expr: prometheus:disk:read:count:rate > 2000
for: 2m
labels:
severity: info
annotations:
summary: "instance: $labels.instance iops read 使用率高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-disk-write-count-high
expr: prometheus:disk:write:count:rate > 2000
for: 2m
labels:
severity: info
annotations:
summary: "instance: $labels.instance iops write 使用率高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-disk-read-mb-high
expr: prometheus:disk:read:mb:rate > 60
for: 2m
labels:
severity: info
annotations:
summary: "instance: $labels.instance 读取字节数 高于 $value "
description: ""
instance: " $labels.instance "
value: " $value "
- alert: prometheus-disk-write-mb-high
expr: prometheus:disk:write:mb:rate > 60
for: 2m
labels:
severity: info
annotations:
summary: "instance: $labels.instance 写入字节数 高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-filefd-allocated-percent-high
expr: prometheus:filefd_allocated:percent > 80
for: 10m
labels:
severity: info
annotations:
summary: "instance: $labels.instance 打开文件描述符 高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-network-netin-error-rate-high
expr: prometheus:network:netin:error:rate > 4
for: 1m
labels:
severity: info
annotations:
summary: "instance: $labels.instance 包进入的错误速率 高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-network-netin-packet-rate-high
expr: prometheus:network:netin:packet:rate > 35000
for: 1m
labels:
severity: info
annotations:
summary: "instance: $labels.instance 包进入速率 高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-network-netout-packet-rate-high
expr: prometheus:network:netout:packet:rate > 35000
for: 1m
labels:
severity: info
annotations:
summary: "instance: $labels.instance 包流出速率 高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-network-tcp-total-count-high
expr: prometheus:network:tcp:total:count > 40000
for: 1m
labels:
severity: info
annotations:
summary: "instance: $labels.instance tcp连接数量 高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-process-zoom-total-count-high
expr: prometheus:process:zoom:total:count > 10
for: 10m
labels:
severity: info
annotations:
summary: "instance: $labels.instance 僵死进程数量 高于 $value "
description: ""
value: " $value "
instance: " $labels.instance "
- alert: prometheus-time-offset-high
expr: prometheus:time:offset > 0.03
for: 2m
labels:
severity: info
annotations:
summary: "instance: $labels.instance $labels.desc $value $labels.unit "
description: ""
value: " $value "
instance: " $labels.instance "
编辑规则文件
vim /etc/prometheus/record-rules.yml
groups:
- name: prometheus-record
rules:
- expr: up{job!="prometheus"}
record: prometheus:up
labels:
desc: "节点是否在线, 在线1,不在线0"
unit: " "
job: "prometheus"
- expr: time() - node_boot_time_seconds{}
record: prometheus:node_uptime
labels:
desc: "节点的运行时间"
unit: "s"
job: "prometheus"
##############################################################################################
# cpu #
- expr: (1 - avg by (environment,instance) (irate(node_cpu_seconds_total{job!="prometheus",mode="idle"}[5m]))) * 100
record: prometheus:cpu:total:percent
labels:
desc: "节点的cpu总消耗百分比"
unit: "%"
job: "prometheus"
- expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job!="prometheus",mode="idle"}[5m]))) * 100
record: prometheus:cpu:idle:percent
labels:
desc: "节点的cpu idle百分比"
unit: "%"
job: "prometheus"
- expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job!="prometheus",mode="iowait"}[5m]))) * 100
record: prometheus:cpu:iowait:percent
labels:
desc: "节点的cpu iowait百分比"
unit: "%"
job: "prometheus"
- expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job!="prometheus",mode="system"}[5m]))) * 100
record: prometheus:cpu:system:percent
labels:
desc: "节点的cpu system百分比"
unit: "%"
job: "prometheus"
- expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job!="prometheus",mode="user"}[5m]))) * 100
record: prometheus:cpu:user:percent
labels:
desc: "节点的cpu user百分比"
unit: "%"
job: "prometheus"
- expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job!="prometheus",mode=~"softirq|nice|irq|steal"}[5m]))) * 100
record: prometheus:cpu:other:percent
labels:
desc: "节点的cpu 其他的百分比"
unit: "%"
job: "prometheus"
##############################################################################################
##############################################################################################
# memory #
- expr: node_memory_MemTotal_bytes{job!="prometheus"}
record: prometheus:memory:total
labels:
desc: "节点的内存总量"
unit: byte
job: "prometheus"
- expr: node_memory_MemFree_bytes{job!="prometheus"}
record: prometheus:memory:free
labels:
desc: "节点的剩余内存量"
unit: byte
job: "prometheus"
- expr: node_memory_MemTotal_bytes{job!="prometheus"} - node_memory_MemFree_bytes{job!="prometheus"}
record: prometheus:memory:used
labels:
desc: "节点的已使用内存量"
unit: byte
job: "prometheus"
- expr: node_memory_MemTotal_bytes{job!="prometheus"} - node_memory_MemAvailable_bytes{job!="prometheus"}
record: prometheus:memory:actualused
labels:
desc: "节点用户实际使用的内存量"
unit: byte
job: "prometheus"
- expr: (1-(node_memory_MemAvailable_bytes{job!="prometheus"} / (node_memory_MemTotal_bytes{job!="prometheus"})))* 100
record: prometheus:memory:used:percent
labels:
desc: "节点的内存使用百分比"
unit: "%"
job: "prometheus"
- expr: ((node_memory_MemAvailable_bytes{job!="prometheus"} / (node_memory_MemTotal_bytes{job!="prometheus"})))* 100
record: prometheus:memory:free:percent
labels:
desc: "节点的内存剩余百分比"
unit: "%"
job: "prometheus"
##############################################################################################
# load #
- expr: sum by (instance) (node_load1{job!="prometheus"})
record: prometheus:load:load1
labels:
desc: "系统1分钟负载"
unit: " "
job: "prometheus"
- expr: sum by (instance) (node_load5{job!="prometheus"})
record: prometheus:load:load5
labels:
desc: "系统5分钟负载"
unit: " "
job: "prometheus"
- expr: sum by (instance) (node_load15{job!="prometheus"})
record: prometheus:load:load15
labels:
desc: "系统15分钟负载"
unit: " "
job: "prometheus"
##############################################################################################
# disk #
- expr: node_filesystem_size_bytes{job!="prometheus" ,fstype=~"ext4|xfs"}
record: prometheus:disk:usage:total
labels:
desc: "节点的磁盘总量"
unit: byte
job: "prometheus"
- expr: node_filesystem_avail_bytes{job!="prometheus",fstype=~"ext4|xfs"}
record: prometheus:disk:usage:free
labels:
desc: "节点的磁盘剩余空间"
unit: byte
job: "prometheus"
- expr: node_filesystem_size_bytes{job!="prometheus",fstype=~"ext4|xfs"} - node_filesystem_avail_bytes{job!="prometheus",fstype=~"ext4|xfs"}
record: prometheus:disk:usage:used
labels:
desc: "节点的磁盘使用的空间"
unit: byte
job: "prometheus"
- expr: (1 - node_filesystem_avail_bytes{job!="prometheus",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{job!="prometheus",fstype=~"ext4|xfs"}) * 100
record: prometheus:disk:used:percent
labels:
desc: "节点的磁盘的使用百分比"
unit: "%"
job: "prometheus"
- expr: irate(node_disk_reads_completed_total{job!="prometheus"}[1m])
record: prometheus:disk:read:count:rate
labels:
desc: "节点的磁盘读取速率"
unit: "次/秒"
job: "prometheus"
- expr: irate(node_disk_writes_completed_total{job!="prometheus"}[1m])
record: prometheus:disk:write:count:rate
labels:
desc: "节点的磁盘写入速率"
unit: "次/秒"
job: "prometheus"
- expr: (irate(node_disk_written_bytes_total{job!="prometheus"}[1m]))/1024/1024
record: prometheus:disk:read:mb:rate
labels:
desc: "节点的设备读取MB速率"
unit: "MB/s"
job: "prometheus"
- expr: (irate(node_disk_read_bytes_total{job!="prometheus"}[1m]))/1024/1024
record: prometheus:disk:write:mb:rate
labels:
desc: "节点的设备写入MB速率"
unit: "MB/s"
job: "prometheus"
##############################################################################################
# filesystem #
- expr: (1 -node_filesystem_files_free{job!="prometheus",fstype=~"ext4|xfs"} / node_filesystem_files{job!="prometheus",fstype=~"ext4|xfs"}) * 100
record: prometheus:filesystem:used:percent
labels:
desc: "节点的inode的剩余可用的百分比"
unit: "%"
job: "prometheus"
#############################################################################################
# filefd #
- expr: node_filefd_allocated{job!="prometheus"}
record: prometheus:filefd_allocated:count
labels:
desc: "节点的文件描述符打开个数"
unit: "%"
job: "prometheus"
- expr: node_filefd_allocated{job!="prometheus"}/node_filefd_maximum{job!="prometheus"} * 100
record: prometheus:filefd_allocated:percent
labels:
desc: "节点的文件描述符打开百分比"
unit: "%"
job: "prometheus"
#############################################################################################
# network #
- expr: avg by (environment,instance,device) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
record: prometheus:network:netin:bit:rate
labels:
desc: "节点网卡eth0每秒接收的比特数"
unit: "bit/s"
job: "prometheus"
- expr: avg by (environment,instance,device) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
record: prometheus:network:netout:bit:rate
labels:
desc: "节点网卡eth0每秒发送的比特数"
unit: "bit/s"
job: "prometheus"
- expr: avg by (environment,instance,device) (irate(node_network_receive_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
record: prometheus:network:netin:packet:rate
labels:
desc: "节点网卡每秒接收的数据包个数"
unit: "个/秒"
job: "prometheus"
- expr: avg by (environment,instance,device) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
record: prometheus:network:netout:packet:rate
labels:
desc: "节点网卡发送的数据包个数"
unit: "个/秒"
job: "prometheus"
- expr: avg by (environment,instance,device) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
record: prometheus:network:netin:error:rate
labels:
desc: "节点设备驱动器检测到的接收错误包的数量"
unit: "个/秒"
job: "prometheus"
- expr: avg by (environment,instance,device) (irate(node_network_transmit_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
record: prometheus:network:netout:error:rate
labels:
desc: "节点设备驱动器检测到的发送错误包的数量"
unit: "个/秒"
job: "prometheus"
- expr: node_tcp_connection_states{job!="prometheus", state="established"}
record: prometheus:network:tcp:established:count
labels:
desc: "节点当前established的个数"
unit: "个"
job: "prometheus"
- expr: node_tcp_connection_states{job!="prometheus", state="time_wait"}
record: prometheus:network:tcp:timewait:count
labels:
desc: "节点timewait的连接数"
unit: "个"
job: "prometheus"
- expr: sum by (environment,instance) (node_tcp_connection_states{job!="prometheus"})
record: prometheus:network:tcp:total:count
labels:
desc: "节点tcp连接总数"
unit: "个"
job: "prometheus"
vim /etc/alertmanager/alertmanager.yml
# 适用于企业微信告警
global:
resolve_timeout: 5m
templates:
- '/etc/alertmanager/wechat.tmpl' #微信模板,同级目录下wechat.tmpl
route:
group_by: ['alertname']
group_wait: 5s
#同一组内警报,等待group_interval时间后,再继续等待repeat_interval时间
group_interval: 1m
#当group_interval时间到后,再等待repeat_interval时间后,才进行报警
repeat_interval: 5m
receiver: 'wechat'
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'xxxxxxxxxxxxx' #企业微信ID
agent_id: 'xxxxxxx' #应用ID
api_secret: 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' #应用密码
#to_party: 'xxxxxx' #部门id(部门所属人员均可接收到) 单个账号发送:to_user: 'xxxxx' #多人“,”逗号隔开
to_user: 'xxxxxx'
send_resolved: true
vim /etc/alertmanager/wechat.tmpl
define "wechat.default.message"
- if gt (len .Alerts.Firing) 0 -
- range $index, $alert := .Alerts -
- if eq $index 0
==========异常告警==========
告警类型: $alert.Labels.alertname
告警级别: $alert.Labels.severity
告警详情: $alert.Annotations.message $alert.Annotations.description;$alert.Annotations.summary
故障时间: ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05"
- if gt (len $alert.Labels.instance) 0
实例信息: $alert.Labels.instance
- end
- if gt (len $alert.Labels.namespace) 0
命名空间: $alert.Labels.namespace
- end
- if gt (len $alert.Labels.node) 0
节点信息: $alert.Labels.node
- end
- if gt (len $alert.Labels.pod) 0
实例名称: $alert.Labels.pod
- end
============END============
- end
- end
- end
- if gt (len .Alerts.Resolved) 0 -
- range $index, $alert := .Alerts -
- if eq $index 0
==========异常恢复==========
告警类型: $alert.Labels.alertname
告警级别: $alert.Labels.severity
告警详情: $alert.Annotations.message $alert.Annotations.description;$alert.Annotations.summary
故障时间: ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05"
恢复时间: ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05"
- if gt (len $alert.Labels.instance) 0
实例信息: $alert.Labels.instance
- end
- if gt (len $alert.Labels.namespace) 0
命名空间: $alert.Labels.namespace
- end
- if gt (len $alert.Labels.node) 0
节点信息: $alert.Labels.node
- end
- if gt (len $alert.Labels.pod) 0
实例名称: $alert.Labels.pod
- end
============END============
- end
- end
- end
- end
vim /app/prometheus/prometheus.yml
version: '3'
services:
prometheus:
image: prom/prometheus
container_name: prometheus
volumes:
- /etc/prometheus/:/etc/prometheus/
- prometheus_data:/prometheus
- /etc/localtime:/etc/localtime
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.external-url=http://172.16.250.104:9090/'
- '--web.enable-lifecycle'
- '--storage.tsdb.retention=15d'
ports:
- 9090:9090
links:
- alertmanager:alertmanager
- cadvisor:cadvisor
restart: always
cadvisor:
image: google/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /etc/localtime:/etc/localtime
ports:
- 8080:8080
alertmanager:
image: prom/alertmanager
container_name: alertmanager
ports:
- 9093:9093
volumes:
- /etc/alertmanager/:/etc/alertmanager/
- alertmanager_data:/alertmanager
- /etc/localtime:/etc/localtime
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
restart: unless-stopped
grafana:
image: grafana/grafana
container_name: grafana
ports:
- 3000:3000
volumes:
- /etc/grafana/:/etc/grafana/provisioning/
- grafana_data:/var/lib/grafana
- /etc/localtime:/etc/localtime
environment:
- GF_INSTALL_PLUGINS=camptocamp-prometheus-alertmanager-datasource
links:
- prometheus:prometheus
- alertmanager:alertmanager
- cadvisor:cadvisor
restart: unless-stopped
volumes:
prometheus_data: {}
grafana_data: {}
alertmanager_data: {}
// 在/app/prometheus 目录中执行
docker-compose up -d
查看启动状态
[root@ansible-ctl prometheus]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
e636a8bcaa96 grafana/grafana:9.2.8 "/run.sh" 4 hours ago Up 4 hours 0.0.0.0:3000->3000/tcp, :::3000->3000/tcp grafana
032babdc6206 prom/prometheus "/bin/prometheus --c…" 4 hours ago Up 4 hours 0.0.0.0:9090->9090/tcp, :::9090->9090/tcp prometheus
f371002e4418 google/cadvisor:latest "/usr/bin/cadvisor -…" 4 hours ago Up 4 hours 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp cadvisor
ebf9c9d93bf6 prom/alertmanager "/bin/alertmanager -…" 4 hours ago Up 3 hours 0.0.0.0:9093->9093/tcp, :::9093->9093/tcp alertmanager
curl -Lo /etc/yum.repos.d/_copr_ibotty-prometheus-exporters.repo https://copr.fedorainfracloud.org/coprs/ibotty/prometheus-exporters/repo/epel-7/ibotty-prometheus-exporters-epel-7.repo
yum -y install node_exporter
systemctl start node_exporter && systemctl enable node_exporter
# 客户端安装完成后,在服务端/etc/prometheus/prometheus.yml
# 中添加采集客户端信息,也就是最开始的prometheus配置文件,
# 之前已经新增过了,此处不再新增,如有新增的,在
# job_name: 'nodes-metrics'
# static_configs:
# - targets:
# - '172.16.25.104:9100' # 此处为需要客户端IP,即为node_exporter
# - '172.16.250.100:9100'
# - '172.16.250.101:9100'
# - '172.16.250.102:9100'
# - '需要新增的客户端IP:9100'
systemctl restart node_exporter
docker-compose restart # 或者单独重启prometheus容器
docker restart prometheus
http://172.16.250.104:8080/ # cAdvisor docker容器指标监控查看
6. 测试微信告警功能
此处手动停止某个主机的node_exporter服务,测试微信能否收到告警信息
systemctl stop node_exporter
可以在Grafana中看到,已经有主机down机
大概一分钟左右,收到微信告警信息
ok,打完收工,以上即为使用docker-compose简单的搭建Prometheus监控平台,更多高阶使用,后续再慢慢摸索,此处仅仅为探索安装部署。
# 文中告警规则以及模板来源于魔改网上找到的资料
引用来源:
1. https://www.cnblogs.com/namedgx/p/14919857.html
2. https://blog.csdn.net/weixin_42665545/article/details/125893800