1、文件结构:
server端:
docker-compose-prometheus/
├── alertmanager
│ └── config.yml
├── docker-compose.yml
├── grafana
│ ├── config.monitoring
│ └── provisioning
└── prometheus
├── alert.yml
└── prometheus.yml
node端:
docker-compose-prometheus/
├── node-docker-compose.yml
2、Server端文件:
vim docker-compose.yml
```
version: '3.3'
volumes:
prometheus_data: {}
grafana_data: {}
networks:
monitoring:
driver: bridge
services:
prometheus:
image: prom/prometheus:v2.51.1
container_name: prometheus
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- $PWD/prometheus/:/etc/prometheus/
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
networks:
- monitoring
links:
- alertmanager
- cadvisor
- node_exporter
expose:
- '9090'
ports:
- 9090:9090
depends_on:
- cadvisor
alertmanager:
image: prom/alertmanager:v0.25.1
container_name: alertmanager
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- $PWD/alertmanager/:/etc/alertmanager/
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
networks:
- monitoring
expose:
- '9093'
ports:
- 9093:9093
cadvisor:
image: google/cadvisor:v0.33.0
container_name: cadvisor
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
networks:
- monitoring
expose:
- '8080'
node_exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- --collector.filesystem.ignored-mount-points
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
networks:
- monitoring
expose:
- '9100'
grafana:
image: grafana/grafana:7.1.5
user: "104"
container_name: grafana
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- grafana_data:/var/lib/grafana
- $PWD/grafana/provisioning/:/etc/grafana/provisioning/
env_file:
- $PWD/grafana/config.monitoring
networks:
- monitoring
links:
- prometheus
ports:
- 3000:3000
depends_on:
- prometheus
```
2.2、prometheus:
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['localhost:9090']
# 微商城主机
- job_name: '112mysql'
static_configs:
- targets: ['xxx.xxx.xxx.xxx:xxxxx']
- job_name: '115linux'
static_configs:
- targets: ['xxx.xxx.xxx.xxx:xxxxx']
# 内部linux主机
- job_name: 'xxxlinux'
static_configs:
- targets: ['192.168.188.xxx:9100']
- job_name: 'xxxcadvisor'
scrape_interval: 5s
static_configs:
- targets: ['192.168.188.xxx:8080']
# 内部 Windows 主机
- job_name: 'xxxwindows'
static_configs:
- targets: ['192.168.188.xxx:9182']
# 监控本地
# - job_name: 'cadvisor'
# Override the global default and scrape targets from this job every 5 seconds.
# scrape_interval: 5s
# static_configs:
# - targets: ['cadvisor:8080']
# - job_name: 'node-exporter'
# Override the global default and scrape targets from this job every 5 seconds.
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
# scrape_interval: 5s
# static_configs:
# - targets: ['node_exporter:9100']
groups:
- name: 服务器状态
rules:
# Alert for any instance that is unreachable for >2 minutes.
# 服务器已掉线报警
- alert: 服务器已掉线
expr: up == 0
for: 2m
labels:
severity: 警告
annotations:
summary: "服务器 {{ $labels.instance }} 已掉线"
description: "{{ $labels.instance }} 服务器 {{ $labels.job }}掉线超过2分钟 "
# 服务器已重新上线告警
- alert: 服务器已重新上线
expr: up == 1
for: 2m
labels:
severity: 信息
annotations:
summary: "服务器 {{ $labels.instance }} 已重新上线"
description: "{{ $labels.instance }} 服务器 {{ $labels.job }} 重新上线"
# Windows 服务器CPU使用率
- alert: Windows服务器CPU使用率-90
expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90
for: 2m
labels:
severity: 严重
annotations:
summary: Windows 服务器 CPU 使用率 (instance {{ $labels.instance }})
description: "CPU 使用率已超过 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Windows 服务器内存使用率
- alert: Windows服务器内存使用率-90
expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
for: 2m
labels:
severity: 严重
annotations:
summary: Windows 服务器内存使用率为 (instance {{ $labels.instance }})
description: "内存使用率超过 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Windows 服务器磁盘使用率
- alert: Windows服务器磁盘使用率-90
expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 90
for: 2m
labels:
severity: 严重
annotations:
summary: Windows 服务器磁盘使用率为 (instance {{ $labels.instance }})
description: "磁盘使用率超过 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Windows 服务器磁盘使用率
- alert: Windows服务器磁盘使用率-95
expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 95
for: 2m
labels:
severity: 严重
annotations:
summary: Windows 服务器磁盘使用率为 (instance {{ $labels.instance }})
description: "磁盘使用率超过 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: Linux 服务器状态
rules:
# cpu使用率告警
- alert: cpu使用率告警-80
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
for: 2m
labels:
severity: 警告
annotations:
description: "服务器: CPU使用超过80%!(当前值: {{ $value }}%)"
# cpu使用率告警
- alert: cpu使用率告警-90
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
for: 2m
labels:
severity: 严重
annotations:
description: "服务器: CPU使用超过90%!(当前值: {{ $value }}%)"
# 内存使用率告警
- alert: 内存使用率告警-80
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
severity: 警告
annotations:
description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)"
# 内存使用率告警
- alert: 内存使用率告警-90
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 90
for: 2m
labels:
severity: 严重
annotations:
description: "服务器: 内存使用超过90%!(当前值: {{ $value }}%)"
# 磁盘告警
- alert: 磁盘告警-80
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
for: 2m
labels:
severity: 警告
annotations:
description: "服务器: 磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
# 磁盘告警
- alert: 磁盘告警-90
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 90
for: 2m
labels:
severity: 严重
annotations:
description: "服务器: 磁盘设备: 使用超过90%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
# MYSQL数据库监控规则
- name: MySQL数据库监控规则
rules:
#mysql状态检测
- alert: MySQL Status
expr: mysql_up == 0
for: 10s
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} Mysql服务 !!!"
description: "{{ $labels.instance }} Mysql服务不可用 请检查!"
#mysql连接数告警
- alert: Mysql_Too_Many_Connections
expr: rate(mysql_global_status_threads_connected[5m]) > 200
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: 连接数过多"
description: "{{$labels.instance}}: 连接数过多,请处理 ,(current value is: {{ $value }})!"
#mysql慢查询有点多告警
- alert: Mysql_Too_Many_slow_queries
expr: rate(mysql_global_status_slow_queries[5m]) > 3
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: 慢查询有点多,请检查处理!"
description: "{{$labels.instance}}: Mysql slow_queries is more than 3 per second ,(current value is: {{ $value }})"
2.3、alertmanager:
2.3.1、config.yml
global:
#163服务器
#smtp_smarthost: 'smtp.qq.com:465'
smtp_smarthost: 'smtp.163.com:25'
#发邮件的邮箱
#smtp_from: '[email protected]'
smtp_from: '[email protected]'
#发邮件的邮箱用户名,也就是你的邮箱
#smtp_auth_username: '[email protected]'
smtp_auth_username: 'xxxxxx163.com'
#发邮件的邮箱密码 这儿的密码是第三方授权码
#smtp_auth_password: 'your-password'
smtp_auth_password: 'xxxxxx'
#进行tls验证
#smtp_require_tls: true
smtp_require_tls: false
route:
group_by: ['alertname','instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'default'
routes:
- match_re:
instance: "xxx.xxx.xxx.xxx:xxxxx|xxx.xxx.xxx.xxx:xxxxx"
receiver: '哈哈哈'
- receiver: 'other'
receivers:
- name: '哈哈哈'
email_configs:
- to: 'xxxxx.com'
- name: 'other'
email_configs:
- to: 'xxxx.com'
- name: 'default'
email_configs:
- to: 'xxxx.com'
# 抑制规则,规则未生效,没什么卵用
# inhibit_rules:
# - source_match:
# alertname: 高负载警告
# severity: critical
# target_match:
# severity: normal
# equal:
# - instance
# 抑制规则
# 减少重复警告,如果同1台机器在短时间内多次触发相同的 高负载警告 的 警告 级别告警,只会发送1次,后续的相同告警将被抑制。这有助于减少告警风暴,避免重复告警对接收者造成干扰
inhibit_rules:
- source_match:
alertname: 高负载警告
severity: '警告'
target_match:
severity: '警告'
equal: ['alertname', 'instance']
2.4、grafana:
2.4.1、config.monitoring:
GF_SECURITY_ADMIN_PASSWORD=qwe123!@#
GF_USERS_ALLOW_SIGN_UP=false
GF_SMTP_ENABLED=true
GF_SMTP_HOST=smtp.qq.com:465
[email protected]
GF_SMTP_PASSWORD=your-password
[email protected]
3、node端文件:
3.1、node-docker-compose.yml:
#监控node节点+节点上的容器
version: '3.2'
services:
nodeexporter:
image: 192.168.188.251:8081/node_exporter/node_exporter:v1.8.2 # harbor私有库地址
container_name: nodeexporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
environment:
TZ: Asia/Shanghai
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
labels:
org.label-schema.group: "monitoring"
restart: always
network_mode: host
cadvisor:
image: 192.168.188.251:8081/cadvisor/cadvisor:v0.33.0
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
ports:
- 8080:8080
environment:
TZ: Asia/Shanghai
labels:
org.label-schema.group: "monitoring"
restart: always