1、文件结构:

server端:
docker-compose-prometheus/
├── alertmanager
│   └── config.yml
├── docker-compose.yml
├── grafana
│   ├── config.monitoring
│   └── provisioning
└── prometheus
    ├── alert.yml
    └── prometheus.yml



node端:
docker-compose-prometheus/
├── node-docker-compose.yml

2、Server端文件:

2.1、docker-compose.yml

vim docker-compose.yml
```
version: '3.3'

volumes:
  prometheus_data: {}
  grafana_data: {}

networks:
  monitoring:
    driver: bridge

services:
  prometheus:
    image: prom/prometheus:v2.51.1
    container_name: prometheus
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - $PWD/prometheus/:/etc/prometheus/
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
    networks:
      - monitoring
    links:
      - alertmanager
      - cadvisor
      - node_exporter
    expose:
      - '9090'
    ports:
      - 9090:9090
    depends_on:
      - cadvisor

  alertmanager:
    image: prom/alertmanager:v0.25.1
    container_name: alertmanager
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - $PWD/alertmanager/:/etc/alertmanager/
    command:
      - '--config.file=/etc/alertmanager/config.yml'
      - '--storage.path=/alertmanager'
    networks:
      - monitoring
    expose:
      - '9093'
    ports:
      - 9093:9093

  cadvisor:
    image: google/cadvisor:v0.33.0
    container_name: cadvisor
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    networks:
      - monitoring
    expose:
      - '8080'

  node_exporter:
    image: prom/node-exporter:v1.7.0
    container_name: node-exporter
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command: 
      - '--path.procfs=/host/proc' 
      - '--path.sysfs=/host/sys'
      - --collector.filesystem.ignored-mount-points
      - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
    networks:
      - monitoring
    expose:
      - '9100'

  grafana:
    image: grafana/grafana:7.1.5
    user: "104"
    container_name: grafana
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - grafana_data:/var/lib/grafana
      - $PWD/grafana/provisioning/:/etc/grafana/provisioning/
    env_file:
      - $PWD/grafana/config.monitoring
    networks:
      - monitoring
    links:
      - prometheus
    ports:
      - 3000:3000
    depends_on:
      - prometheus
```

2.2、prometheus:

2.2.1、prometheus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
      - targets: ['alertmanager:9093']
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "alert.yml"
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    # Override the global default and scrape targets from this job every 5 seconds.
    scrape_interval: 5s
    static_configs:
      - targets: ['localhost:9090']

# 微商城主机
  - job_name: '112mysql'
    static_configs:
      - targets: ['xxx.xxx.xxx.xxx:xxxxx']

  - job_name: '115linux'
    static_configs:
      - targets: ['xxx.xxx.xxx.xxx:xxxxx']

# 内部linux主机

  - job_name: 'xxxlinux'
    static_configs:
      - targets: ['192.168.188.xxx:9100']

  - job_name: 'xxxcadvisor'
    scrape_interval: 5s
    static_configs:
      - targets: ['192.168.188.xxx:8080']


# 内部 Windows 主机
  - job_name: 'xxxwindows'
    static_configs:
      - targets: ['192.168.188.xxx:9182']


# 监控本地
#  - job_name: 'cadvisor'
    # Override the global default and scrape targets from this job every 5 seconds.
#    scrape_interval: 5s
#    static_configs:
#      - targets: ['cadvisor:8080']

#  - job_name: 'node-exporter'
    # Override the global default and scrape targets from this job every 5 seconds.
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
#    scrape_interval: 5s
#    static_configs:
#      - targets: ['node_exporter:9100']

2.2.2、alert.yml

groups:
- name: 服务器状态
  rules:

  # Alert for any instance that is unreachable for >2 minutes.
# 服务器已掉线报警
  - alert: 服务器已掉线
    expr: up == 0
    for: 2m
    labels:
      severity: 警告
    annotations:
      summary: "服务器 {{ $labels.instance }} 已掉线"
      description: "{{ $labels.instance }} 服务器 {{ $labels.job }}掉线超过2分钟 "

# 服务器已重新上线告警
  - alert: 服务器已重新上线
    expr: up == 1
    for: 2m
    labels:
      severity: 信息
    annotations:
      summary: "服务器 {{ $labels.instance }} 已重新上线"
      description: "{{ $labels.instance }} 服务器 {{ $labels.job }} 重新上线"


# Windows 服务器CPU使用率
  - alert: Windows服务器CPU使用率-90
    expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90
    for: 2m
    labels:
      severity: 严重
    annotations:
      summary: Windows 服务器 CPU 使用率 (instance {{ $labels.instance }})
      description: "CPU 使用率已超过 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"



# Windows 服务器内存使用率 
  - alert: Windows服务器内存使用率-90
    expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
    for: 2m
    labels:
      severity: 严重
    annotations:
      summary: Windows 服务器内存使用率为 (instance {{ $labels.instance }})
      description: "内存使用率超过 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  
  
# Windows 服务器磁盘使用率
  - alert: Windows服务器磁盘使用率-90
    expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 90
    for: 2m
    labels:
      severity: 严重
    annotations:
      summary: Windows 服务器磁盘使用率为 (instance {{ $labels.instance }})
      description: "磁盘使用率超过 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

# Windows 服务器磁盘使用率
  - alert: Windows服务器磁盘使用率-95
    expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 95
    for: 2m
    labels:
      severity: 严重
    annotations:
      summary: Windows 服务器磁盘使用率为 (instance {{ $labels.instance }})
      description: "磁盘使用率超过 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

- name: Linux 服务器状态
  rules:    
#  cpu使用率告警
   - alert: cpu使用率告警-80
     expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
     for: 2m
     labels:
       severity: 警告
     annotations:
       description: "服务器: CPU使用超过80%!(当前值: {{ $value }}%)" 

#  cpu使用率告警
   - alert: cpu使用率告警-90
     expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
     for: 2m
     labels:
       severity: 严重
     annotations:
       description: "服务器: CPU使用超过90%!(当前值: {{ $value }}%)"


# 内存使用率告警
   - alert: 内存使用率告警-80
     expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
     for: 2m
     labels:
       severity: 警告
     annotations:
       description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)"

# 内存使用率告警
   - alert: 内存使用率告警-90
     expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 90
     for: 2m
     labels:
       severity: 严重
     annotations:
       description: "服务器: 内存使用超过90%!(当前值: {{ $value }}%)"


# 磁盘告警
   - alert: 磁盘告警-80
     expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
     for: 2m
     labels:
       severity: 警告
     annotations:
       description: "服务器: 磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"


# 磁盘告警
   - alert: 磁盘告警-90
     expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 90
     for: 2m
     labels:
       severity: 严重
     annotations:
       description: "服务器: 磁盘设备: 使用超过90%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"


# MYSQL数据库监控规则
- name: MySQL数据库监控规则
  rules:
#mysql状态检测
  - alert: MySQL Status
    expr: mysql_up == 0
    for: 10s
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }} Mysql服务 !!!"
      description: "{{ $labels.instance }} Mysql服务不可用  请检查!"
      
      
#mysql连接数告警
  - alert: Mysql_Too_Many_Connections
    expr: rate(mysql_global_status_threads_connected[5m]) > 200
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{$labels.instance}}: 连接数过多"
      description: "{{$labels.instance}}: 连接数过多,请处理 ,(current value is: {{ $value }})!"  
 
 #mysql慢查询有点多告警
  - alert: Mysql_Too_Many_slow_queries
    expr: rate(mysql_global_status_slow_queries[5m]) > 3
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{$labels.instance}}: 慢查询有点多,请检查处理!"
      description: "{{$labels.instance}}: Mysql slow_queries is more than 3 per second ,(current value is: {{ $value }})"

2.3、alertmanager:

2.3.1、config.yml

global:
  #163服务器
  #smtp_smarthost: 'smtp.qq.com:465'
  smtp_smarthost: 'smtp.163.com:25'
  
  #发邮件的邮箱
  #smtp_from: '[email protected]'
  smtp_from: '[email protected]'
  
  #发邮件的邮箱用户名,也就是你的邮箱     
  #smtp_auth_username: '[email protected]'
  smtp_auth_username: 'xxxxxx163.com'
  
  #发邮件的邮箱密码 这儿的密码是第三方授权码
  #smtp_auth_password: 'your-password'
  smtp_auth_password: 'xxxxxx'
  
  #进行tls验证
  #smtp_require_tls: true
  smtp_require_tls: false



route:
  group_by: ['alertname','instance']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
  receiver: 'default'

  routes:
  - match_re:
      instance: "xxx.xxx.xxx.xxx:xxxxx|xxx.xxx.xxx.xxx:xxxxx"
    receiver: '哈哈哈'
  - receiver: 'other'

receivers:
- name: '哈哈哈'
  email_configs:
  - to: 'xxxxx.com'

- name: 'other'
  email_configs:
  - to: 'xxxx.com'

- name: 'default'
  email_configs:
  - to: 'xxxx.com'

# 抑制规则,规则未生效,没什么卵用
# inhibit_rules:
#  - source_match:
#      alertname: 高负载警告
#      severity: critical
#    target_match:
#      severity: normal
#    equal:
#      - instance

# 抑制规则
# 减少重复警告,如果同1台机器在短时间内多次触发相同的 高负载警告 的 警告 级别告警,只会发送1次,后续的相同告警将被抑制。这有助于减少告警风暴,避免重复告警对接收者造成干扰
inhibit_rules:
  - source_match:
      alertname: 高负载警告
      severity: '警告'
    target_match:
      severity: '警告'
    equal: ['alertname', 'instance']

2.4、grafana:

2.4.1、config.monitoring:

GF_SECURITY_ADMIN_PASSWORD=qwe123!@#
GF_USERS_ALLOW_SIGN_UP=false
GF_SMTP_ENABLED=true
GF_SMTP_HOST=smtp.qq.com:465
[email protected]
GF_SMTP_PASSWORD=your-password
[email protected]

3、node端文件:

3.1、node-docker-compose.yml:

#监控node节点+节点上的容器

version: '3.2'
services:
  nodeexporter:
    image: 192.168.188.251:8081/node_exporter/node_exporter:v1.8.2    # harbor私有库地址
    container_name: nodeexporter
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro

    environment:
      TZ: Asia/Shanghai
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
    labels:
      org.label-schema.group: "monitoring"
    restart: always
    network_mode: host

  cadvisor:
    image: 192.168.188.251:8081/cadvisor/cadvisor:v0.33.0
    container_name: cadvisor
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
    ports:
      - 8080:8080
    environment:
      TZ: Asia/Shanghai
    labels:
      org.label-schema.group: "monitoring"
    restart: always