一、说明
Prometheus负责收集数据,Grafana负责展示数据。其中采用Prometheus 中的 Exporter含:
1)Node Exporter,负责收集 host 硬件和操作系统数据。它将以容器方式运行在所有 host 上。
2)cAdvisor,负责收集容器数据。它将以容器方式运行在所有 host 上。
3)Alertmanager,负责告警。它将以容器方式运行在所有 host 上。
完整Exporter列表请参考:https://prometheus.io/docs/instrumenting/exporters/

二、添加配置文件

1、alertmanager.yaml

global:
  smtp_smarthost: 'smtp.sina.com:25'  #163服务器
  smtp_from: 'dogotsn@sina.com'        #发邮件的邮箱
  smtp_auth_username: 'dogotsn'  #发邮件的邮箱用户名,也就是你的邮箱
  smtp_auth_password: '35ea02c*****'        #发邮件的邮箱密码
  smtp_require_tls: false        #不进行tls验证

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 10m
  receiver: live-monitoring

receivers:
- name: 'live-monitoring'
  email_configs:
  - to: 'zhangc***o15@163.com' 

2、node_down.yml

groups:
- name: node_down
  rules:
  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      user: test
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."

3、prometheus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets: ['193.168.1.39:9093']
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "node_down.yml"
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    static_configs:
    - targets: ['193.168.1.39:9090']

  - job_name: 'cadvisor'
    static_configs:
    - targets: ['193.168.1.39:8080']

  - job_name: 'node'
    scrape_interval: 8s
    static_configs:
      - targets: ['193.168.1.39:9100']

三、编写docker-compose

version: '2'

networks:
    mynet:
        driver: bridge

services:
    prometheus:
        image: prom/prometheus
        container_name: prometheus
        hostname: prometheus
        restart: always
        volumes:
            - ./prometheus.yml:/etc/prometheus/prometheus.yml
            - ./node_down.yml:/etc/prometheus/node_down.yml
        ports:
            - "9090:9090"
        networks:
            - mynet

    alertmanager:
        image: prom/alertmanager
        container_name: alertmanager
        hostname: alertmanager
        restart: always
        volumes:
            - ./alertmanager.yaml:/etc/alertmanager/alertmanager.yaml
        ports:
            - "9093:9093"
        networks:
            - mynet

    grafana:
        image: grafana/grafana
        container_name: grafana
        hostname: grafana
        restart: always
        ports:
            - "3000:3000"
        networks:
            - mynet

    node-exporter:
        image: prom/node-exporter
        #image: quay.io/prometheus/node-exporter
        container_name: node-exporter
        hostname: node-exporter
        restart: always
        ports:
            - "9100:9100"
        networks:
            - mynet
    cadvisor:
        image: google/cadvisor:latest
        container_name: cadvisor
        hostname: cadvisor
        restart: always
        volumes:
            - /:/rootfs:ro
            - /var/run:/var/run:rw
            - /sys:/sys:ro
            - /var/lib/docker/:/var/lib/docker:ro
        ports:
            - "8080:8080"
        networks:
            - mynet
                                  
View Code

相关文章:

  • 2021-08-04
  • 2021-08-13
  • 2021-05-21
  • 2022-01-17
  • 2022-03-11
  • 2021-06-04
  • 2022-12-23
猜你喜欢
  • 2021-07-20
  • 2022-03-14
  • 2021-10-26
  • 2021-06-17
  • 2021-08-09
相关资源
相似解决方案