1.监测网站状态
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
#监测网站状态 - job_name: 'web_status' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: ['http://www.yoyoask.com'] labels: instance: 'web_status' group: 'web' relabel_configs: - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: 127.0.0.1:9115 |
对应监控规则
1 2 3 4 5 6 7 8 9 10 11 |
groups: - name: 网站状态告警 rules: - alert: 网站状态异常 expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 for: 0m labels: severity: critical annotations: summary: "HTTP失败 (instance {{ $labels.instance }})" description: "HTTP状态不是 200-399\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
1 2 3 4 5 6 7 8 9 10 11 |
groups: - name: HTTP请求缓慢探测 rules: - alert: HTTP请求缓慢探测 expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: 黑盒探测缓慢的HTTP (instance {{ $labels.instance }}) description: HTTP请求花费了超过1s \n VALUE = {{ $value }}\n LABELS: {{ $labels }} |
1 2 3 4 5 6 7 8 9 10 11 12 |
groups: - name: httpd url check rules: - alert: http_url_check failed for: 5s expr: probe_success{job="web_status"} == 0 labels: serverity: critical annotations: summary: "{{ $labels.group }}组的应用 {{ $labels.app }} url接口检测不通" description: "{{ $labels.group }}的{{ $labels.app }} url检测失败,当前probe_success的值为{ { $value }}" |
2.监听ip主机存活状态
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
#监听ip主机存活状态 - job_name: 'node_status' metrics_path: /probe params: module: [icmp] #ping static_configs: - targets: ['192.168.66.177'] labels: instance: 'node_status' group: 'node' relabel_configs: - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: 127.0.0.1:9115 |
icmp[ping]对应监控规则
1 2 3 4 5 6 7 8 9 10 11 |
groups: - name: ping 检测失败 rules: - alert: ping检测失败 expr: probe_success{job="node_status"} == 0 for: 2m labels: serverity: critical annotations: summary: "{{ $labels.group }}组的服务器 {{ $labels.hostname }} 服务器检测不通" description: "{{ $labels.group }}的{{ $labels.hostname }} icmp检测失败,当前probe_success的值为{ { $value }}" |
3.监控主机端口存活状态
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
- job_name: 'port_status' metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: ['192.168.66.177:80'] labels: instance: 'port_status' group: 'tcp' relabel_configs: - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: 127.0.0.1:9115 |
对应监控规则
1 2 3 4 5 6 7 8 9 10 11 12 |
groups: - name: TCP端口检测 rules: - alert: TCP端口检测失败 expr: probe_success{job="port_status"} == 0 for: 5s labels: serverity: critical annotations: summary: "{{ $labels.group }}组的应用 {{ $labels.app }} 端口检测不通" description: "{{ $labels.group }}的{{ $labels.app }} tcp检测失败,当前probe_success的值为{ { $value }}" |
linux基本资源指标监控,比如cpu、内存、网卡、磁盘等,也可以通过promsql 自己设定其他规则
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
groups: - name: linux_alert rules: - alert: "linux load5 over 5" for: 5s expr: node_load5 > 5 labels: serverity: critical annotations: description: "{{ $labels.app }} over 5,当前值:{{ $value }}" summary: "linux load5 over 5" - alert: "node explorter have down" for: 5s expr: up==0 labels: serverity: critical annotations: description: "{{ $labels.app }} -- {{ $labels.instance }} ,当前值:{{ $value }}" summary: "node explorter value equle 0" - alert: "cpu used percent over 80% per 1 min" for: 5s expr: 100 * (1 - avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m]))) * on(instance) group_left(hostname) node_uname_info > 80 labels: serverity: critical annotations: description: "{{ $labels.app }} -- {{ $labels.instance }} ,当前值:{{ $value }}" summary: "cpu used percent over 80% per 1 min" - alert: "memory used percent over 85%" for: 5m expr: ((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes{instance!~"172..*"})) * 100 > 85 labels: serverity: critical annotations: description: "{{ $labels.app }} -- {{ $labels.instance }} ,当前值:{{ $value }}" summary: "memory used percent over 85%" - alert: "eth0 input traffic network over 10M" for: 3m expr: sum by(instance) (irate(node_network_receive_bytes_total{device="eth0",instance!~"172.1.*|172..*"}[1m]) / 128/1024) * on(instance) group_left(hostname) node_uname_info > 10 labels: serverity: critical annotations: description: "{{ $labels.app }} -- {{ $labels.instance }} ,当前值:{{ $value }}" summary: "eth0 input traffic network over 10M" - alert: "eth0 output traffic network over 10M" for: 3m expr: sum by(instance) (irate(node_network_transmit_bytes_total{device="eth0",instance!~"172.1.*|175.*"}[1m]) / 128/1024) * on(instance) group_left(hostname) node_uname_info > 10 labels: serverity: critical annotations: description: "{{ $labels.app }} -- {{ $labels.instance }} ,当前值:{{ $value }}" summary: "eth0 output traffic network over 10M" - alert: "disk usage over 80%" for: 10m expr: (node_filesystem_size_bytes{device=~"/dev/.+"} - node_filesystem_free_bytes{device=~"/dev/.+"} )/ node_filesystem_size_bytes{device=~"/dev/.+"} * 100 > 80 labels: serverity: critical annotations: description: "{{ $labels.mountpoint }} 分区 over 80%,当前值:{{ $value }}" summary: "disk usage over 80%" |
SSL证书过期时间监测
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
cat << 'EOF' > prometheus.yml rule_files: - ssl_expiry.rules scrape_configs: - job_name: 'blackbox' metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. static_configs: - targets: - example.com # Target to probe relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # Blackbox exporter. EOF cat << 'EOF' > ssl_expiry.rules groups: - name: ssl_expiry.rules rules: - alert: SSLCertExpiringSoon expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 86400 * 30 for: 10m EOF |
1 2 3 4 5 |
参考过以下文章: http://www.21yunwei.com/archives/7327 https://blog.csdn.net/qq_25934401/article/details/84325356 https://www.cnblogs.com/cp-miao/p/9071939.html |
- 本文固定链接: https://www.yoyoask.com/?p=4571
- 转载请注明: shooter 于 SHOOTER 发表