feat(monitoring): add Alertmanager with alert rules
- docker-compose.monitoring.yml: added alertmanager service (port 9093) - prometheus.yml: alerting config + rule_files entry - alerts.yml: 5 alert rules (PostgreSQLDown, RedisDown, HighDiskUsage, HighMemoryUsage, NodeDown) - alertmanager.yml: SMTP + webhook receiver, inhibit rules
This commit is contained in:
34
docker/alertmanager/alertmanager.yml
Normal file
34
docker/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
global:
|
||||||
|
smtp_smarthost: 'localhost:587'
|
||||||
|
smtp_from: 'alerts@nexus.local'
|
||||||
|
smtp_require_tls: false
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname', 'severity']
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 10s
|
||||||
|
repeat_interval: 1h
|
||||||
|
receiver: 'default'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'default'
|
||||||
|
email_configs:
|
||||||
|
- to: 'admin@nexus.local'
|
||||||
|
subject: 'Nexus Alert: {{ .GroupLabels.alertname }}'
|
||||||
|
body: |
|
||||||
|
{{ range .Alerts }}
|
||||||
|
Alert: {{ .Annotations.summary }}
|
||||||
|
Description: {{ .Annotations.description }}
|
||||||
|
Severity: {{ .Labels.severity }}
|
||||||
|
Time: {{ .StartsAt }}
|
||||||
|
{{ end }}
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5001/pos/api/notifications/webhook'
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical'
|
||||||
|
target_match:
|
||||||
|
severity: 'warning'
|
||||||
|
equal: ['alertname']
|
||||||
@@ -55,6 +55,20 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "9121:9121"
|
- "9121:9121"
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:v0.27.0
|
||||||
|
container_name: nexus-alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "9093:9093"
|
||||||
|
volumes:
|
||||||
|
- ./alertmanager:/etc/alertmanager
|
||||||
|
- alertmanager-data:/alertmanager
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||||
|
- '--storage.path=/alertmanager'
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
prometheus-data:
|
prometheus-data:
|
||||||
grafana-data:
|
grafana-data:
|
||||||
|
alertmanager-data:
|
||||||
|
|||||||
47
docker/prometheus/alerts.yml
Normal file
47
docker/prometheus/alerts.yml
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
groups:
|
||||||
|
- name: nexus-alerts
|
||||||
|
rules:
|
||||||
|
- alert: PostgreSQLDown
|
||||||
|
expr: pg_up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL is down"
|
||||||
|
description: "PostgreSQL has been down for more than 1 minute."
|
||||||
|
|
||||||
|
- alert: RedisDown
|
||||||
|
expr: redis_up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Redis is down"
|
||||||
|
description: "Redis has been down for more than 1 minute."
|
||||||
|
|
||||||
|
- alert: HighDiskUsage
|
||||||
|
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Disk usage is high"
|
||||||
|
description: "Disk usage is above 90% on {{ $labels.device }}."
|
||||||
|
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Memory usage is high"
|
||||||
|
description: "Memory usage is above 85%."
|
||||||
|
|
||||||
|
- alert: NodeDown
|
||||||
|
expr: up{job="node"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Node exporter is down"
|
||||||
|
description: "Node exporter has been down for more than 2 minutes."
|
||||||
@@ -2,6 +2,16 @@ global:
|
|||||||
scrape_interval: 15s
|
scrape_interval: 15s
|
||||||
evaluation_interval: 15s
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
# Alertmanager configuration
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets: ['alertmanager:9093']
|
||||||
|
|
||||||
|
# Load rules once and periodically evaluate them
|
||||||
|
rule_files:
|
||||||
|
- 'alerts.yml'
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: 'prometheus'
|
- job_name: 'prometheus'
|
||||||
static_configs:
|
static_configs:
|
||||||
|
|||||||
Reference in New Issue
Block a user