feat(monitoring): add Alertmanager with alert rules
- docker-compose.monitoring.yml: added alertmanager service (port 9093) - prometheus.yml: alerting config + rule_files entry - alerts.yml: 5 alert rules (PostgreSQLDown, RedisDown, HighDiskUsage, HighMemoryUsage, NodeDown) - alertmanager.yml: SMTP + webhook receiver, inhibit rules
This commit is contained in:
47
docker/prometheus/alerts.yml
Normal file
47
docker/prometheus/alerts.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
groups:
|
||||
- name: nexus-alerts
|
||||
rules:
|
||||
- alert: PostgreSQLDown
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is down"
|
||||
description: "PostgreSQL has been down for more than 1 minute."
|
||||
|
||||
- alert: RedisDown
|
||||
expr: redis_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis is down"
|
||||
description: "Redis has been down for more than 1 minute."
|
||||
|
||||
- alert: HighDiskUsage
|
||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage is high"
|
||||
description: "Disk usage is above 90% on {{ $labels.device }}."
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage is high"
|
||||
description: "Memory usage is above 85%."
|
||||
|
||||
- alert: NodeDown
|
||||
expr: up{job="node"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node exporter is down"
|
||||
description: "Node exporter has been down for more than 2 minutes."
|
||||
@@ -2,6 +2,16 @@ global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
# Load rules once and periodically evaluate them
|
||||
rule_files:
|
||||
- 'alerts.yml'
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
|
||||
Reference in New Issue
Block a user