feat(monitoring): add Alertmanager with alert rules

- docker-compose.monitoring.yml: added alertmanager service (port 9093)
- prometheus.yml: alerting config + rule_files entry
- alerts.yml: 5 alert rules (PostgreSQLDown, RedisDown, HighDiskUsage,
  HighMemoryUsage, NodeDown)
- alertmanager.yml: SMTP + webhook receiver, inhibit rules
This commit is contained in:
2026-04-29 07:10:22 +00:00
parent 5a913dcac1
commit 3792e4053c
4 changed files with 105 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@nexus.local'
smtp_require_tls: false
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
receivers:
- name: 'default'
email_configs:
- to: 'admin@nexus.local'
subject: 'Nexus Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Time: {{ .StartsAt }}
{{ end }}
webhook_configs:
- url: 'http://localhost:5001/pos/api/notifications/webhook'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']

View File

@@ -55,6 +55,20 @@ services:
ports:
- "9121:9121"
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: nexus-alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager:/etc/alertmanager
- alertmanager-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
volumes:
prometheus-data:
grafana-data:
alertmanager-data:

View File

@@ -0,0 +1,47 @@
groups:
- name: nexus-alerts
rules:
- alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is down"
description: "PostgreSQL has been down for more than 1 minute."
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis has been down for more than 1 minute."
- alert: HighDiskUsage
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk usage is high"
description: "Disk usage is above 90% on {{ $labels.device }}."
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Memory usage is high"
description: "Memory usage is above 85%."
- alert: NodeDown
expr: up{job="node"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node exporter is down"
description: "Node exporter has been down for more than 2 minutes."

View File

@@ -2,6 +2,16 @@ global:
scrape_interval: 15s
evaluation_interval: 15s
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
# Load rules once and periodically evaluate them
rule_files:
- 'alerts.yml'
scrape_configs:
- job_name: 'prometheus'
static_configs: