From 3792e4053cba8156a517382c33f4a1e7e63ff7e7 Mon Sep 17 00:00:00 2001 From: consultoria-as Date: Wed, 29 Apr 2026 07:10:22 +0000 Subject: [PATCH] feat(monitoring): add Alertmanager with alert rules - docker-compose.monitoring.yml: added alertmanager service (port 9093) - prometheus.yml: alerting config + rule_files entry - alerts.yml: 5 alert rules (PostgreSQLDown, RedisDown, HighDiskUsage, HighMemoryUsage, NodeDown) - alertmanager.yml: SMTP + webhook receiver, inhibit rules --- docker/alertmanager/alertmanager.yml | 34 ++++++++++++++++++++ docker/docker-compose.monitoring.yml | 14 +++++++++ docker/prometheus/alerts.yml | 47 ++++++++++++++++++++++++++++ docker/prometheus/prometheus.yml | 10 ++++++ 4 files changed, 105 insertions(+) create mode 100644 docker/alertmanager/alertmanager.yml create mode 100644 docker/prometheus/alerts.yml diff --git a/docker/alertmanager/alertmanager.yml b/docker/alertmanager/alertmanager.yml new file mode 100644 index 0000000..2128e6e --- /dev/null +++ b/docker/alertmanager/alertmanager.yml @@ -0,0 +1,34 @@ +global: + smtp_smarthost: 'localhost:587' + smtp_from: 'alerts@nexus.local' + smtp_require_tls: false + +route: + group_by: ['alertname', 'severity'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'default' + +receivers: + - name: 'default' + email_configs: + - to: 'admin@nexus.local' + subject: 'Nexus Alert: {{ .GroupLabels.alertname }}' + body: | + {{ range .Alerts }} + Alert: {{ .Annotations.summary }} + Description: {{ .Annotations.description }} + Severity: {{ .Labels.severity }} + Time: {{ .StartsAt }} + {{ end }} + webhook_configs: + - url: 'http://localhost:5001/pos/api/notifications/webhook' + send_resolved: true + +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname'] diff --git a/docker/docker-compose.monitoring.yml b/docker/docker-compose.monitoring.yml index 8c0b07a..ad9d412 100644 --- a/docker/docker-compose.monitoring.yml +++ b/docker/docker-compose.monitoring.yml @@ -55,6 +55,20 @@ services: ports: - "9121:9121" + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: nexus-alertmanager + restart: unless-stopped + ports: + - "9093:9093" + volumes: + - ./alertmanager:/etc/alertmanager + - alertmanager-data:/alertmanager + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + volumes: prometheus-data: grafana-data: + alertmanager-data: diff --git a/docker/prometheus/alerts.yml b/docker/prometheus/alerts.yml new file mode 100644 index 0000000..874e5d1 --- /dev/null +++ b/docker/prometheus/alerts.yml @@ -0,0 +1,47 @@ +groups: + - name: nexus-alerts + rules: + - alert: PostgreSQLDown + expr: pg_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "PostgreSQL is down" + description: "PostgreSQL has been down for more than 1 minute." + + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Redis is down" + description: "Redis has been down for more than 1 minute." + + - alert: HighDiskUsage + expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk usage is high" + description: "Disk usage is above 90% on {{ $labels.device }}." + + - alert: HighMemoryUsage + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Memory usage is high" + description: "Memory usage is above 85%." + + - alert: NodeDown + expr: up{job="node"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Node exporter is down" + description: "Node exporter has been down for more than 2 minutes." diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 2dbacc6..e3503eb 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -2,6 +2,16 @@ global: scrape_interval: 15s evaluation_interval: 15s +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +# Load rules once and periodically evaluate them +rule_files: + - 'alerts.yml' + scrape_configs: - job_name: 'prometheus' static_configs: