- config.py: add NEXUS_SERVER_HOST env var for cross-VM deployment - health_service.py: graceful Redis failure when only localhost-bound - systemd service: document remote VM configuration - README: add dedicated 'VM separada' installation section - .env.example: new file with remote connection examples
176 lines
6.1 KiB
Python
176 lines
6.1 KiB
Python
"""Health monitoring service for Nexus infrastructure."""
|
|
import subprocess
|
|
import shutil
|
|
import socket
|
|
import urllib.request
|
|
import urllib.error
|
|
import psycopg2
|
|
import redis
|
|
from config import (
|
|
MASTER_DB_URL, REDIS_URL, POS_URL, DASHBOARD_URL, QUART_URL,
|
|
TENANT_DB_URL_TEMPLATE, NEXUS_SERVER_HOST
|
|
)
|
|
|
|
|
|
def check_postgresql():
|
|
"""Check PostgreSQL connectivity."""
|
|
try:
|
|
conn = psycopg2.connect(MASTER_DB_URL, connect_timeout=5)
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT version(), pg_database_size('nexus_autoparts')")
|
|
version, size = cur.fetchone()
|
|
cur.close()
|
|
conn.close()
|
|
return {
|
|
"status": "ok",
|
|
"version": version.split()[1] if version else "unknown",
|
|
"master_size_mb": round(size / (1024 * 1024), 2)
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def check_redis():
|
|
"""Check Redis connectivity. May be unreachable if Redis only binds to localhost."""
|
|
try:
|
|
r = redis.from_url(REDIS_URL, socket_connect_timeout=3)
|
|
info = r.info()
|
|
return {
|
|
"status": "ok",
|
|
"version": info.get("redis_version", "unknown"),
|
|
"used_memory_human": info.get("used_memory_human", "?"),
|
|
"connected_clients": info.get("connected_clients", 0)
|
|
}
|
|
except redis.ConnectionError:
|
|
return {
|
|
"status": "warning",
|
|
"error": "Redis unreachable. If manager runs on a separate VM, ensure Redis binds to 0.0.0.0 or a VPN interface, or that a tunnel is active."
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def check_http_service(name, url, timeout=5):
|
|
"""Generic HTTP health check."""
|
|
try:
|
|
req = urllib.request.Request(url, method="GET")
|
|
req.add_header("User-Agent", "Nexus-Manager/1.0")
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return {
|
|
"status": "ok",
|
|
"http_status": resp.status,
|
|
"latency_ms": None # Could add timing later
|
|
}
|
|
except urllib.error.HTTPError as e:
|
|
return {"status": "warning", "http_status": e.code, "error": str(e)}
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def check_disk_space(path="/"):
|
|
"""Check disk usage."""
|
|
try:
|
|
total, used, free = shutil.disk_usage(path)
|
|
return {
|
|
"status": "ok",
|
|
"total_gb": round(total / (1024**3), 2),
|
|
"used_gb": round(used / (1024**3), 2),
|
|
"free_gb": round(free / (1024**3), 2),
|
|
"percent_used": round((used / total) * 100, 1)
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def check_memory():
|
|
"""Check system memory via /proc/meminfo."""
|
|
try:
|
|
with open("/proc/meminfo") as f:
|
|
meminfo = f.read()
|
|
data = {}
|
|
for line in meminfo.splitlines():
|
|
if ":" in line:
|
|
key, value = line.split(":", 1)
|
|
data[key.strip()] = int(value.strip().split()[0]) # kB
|
|
total = data.get("MemTotal", 0) / 1024 / 1024 # GB
|
|
available = data.get("MemAvailable", data.get("MemFree", 0)) / 1024 / 1024
|
|
used = total - available
|
|
return {
|
|
"status": "ok",
|
|
"total_gb": round(total, 2),
|
|
"used_gb": round(used, 2),
|
|
"available_gb": round(available, 2),
|
|
"percent_used": round((used / total) * 100, 1) if total else 0
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def check_systemd_service(service_name):
|
|
"""Check systemd service status."""
|
|
try:
|
|
result = subprocess.run(
|
|
["systemctl", "is-active", service_name],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
active = result.stdout.strip() == "active"
|
|
return {
|
|
"status": "ok" if active else "warning",
|
|
"active": active,
|
|
"state": result.stdout.strip()
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def get_full_health_report():
|
|
"""Aggregate health report for all services."""
|
|
return {
|
|
"_meta": {
|
|
"nexus_server_host": NEXUS_SERVER_HOST,
|
|
"note": "disk/memory are local to this manager VM. PostgreSQL/HTTP checks target the remote Nexus server."
|
|
},
|
|
"postgresql": check_postgresql(),
|
|
"redis": check_redis(),
|
|
"pos": check_http_service("pos", POS_URL),
|
|
"dashboard": check_http_service("dashboard", DASHBOARD_URL),
|
|
"quart": check_http_service("quart", QUART_URL),
|
|
"disk": check_disk_space(),
|
|
"memory": check_memory(),
|
|
"services": {
|
|
"nexus": check_systemd_service("nexus.service"),
|
|
"nexus-pos": check_systemd_service("nexus-pos.service"),
|
|
"nexus-quart": check_systemd_service("nexus-quart.service"),
|
|
"nexus-celery": check_systemd_service("nexus-celery.service"),
|
|
}
|
|
}
|
|
|
|
|
|
def get_tenant_health(db_name, timeout=5):
|
|
"""Check connectivity to a specific tenant database."""
|
|
dsn = TENANT_DB_URL_TEMPLATE.format(db_name=db_name)
|
|
try:
|
|
conn = psycopg2.connect(dsn, connect_timeout=timeout)
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
SELECT
|
|
(SELECT COUNT(*) FROM employees WHERE is_active = true) as employees,
|
|
(SELECT COUNT(*) FROM inventory WHERE is_active = true) as inventory,
|
|
(SELECT COUNT(*) FROM customers WHERE is_active = true) as customers,
|
|
(SELECT COUNT(*) FROM sales WHERE created_at > NOW() - INTERVAL '30 days') as sales_30d,
|
|
pg_database_size(current_database()) as db_size
|
|
""")
|
|
row = cur.fetchone()
|
|
cur.close()
|
|
conn.close()
|
|
return {
|
|
"status": "ok",
|
|
"employees": row[0],
|
|
"inventory": row[1],
|
|
"customers": row[2],
|
|
"sales_30d": row[3],
|
|
"db_size_mb": round(row[4] / (1024 * 1024), 2)
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|