Files
Autoparts-DB/manager/services/health_service.py
consultoria-as be4bb8d9ad feat(manager): add Nexus Instance Manager for demo orchestration
- Complete Flask-based control panel for multi-tenant POS instances
- Dashboard with global stats, system health, and recent demos
- Demo provisioning in 1 click with auto-expiration tracking
- Tenant management: activate/deactivate, reset data, delete
- Health monitoring: PostgreSQL, Redis, disk, memory, systemd services
- Migration orchestration UI for running schema updates across all tenants
- JWT authentication with manager_users table
- Dark theme SPA frontend with real-time search and actions
- systemd service file included
2026-05-17 21:01:01 +00:00

167 lines
5.6 KiB
Python

"""Health monitoring service for Nexus infrastructure."""
import subprocess
import shutil
import socket
import urllib.request
import urllib.error
import psycopg2
import redis
from config import (
MASTER_DB_URL, REDIS_URL, POS_URL, DASHBOARD_URL, QUART_URL,
TENANT_DB_URL_TEMPLATE
)
def check_postgresql():
"""Check PostgreSQL connectivity."""
try:
conn = psycopg2.connect(MASTER_DB_URL, connect_timeout=5)
cur = conn.cursor()
cur.execute("SELECT version(), pg_database_size('nexus_autoparts')")
version, size = cur.fetchone()
cur.close()
conn.close()
return {
"status": "ok",
"version": version.split()[1] if version else "unknown",
"master_size_mb": round(size / (1024 * 1024), 2)
}
except Exception as e:
return {"status": "error", "error": str(e)}
def check_redis():
"""Check Redis connectivity."""
try:
r = redis.from_url(REDIS_URL, socket_connect_timeout=3)
info = r.info()
return {
"status": "ok",
"version": info.get("redis_version", "unknown"),
"used_memory_human": info.get("used_memory_human", "?"),
"connected_clients": info.get("connected_clients", 0)
}
except Exception as e:
return {"status": "error", "error": str(e)}
def check_http_service(name, url, timeout=5):
"""Generic HTTP health check."""
try:
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Nexus-Manager/1.0")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return {
"status": "ok",
"http_status": resp.status,
"latency_ms": None # Could add timing later
}
except urllib.error.HTTPError as e:
return {"status": "warning", "http_status": e.code, "error": str(e)}
except Exception as e:
return {"status": "error", "error": str(e)}
def check_disk_space(path="/"):
"""Check disk usage."""
try:
total, used, free = shutil.disk_usage(path)
return {
"status": "ok",
"total_gb": round(total / (1024**3), 2),
"used_gb": round(used / (1024**3), 2),
"free_gb": round(free / (1024**3), 2),
"percent_used": round((used / total) * 100, 1)
}
except Exception as e:
return {"status": "error", "error": str(e)}
def check_memory():
"""Check system memory via /proc/meminfo."""
try:
with open("/proc/meminfo") as f:
meminfo = f.read()
data = {}
for line in meminfo.splitlines():
if ":" in line:
key, value = line.split(":", 1)
data[key.strip()] = int(value.strip().split()[0]) # kB
total = data.get("MemTotal", 0) / 1024 / 1024 # GB
available = data.get("MemAvailable", data.get("MemFree", 0)) / 1024 / 1024
used = total - available
return {
"status": "ok",
"total_gb": round(total, 2),
"used_gb": round(used, 2),
"available_gb": round(available, 2),
"percent_used": round((used / total) * 100, 1) if total else 0
}
except Exception as e:
return {"status": "error", "error": str(e)}
def check_systemd_service(service_name):
"""Check systemd service status."""
try:
result = subprocess.run(
["systemctl", "is-active", service_name],
capture_output=True, text=True, timeout=5
)
active = result.stdout.strip() == "active"
return {
"status": "ok" if active else "warning",
"active": active,
"state": result.stdout.strip()
}
except Exception as e:
return {"status": "error", "error": str(e)}
def get_full_health_report():
"""Aggregate health report for all services."""
return {
"postgresql": check_postgresql(),
"redis": check_redis(),
"pos": check_http_service("pos", POS_URL),
"dashboard": check_http_service("dashboard", DASHBOARD_URL),
"quart": check_http_service("quart", QUART_URL),
"disk": check_disk_space(),
"memory": check_memory(),
"services": {
"nexus": check_systemd_service("nexus.service"),
"nexus-pos": check_systemd_service("nexus-pos.service"),
"nexus-quart": check_systemd_service("nexus-quart.service"),
"nexus-celery": check_systemd_service("nexus-celery.service"),
}
}
def get_tenant_health(db_name, timeout=5):
"""Check connectivity to a specific tenant database."""
dsn = TENANT_DB_URL_TEMPLATE.format(db_name=db_name)
try:
conn = psycopg2.connect(dsn, connect_timeout=timeout)
cur = conn.cursor()
cur.execute("""
SELECT
(SELECT COUNT(*) FROM employees WHERE is_active = true) as employees,
(SELECT COUNT(*) FROM inventory WHERE is_active = true) as inventory,
(SELECT COUNT(*) FROM customers WHERE is_active = true) as customers,
(SELECT COUNT(*) FROM sales WHERE created_at > NOW() - INTERVAL '30 days') as sales_30d,
pg_database_size(current_database()) as db_size
""")
row = cur.fetchone()
cur.close()
conn.close()
return {
"status": "ok",
"employees": row[0],
"inventory": row[1],
"customers": row[2],
"sales_30d": row[3],
"db_size_mb": round(row[4] / (1024 * 1024), 2)
}
except Exception as e:
return {"status": "error", "error": str(e)}