feat(manager): add Nexus Instance Manager for demo orchestration
- Complete Flask-based control panel for multi-tenant POS instances - Dashboard with global stats, system health, and recent demos - Demo provisioning in 1 click with auto-expiration tracking - Tenant management: activate/deactivate, reset data, delete - Health monitoring: PostgreSQL, Redis, disk, memory, systemd services - Migration orchestration UI for running schema updates across all tenants - JWT authentication with manager_users table - Dark theme SPA frontend with real-time search and actions - systemd service file included
This commit is contained in:
166
manager/services/health_service.py
Normal file
166
manager/services/health_service.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""Health monitoring service for Nexus infrastructure."""
|
||||
import subprocess
|
||||
import shutil
|
||||
import socket
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import psycopg2
|
||||
import redis
|
||||
from config import (
|
||||
MASTER_DB_URL, REDIS_URL, POS_URL, DASHBOARD_URL, QUART_URL,
|
||||
TENANT_DB_URL_TEMPLATE
|
||||
)
|
||||
|
||||
|
||||
def check_postgresql():
|
||||
"""Check PostgreSQL connectivity."""
|
||||
try:
|
||||
conn = psycopg2.connect(MASTER_DB_URL, connect_timeout=5)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT version(), pg_database_size('nexus_autoparts')")
|
||||
version, size = cur.fetchone()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return {
|
||||
"status": "ok",
|
||||
"version": version.split()[1] if version else "unknown",
|
||||
"master_size_mb": round(size / (1024 * 1024), 2)
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def check_redis():
|
||||
"""Check Redis connectivity."""
|
||||
try:
|
||||
r = redis.from_url(REDIS_URL, socket_connect_timeout=3)
|
||||
info = r.info()
|
||||
return {
|
||||
"status": "ok",
|
||||
"version": info.get("redis_version", "unknown"),
|
||||
"used_memory_human": info.get("used_memory_human", "?"),
|
||||
"connected_clients": info.get("connected_clients", 0)
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def check_http_service(name, url, timeout=5):
|
||||
"""Generic HTTP health check."""
|
||||
try:
|
||||
req = urllib.request.Request(url, method="GET")
|
||||
req.add_header("User-Agent", "Nexus-Manager/1.0")
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return {
|
||||
"status": "ok",
|
||||
"http_status": resp.status,
|
||||
"latency_ms": None # Could add timing later
|
||||
}
|
||||
except urllib.error.HTTPError as e:
|
||||
return {"status": "warning", "http_status": e.code, "error": str(e)}
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def check_disk_space(path="/"):
|
||||
"""Check disk usage."""
|
||||
try:
|
||||
total, used, free = shutil.disk_usage(path)
|
||||
return {
|
||||
"status": "ok",
|
||||
"total_gb": round(total / (1024**3), 2),
|
||||
"used_gb": round(used / (1024**3), 2),
|
||||
"free_gb": round(free / (1024**3), 2),
|
||||
"percent_used": round((used / total) * 100, 1)
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def check_memory():
|
||||
"""Check system memory via /proc/meminfo."""
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
meminfo = f.read()
|
||||
data = {}
|
||||
for line in meminfo.splitlines():
|
||||
if ":" in line:
|
||||
key, value = line.split(":", 1)
|
||||
data[key.strip()] = int(value.strip().split()[0]) # kB
|
||||
total = data.get("MemTotal", 0) / 1024 / 1024 # GB
|
||||
available = data.get("MemAvailable", data.get("MemFree", 0)) / 1024 / 1024
|
||||
used = total - available
|
||||
return {
|
||||
"status": "ok",
|
||||
"total_gb": round(total, 2),
|
||||
"used_gb": round(used, 2),
|
||||
"available_gb": round(available, 2),
|
||||
"percent_used": round((used / total) * 100, 1) if total else 0
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def check_systemd_service(service_name):
|
||||
"""Check systemd service status."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["systemctl", "is-active", service_name],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
active = result.stdout.strip() == "active"
|
||||
return {
|
||||
"status": "ok" if active else "warning",
|
||||
"active": active,
|
||||
"state": result.stdout.strip()
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def get_full_health_report():
|
||||
"""Aggregate health report for all services."""
|
||||
return {
|
||||
"postgresql": check_postgresql(),
|
||||
"redis": check_redis(),
|
||||
"pos": check_http_service("pos", POS_URL),
|
||||
"dashboard": check_http_service("dashboard", DASHBOARD_URL),
|
||||
"quart": check_http_service("quart", QUART_URL),
|
||||
"disk": check_disk_space(),
|
||||
"memory": check_memory(),
|
||||
"services": {
|
||||
"nexus": check_systemd_service("nexus.service"),
|
||||
"nexus-pos": check_systemd_service("nexus-pos.service"),
|
||||
"nexus-quart": check_systemd_service("nexus-quart.service"),
|
||||
"nexus-celery": check_systemd_service("nexus-celery.service"),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_tenant_health(db_name, timeout=5):
|
||||
"""Check connectivity to a specific tenant database."""
|
||||
dsn = TENANT_DB_URL_TEMPLATE.format(db_name=db_name)
|
||||
try:
|
||||
conn = psycopg2.connect(dsn, connect_timeout=timeout)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM employees WHERE is_active = true) as employees,
|
||||
(SELECT COUNT(*) FROM inventory WHERE is_active = true) as inventory,
|
||||
(SELECT COUNT(*) FROM customers WHERE is_active = true) as customers,
|
||||
(SELECT COUNT(*) FROM sales WHERE created_at > NOW() - INTERVAL '30 days') as sales_30d,
|
||||
pg_database_size(current_database()) as db_size
|
||||
""")
|
||||
row = cur.fetchone()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return {
|
||||
"status": "ok",
|
||||
"employees": row[0],
|
||||
"inventory": row[1],
|
||||
"customers": row[2],
|
||||
"sales_30d": row[3],
|
||||
"db_size_mb": round(row[4] / (1024 * 1024), 2)
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
Reference in New Issue
Block a user