"""Health monitoring service for Nexus infrastructure.""" import subprocess import shutil import socket import urllib.request import urllib.error import psycopg2 import redis from config import ( MASTER_DB_URL, REDIS_URL, POS_URL, DASHBOARD_URL, QUART_URL, TENANT_DB_URL_TEMPLATE ) def check_postgresql(): """Check PostgreSQL connectivity.""" try: conn = psycopg2.connect(MASTER_DB_URL, connect_timeout=5) cur = conn.cursor() cur.execute("SELECT version(), pg_database_size('nexus_autoparts')") version, size = cur.fetchone() cur.close() conn.close() return { "status": "ok", "version": version.split()[1] if version else "unknown", "master_size_mb": round(size / (1024 * 1024), 2) } except Exception as e: return {"status": "error", "error": str(e)} def check_redis(): """Check Redis connectivity.""" try: r = redis.from_url(REDIS_URL, socket_connect_timeout=3) info = r.info() return { "status": "ok", "version": info.get("redis_version", "unknown"), "used_memory_human": info.get("used_memory_human", "?"), "connected_clients": info.get("connected_clients", 0) } except Exception as e: return {"status": "error", "error": str(e)} def check_http_service(name, url, timeout=5): """Generic HTTP health check.""" try: req = urllib.request.Request(url, method="GET") req.add_header("User-Agent", "Nexus-Manager/1.0") with urllib.request.urlopen(req, timeout=timeout) as resp: return { "status": "ok", "http_status": resp.status, "latency_ms": None # Could add timing later } except urllib.error.HTTPError as e: return {"status": "warning", "http_status": e.code, "error": str(e)} except Exception as e: return {"status": "error", "error": str(e)} def check_disk_space(path="/"): """Check disk usage.""" try: total, used, free = shutil.disk_usage(path) return { "status": "ok", "total_gb": round(total / (1024**3), 2), "used_gb": round(used / (1024**3), 2), "free_gb": round(free / (1024**3), 2), "percent_used": round((used / total) * 100, 1) } except Exception as e: return {"status": "error", "error": str(e)} def check_memory(): """Check system memory via /proc/meminfo.""" try: with open("/proc/meminfo") as f: meminfo = f.read() data = {} for line in meminfo.splitlines(): if ":" in line: key, value = line.split(":", 1) data[key.strip()] = int(value.strip().split()[0]) # kB total = data.get("MemTotal", 0) / 1024 / 1024 # GB available = data.get("MemAvailable", data.get("MemFree", 0)) / 1024 / 1024 used = total - available return { "status": "ok", "total_gb": round(total, 2), "used_gb": round(used, 2), "available_gb": round(available, 2), "percent_used": round((used / total) * 100, 1) if total else 0 } except Exception as e: return {"status": "error", "error": str(e)} def check_systemd_service(service_name): """Check systemd service status.""" try: result = subprocess.run( ["systemctl", "is-active", service_name], capture_output=True, text=True, timeout=5 ) active = result.stdout.strip() == "active" return { "status": "ok" if active else "warning", "active": active, "state": result.stdout.strip() } except Exception as e: return {"status": "error", "error": str(e)} def get_full_health_report(): """Aggregate health report for all services.""" return { "postgresql": check_postgresql(), "redis": check_redis(), "pos": check_http_service("pos", POS_URL), "dashboard": check_http_service("dashboard", DASHBOARD_URL), "quart": check_http_service("quart", QUART_URL), "disk": check_disk_space(), "memory": check_memory(), "services": { "nexus": check_systemd_service("nexus.service"), "nexus-pos": check_systemd_service("nexus-pos.service"), "nexus-quart": check_systemd_service("nexus-quart.service"), "nexus-celery": check_systemd_service("nexus-celery.service"), } } def get_tenant_health(db_name, timeout=5): """Check connectivity to a specific tenant database.""" dsn = TENANT_DB_URL_TEMPLATE.format(db_name=db_name) try: conn = psycopg2.connect(dsn, connect_timeout=timeout) cur = conn.cursor() cur.execute(""" SELECT (SELECT COUNT(*) FROM employees WHERE is_active = true) as employees, (SELECT COUNT(*) FROM inventory WHERE is_active = true) as inventory, (SELECT COUNT(*) FROM customers WHERE is_active = true) as customers, (SELECT COUNT(*) FROM sales WHERE created_at > NOW() - INTERVAL '30 days') as sales_30d, pg_database_size(current_database()) as db_size """) row = cur.fetchone() cur.close() conn.close() return { "status": "ok", "employees": row[0], "inventory": row[1], "customers": row[2], "sales_30d": row[3], "db_size_mb": round(row[4] / (1024 * 1024), 2) } except Exception as e: return {"status": "error", "error": str(e)}