Browse Source

Fix host monitoring metrics collection

Fixed PostgreSQL and ClickHouse metrics collection errors:

1. PostgreSQL metrics:
   - Convert Decimal types to int/float to avoid type errors in division
   - Fixed: db_size, cache_hit_ratio, total_xacts, deadlocks, temp_files

2. ClickHouse metrics:
   - Gracefully handle missing clickhouse_connect module
   - Return zero values when ClickHouse is not configured

3. Monitoring API:
   - Added /metrics endpoint for dashboard cards
   - Returns latest PostgreSQL, ClickHouse, and HTTP metrics

Dashboard metrics cards now display correctly with real-time data.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
root 4 weeks ago
parent
commit
b455ce16e6
2 changed files with 50 additions and 8 deletions
  1. 27 0
      backend/app/api/v1/superadmin/monitoring.py
  2. 23 8
      backend/app/services/host_monitor.py

+ 27 - 0
backend/app/api/v1/superadmin/monitoring.py

@@ -17,6 +17,33 @@ from app.services.alert_service import alert_service
 router = APIRouter()
 
 
+@router.get("/metrics")
+async def get_current_metrics(
+    _current_user=Depends(get_current_superadmin),
+):
+    """
+    Get current system metrics (latest snapshot) for dashboard cards.
+    Returns PostgreSQL, ClickHouse, and HTTP/API metrics.
+    """
+    from app.services.host_monitor import host_monitor
+
+    # Get the most recent metrics from the monitor
+    latest_metrics = host_monitor.latest_metrics
+
+    if not latest_metrics:
+        return {
+            "postgresql": None,
+            "clickhouse": None,
+            "http": None,
+        }
+
+    return {
+        "postgresql": latest_metrics.get("postgresql"),
+        "clickhouse": latest_metrics.get("clickhouse"),
+        "http": latest_metrics.get("http"),
+    }
+
+
 @router.get("/host-metrics/recent")
 async def get_recent_host_metrics(
     limit: int = Query(default=60, le=1000),

+ 23 - 8
backend/app/services/host_monitor.py

@@ -26,6 +26,7 @@ class HostMonitor:
         self.previous_pg_stats = None
         self.previous_ch_stats = None
         self.running = False
+        self.latest_metrics = {}  # Latest collected metrics for dashboard
 
     async def collect_postgresql_metrics(self) -> dict:
         """Collect PostgreSQL database metrics."""
@@ -48,7 +49,7 @@ class HostMonitor:
                 result = await session.execute(text(f"""
                     SELECT pg_database_size('{db_name}')
                 """))
-                db_size = result.scalar() or 0
+                db_size = int(result.scalar() or 0)
 
                 # Cache hit ratio
                 result = await session.execute(text("""
@@ -56,7 +57,7 @@ class HostMonitor:
                         sum(blks_hit) * 100.0 / NULLIF(sum(blks_hit) + sum(blks_read), 0) as cache_hit_ratio
                     FROM pg_stat_database
                 """))
-                cache_hit_ratio = result.scalar() or 0
+                cache_hit_ratio = float(result.scalar() or 0)
 
                 # Transactions per second (delta-based)
                 result = await session.execute(text("""
@@ -66,9 +67,9 @@ class HostMonitor:
                     FROM pg_stat_database
                 """))
                 row = result.fetchone()
-                total_xacts = row[0] or 0
-                deadlocks = row[1] or 0
-                temp_files = row[2] or 0
+                total_xacts = int(row[0] or 0)
+                deadlocks = int(row[1] or 0)
+                temp_files = int(row[2] or 0)
 
                 # Calculate TPS
                 tps = 0
@@ -103,7 +104,18 @@ class HostMonitor:
     async def collect_clickhouse_metrics(self) -> dict:
         """Collect ClickHouse database metrics."""
         try:
-            import clickhouse_connect
+            # Check if clickhouse_connect module is available
+            try:
+                import clickhouse_connect
+            except ImportError:
+                return {
+                    'ch_active_queries': 0,
+                    'ch_database_size_bytes': 0,
+                    'ch_queries_per_sec': 0,
+                    'ch_rows_read_per_sec': 0,
+                    'ch_memory_usage_bytes': 0,
+                }
+
             from app.config import settings
 
             # Check if ClickHouse is configured
@@ -271,7 +283,7 @@ class HostMonitor:
             for p in top_mem if p.get('memory_percent', 0) > 0
         ]
 
-        return {
+        metrics = {
             'timestamp': datetime.now(timezone.utc),
             # CPU
             'cpu_percent': cpu_percent,
@@ -318,7 +330,7 @@ class HostMonitor:
             'process_count': len(psutil.pids()),
             'thread_count': sum(p.num_threads() for p in psutil.process_iter() if p.is_running()),
             'top_cpu_processes': top_cpu_clean,
-            'top_mem_processes': top_mem_processes,
+            'top_mem_processes': top_mem_clean,
         }
 
         # Collect database metrics
@@ -424,6 +436,9 @@ class HostMonitor:
                 # Collect metrics
                 metrics = await self.collect_metrics()
 
+                # Save latest metrics for dashboard
+                self.latest_metrics = metrics
+
                 # Store in database
                 await self.store_metrics(metrics)