Browse Source

Expand host monitoring with enterprise-grade metrics

Added comprehensive monitoring capabilities:
- IOPS calculation (read/write operations per second)
- Throughput rates (MB/s for disk and network)
- Per-core CPU usage tracking
- CPU steal time for VM environments
- Context switches and interrupts per second
- Detailed memory breakdown (buffers, cached, available, swap)
- Network packet rates, error counts, drop counts
- Process monitoring (top 5 by CPU and memory)
- Thread count tracking

Database changes:
- Expanded host_metrics table from 14 to 40+ fields
- Migration adds 25+ new columns with server defaults

Service improvements:
- Delta-based rate calculations using time intervals
- Previous state tracking for IOPS/throughput
- Graceful exception handling for process iteration

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
root 4 weeks ago
parent
commit
8f8b82e722

+ 88 - 0
backend/alembic/versions/20251229_0202_a68acea9f536_expand_host_metrics_advanced.py

@@ -0,0 +1,88 @@
+"""expand_host_metrics_advanced
+
+Revision ID: a68acea9f536
+Revises: 7ff254c24bb5
+Create Date: 2025-12-29 02:02:00.000000+00:00
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision: str = 'a68acea9f536'
+down_revision: Union[str, None] = '7ff254c24bb5'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Add new CPU columns
+    op.add_column('host_metrics', sa.Column('cpu_per_core', postgresql.JSON(astext_type=sa.Text()), nullable=True))
+    op.add_column('host_metrics', sa.Column('cpu_steal', sa.Float(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('context_switches_per_sec', sa.Integer(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('interrupts_per_sec', sa.Integer(), nullable=False, server_default='0'))
+
+    # Add new Memory columns
+    op.add_column('host_metrics', sa.Column('memory_available', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('memory_buffers', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('memory_cached', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('swap_total', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('swap_used', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('swap_percent', sa.Float(), nullable=False, server_default='0'))
+
+    # Add new Disk I/O columns
+    op.add_column('host_metrics', sa.Column('disk_read_iops', sa.Integer(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('disk_write_iops', sa.Integer(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('disk_read_mbps', sa.Float(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('disk_write_mbps', sa.Float(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('disk_io_time_ms', sa.BigInteger(), nullable=False, server_default='0'))
+
+    # Add new Network columns
+    op.add_column('host_metrics', sa.Column('net_in_mbps', sa.Float(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('net_out_mbps', sa.Float(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('net_packets_in_per_sec', sa.Integer(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('net_packets_out_per_sec', sa.Integer(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('net_errors_in', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('net_errors_out', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('net_drops_in', sa.BigInteger(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('net_drops_out', sa.BigInteger(), nullable=False, server_default='0'))
+
+    # Add Process columns
+    op.add_column('host_metrics', sa.Column('process_count', sa.Integer(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('thread_count', sa.Integer(), nullable=False, server_default='0'))
+    op.add_column('host_metrics', sa.Column('top_cpu_processes', postgresql.JSON(astext_type=sa.Text()), nullable=True))
+    op.add_column('host_metrics', sa.Column('top_mem_processes', postgresql.JSON(astext_type=sa.Text()), nullable=True))
+
+
+def downgrade() -> None:
+    # Remove added columns
+    op.drop_column('host_metrics', 'top_mem_processes')
+    op.drop_column('host_metrics', 'top_cpu_processes')
+    op.drop_column('host_metrics', 'thread_count')
+    op.drop_column('host_metrics', 'process_count')
+    op.drop_column('host_metrics', 'net_drops_out')
+    op.drop_column('host_metrics', 'net_drops_in')
+    op.drop_column('host_metrics', 'net_errors_out')
+    op.drop_column('host_metrics', 'net_errors_in')
+    op.drop_column('host_metrics', 'net_packets_out_per_sec')
+    op.drop_column('host_metrics', 'net_packets_in_per_sec')
+    op.drop_column('host_metrics', 'net_out_mbps')
+    op.drop_column('host_metrics', 'net_in_mbps')
+    op.drop_column('host_metrics', 'disk_io_time_ms')
+    op.drop_column('host_metrics', 'disk_write_mbps')
+    op.drop_column('host_metrics', 'disk_read_mbps')
+    op.drop_column('host_metrics', 'disk_write_iops')
+    op.drop_column('host_metrics', 'disk_read_iops')
+    op.drop_column('host_metrics', 'swap_percent')
+    op.drop_column('host_metrics', 'swap_used')
+    op.drop_column('host_metrics', 'swap_total')
+    op.drop_column('host_metrics', 'memory_cached')
+    op.drop_column('host_metrics', 'memory_buffers')
+    op.drop_column('host_metrics', 'memory_available')
+    op.drop_column('host_metrics', 'interrupts_per_sec')
+    op.drop_column('host_metrics', 'context_switches_per_sec')
+    op.drop_column('host_metrics', 'cpu_steal')
+    op.drop_column('host_metrics', 'cpu_per_core')

+ 34 - 5
backend/app/models/host_metrics.py

@@ -4,14 +4,14 @@ Host metrics model for storing system monitoring data.
 
 
 from datetime import datetime
 from datetime import datetime
 
 
-from sqlalchemy import BigInteger, DateTime, Float, Integer, String
+from sqlalchemy import BigInteger, DateTime, Float, Integer, JSON
 from sqlalchemy.orm import Mapped, mapped_column
 from sqlalchemy.orm import Mapped, mapped_column
 
 
 from app.core.database import Base
 from app.core.database import Base
 
 
 
 
 class HostMetrics(Base):
 class HostMetrics(Base):
-    """Host system metrics (CPU, RAM, Load, Disk, Network)."""
+    """Host system metrics - comprehensive monitoring."""
 
 
     __tablename__ = "host_metrics"
     __tablename__ = "host_metrics"
 
 
@@ -21,22 +21,51 @@ class HostMetrics(Base):
     # CPU
     # CPU
     cpu_percent: Mapped[float] = mapped_column(Float, nullable=False)
     cpu_percent: Mapped[float] = mapped_column(Float, nullable=False)
     cpu_count: Mapped[int] = mapped_column(Integer, nullable=False)
     cpu_count: Mapped[int] = mapped_column(Integer, nullable=False)
+    cpu_per_core: Mapped[list | None] = mapped_column(JSON)  # Per-core percentages
+    cpu_steal: Mapped[float] = mapped_column(Float, nullable=False, default=0)  # VM steal time
+    context_switches_per_sec: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    interrupts_per_sec: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
 
 
     # Memory
     # Memory
-    memory_total: Mapped[int] = mapped_column(BigInteger, nullable=False)  # bytes
+    memory_total: Mapped[int] = mapped_column(BigInteger, nullable=False)
     memory_used: Mapped[int] = mapped_column(BigInteger, nullable=False)
     memory_used: Mapped[int] = mapped_column(BigInteger, nullable=False)
     memory_percent: Mapped[float] = mapped_column(Float, nullable=False)
     memory_percent: Mapped[float] = mapped_column(Float, nullable=False)
+    memory_available: Mapped[int] = mapped_column(BigInteger, nullable=False)
+    memory_buffers: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+    memory_cached: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+    swap_total: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+    swap_used: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+    swap_percent: Mapped[float] = mapped_column(Float, nullable=False, default=0)
 
 
     # Load Average
     # Load Average
     load_1: Mapped[float] = mapped_column(Float, nullable=False)
     load_1: Mapped[float] = mapped_column(Float, nullable=False)
     load_5: Mapped[float] = mapped_column(Float, nullable=False)
     load_5: Mapped[float] = mapped_column(Float, nullable=False)
     load_15: Mapped[float] = mapped_column(Float, nullable=False)
     load_15: Mapped[float] = mapped_column(Float, nullable=False)
 
 
-    # Disk I/O
+    # Disk I/O - cumulative + rates
     disk_read_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
     disk_read_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
     disk_write_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
     disk_write_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
+    disk_read_iops: Mapped[int] = mapped_column(Integer, nullable=False, default=0)  # Ops/sec
+    disk_write_iops: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    disk_read_mbps: Mapped[float] = mapped_column(Float, nullable=False, default=0)  # MB/s
+    disk_write_mbps: Mapped[float] = mapped_column(Float, nullable=False, default=0)
+    disk_io_time_ms: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)  # Latency
     disk_usage_percent: Mapped[float] = mapped_column(Float, nullable=False)
     disk_usage_percent: Mapped[float] = mapped_column(Float, nullable=False)
 
 
-    # Network
+    # Network - cumulative + rates
     net_sent_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
     net_sent_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
     net_recv_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
     net_recv_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
+    net_in_mbps: Mapped[float] = mapped_column(Float, nullable=False, default=0)  # MB/s
+    net_out_mbps: Mapped[float] = mapped_column(Float, nullable=False, default=0)
+    net_packets_in_per_sec: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    net_packets_out_per_sec: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    net_errors_in: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+    net_errors_out: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+    net_drops_in: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+    net_drops_out: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
+
+    # Processes
+    process_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    thread_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    top_cpu_processes: Mapped[list | None] = mapped_column(JSON)  # Top 5 by CPU
+    top_mem_processes: Mapped[list | None] = mapped_column(JSON)  # Top 5 by memory

+ 123 - 23
backend/app/services/host_monitor.py

@@ -3,6 +3,7 @@ Host monitoring service for collecting system metrics.
 """
 """
 
 
 import asyncio
 import asyncio
+import time
 from datetime import datetime, timedelta, timezone
 from datetime import datetime, timedelta, timezone
 
 
 import psutil
 import psutil
@@ -20,53 +21,152 @@ class HostMonitor:
     def __init__(self):
     def __init__(self):
         self.previous_disk_io = None
         self.previous_disk_io = None
         self.previous_net_io = None
         self.previous_net_io = None
+        self.previous_cpu_stats = None
+        self.previous_timestamp = None
         self.running = False
         self.running = False
 
 
     async def collect_metrics(self) -> dict:
     async def collect_metrics(self) -> dict:
-        """Collect current system metrics."""
-        # CPU
+        """Collect comprehensive system metrics."""
+        current_timestamp = time.time()
+
+        # CPU - detailed
         cpu_percent = psutil.cpu_percent(interval=1)
         cpu_percent = psutil.cpu_percent(interval=1)
         cpu_count = psutil.cpu_count()
         cpu_count = psutil.cpu_count()
+        cpu_per_core = psutil.cpu_percent(interval=0, percpu=True)
+        cpu_times = psutil.cpu_times()
+        cpu_stats = psutil.cpu_stats()
+
+        # Context switches and interrupts (delta)
+        context_switches = cpu_stats.ctx_switches
+        interrupts = cpu_stats.interrupts
+        ctx_switches_per_sec = 0
+        interrupts_per_sec = 0
+
+        if self.previous_cpu_stats:
+            time_delta = current_timestamp - self.previous_timestamp
+            if time_delta > 0:
+                ctx_switches_per_sec = (context_switches - self.previous_cpu_stats.ctx_switches) / time_delta
+                interrupts_per_sec = (interrupts - self.previous_cpu_stats.interrupts) / time_delta
 
 
-        # Memory
+        self.previous_cpu_stats = cpu_stats
+
+        # Memory - detailed
         mem = psutil.virtual_memory()
         mem = psutil.virtual_memory()
-        memory_total = mem.total
-        memory_used = mem.used
-        memory_percent = mem.percent
+        swap = psutil.swap_memory()
 
 
         # Load Average
         # Load Average
         load_avg = psutil.getloadavg()
         load_avg = psutil.getloadavg()
         load_1, load_5, load_15 = load_avg
         load_1, load_5, load_15 = load_avg
 
 
-        # Disk I/O
+        # Disk I/O - with IOPS and throughput
         disk_io = psutil.disk_io_counters()
         disk_io = psutil.disk_io_counters()
-        disk_read_bytes = disk_io.read_bytes
-        disk_write_bytes = disk_io.write_bytes
-
-        # Disk Usage
         disk_usage = psutil.disk_usage('/')
         disk_usage = psutil.disk_usage('/')
-        disk_usage_percent = disk_usage.percent
 
 
-        # Network
+        # Calculate disk deltas (IOPS, throughput)
+        disk_read_iops = 0
+        disk_write_iops = 0
+        disk_read_mbps = 0
+        disk_write_mbps = 0
+
+        if self.previous_disk_io and self.previous_timestamp:
+            time_delta = current_timestamp - self.previous_timestamp
+            if time_delta > 0:
+                disk_read_iops = (disk_io.read_count - self.previous_disk_io.read_count) / time_delta
+                disk_write_iops = (disk_io.write_count - self.previous_disk_io.write_count) / time_delta
+                disk_read_mbps = ((disk_io.read_bytes - self.previous_disk_io.read_bytes) / time_delta) / (1024 * 1024)
+                disk_write_mbps = ((disk_io.write_bytes - self.previous_disk_io.write_bytes) / time_delta) / (1024 * 1024)
+
+        self.previous_disk_io = disk_io
+
+        # Network - with packets and throughput
         net_io = psutil.net_io_counters()
         net_io = psutil.net_io_counters()
-        net_sent_bytes = net_io.bytes_sent
-        net_recv_bytes = net_io.bytes_recv
+
+        # Calculate network deltas
+        net_in_mbps = 0
+        net_out_mbps = 0
+        net_packets_in_per_sec = 0
+        net_packets_out_per_sec = 0
+
+        if self.previous_net_io and self.previous_timestamp:
+            time_delta = current_timestamp - self.previous_timestamp
+            if time_delta > 0:
+                net_in_mbps = ((net_io.bytes_recv - self.previous_net_io.bytes_recv) / time_delta) / (1024 * 1024)
+                net_out_mbps = ((net_io.bytes_sent - self.previous_net_io.bytes_sent) / time_delta) / (1024 * 1024)
+                net_packets_in_per_sec = (net_io.packets_recv - self.previous_net_io.packets_recv) / time_delta
+                net_packets_out_per_sec = (net_io.packets_sent - self.previous_net_io.packets_sent) / time_delta
+
+        self.previous_net_io = net_io
+        self.previous_timestamp = current_timestamp
+
+        # Processes - top CPU and memory consumers
+        processes = []
+        for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
+            try:
+                processes.append(proc.info)
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                pass
+
+        top_cpu = sorted(processes, key=lambda p: p.get('cpu_percent', 0), reverse=True)[:5]
+        top_mem = sorted(processes, key=lambda p: p.get('memory_percent', 0), reverse=True)[:5]
+
+        # Clean up process info
+        top_cpu_clean = [
+            {'pid': p['pid'], 'name': p['name'], 'cpu': round(p.get('cpu_percent', 0), 1)}
+            for p in top_cpu if p.get('cpu_percent', 0) > 0
+        ]
+        top_mem_clean = [
+            {'pid': p['pid'], 'name': p['name'], 'mem': round(p.get('memory_percent', 0), 1)}
+            for p in top_mem if p.get('memory_percent', 0) > 0
+        ]
 
 
         return {
         return {
             'timestamp': datetime.now(timezone.utc),
             'timestamp': datetime.now(timezone.utc),
+            # CPU
             'cpu_percent': cpu_percent,
             'cpu_percent': cpu_percent,
             'cpu_count': cpu_count,
             'cpu_count': cpu_count,
-            'memory_total': memory_total,
-            'memory_used': memory_used,
-            'memory_percent': memory_percent,
+            'cpu_per_core': cpu_per_core,
+            'cpu_steal': getattr(cpu_times, 'steal', 0),  # VM steal time
+            'context_switches_per_sec': int(ctx_switches_per_sec),
+            'interrupts_per_sec': int(interrupts_per_sec),
+            # Memory
+            'memory_total': mem.total,
+            'memory_used': mem.used,
+            'memory_percent': mem.percent,
+            'memory_available': mem.available,
+            'memory_buffers': getattr(mem, 'buffers', 0),
+            'memory_cached': getattr(mem, 'cached', 0),
+            'swap_total': swap.total,
+            'swap_used': swap.used,
+            'swap_percent': swap.percent,
+            # Load
             'load_1': load_1,
             'load_1': load_1,
             'load_5': load_5,
             'load_5': load_5,
             'load_15': load_15,
             'load_15': load_15,
-            'disk_read_bytes': disk_read_bytes,
-            'disk_write_bytes': disk_write_bytes,
-            'disk_usage_percent': disk_usage_percent,
-            'net_sent_bytes': net_sent_bytes,
-            'net_recv_bytes': net_recv_bytes,
+            # Disk I/O
+            'disk_read_bytes': disk_io.read_bytes,
+            'disk_write_bytes': disk_io.write_bytes,
+            'disk_read_iops': int(disk_read_iops),
+            'disk_write_iops': int(disk_write_iops),
+            'disk_read_mbps': round(disk_read_mbps, 2),
+            'disk_write_mbps': round(disk_write_mbps, 2),
+            'disk_io_time_ms': getattr(disk_io, 'read_time', 0) + getattr(disk_io, 'write_time', 0),
+            'disk_usage_percent': disk_usage.percent,
+            # Network
+            'net_sent_bytes': net_io.bytes_sent,
+            'net_recv_bytes': net_io.bytes_recv,
+            'net_in_mbps': round(net_in_mbps, 2),
+            'net_out_mbps': round(net_out_mbps, 2),
+            'net_packets_in_per_sec': int(net_packets_in_per_sec),
+            'net_packets_out_per_sec': int(net_packets_out_per_sec),
+            'net_errors_in': net_io.errin,
+            'net_errors_out': net_io.errout,
+            'net_drops_in': net_io.dropin,
+            'net_drops_out': net_io.dropout,
+            # Processes
+            'process_count': len(psutil.pids()),
+            'thread_count': sum(p.num_threads() for p in psutil.process_iter() if p.is_running()),
+            'top_cpu_processes': top_cpu_clean,
+            'top_mem_processes': top_mem_clean,
         }
         }
 
 
     async def store_metrics(self, metrics: dict):
     async def store_metrics(self, metrics: dict):