""" Host monitoring service for collecting system metrics. """ import asyncio from datetime import datetime, timedelta, timezone import psutil from sqlalchemy import delete, select from sqlalchemy.ext.asyncio import AsyncSession from app.core.database import async_session_maker from app.models.host_metrics import HostMetrics from app.services.alert_service import alert_service class HostMonitor: """Collect and store host system metrics.""" def __init__(self): self.previous_disk_io = None self.previous_net_io = None self.running = False async def collect_metrics(self) -> dict: """Collect current system metrics.""" # CPU cpu_percent = psutil.cpu_percent(interval=1) cpu_count = psutil.cpu_count() # Memory mem = psutil.virtual_memory() memory_total = mem.total memory_used = mem.used memory_percent = mem.percent # Load Average load_avg = psutil.getloadavg() load_1, load_5, load_15 = load_avg # Disk I/O disk_io = psutil.disk_io_counters() disk_read_bytes = disk_io.read_bytes disk_write_bytes = disk_io.write_bytes # Disk Usage disk_usage = psutil.disk_usage('/') disk_usage_percent = disk_usage.percent # Network net_io = psutil.net_io_counters() net_sent_bytes = net_io.bytes_sent net_recv_bytes = net_io.bytes_recv return { 'timestamp': datetime.now(timezone.utc), 'cpu_percent': cpu_percent, 'cpu_count': cpu_count, 'memory_total': memory_total, 'memory_used': memory_used, 'memory_percent': memory_percent, 'load_1': load_1, 'load_5': load_5, 'load_15': load_15, 'disk_read_bytes': disk_read_bytes, 'disk_write_bytes': disk_write_bytes, 'disk_usage_percent': disk_usage_percent, 'net_sent_bytes': net_sent_bytes, 'net_recv_bytes': net_recv_bytes, } async def store_metrics(self, metrics: dict): """Store metrics in database.""" async with async_session_maker() as session: metric = HostMetrics(**metrics) session.add(metric) await session.commit() async def check_thresholds(self, metrics: dict): """Check if metrics exceed configured thresholds and create alerts.""" # Get thresholds from settings async with async_session_maker() as session: from app.models.settings import Settings result = await session.execute( select(Settings).where(Settings.key == "host_monitoring") ) settings = result.scalar_one_or_none() if not settings: return thresholds = settings.value # Check CPU if metrics['cpu_percent'] > thresholds.get('cpu_threshold', 90): await alert_service.create_alert( alert_type='host_metrics', severity='warning' if metrics['cpu_percent'] < 95 else 'critical', title=f'High CPU Usage: {metrics["cpu_percent"]:.1f}%', message=f'CPU usage is at {metrics["cpu_percent"]:.1f}%, threshold is {thresholds.get("cpu_threshold", 90)}%', alert_metadata={'metric': 'cpu_percent', 'value': metrics['cpu_percent']}, ) # Check Memory if metrics['memory_percent'] > thresholds.get('memory_threshold', 90): await alert_service.create_alert( alert_type='host_metrics', severity='warning' if metrics['memory_percent'] < 95 else 'critical', title=f'High Memory Usage: {metrics["memory_percent"]:.1f}%', message=f'Memory usage is at {metrics["memory_percent"]:.1f}%, threshold is {thresholds.get("memory_threshold", 90)}%', alert_metadata={'metric': 'memory_percent', 'value': metrics['memory_percent']}, ) # Check Load Average (relative to CPU count) load_threshold = thresholds.get('load_threshold', 2.0) * metrics['cpu_count'] if metrics['load_1'] > load_threshold: await alert_service.create_alert( alert_type='host_metrics', severity='warning', title=f'High Load Average: {metrics["load_1"]:.2f}', message=f'1-minute load average is {metrics["load_1"]:.2f}, threshold is {load_threshold:.2f}', alert_metadata={'metric': 'load_1', 'value': metrics['load_1']}, ) # Check Disk Usage if metrics['disk_usage_percent'] > thresholds.get('disk_threshold', 90): await alert_service.create_alert( alert_type='host_metrics', severity='warning' if metrics['disk_usage_percent'] < 95 else 'critical', title=f'High Disk Usage: {metrics["disk_usage_percent"]:.1f}%', message=f'Disk usage is at {metrics["disk_usage_percent"]:.1f}%, threshold is {thresholds.get("disk_threshold", 90)}%', alert_metadata={'metric': 'disk_usage_percent', 'value': metrics['disk_usage_percent']}, ) async def cleanup_old_metrics(self, days: int = 30): """Delete metrics older than specified days.""" cutoff = datetime.now(timezone.utc) - timedelta(days=days) async with async_session_maker() as session: await session.execute( delete(HostMetrics).where(HostMetrics.timestamp < cutoff) ) await session.commit() async def run_monitoring_loop(self): """Main monitoring loop - runs in background.""" print("[HostMonitor] Starting host monitoring loop") self.running = True while self.running: try: # Collect metrics metrics = await self.collect_metrics() # Store in database await self.store_metrics(metrics) # Check thresholds await self.check_thresholds(metrics) # Cleanup old data once per hour if datetime.now().minute == 0: await self.cleanup_old_metrics() # Wait 60 seconds before next collection await asyncio.sleep(60) except Exception as e: print(f"[HostMonitor] Error in monitoring loop: {e}") await asyncio.sleep(60) async def stop(self): """Stop monitoring loop.""" print("[HostMonitor] Stopping host monitoring loop") self.running = False # Global instance host_monitor = HostMonitor()