| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- """
- Host monitoring service for collecting system metrics.
- """
- import asyncio
- from datetime import datetime, timedelta, timezone
- import psutil
- from sqlalchemy import delete, select
- from sqlalchemy.ext.asyncio import AsyncSession
- from app.core.database import async_session_maker
- from app.models.host_metrics import HostMetrics
- from app.services.alert_service import alert_service
- class HostMonitor:
- """Collect and store host system metrics."""
- def __init__(self):
- self.previous_disk_io = None
- self.previous_net_io = None
- self.running = False
- async def collect_metrics(self) -> dict:
- """Collect current system metrics."""
- # CPU
- cpu_percent = psutil.cpu_percent(interval=1)
- cpu_count = psutil.cpu_count()
- # Memory
- mem = psutil.virtual_memory()
- memory_total = mem.total
- memory_used = mem.used
- memory_percent = mem.percent
- # Load Average
- load_avg = psutil.getloadavg()
- load_1, load_5, load_15 = load_avg
- # Disk I/O
- disk_io = psutil.disk_io_counters()
- disk_read_bytes = disk_io.read_bytes
- disk_write_bytes = disk_io.write_bytes
- # Disk Usage
- disk_usage = psutil.disk_usage('/')
- disk_usage_percent = disk_usage.percent
- # Network
- net_io = psutil.net_io_counters()
- net_sent_bytes = net_io.bytes_sent
- net_recv_bytes = net_io.bytes_recv
- return {
- 'timestamp': datetime.now(timezone.utc),
- 'cpu_percent': cpu_percent,
- 'cpu_count': cpu_count,
- 'memory_total': memory_total,
- 'memory_used': memory_used,
- 'memory_percent': memory_percent,
- 'load_1': load_1,
- 'load_5': load_5,
- 'load_15': load_15,
- 'disk_read_bytes': disk_read_bytes,
- 'disk_write_bytes': disk_write_bytes,
- 'disk_usage_percent': disk_usage_percent,
- 'net_sent_bytes': net_sent_bytes,
- 'net_recv_bytes': net_recv_bytes,
- }
- async def store_metrics(self, metrics: dict):
- """Store metrics in database."""
- async with async_session_maker() as session:
- metric = HostMetrics(**metrics)
- session.add(metric)
- await session.commit()
- async def check_thresholds(self, metrics: dict):
- """Check if metrics exceed configured thresholds and create alerts."""
- # Get thresholds from settings
- async with async_session_maker() as session:
- from app.models.settings import Settings
- result = await session.execute(
- select(Settings).where(Settings.key == "host_monitoring")
- )
- settings = result.scalar_one_or_none()
- if not settings:
- return
- thresholds = settings.value
- # Check CPU
- if metrics['cpu_percent'] > thresholds.get('cpu_threshold', 90):
- await alert_service.create_alert(
- alert_type='host_metrics',
- severity='warning' if metrics['cpu_percent'] < 95 else 'critical',
- title=f'High CPU Usage: {metrics["cpu_percent"]:.1f}%',
- message=f'CPU usage is at {metrics["cpu_percent"]:.1f}%, threshold is {thresholds.get("cpu_threshold", 90)}%',
- alert_metadata={'metric': 'cpu_percent', 'value': metrics['cpu_percent']},
- )
- # Check Memory
- if metrics['memory_percent'] > thresholds.get('memory_threshold', 90):
- await alert_service.create_alert(
- alert_type='host_metrics',
- severity='warning' if metrics['memory_percent'] < 95 else 'critical',
- title=f'High Memory Usage: {metrics["memory_percent"]:.1f}%',
- message=f'Memory usage is at {metrics["memory_percent"]:.1f}%, threshold is {thresholds.get("memory_threshold", 90)}%',
- alert_metadata={'metric': 'memory_percent', 'value': metrics['memory_percent']},
- )
- # Check Load Average (relative to CPU count)
- load_threshold = thresholds.get('load_threshold', 2.0) * metrics['cpu_count']
- if metrics['load_1'] > load_threshold:
- await alert_service.create_alert(
- alert_type='host_metrics',
- severity='warning',
- title=f'High Load Average: {metrics["load_1"]:.2f}',
- message=f'1-minute load average is {metrics["load_1"]:.2f}, threshold is {load_threshold:.2f}',
- alert_metadata={'metric': 'load_1', 'value': metrics['load_1']},
- )
- # Check Disk Usage
- if metrics['disk_usage_percent'] > thresholds.get('disk_threshold', 90):
- await alert_service.create_alert(
- alert_type='host_metrics',
- severity='warning' if metrics['disk_usage_percent'] < 95 else 'critical',
- title=f'High Disk Usage: {metrics["disk_usage_percent"]:.1f}%',
- message=f'Disk usage is at {metrics["disk_usage_percent"]:.1f}%, threshold is {thresholds.get("disk_threshold", 90)}%',
- alert_metadata={'metric': 'disk_usage_percent', 'value': metrics['disk_usage_percent']},
- )
- async def cleanup_old_metrics(self, days: int = 30):
- """Delete metrics older than specified days."""
- cutoff = datetime.now(timezone.utc) - timedelta(days=days)
- async with async_session_maker() as session:
- await session.execute(
- delete(HostMetrics).where(HostMetrics.timestamp < cutoff)
- )
- await session.commit()
- async def run_monitoring_loop(self):
- """Main monitoring loop - runs in background."""
- print("[HostMonitor] Starting host monitoring loop")
- self.running = True
- while self.running:
- try:
- # Collect metrics
- metrics = await self.collect_metrics()
- # Store in database
- await self.store_metrics(metrics)
- # Check thresholds
- await self.check_thresholds(metrics)
- # Cleanup old data once per hour
- if datetime.now().minute == 0:
- await self.cleanup_old_metrics()
- # Wait 60 seconds before next collection
- await asyncio.sleep(60)
- except Exception as e:
- print(f"[HostMonitor] Error in monitoring loop: {e}")
- await asyncio.sleep(60)
- async def stop(self):
- """Stop monitoring loop."""
- print("[HostMonitor] Stopping host monitoring loop")
- self.running = False
- # Global instance
- host_monitor = HostMonitor()
|