host_monitor.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. """
  2. Host monitoring service for collecting system metrics.
  3. """
  4. import asyncio
  5. from datetime import datetime, timedelta, timezone
  6. import psutil
  7. from sqlalchemy import delete, select
  8. from sqlalchemy.ext.asyncio import AsyncSession
  9. from app.core.database import async_session_maker
  10. from app.models.host_metrics import HostMetrics
  11. from app.services.alert_service import alert_service
  12. class HostMonitor:
  13. """Collect and store host system metrics."""
  14. def __init__(self):
  15. self.previous_disk_io = None
  16. self.previous_net_io = None
  17. self.running = False
  18. async def collect_metrics(self) -> dict:
  19. """Collect current system metrics."""
  20. # CPU
  21. cpu_percent = psutil.cpu_percent(interval=1)
  22. cpu_count = psutil.cpu_count()
  23. # Memory
  24. mem = psutil.virtual_memory()
  25. memory_total = mem.total
  26. memory_used = mem.used
  27. memory_percent = mem.percent
  28. # Load Average
  29. load_avg = psutil.getloadavg()
  30. load_1, load_5, load_15 = load_avg
  31. # Disk I/O
  32. disk_io = psutil.disk_io_counters()
  33. disk_read_bytes = disk_io.read_bytes
  34. disk_write_bytes = disk_io.write_bytes
  35. # Disk Usage
  36. disk_usage = psutil.disk_usage('/')
  37. disk_usage_percent = disk_usage.percent
  38. # Network
  39. net_io = psutil.net_io_counters()
  40. net_sent_bytes = net_io.bytes_sent
  41. net_recv_bytes = net_io.bytes_recv
  42. return {
  43. 'timestamp': datetime.now(timezone.utc),
  44. 'cpu_percent': cpu_percent,
  45. 'cpu_count': cpu_count,
  46. 'memory_total': memory_total,
  47. 'memory_used': memory_used,
  48. 'memory_percent': memory_percent,
  49. 'load_1': load_1,
  50. 'load_5': load_5,
  51. 'load_15': load_15,
  52. 'disk_read_bytes': disk_read_bytes,
  53. 'disk_write_bytes': disk_write_bytes,
  54. 'disk_usage_percent': disk_usage_percent,
  55. 'net_sent_bytes': net_sent_bytes,
  56. 'net_recv_bytes': net_recv_bytes,
  57. }
  58. async def store_metrics(self, metrics: dict):
  59. """Store metrics in database."""
  60. async with async_session_maker() as session:
  61. metric = HostMetrics(**metrics)
  62. session.add(metric)
  63. await session.commit()
  64. async def check_thresholds(self, metrics: dict):
  65. """Check if metrics exceed configured thresholds and create alerts."""
  66. # Get thresholds from settings
  67. async with async_session_maker() as session:
  68. from app.models.settings import Settings
  69. result = await session.execute(
  70. select(Settings).where(Settings.key == "host_monitoring")
  71. )
  72. settings = result.scalar_one_or_none()
  73. if not settings:
  74. return
  75. thresholds = settings.value
  76. # Check CPU
  77. if metrics['cpu_percent'] > thresholds.get('cpu_threshold', 90):
  78. await alert_service.create_alert(
  79. alert_type='host_metrics',
  80. severity='warning' if metrics['cpu_percent'] < 95 else 'critical',
  81. title=f'High CPU Usage: {metrics["cpu_percent"]:.1f}%',
  82. message=f'CPU usage is at {metrics["cpu_percent"]:.1f}%, threshold is {thresholds.get("cpu_threshold", 90)}%',
  83. alert_metadata={'metric': 'cpu_percent', 'value': metrics['cpu_percent']},
  84. )
  85. # Check Memory
  86. if metrics['memory_percent'] > thresholds.get('memory_threshold', 90):
  87. await alert_service.create_alert(
  88. alert_type='host_metrics',
  89. severity='warning' if metrics['memory_percent'] < 95 else 'critical',
  90. title=f'High Memory Usage: {metrics["memory_percent"]:.1f}%',
  91. message=f'Memory usage is at {metrics["memory_percent"]:.1f}%, threshold is {thresholds.get("memory_threshold", 90)}%',
  92. alert_metadata={'metric': 'memory_percent', 'value': metrics['memory_percent']},
  93. )
  94. # Check Load Average (relative to CPU count)
  95. load_threshold = thresholds.get('load_threshold', 2.0) * metrics['cpu_count']
  96. if metrics['load_1'] > load_threshold:
  97. await alert_service.create_alert(
  98. alert_type='host_metrics',
  99. severity='warning',
  100. title=f'High Load Average: {metrics["load_1"]:.2f}',
  101. message=f'1-minute load average is {metrics["load_1"]:.2f}, threshold is {load_threshold:.2f}',
  102. alert_metadata={'metric': 'load_1', 'value': metrics['load_1']},
  103. )
  104. # Check Disk Usage
  105. if metrics['disk_usage_percent'] > thresholds.get('disk_threshold', 90):
  106. await alert_service.create_alert(
  107. alert_type='host_metrics',
  108. severity='warning' if metrics['disk_usage_percent'] < 95 else 'critical',
  109. title=f'High Disk Usage: {metrics["disk_usage_percent"]:.1f}%',
  110. message=f'Disk usage is at {metrics["disk_usage_percent"]:.1f}%, threshold is {thresholds.get("disk_threshold", 90)}%',
  111. alert_metadata={'metric': 'disk_usage_percent', 'value': metrics['disk_usage_percent']},
  112. )
  113. async def cleanup_old_metrics(self, days: int = 30):
  114. """Delete metrics older than specified days."""
  115. cutoff = datetime.now(timezone.utc) - timedelta(days=days)
  116. async with async_session_maker() as session:
  117. await session.execute(
  118. delete(HostMetrics).where(HostMetrics.timestamp < cutoff)
  119. )
  120. await session.commit()
  121. async def run_monitoring_loop(self):
  122. """Main monitoring loop - runs in background."""
  123. print("[HostMonitor] Starting host monitoring loop")
  124. self.running = True
  125. while self.running:
  126. try:
  127. # Collect metrics
  128. metrics = await self.collect_metrics()
  129. # Store in database
  130. await self.store_metrics(metrics)
  131. # Check thresholds
  132. await self.check_thresholds(metrics)
  133. # Cleanup old data once per hour
  134. if datetime.now().minute == 0:
  135. await self.cleanup_old_metrics()
  136. # Wait 60 seconds before next collection
  137. await asyncio.sleep(60)
  138. except Exception as e:
  139. print(f"[HostMonitor] Error in monitoring loop: {e}")
  140. await asyncio.sleep(60)
  141. async def stop(self):
  142. """Stop monitoring loop."""
  143. print("[HostMonitor] Stopping host monitoring loop")
  144. self.running = False
  145. # Global instance
  146. host_monitor = HostMonitor()