root
/
mybeacon-backend


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
							"""
Host monitoring service for collecting system metrics.
"""

import asyncio
from datetime import datetime, timedelta, timezone

import psutil
from sqlalchemy import delete, select
from sqlalchemy.ext.asyncio import AsyncSession

from app.core.database import async_session_maker
from app.models.host_metrics import HostMetrics
from app.services.alert_service import alert_service


class HostMonitor:
    """Collect and store host system metrics."""

    def __init__(self):
        self.previous_disk_io = None
        self.previous_net_io = None
        self.running = False

    async def collect_metrics(self) -> dict:
        """Collect current system metrics."""
        # CPU
        cpu_percent = psutil.cpu_percent(interval=1)
        cpu_count = psutil.cpu_count()

        # Memory
        mem = psutil.virtual_memory()
        memory_total = mem.total
        memory_used = mem.used
        memory_percent = mem.percent

        # Load Average
        load_avg = psutil.getloadavg()
        load_1, load_5, load_15 = load_avg

        # Disk I/O
        disk_io = psutil.disk_io_counters()
        disk_read_bytes = disk_io.read_bytes
        disk_write_bytes = disk_io.write_bytes

        # Disk Usage
        disk_usage = psutil.disk_usage('/')
        disk_usage_percent = disk_usage.percent

        # Network
        net_io = psutil.net_io_counters()
        net_sent_bytes = net_io.bytes_sent
        net_recv_bytes = net_io.bytes_recv

        return {
            'timestamp': datetime.now(timezone.utc),
            'cpu_percent': cpu_percent,
            'cpu_count': cpu_count,
            'memory_total': memory_total,
            'memory_used': memory_used,
            'memory_percent': memory_percent,
            'load_1': load_1,
            'load_5': load_5,
            'load_15': load_15,
            'disk_read_bytes': disk_read_bytes,
            'disk_write_bytes': disk_write_bytes,
            'disk_usage_percent': disk_usage_percent,
            'net_sent_bytes': net_sent_bytes,
            'net_recv_bytes': net_recv_bytes,
        }

    async def store_metrics(self, metrics: dict):
        """Store metrics in database."""
        async with async_session_maker() as session:
            metric = HostMetrics(**metrics)
            session.add(metric)
            await session.commit()

    async def check_thresholds(self, metrics: dict):
        """Check if metrics exceed configured thresholds and create alerts."""
        # Get thresholds from settings
        async with async_session_maker() as session:
            from app.models.settings import Settings

            result = await session.execute(
                select(Settings).where(Settings.key == "host_monitoring")
            )
            settings = result.scalar_one_or_none()

            if not settings:
                return

            thresholds = settings.value

        # Check CPU
        if metrics['cpu_percent'] > thresholds.get('cpu_threshold', 90):
            await alert_service.create_alert(
                alert_type='host_metrics',
                severity='warning' if metrics['cpu_percent'] < 95 else 'critical',
                title=f'High CPU Usage: {metrics["cpu_percent"]:.1f}%',
                message=f'CPU usage is at {metrics["cpu_percent"]:.1f}%, threshold is {thresholds.get("cpu_threshold", 90)}%',
                alert_metadata={'metric': 'cpu_percent', 'value': metrics['cpu_percent']},
            )

        # Check Memory
        if metrics['memory_percent'] > thresholds.get('memory_threshold', 90):
            await alert_service.create_alert(
                alert_type='host_metrics',
                severity='warning' if metrics['memory_percent'] < 95 else 'critical',
                title=f'High Memory Usage: {metrics["memory_percent"]:.1f}%',
                message=f'Memory usage is at {metrics["memory_percent"]:.1f}%, threshold is {thresholds.get("memory_threshold", 90)}%',
                alert_metadata={'metric': 'memory_percent', 'value': metrics['memory_percent']},
            )

        # Check Load Average (relative to CPU count)
        load_threshold = thresholds.get('load_threshold', 2.0) * metrics['cpu_count']
        if metrics['load_1'] > load_threshold:
            await alert_service.create_alert(
                alert_type='host_metrics',
                severity='warning',
                title=f'High Load Average: {metrics["load_1"]:.2f}',
                message=f'1-minute load average is {metrics["load_1"]:.2f}, threshold is {load_threshold:.2f}',
                alert_metadata={'metric': 'load_1', 'value': metrics['load_1']},
            )

        # Check Disk Usage
        if metrics['disk_usage_percent'] > thresholds.get('disk_threshold', 90):
            await alert_service.create_alert(
                alert_type='host_metrics',
                severity='warning' if metrics['disk_usage_percent'] < 95 else 'critical',
                title=f'High Disk Usage: {metrics["disk_usage_percent"]:.1f}%',
                message=f'Disk usage is at {metrics["disk_usage_percent"]:.1f}%, threshold is {thresholds.get("disk_threshold", 90)}%',
                alert_metadata={'metric': 'disk_usage_percent', 'value': metrics['disk_usage_percent']},
            )

    async def cleanup_old_metrics(self, days: int = 30):
        """Delete metrics older than specified days."""
        cutoff = datetime.now(timezone.utc) - timedelta(days=days)

        async with async_session_maker() as session:
            await session.execute(
                delete(HostMetrics).where(HostMetrics.timestamp < cutoff)
            )
            await session.commit()

    async def run_monitoring_loop(self):
        """Main monitoring loop - runs in background."""
        print("[HostMonitor] Starting host monitoring loop")
        self.running = True

        while self.running:
            try:
                # Collect metrics
                metrics = await self.collect_metrics()

                # Store in database
                await self.store_metrics(metrics)

                # Check thresholds
                await self.check_thresholds(metrics)

                # Cleanup old data once per hour
                if datetime.now().minute == 0:
                    await self.cleanup_old_metrics()

                # Wait 60 seconds before next collection
                await asyncio.sleep(60)

            except Exception as e:
                print(f"[HostMonitor] Error in monitoring loop: {e}")
                await asyncio.sleep(60)

    async def stop(self):
        """Stop monitoring loop."""
        print("[HostMonitor] Stopping host monitoring loop")
        self.running = False


# Global instance
host_monitor = HostMonitor()