|
|
@@ -7,7 +7,7 @@ import time
|
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
|
|
import psutil
|
|
|
-from sqlalchemy import delete, select
|
|
|
+from sqlalchemy import delete, select, text
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
|
|
from app.core.database import async_session_maker
|
|
|
@@ -23,8 +23,160 @@ class HostMonitor:
|
|
|
self.previous_net_io = None
|
|
|
self.previous_cpu_stats = None
|
|
|
self.previous_timestamp = None
|
|
|
+ self.previous_pg_stats = None
|
|
|
+ self.previous_ch_stats = None
|
|
|
self.running = False
|
|
|
|
|
|
+ async def collect_postgresql_metrics(self) -> dict:
|
|
|
+ """Collect PostgreSQL database metrics."""
|
|
|
+ try:
|
|
|
+ async with async_session_maker() as session:
|
|
|
+ # Active connections
|
|
|
+ result = await session.execute(text("""
|
|
|
+ SELECT count(*) as active,
|
|
|
+ (SELECT setting::int FROM pg_settings WHERE name = 'max_connections') as max
|
|
|
+ FROM pg_stat_activity
|
|
|
+ WHERE state = 'active'
|
|
|
+ """))
|
|
|
+ row = result.fetchone()
|
|
|
+ active_connections = row[0] if row else 0
|
|
|
+ total_connections = row[1] if row else 100
|
|
|
+
|
|
|
+ # Database size
|
|
|
+ from app.config import settings
|
|
|
+ db_name = settings.DATABASE_URL.split('/')[-1].split('?')[0]
|
|
|
+ result = await session.execute(text(f"""
|
|
|
+ SELECT pg_database_size('{db_name}')
|
|
|
+ """))
|
|
|
+ db_size = result.scalar() or 0
|
|
|
+
|
|
|
+ # Cache hit ratio
|
|
|
+ result = await session.execute(text("""
|
|
|
+ SELECT
|
|
|
+ sum(blks_hit) * 100.0 / NULLIF(sum(blks_hit) + sum(blks_read), 0) as cache_hit_ratio
|
|
|
+ FROM pg_stat_database
|
|
|
+ """))
|
|
|
+ cache_hit_ratio = result.scalar() or 0
|
|
|
+
|
|
|
+ # Transactions per second (delta-based)
|
|
|
+ result = await session.execute(text("""
|
|
|
+ SELECT sum(xact_commit + xact_rollback) as total_xacts,
|
|
|
+ sum(deadlocks) as deadlocks,
|
|
|
+ sum(temp_files) as temp_files
|
|
|
+ FROM pg_stat_database
|
|
|
+ """))
|
|
|
+ row = result.fetchone()
|
|
|
+ total_xacts = row[0] or 0
|
|
|
+ deadlocks = row[1] or 0
|
|
|
+ temp_files = row[2] or 0
|
|
|
+
|
|
|
+ # Calculate TPS
|
|
|
+ tps = 0
|
|
|
+ if self.previous_pg_stats and self.previous_timestamp:
|
|
|
+ time_delta = time.time() - self.previous_timestamp
|
|
|
+ if time_delta > 0:
|
|
|
+ tps = int((total_xacts - self.previous_pg_stats['xacts']) / time_delta)
|
|
|
+
|
|
|
+ self.previous_pg_stats = {'xacts': total_xacts}
|
|
|
+
|
|
|
+ return {
|
|
|
+ 'pg_active_connections': active_connections,
|
|
|
+ 'pg_total_connections': total_connections,
|
|
|
+ 'pg_database_size_bytes': db_size,
|
|
|
+ 'pg_cache_hit_ratio': round(cache_hit_ratio, 2),
|
|
|
+ 'pg_transactions_per_sec': max(0, tps),
|
|
|
+ 'pg_deadlocks': deadlocks,
|
|
|
+ 'pg_temp_files': temp_files,
|
|
|
+ }
|
|
|
+ except Exception as e:
|
|
|
+ print(f"[HostMonitor] Error collecting PostgreSQL metrics: {e}")
|
|
|
+ return {
|
|
|
+ 'pg_active_connections': 0,
|
|
|
+ 'pg_total_connections': 0,
|
|
|
+ 'pg_database_size_bytes': 0,
|
|
|
+ 'pg_cache_hit_ratio': 0,
|
|
|
+ 'pg_transactions_per_sec': 0,
|
|
|
+ 'pg_deadlocks': 0,
|
|
|
+ 'pg_temp_files': 0,
|
|
|
+ }
|
|
|
+
|
|
|
+ async def collect_clickhouse_metrics(self) -> dict:
|
|
|
+ """Collect ClickHouse database metrics."""
|
|
|
+ try:
|
|
|
+ import clickhouse_connect
|
|
|
+ from app.config import settings
|
|
|
+
|
|
|
+ # Check if ClickHouse is configured
|
|
|
+ if not hasattr(settings, 'CLICKHOUSE_HOST'):
|
|
|
+ return {
|
|
|
+ 'ch_active_queries': 0,
|
|
|
+ 'ch_database_size_bytes': 0,
|
|
|
+ 'ch_queries_per_sec': 0,
|
|
|
+ 'ch_rows_read_per_sec': 0,
|
|
|
+ 'ch_memory_usage_bytes': 0,
|
|
|
+ }
|
|
|
+
|
|
|
+ # Connect to ClickHouse
|
|
|
+ client = clickhouse_connect.get_client(
|
|
|
+ host=settings.CLICKHOUSE_HOST,
|
|
|
+ port=settings.CLICKHOUSE_PORT,
|
|
|
+ username=settings.CLICKHOUSE_USER,
|
|
|
+ password=settings.CLICKHOUSE_PASSWORD,
|
|
|
+ )
|
|
|
+
|
|
|
+ # Active queries
|
|
|
+ result = client.query("SELECT count() FROM system.processes")
|
|
|
+ active_queries = result.result_rows[0][0] if result.result_rows else 0
|
|
|
+
|
|
|
+ # Database size
|
|
|
+ result = client.query("""
|
|
|
+ SELECT sum(bytes) FROM system.parts
|
|
|
+ WHERE active AND database NOT IN ('system', 'information_schema')
|
|
|
+ """)
|
|
|
+ db_size = result.result_rows[0][0] if result.result_rows else 0
|
|
|
+
|
|
|
+ # Query stats (delta-based)
|
|
|
+ result = client.query("""
|
|
|
+ SELECT
|
|
|
+ sum(query_count) as queries,
|
|
|
+ sum(read_rows) as rows_read,
|
|
|
+ sum(memory_usage) as memory
|
|
|
+ FROM system.query_log
|
|
|
+ WHERE event_time > now() - INTERVAL 60 SECOND
|
|
|
+ """)
|
|
|
+ row = result.result_rows[0] if result.result_rows else (0, 0, 0)
|
|
|
+ queries = row[0] or 0
|
|
|
+ rows_read = row[1] or 0
|
|
|
+ memory_usage = row[2] or 0
|
|
|
+
|
|
|
+ # Calculate QPS
|
|
|
+ qps = 0
|
|
|
+ rows_per_sec = 0
|
|
|
+ if self.previous_ch_stats and self.previous_timestamp:
|
|
|
+ time_delta = time.time() - self.previous_timestamp
|
|
|
+ if time_delta > 0:
|
|
|
+ qps = int((queries - self.previous_ch_stats['queries']) / time_delta)
|
|
|
+ rows_per_sec = int((rows_read - self.previous_ch_stats['rows']) / time_delta)
|
|
|
+
|
|
|
+ self.previous_ch_stats = {'queries': queries, 'rows': rows_read}
|
|
|
+
|
|
|
+ return {
|
|
|
+ 'ch_active_queries': active_queries,
|
|
|
+ 'ch_database_size_bytes': db_size or 0,
|
|
|
+ 'ch_queries_per_sec': max(0, qps),
|
|
|
+ 'ch_rows_read_per_sec': max(0, rows_per_sec),
|
|
|
+ 'ch_memory_usage_bytes': memory_usage or 0,
|
|
|
+ }
|
|
|
+ except Exception as e:
|
|
|
+ print(f"[HostMonitor] Error collecting ClickHouse metrics: {e}")
|
|
|
+ return {
|
|
|
+ 'ch_active_queries': 0,
|
|
|
+ 'ch_database_size_bytes': 0,
|
|
|
+ 'ch_queries_per_sec': 0,
|
|
|
+ 'ch_rows_read_per_sec': 0,
|
|
|
+ 'ch_memory_usage_bytes': 0,
|
|
|
+ }
|
|
|
+
|
|
|
async def collect_metrics(self) -> dict:
|
|
|
"""Collect comprehensive system metrics."""
|
|
|
current_timestamp = time.time()
|
|
|
@@ -166,7 +318,26 @@ class HostMonitor:
|
|
|
'process_count': len(psutil.pids()),
|
|
|
'thread_count': sum(p.num_threads() for p in psutil.process_iter() if p.is_running()),
|
|
|
'top_cpu_processes': top_cpu_clean,
|
|
|
- 'top_mem_processes': top_mem_clean,
|
|
|
+ 'top_mem_processes': top_mem_processes,
|
|
|
+ }
|
|
|
+
|
|
|
+ # Collect database metrics
|
|
|
+ pg_metrics = await self.collect_postgresql_metrics()
|
|
|
+ ch_metrics = await self.collect_clickhouse_metrics()
|
|
|
+
|
|
|
+ # Collect HTTP metrics
|
|
|
+ from app.core.http_metrics import http_metrics_collector
|
|
|
+ http_metrics = http_metrics_collector.get_metrics()
|
|
|
+
|
|
|
+ # Merge all metrics
|
|
|
+ return {
|
|
|
+ **metrics,
|
|
|
+ **pg_metrics,
|
|
|
+ **ch_metrics,
|
|
|
+ 'http_requests_per_sec': http_metrics['requests_per_sec'],
|
|
|
+ 'http_avg_response_time_ms': round(http_metrics['avg_response_time_ms'], 2),
|
|
|
+ 'http_error_rate': round(http_metrics['error_rate'], 2),
|
|
|
+ 'http_active_requests': http_metrics['active_requests'],
|
|
|
}
|
|
|
|
|
|
async def store_metrics(self, metrics: dict):
|