monitoring.py 9.9 KB


  1. """
  2. Superadmin monitoring endpoints for host metrics and alerts.
  3. """
  4. from datetime import datetime, timedelta, timezone
  5. from fastapi import APIRouter, Depends, Query
  6. from sqlalchemy import select
  7. from sqlalchemy.ext.asyncio import AsyncSession
  8. from app.api.deps import get_current_superadmin, get_db
  9. from app.models.alert import Alert
  10. from app.models.host_metrics import HostMetrics
  11. from app.models.security_event import SecurityEvent
  12. from app.services.alert_service import alert_service
  13. router = APIRouter()
  14. @router.get("/metrics")
  15. async def get_current_metrics(
  16. _current_user=Depends(get_current_superadmin),
  17. ):
  18. """
  19. Get current system metrics (latest snapshot) for dashboard cards.
  20. Returns PostgreSQL, ClickHouse, and HTTP/API metrics.
  21. """
  22. from app.services.host_monitor import host_monitor
  23. # Get the most recent metrics from the monitor
  24. latest_metrics = host_monitor.latest_metrics
  25. if not latest_metrics:
  26. return {
  27. "postgresql": None,
  28. "clickhouse": None,
  29. "http": None,
  30. }
  31. return {
  32. "postgresql": latest_metrics.get("postgresql"),
  33. "clickhouse": latest_metrics.get("clickhouse"),
  34. "http": latest_metrics.get("http"),
  35. }
  36. @router.get("/host-metrics/recent")
  37. async def get_recent_host_metrics(
  38. limit: int = Query(default=60, le=1000),
  39. db: AsyncSession = Depends(get_db),
  40. _current_user=Depends(get_current_superadmin),
  41. ):
  42. """Get recent host metrics for dashboard charts (default: last 60 data points)."""
  43. result = await db.execute(
  44. select(HostMetrics)
  45. .order_by(HostMetrics.timestamp.desc())
  46. .limit(limit)
  47. )
  48. metrics = list(result.scalars().all())
  49. # Return in chronological order
  50. return {
  51. "metrics": [
  52. {
  53. "timestamp": m.timestamp.isoformat(),
  54. "cpu_percent": m.cpu_percent,
  55. "cpu_count": m.cpu_count,
  56. "cpu_per_core": m.cpu_per_core,
  57. "cpu_steal": m.cpu_steal,
  58. "context_switches_per_sec": m.context_switches_per_sec,
  59. "interrupts_per_sec": m.interrupts_per_sec,
  60. "memory_total": m.memory_total,
  61. "memory_used": m.memory_used,
  62. "memory_percent": m.memory_percent,
  63. "memory_available": m.memory_available,
  64. "memory_buffers": m.memory_buffers,
  65. "memory_cached": m.memory_cached,
  66. "swap_total": m.swap_total,
  67. "swap_used": m.swap_used,
  68. "swap_percent": m.swap_percent,
  69. "load_1": m.load_1,
  70. "load_5": m.load_5,
  71. "load_15": m.load_15,
  72. "disk_read_bytes": m.disk_read_bytes,
  73. "disk_write_bytes": m.disk_write_bytes,
  74. "disk_read_iops": m.disk_read_iops,
  75. "disk_write_iops": m.disk_write_iops,
  76. "disk_read_mbps": m.disk_read_mbps,
  77. "disk_write_mbps": m.disk_write_mbps,
  78. "disk_io_time_ms": m.disk_io_time_ms,
  79. "disk_usage_percent": m.disk_usage_percent,
  80. "net_sent_bytes": m.net_sent_bytes,
  81. "net_recv_bytes": m.net_recv_bytes,
  82. "net_in_mbps": m.net_in_mbps,
  83. "net_out_mbps": m.net_out_mbps,
  84. "net_packets_in_per_sec": m.net_packets_in_per_sec,
  85. "net_packets_out_per_sec": m.net_packets_out_per_sec,
  86. "net_errors_in": m.net_errors_in,
  87. "net_errors_out": m.net_errors_out,
  88. "net_drops_in": m.net_drops_in,
  89. "net_drops_out": m.net_drops_out,
  90. "process_count": m.process_count,
  91. "thread_count": m.thread_count,
  92. "top_cpu_processes": m.top_cpu_processes,
  93. "top_mem_processes": m.top_mem_processes,
  94. # PostgreSQL
  95. "pg_active_connections": m.pg_active_connections,
  96. "pg_total_connections": m.pg_total_connections,
  97. "pg_database_size_bytes": m.pg_database_size_bytes,
  98. "pg_cache_hit_ratio": m.pg_cache_hit_ratio,
  99. "pg_transactions_per_sec": m.pg_transactions_per_sec,
  100. "pg_deadlocks": m.pg_deadlocks,
  101. "pg_temp_files": m.pg_temp_files,
  102. # ClickHouse
  103. "ch_active_queries": m.ch_active_queries,
  104. "ch_database_size_bytes": m.ch_database_size_bytes,
  105. "ch_queries_per_sec": m.ch_queries_per_sec,
  106. "ch_rows_read_per_sec": m.ch_rows_read_per_sec,
  107. "ch_memory_usage_bytes": m.ch_memory_usage_bytes,
  108. # HTTP/API
  109. "http_requests_per_sec": m.http_requests_per_sec,
  110. "http_avg_response_time_ms": m.http_avg_response_time_ms,
  111. "http_error_rate": m.http_error_rate,
  112. "http_active_requests": m.http_active_requests,
  113. }
  114. for m in reversed(metrics)
  115. ]
  116. }
  117. @router.get("/host-metrics/history")
  118. async def get_host_metrics_history(
  119. start_date: datetime = Query(...),
  120. end_date: datetime = Query(...),
  121. db: AsyncSession = Depends(get_db),
  122. _current_user=Depends(get_current_superadmin),
  123. ):
  124. """Get historical host metrics for specified date range."""
  125. result = await db.execute(
  126. select(HostMetrics)
  127. .where(HostMetrics.timestamp >= start_date)
  128. .where(HostMetrics.timestamp <= end_date)
  129. .order_by(HostMetrics.timestamp.asc())
  130. )
  131. metrics = list(result.scalars().all())
  132. return [
  133. {
  134. "timestamp": m.timestamp.isoformat(),
  135. "cpu_percent": m.cpu_percent,
  136. "cpu_count": m.cpu_count,
  137. "memory_total": m.memory_total,
  138. "memory_used": m.memory_used,
  139. "memory_percent": m.memory_percent,
  140. "load_1": m.load_1,
  141. "load_5": m.load_5,
  142. "load_15": m.load_15,
  143. "disk_read_bytes": m.disk_read_bytes,
  144. "disk_write_bytes": m.disk_write_bytes,
  145. "disk_usage_percent": m.disk_usage_percent,
  146. "net_sent_bytes": m.net_sent_bytes,
  147. "net_recv_bytes": m.net_recv_bytes,
  148. }
  149. for m in metrics
  150. ]
  151. @router.get("/alerts")
  152. async def get_alerts(
  153. dismissed: bool = Query(default=False),
  154. limit: int = Query(default=100, le=1000),
  155. db: AsyncSession = Depends(get_db),
  156. _current_user=Depends(get_current_superadmin),
  157. ):
  158. """Get alerts (by default only active/non-dismissed alerts)."""
  159. query = select(Alert).order_by(Alert.timestamp.desc()).limit(limit)
  160. if not dismissed:
  161. query = query.where(Alert.dismissed == False)
  162. result = await db.execute(query)
  163. alerts = list(result.scalars().all())
  164. return {
  165. "alerts": [
  166. {
  167. "id": a.id,
  168. "timestamp": a.timestamp.isoformat(),
  169. "created_at": a.timestamp.isoformat(), # For frontend compatibility
  170. "alert_type": a.alert_type,
  171. "severity": a.severity,
  172. "title": a.title,
  173. "message": a.message,
  174. "alert_metadata": a.alert_metadata,
  175. "acknowledged": a.acknowledged,
  176. "acknowledged_at": a.acknowledged_at.isoformat() if a.acknowledged_at else None,
  177. "acknowledged_by": a.acknowledged_by,
  178. "dismissed": a.dismissed,
  179. "dismissed_at": a.dismissed_at.isoformat() if a.dismissed_at else None,
  180. "sent_dashboard": a.sent_dashboard,
  181. "sent_telegram": a.sent_telegram,
  182. "sent_email": a.sent_email,
  183. }
  184. for a in alerts
  185. ]
  186. }
  187. @router.post("/alerts/{alert_id}/acknowledge")
  188. async def acknowledge_alert(
  189. alert_id: int,
  190. db: AsyncSession = Depends(get_db),
  191. current_user=Depends(get_current_superadmin),
  192. ):
  193. """Mark alert as acknowledged."""
  194. await alert_service.acknowledge_alert(db, alert_id, current_user.id)
  195. return {"status": "ok"}
  196. @router.post("/alerts/{alert_id}/dismiss")
  197. async def dismiss_alert(
  198. alert_id: int,
  199. db: AsyncSession = Depends(get_db),
  200. _current_user=Depends(get_current_superadmin),
  201. ):
  202. """Mark alert as dismissed (hide from dashboard)."""
  203. await alert_service.dismiss_alert(db, alert_id)
  204. return {"status": "ok"}
  205. @router.get("/security-events")
  206. async def get_security_events(
  207. resolved: bool = Query(default=False),
  208. limit: int = Query(default=100, le=1000),
  209. db: AsyncSession = Depends(get_db),
  210. _current_user=Depends(get_current_superadmin),
  211. ):
  212. """Get security events (by default only unresolved events)."""
  213. query = select(SecurityEvent).order_by(SecurityEvent.timestamp.desc()).limit(limit)
  214. if not resolved:
  215. query = query.where(SecurityEvent.resolved == False)
  216. result = await db.execute(query)
  217. events = list(result.scalars().all())
  218. return [
  219. {
  220. "id": e.id,
  221. "timestamp": e.timestamp.isoformat(),
  222. "event_type": e.event_type,
  223. "severity": e.severity,
  224. "ip_address": e.ip_address,
  225. "user_agent": e.user_agent,
  226. "endpoint": e.endpoint,
  227. "description": e.description,
  228. "event_metadata": e.event_metadata,
  229. "resolved": e.resolved,
  230. "resolved_at": e.resolved_at.isoformat() if e.resolved_at else None,
  231. "resolved_by": e.resolved_by,
  232. }
  233. for e in events
  234. ]
  235. @router.post("/security-events/{event_id}/resolve")
  236. async def resolve_security_event(
  237. event_id: int,
  238. db: AsyncSession = Depends(get_db),
  239. current_user=Depends(get_current_superadmin),
  240. ):
  241. """Mark security event as resolved."""
  242. result = await db.execute(
  243. select(SecurityEvent).where(SecurityEvent.id == event_id)
  244. )
  245. event = result.scalar_one_or_none()
  246. if event:
  247. event.resolved = True
  248. event.resolved_at = datetime.now(timezone.utc)
  249. event.resolved_by = current_user.id
  250. await db.commit()
  251. return {"status": "ok"}