109 lines
4.1 KiB
Python
109 lines
4.1 KiB
Python
from __future__ import annotations
|
|
|
|
import time
|
|
from collections import deque
|
|
from threading import Lock
|
|
|
|
import httpx
|
|
|
|
from app.core.config import Settings
|
|
from app.core.logging import logger
|
|
|
|
|
|
class TrafficGuard:
|
|
def __init__(self, settings: Settings):
|
|
self.settings = settings
|
|
self._lock = Lock()
|
|
self._minute = 0
|
|
self._minute_count = 0
|
|
self._open_until = 0.0
|
|
self._events: deque[tuple[float, int]] = deque()
|
|
self._requests = 0
|
|
self._rate_limited = 0
|
|
self._circuit_blocked = 0
|
|
self._avg_latency_ms = 0.0
|
|
self._alert_last_sent = 0.0
|
|
|
|
def allow(self, path: str) -> tuple[bool, str]:
|
|
now = time.time()
|
|
with self._lock:
|
|
minute = int(now // 60)
|
|
if self._minute != minute:
|
|
self._minute = minute
|
|
self._minute_count = 0
|
|
if self._minute_count >= self.settings.app_rate_limit_per_minute:
|
|
self._rate_limited += 1
|
|
return False, "rate_limited"
|
|
if self._open_until > now and not self._is_exempt(path):
|
|
self._circuit_blocked += 1
|
|
return False, "circuit_open"
|
|
self._minute_count += 1
|
|
self._requests += 1
|
|
return True, "ok"
|
|
|
|
def record(self, status_code: int, latency_ms: float) -> None:
|
|
now = time.time()
|
|
with self._lock:
|
|
self._events.append((now, status_code))
|
|
self._avg_latency_ms = self._ema(self._avg_latency_ms, latency_ms)
|
|
self._trim(now)
|
|
total = len(self._events)
|
|
if total < self.settings.app_circuit_breaker_min_requests:
|
|
return
|
|
errors = sum(1 for _, code in self._events if code >= 500)
|
|
error_rate = errors / total
|
|
if error_rate >= self.settings.app_circuit_breaker_error_rate:
|
|
self._open_until = now + self.settings.app_circuit_breaker_cooldown_seconds
|
|
self._send_alert(
|
|
"app circuit opened",
|
|
{
|
|
"error_rate": round(error_rate, 4),
|
|
"window_requests": total,
|
|
"cooldown_seconds": self.settings.app_circuit_breaker_cooldown_seconds,
|
|
},
|
|
)
|
|
|
|
def snapshot(self) -> dict[str, float | int]:
|
|
now = time.time()
|
|
with self._lock:
|
|
self._trim(now)
|
|
total = len(self._events)
|
|
errors = sum(1 for _, code in self._events if code >= 500)
|
|
return {
|
|
"requests_total": self._requests,
|
|
"rate_limited_total": self._rate_limited,
|
|
"circuit_blocked_total": self._circuit_blocked,
|
|
"window_requests": total,
|
|
"window_errors": errors,
|
|
"window_error_rate": round((errors / total), 4) if total else 0.0,
|
|
"avg_latency_ms": round(self._avg_latency_ms, 2),
|
|
"circuit_open": 1 if self._open_until > now else 0,
|
|
}
|
|
|
|
def _trim(self, now: float) -> None:
|
|
lower = now - self.settings.app_circuit_breaker_window_seconds
|
|
while self._events and self._events[0][0] < lower:
|
|
self._events.popleft()
|
|
|
|
def _ema(self, prev: float, value: float, alpha: float = 0.2) -> float:
|
|
if prev <= 0:
|
|
return value
|
|
return alpha * value + (1 - alpha) * prev
|
|
|
|
def _is_exempt(self, path: str) -> bool:
|
|
return path in {"/health", "/docs", "/openapi.json", "/poc/ops/system/metrics", "/poc/ops/ai/metrics"}
|
|
|
|
def _send_alert(self, message: str, extra: dict) -> None:
|
|
now = time.time()
|
|
if now - self._alert_last_sent < 30:
|
|
return
|
|
self._alert_last_sent = now
|
|
logger.warning("%s extra=%s", message, extra)
|
|
if not self.settings.alert_webhook_url:
|
|
return
|
|
try:
|
|
with httpx.Client(timeout=2.0) as client:
|
|
client.post(self.settings.alert_webhook_url, json={"message": message, "extra": extra})
|
|
except Exception:
|
|
logger.exception("alert webhook send failed")
|