Files
ss-tools/backend/src/services/health_service.py
2026-03-27 21:27:31 +03:00

385 lines
16 KiB
Python

# [DEF:health_service:Module]
# @COMPLEXITY: 3
# @SEMANTICS: health, aggregation, dashboards
# @PURPOSE: Business logic for aggregating dashboard health status from validation records.
# @LAYER: Domain/Service
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
# @RELATION: [DEPENDS_ON] ->[SupersetClient]
# @RELATION: [DEPENDS_ON] ->[TaskCleanupService]
# @RELATION: [DEPENDS_ON] ->[TaskManager]
from typing import List, Dict, Any, Optional, Tuple, cast
import time
from sqlalchemy.orm import Session
from sqlalchemy import func
import os
from ..models.llm import ValidationRecord
from ..schemas.health import DashboardHealthItem, HealthSummaryResponse
from ..core.logger import logger
from ..core.superset_client import SupersetClient
from ..core.task_manager.cleanup import TaskCleanupService
from ..core.task_manager import TaskManager
def _empty_dashboard_meta() -> Dict[str, Optional[str]]:
return cast(Dict[str, Optional[str]], {"slug": None, "title": None})
# [DEF:HealthService:Class]
# @COMPLEXITY: 4
# @PURPOSE: Aggregate latest dashboard validation state and manage persisted health report lifecycle.
# @PRE: Service is constructed with a live SQLAlchemy session and optional config manager.
# @POST: Exposes health summary aggregation and validation report deletion operations.
# @SIDE_EFFECT: Maintains in-memory dashboard metadata caches and may coordinate cleanup through collaborators.
# @DATA_CONTRACT: Input[Session, Optional[Any]] -> Output[HealthSummaryResponse|bool]
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
# @RELATION: [DEPENDS_ON] ->[DashboardHealthItem]
# @RELATION: [DEPENDS_ON] ->[HealthSummaryResponse]
# @RELATION: [DEPENDS_ON] ->[SupersetClient]
# @RELATION: [DEPENDS_ON] ->[TaskCleanupService]
# @RELATION: [DEPENDS_ON] ->[TaskManager]
class HealthService:
_dashboard_summary_cache: Dict[
str, Tuple[float, Dict[str, Dict[str, Optional[str]]]]
] = {}
_dashboard_summary_cache_ttl_seconds = 60.0
"""
@PURPOSE: Service for managing and querying dashboard health data.
"""
# [DEF:HealthService_init:Function]
# @COMPLEXITY: 3
# @PURPOSE: Initialize health service with DB session and optional config access for dashboard metadata resolution.
# @PRE: db is a valid SQLAlchemy session.
# @POST: Service is ready to aggregate summaries and delete health reports.
# @SIDE_EFFECT: Initializes per-instance dashboard metadata cache.
# @DATA_CONTRACT: Input[db: Session, config_manager: Optional[Any]] -> Output[HealthService]
# @RELATION: [BINDS_TO] ->[HealthService]
def __init__(self, db: Session, config_manager=None):
self.db = db
self.config_manager = config_manager
self._dashboard_meta_cache: Dict[Tuple[str, str], Dict[str, Optional[str]]] = {}
# [/DEF:HealthService_init:Function]
# [DEF:_prime_dashboard_meta_cache:Function]
# @COMPLEXITY: 3
# @PURPOSE: Warm dashboard slug/title cache with one Superset list fetch per environment.
# @PRE: records may contain mixed numeric and slug dashboard identifiers.
# @POST: Numeric dashboard ids for known environments are cached when discoverable.
# @SIDE_EFFECT: May call Superset dashboard list API once per referenced environment.
# @DATA_CONTRACT: Input[records: List[ValidationRecord]] -> Output[None]
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
# @RELATION: [DEPENDS_ON] ->[SupersetClient]
def _prime_dashboard_meta_cache(self, records: List[ValidationRecord]) -> None:
if not self.config_manager or not records:
return
numeric_ids_by_env: Dict[str, set[str]] = {}
for record in records:
environment_id = str(record.environment_id or "").strip()
dashboard_id = str(record.dashboard_id or "").strip()
if not environment_id or not dashboard_id or not dashboard_id.isdigit():
continue
cache_key = (environment_id, dashboard_id)
if cache_key in self._dashboard_meta_cache:
continue
numeric_ids_by_env.setdefault(environment_id, set()).add(dashboard_id)
if not numeric_ids_by_env:
return
environments = {
str(getattr(env, "id", "")).strip(): env
for env in self.config_manager.get_environments()
if str(getattr(env, "id", "")).strip()
}
for environment_id, dashboard_ids in numeric_ids_by_env.items():
env = environments.get(environment_id)
if not env:
for dashboard_id in dashboard_ids:
self._dashboard_meta_cache[(environment_id, dashboard_id)] = (
_empty_dashboard_meta()
)
continue
try:
cached_meta = self.__class__._dashboard_summary_cache.get(
environment_id
)
dashboard_meta_map: Dict[str, Dict[str, Optional[str]]]
if (
cached_meta is not None
and (time.monotonic() - cached_meta[0])
< self.__class__._dashboard_summary_cache_ttl_seconds
):
cached_meta_data = cast(
Tuple[float, Dict[str, Dict[str, Optional[str]]]],
cached_meta,
)
dashboard_meta_map = cached_meta_data[1]
else:
dashboards = SupersetClient(env).get_dashboards_summary()
dashboard_meta_map = {
str(item.get("id")): {
"slug": item.get("slug"),
"title": item.get("title"),
}
for item in dashboards
if str(item.get("id") or "").strip()
}
self.__class__._dashboard_summary_cache[environment_id] = (
time.monotonic(),
dashboard_meta_map,
)
for dashboard_id in dashboard_ids:
self._dashboard_meta_cache[(environment_id, dashboard_id)] = (
dashboard_meta_map.get(
dashboard_id,
_empty_dashboard_meta(),
)
)
except Exception as exc:
logger.warning(
"[HealthService][_prime_dashboard_meta_cache] Failed to preload dashboard metadata for env=%s: %s",
environment_id,
exc,
)
for dashboard_id in dashboard_ids:
self._dashboard_meta_cache[(environment_id, dashboard_id)] = (
_empty_dashboard_meta()
)
# [/DEF:_prime_dashboard_meta_cache:Function]
# [DEF:_resolve_dashboard_meta:Function]
# @COMPLEXITY: 1
# @PURPOSE: Resolve slug/title for a dashboard referenced by persisted validation record.
# @PRE: dashboard_id may be numeric or slug-like; environment_id may be empty.
# @POST: Returns dict with `slug` and `title` keys, using cache when possible.
# @SIDE_EFFECT: Writes default cache entries for unresolved numeric dashboard ids.
def _resolve_dashboard_meta(
self, dashboard_id: str, environment_id: Optional[str]
) -> Dict[str, Optional[str]]:
normalized_dashboard_id = str(dashboard_id or "").strip()
normalized_environment_id = str(environment_id or "").strip()
if not normalized_dashboard_id:
return _empty_dashboard_meta()
if not normalized_dashboard_id.isdigit():
return {"slug": normalized_dashboard_id, "title": None}
if not self.config_manager or not normalized_environment_id:
return _empty_dashboard_meta()
cache_key = (normalized_environment_id, normalized_dashboard_id)
cached = self._dashboard_meta_cache.get(cache_key)
if cached is not None:
return cached
meta = _empty_dashboard_meta()
self._dashboard_meta_cache[cache_key] = meta
return meta
# [/DEF:_resolve_dashboard_meta:Function]
# [DEF:get_health_summary:Function]
# @COMPLEXITY: 3
# @PURPOSE: Aggregate latest validation status per dashboard and enrich rows with dashboard slug/title.
# @PRE: environment_id may be omitted to aggregate across all environments.
# @POST: Returns HealthSummaryResponse with counts and latest record row per dashboard.
# @SIDE_EFFECT: May call Superset API to resolve dashboard metadata.
# @DATA_CONTRACT: Input[environment_id: Optional[str]] -> Output[HealthSummaryResponse]
# @RELATION: [CALLS] ->[_prime_dashboard_meta_cache]
# @RELATION: [CALLS] ->[_resolve_dashboard_meta]
async def get_health_summary(
self, environment_id: str = ""
) -> HealthSummaryResponse:
"""
@PURPOSE: Aggregates the latest validation status for all dashboards.
@PRE: environment_id (optional) to filter by environment.
@POST: Returns a HealthSummaryResponse with aggregated status counts and items.
"""
# [REASON] We need the latest ValidationRecord for each unique dashboard_id.
# We use a subquery to find the max timestamp per dashboard_id.
subquery = self.db.query(
ValidationRecord.dashboard_id,
func.max(ValidationRecord.timestamp).label("max_ts"),
)
if environment_id:
subquery = subquery.filter(
ValidationRecord.environment_id == environment_id
)
subquery = subquery.group_by(ValidationRecord.dashboard_id).subquery()
query = self.db.query(ValidationRecord).join(
subquery,
(ValidationRecord.dashboard_id == subquery.c.dashboard_id)
& (ValidationRecord.timestamp == subquery.c.max_ts),
)
records = query.all()
self._prime_dashboard_meta_cache(records)
items = []
pass_count = 0
warn_count = 0
fail_count = 0
unknown_count = 0
for rec in records:
record = cast(Any, rec)
status = str(record.status or "").upper()
if status == "PASS":
pass_count += 1
elif status == "WARN":
warn_count += 1
elif status == "FAIL":
fail_count += 1
else:
unknown_count += 1
status = "UNKNOWN"
record_id = str(record.id or "")
dashboard_id = str(record.dashboard_id or "")
resolved_environment_id = (
str(record.environment_id)
if record.environment_id is not None
else None
)
response_environment_id = (
resolved_environment_id
if resolved_environment_id is not None
else "unknown"
)
task_id = str(record.task_id) if record.task_id is not None else None
summary = str(record.summary) if record.summary is not None else None
timestamp = cast(Any, record.timestamp)
meta = self._resolve_dashboard_meta(dashboard_id, resolved_environment_id)
items.append(
DashboardHealthItem(
record_id=record_id,
dashboard_id=dashboard_id,
dashboard_slug=meta.get("slug"),
dashboard_title=meta.get("title"),
environment_id=response_environment_id,
status=status,
last_check=timestamp,
task_id=task_id,
summary=summary,
)
)
logger.info(
f"[HealthService][get_health_summary] Aggregated {len(items)} dashboard health records."
)
return HealthSummaryResponse(
items=items,
pass_count=pass_count,
warn_count=warn_count,
fail_count=fail_count,
unknown_count=unknown_count,
)
# [/DEF:get_health_summary:Function]
# [DEF:delete_validation_report:Function]
# @COMPLEXITY: 3
# @PURPOSE: Delete one persisted health report and optionally clean linked task/log artifacts.
# @PRE: record_id is a validation record identifier.
# @POST: Returns True only when a matching record was deleted.
# @SIDE_EFFECT: Deletes DB rows, optional screenshot file, and optional task/log persistence.
# @DATA_CONTRACT: Input[record_id: str, task_manager: Optional[TaskManager]] -> Output[bool]
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
# @RELATION: [DEPENDS_ON] ->[TaskManager]
# @RELATION: [DEPENDS_ON] ->[TaskCleanupService]
def delete_validation_report(
self, record_id: str, task_manager: Optional[TaskManager] = None
) -> bool:
record = (
self.db.query(ValidationRecord)
.filter(ValidationRecord.id == record_id)
.first()
)
if not record:
return False
peer_query = self.db.query(ValidationRecord).filter(
ValidationRecord.dashboard_id == record.dashboard_id
)
if record.environment_id is None:
peer_query = peer_query.filter(ValidationRecord.environment_id.is_(None))
else:
peer_query = peer_query.filter(
ValidationRecord.environment_id == record.environment_id
)
records_to_delete = peer_query.all()
screenshot_paths = [
str(item.screenshot_path or "").strip()
for item in records_to_delete
if str(item.screenshot_path or "").strip()
]
task_ids = {
str(item.task_id or "").strip()
for item in records_to_delete
if str(item.task_id or "").strip()
}
logger.info(
"[HealthService][delete_validation_report] Removing %s validation record(s) for dashboard=%s environment=%s triggered_by_record=%s",
len(records_to_delete),
record.dashboard_id,
record.environment_id,
record_id,
)
for item in records_to_delete:
self.db.delete(item)
self.db.commit()
for screenshot_path in screenshot_paths:
try:
if os.path.exists(screenshot_path):
os.remove(screenshot_path)
except OSError as exc:
logger.warning(
"[HealthService][delete_validation_report] Failed to remove screenshot %s: %s",
screenshot_path,
exc,
)
if task_ids and task_manager and self.config_manager:
try:
cleanup_service = TaskCleanupService(
task_manager.persistence_service,
task_manager.log_persistence_service,
self.config_manager,
)
for task_id in task_ids:
task_manager.tasks.pop(task_id, None)
cleanup_service.delete_task_with_logs(task_id)
except Exception as exc:
logger.warning(
"[HealthService][delete_validation_report] Failed to cleanup linked task/logs for dashboard=%s environment=%s: %s",
record.dashboard_id,
record.environment_id,
exc,
)
return True
# [/DEF:delete_validation_report:Function]
# [/DEF:HealthService:Class]
# [/DEF:health_service:Module]