385 lines
16 KiB
Python
385 lines
16 KiB
Python
# [DEF:health_service:Module]
|
|
# @COMPLEXITY: 3
|
|
# @SEMANTICS: health, aggregation, dashboards
|
|
# @PURPOSE: Business logic for aggregating dashboard health status from validation records.
|
|
# @LAYER: Domain/Service
|
|
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
|
|
# @RELATION: [DEPENDS_ON] ->[SupersetClient]
|
|
# @RELATION: [DEPENDS_ON] ->[TaskCleanupService]
|
|
# @RELATION: [DEPENDS_ON] ->[TaskManager]
|
|
|
|
from typing import List, Dict, Any, Optional, Tuple, cast
|
|
import time
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy import func
|
|
import os
|
|
from ..models.llm import ValidationRecord
|
|
from ..schemas.health import DashboardHealthItem, HealthSummaryResponse
|
|
from ..core.logger import logger
|
|
from ..core.superset_client import SupersetClient
|
|
from ..core.task_manager.cleanup import TaskCleanupService
|
|
from ..core.task_manager import TaskManager
|
|
|
|
|
|
def _empty_dashboard_meta() -> Dict[str, Optional[str]]:
|
|
return cast(Dict[str, Optional[str]], {"slug": None, "title": None})
|
|
|
|
|
|
# [DEF:HealthService:Class]
|
|
# @COMPLEXITY: 4
|
|
# @PURPOSE: Aggregate latest dashboard validation state and manage persisted health report lifecycle.
|
|
# @PRE: Service is constructed with a live SQLAlchemy session and optional config manager.
|
|
# @POST: Exposes health summary aggregation and validation report deletion operations.
|
|
# @SIDE_EFFECT: Maintains in-memory dashboard metadata caches and may coordinate cleanup through collaborators.
|
|
# @DATA_CONTRACT: Input[Session, Optional[Any]] -> Output[HealthSummaryResponse|bool]
|
|
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
|
|
# @RELATION: [DEPENDS_ON] ->[DashboardHealthItem]
|
|
# @RELATION: [DEPENDS_ON] ->[HealthSummaryResponse]
|
|
# @RELATION: [DEPENDS_ON] ->[SupersetClient]
|
|
# @RELATION: [DEPENDS_ON] ->[TaskCleanupService]
|
|
# @RELATION: [DEPENDS_ON] ->[TaskManager]
|
|
class HealthService:
|
|
_dashboard_summary_cache: Dict[
|
|
str, Tuple[float, Dict[str, Dict[str, Optional[str]]]]
|
|
] = {}
|
|
_dashboard_summary_cache_ttl_seconds = 60.0
|
|
|
|
"""
|
|
@PURPOSE: Service for managing and querying dashboard health data.
|
|
"""
|
|
|
|
# [DEF:HealthService_init:Function]
|
|
# @COMPLEXITY: 3
|
|
# @PURPOSE: Initialize health service with DB session and optional config access for dashboard metadata resolution.
|
|
# @PRE: db is a valid SQLAlchemy session.
|
|
# @POST: Service is ready to aggregate summaries and delete health reports.
|
|
# @SIDE_EFFECT: Initializes per-instance dashboard metadata cache.
|
|
# @DATA_CONTRACT: Input[db: Session, config_manager: Optional[Any]] -> Output[HealthService]
|
|
# @RELATION: [BINDS_TO] ->[HealthService]
|
|
def __init__(self, db: Session, config_manager=None):
|
|
self.db = db
|
|
self.config_manager = config_manager
|
|
self._dashboard_meta_cache: Dict[Tuple[str, str], Dict[str, Optional[str]]] = {}
|
|
|
|
# [/DEF:HealthService_init:Function]
|
|
|
|
# [DEF:_prime_dashboard_meta_cache:Function]
|
|
# @COMPLEXITY: 3
|
|
# @PURPOSE: Warm dashboard slug/title cache with one Superset list fetch per environment.
|
|
# @PRE: records may contain mixed numeric and slug dashboard identifiers.
|
|
# @POST: Numeric dashboard ids for known environments are cached when discoverable.
|
|
# @SIDE_EFFECT: May call Superset dashboard list API once per referenced environment.
|
|
# @DATA_CONTRACT: Input[records: List[ValidationRecord]] -> Output[None]
|
|
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
|
|
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
|
|
# @RELATION: [DEPENDS_ON] ->[SupersetClient]
|
|
def _prime_dashboard_meta_cache(self, records: List[ValidationRecord]) -> None:
|
|
if not self.config_manager or not records:
|
|
return
|
|
|
|
numeric_ids_by_env: Dict[str, set[str]] = {}
|
|
for record in records:
|
|
environment_id = str(record.environment_id or "").strip()
|
|
dashboard_id = str(record.dashboard_id or "").strip()
|
|
if not environment_id or not dashboard_id or not dashboard_id.isdigit():
|
|
continue
|
|
cache_key = (environment_id, dashboard_id)
|
|
if cache_key in self._dashboard_meta_cache:
|
|
continue
|
|
numeric_ids_by_env.setdefault(environment_id, set()).add(dashboard_id)
|
|
|
|
if not numeric_ids_by_env:
|
|
return
|
|
|
|
environments = {
|
|
str(getattr(env, "id", "")).strip(): env
|
|
for env in self.config_manager.get_environments()
|
|
if str(getattr(env, "id", "")).strip()
|
|
}
|
|
|
|
for environment_id, dashboard_ids in numeric_ids_by_env.items():
|
|
env = environments.get(environment_id)
|
|
if not env:
|
|
for dashboard_id in dashboard_ids:
|
|
self._dashboard_meta_cache[(environment_id, dashboard_id)] = (
|
|
_empty_dashboard_meta()
|
|
)
|
|
continue
|
|
|
|
try:
|
|
cached_meta = self.__class__._dashboard_summary_cache.get(
|
|
environment_id
|
|
)
|
|
dashboard_meta_map: Dict[str, Dict[str, Optional[str]]]
|
|
if (
|
|
cached_meta is not None
|
|
and (time.monotonic() - cached_meta[0])
|
|
< self.__class__._dashboard_summary_cache_ttl_seconds
|
|
):
|
|
cached_meta_data = cast(
|
|
Tuple[float, Dict[str, Dict[str, Optional[str]]]],
|
|
cached_meta,
|
|
)
|
|
dashboard_meta_map = cached_meta_data[1]
|
|
else:
|
|
dashboards = SupersetClient(env).get_dashboards_summary()
|
|
dashboard_meta_map = {
|
|
str(item.get("id")): {
|
|
"slug": item.get("slug"),
|
|
"title": item.get("title"),
|
|
}
|
|
for item in dashboards
|
|
if str(item.get("id") or "").strip()
|
|
}
|
|
self.__class__._dashboard_summary_cache[environment_id] = (
|
|
time.monotonic(),
|
|
dashboard_meta_map,
|
|
)
|
|
for dashboard_id in dashboard_ids:
|
|
self._dashboard_meta_cache[(environment_id, dashboard_id)] = (
|
|
dashboard_meta_map.get(
|
|
dashboard_id,
|
|
_empty_dashboard_meta(),
|
|
)
|
|
)
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"[HealthService][_prime_dashboard_meta_cache] Failed to preload dashboard metadata for env=%s: %s",
|
|
environment_id,
|
|
exc,
|
|
)
|
|
for dashboard_id in dashboard_ids:
|
|
self._dashboard_meta_cache[(environment_id, dashboard_id)] = (
|
|
_empty_dashboard_meta()
|
|
)
|
|
|
|
# [/DEF:_prime_dashboard_meta_cache:Function]
|
|
|
|
# [DEF:_resolve_dashboard_meta:Function]
|
|
# @COMPLEXITY: 1
|
|
# @PURPOSE: Resolve slug/title for a dashboard referenced by persisted validation record.
|
|
# @PRE: dashboard_id may be numeric or slug-like; environment_id may be empty.
|
|
# @POST: Returns dict with `slug` and `title` keys, using cache when possible.
|
|
# @SIDE_EFFECT: Writes default cache entries for unresolved numeric dashboard ids.
|
|
def _resolve_dashboard_meta(
|
|
self, dashboard_id: str, environment_id: Optional[str]
|
|
) -> Dict[str, Optional[str]]:
|
|
normalized_dashboard_id = str(dashboard_id or "").strip()
|
|
normalized_environment_id = str(environment_id or "").strip()
|
|
if not normalized_dashboard_id:
|
|
return _empty_dashboard_meta()
|
|
|
|
if not normalized_dashboard_id.isdigit():
|
|
return {"slug": normalized_dashboard_id, "title": None}
|
|
|
|
if not self.config_manager or not normalized_environment_id:
|
|
return _empty_dashboard_meta()
|
|
|
|
cache_key = (normalized_environment_id, normalized_dashboard_id)
|
|
cached = self._dashboard_meta_cache.get(cache_key)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
meta = _empty_dashboard_meta()
|
|
self._dashboard_meta_cache[cache_key] = meta
|
|
return meta
|
|
|
|
# [/DEF:_resolve_dashboard_meta:Function]
|
|
|
|
# [DEF:get_health_summary:Function]
|
|
# @COMPLEXITY: 3
|
|
# @PURPOSE: Aggregate latest validation status per dashboard and enrich rows with dashboard slug/title.
|
|
# @PRE: environment_id may be omitted to aggregate across all environments.
|
|
# @POST: Returns HealthSummaryResponse with counts and latest record row per dashboard.
|
|
# @SIDE_EFFECT: May call Superset API to resolve dashboard metadata.
|
|
# @DATA_CONTRACT: Input[environment_id: Optional[str]] -> Output[HealthSummaryResponse]
|
|
# @RELATION: [CALLS] ->[_prime_dashboard_meta_cache]
|
|
# @RELATION: [CALLS] ->[_resolve_dashboard_meta]
|
|
async def get_health_summary(
|
|
self, environment_id: str = ""
|
|
) -> HealthSummaryResponse:
|
|
"""
|
|
@PURPOSE: Aggregates the latest validation status for all dashboards.
|
|
@PRE: environment_id (optional) to filter by environment.
|
|
@POST: Returns a HealthSummaryResponse with aggregated status counts and items.
|
|
"""
|
|
# [REASON] We need the latest ValidationRecord for each unique dashboard_id.
|
|
# We use a subquery to find the max timestamp per dashboard_id.
|
|
|
|
subquery = self.db.query(
|
|
ValidationRecord.dashboard_id,
|
|
func.max(ValidationRecord.timestamp).label("max_ts"),
|
|
)
|
|
if environment_id:
|
|
subquery = subquery.filter(
|
|
ValidationRecord.environment_id == environment_id
|
|
)
|
|
subquery = subquery.group_by(ValidationRecord.dashboard_id).subquery()
|
|
|
|
query = self.db.query(ValidationRecord).join(
|
|
subquery,
|
|
(ValidationRecord.dashboard_id == subquery.c.dashboard_id)
|
|
& (ValidationRecord.timestamp == subquery.c.max_ts),
|
|
)
|
|
|
|
records = query.all()
|
|
|
|
self._prime_dashboard_meta_cache(records)
|
|
|
|
items = []
|
|
pass_count = 0
|
|
warn_count = 0
|
|
fail_count = 0
|
|
unknown_count = 0
|
|
|
|
for rec in records:
|
|
record = cast(Any, rec)
|
|
status = str(record.status or "").upper()
|
|
if status == "PASS":
|
|
pass_count += 1
|
|
elif status == "WARN":
|
|
warn_count += 1
|
|
elif status == "FAIL":
|
|
fail_count += 1
|
|
else:
|
|
unknown_count += 1
|
|
status = "UNKNOWN"
|
|
|
|
record_id = str(record.id or "")
|
|
dashboard_id = str(record.dashboard_id or "")
|
|
resolved_environment_id = (
|
|
str(record.environment_id)
|
|
if record.environment_id is not None
|
|
else None
|
|
)
|
|
response_environment_id = (
|
|
resolved_environment_id
|
|
if resolved_environment_id is not None
|
|
else "unknown"
|
|
)
|
|
task_id = str(record.task_id) if record.task_id is not None else None
|
|
summary = str(record.summary) if record.summary is not None else None
|
|
timestamp = cast(Any, record.timestamp)
|
|
|
|
meta = self._resolve_dashboard_meta(dashboard_id, resolved_environment_id)
|
|
items.append(
|
|
DashboardHealthItem(
|
|
record_id=record_id,
|
|
dashboard_id=dashboard_id,
|
|
dashboard_slug=meta.get("slug"),
|
|
dashboard_title=meta.get("title"),
|
|
environment_id=response_environment_id,
|
|
status=status,
|
|
last_check=timestamp,
|
|
task_id=task_id,
|
|
summary=summary,
|
|
)
|
|
)
|
|
|
|
logger.info(
|
|
f"[HealthService][get_health_summary] Aggregated {len(items)} dashboard health records."
|
|
)
|
|
|
|
return HealthSummaryResponse(
|
|
items=items,
|
|
pass_count=pass_count,
|
|
warn_count=warn_count,
|
|
fail_count=fail_count,
|
|
unknown_count=unknown_count,
|
|
)
|
|
|
|
# [/DEF:get_health_summary:Function]
|
|
|
|
# [DEF:delete_validation_report:Function]
|
|
# @COMPLEXITY: 3
|
|
# @PURPOSE: Delete one persisted health report and optionally clean linked task/log artifacts.
|
|
# @PRE: record_id is a validation record identifier.
|
|
# @POST: Returns True only when a matching record was deleted.
|
|
# @SIDE_EFFECT: Deletes DB rows, optional screenshot file, and optional task/log persistence.
|
|
# @DATA_CONTRACT: Input[record_id: str, task_manager: Optional[TaskManager]] -> Output[bool]
|
|
# @RELATION: [DEPENDS_ON] ->[ValidationRecord]
|
|
# @RELATION: [DEPENDS_ON] ->[TaskManager]
|
|
# @RELATION: [DEPENDS_ON] ->[TaskCleanupService]
|
|
def delete_validation_report(
|
|
self, record_id: str, task_manager: Optional[TaskManager] = None
|
|
) -> bool:
|
|
record = (
|
|
self.db.query(ValidationRecord)
|
|
.filter(ValidationRecord.id == record_id)
|
|
.first()
|
|
)
|
|
if not record:
|
|
return False
|
|
|
|
peer_query = self.db.query(ValidationRecord).filter(
|
|
ValidationRecord.dashboard_id == record.dashboard_id
|
|
)
|
|
if record.environment_id is None:
|
|
peer_query = peer_query.filter(ValidationRecord.environment_id.is_(None))
|
|
else:
|
|
peer_query = peer_query.filter(
|
|
ValidationRecord.environment_id == record.environment_id
|
|
)
|
|
|
|
records_to_delete = peer_query.all()
|
|
screenshot_paths = [
|
|
str(item.screenshot_path or "").strip()
|
|
for item in records_to_delete
|
|
if str(item.screenshot_path or "").strip()
|
|
]
|
|
task_ids = {
|
|
str(item.task_id or "").strip()
|
|
for item in records_to_delete
|
|
if str(item.task_id or "").strip()
|
|
}
|
|
|
|
logger.info(
|
|
"[HealthService][delete_validation_report] Removing %s validation record(s) for dashboard=%s environment=%s triggered_by_record=%s",
|
|
len(records_to_delete),
|
|
record.dashboard_id,
|
|
record.environment_id,
|
|
record_id,
|
|
)
|
|
|
|
for item in records_to_delete:
|
|
self.db.delete(item)
|
|
self.db.commit()
|
|
|
|
for screenshot_path in screenshot_paths:
|
|
try:
|
|
if os.path.exists(screenshot_path):
|
|
os.remove(screenshot_path)
|
|
except OSError as exc:
|
|
logger.warning(
|
|
"[HealthService][delete_validation_report] Failed to remove screenshot %s: %s",
|
|
screenshot_path,
|
|
exc,
|
|
)
|
|
|
|
if task_ids and task_manager and self.config_manager:
|
|
try:
|
|
cleanup_service = TaskCleanupService(
|
|
task_manager.persistence_service,
|
|
task_manager.log_persistence_service,
|
|
self.config_manager,
|
|
)
|
|
for task_id in task_ids:
|
|
task_manager.tasks.pop(task_id, None)
|
|
cleanup_service.delete_task_with_logs(task_id)
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"[HealthService][delete_validation_report] Failed to cleanup linked task/logs for dashboard=%s environment=%s: %s",
|
|
record.dashboard_id,
|
|
record.environment_id,
|
|
exc,
|
|
)
|
|
|
|
return True
|
|
|
|
# [/DEF:delete_validation_report:Function]
|
|
|
|
|
|
# [/DEF:HealthService:Class]
|
|
|
|
# [/DEF:health_service:Module]
|