feat(us1): add dataset review orchestration automatic review slice
This commit is contained in:
386
backend/src/services/dataset_review/orchestrator.py
Normal file
386
backend/src/services/dataset_review/orchestrator.py
Normal file
@@ -0,0 +1,386 @@
|
||||
# [DEF:DatasetReviewOrchestrator:Module]
|
||||
# @COMPLEXITY: 5
|
||||
# @SEMANTICS: dataset_review, orchestration, session_lifecycle, intake, recovery
|
||||
# @PURPOSE: Coordinate dataset review session startup and lifecycle-safe intake recovery for one authenticated user.
|
||||
# @LAYER: Domain
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticSourceResolver]
|
||||
# @RELATION: [DEPENDS_ON] ->[ClarificationEngine]
|
||||
# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
|
||||
# @RELATION: [DEPENDS_ON] ->[SupersetCompilationAdapter]
|
||||
# @RELATION: [DEPENDS_ON] ->[TaskManager]
|
||||
# @PRE: session mutations must execute inside a persisted session boundary scoped to one authenticated user.
|
||||
# @POST: state transitions are persisted atomically and emit observable progress for long-running steps.
|
||||
# @SIDE_EFFECT: creates task records, updates session aggregates, triggers upstream Superset calls, persists audit artifacts.
|
||||
# @DATA_CONTRACT: Input[SessionCommand] -> Output[DatasetReviewSession | CompiledPreview | DatasetRunContext]
|
||||
# @INVARIANT: Launch is blocked unless a current session has no open blocking findings, all launch-sensitive mappings are approved, and a non-stale Superset-generated compiled preview matches the current input fingerprint.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator.imports:Block]
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from src.core.config_manager import ConfigManager
|
||||
from src.core.logger import belief_scope, logger
|
||||
from src.core.task_manager import TaskManager
|
||||
from src.core.utils.superset_context_extractor import (
|
||||
SupersetContextExtractor,
|
||||
SupersetParsedContext,
|
||||
)
|
||||
from src.models.auth import User
|
||||
from src.models.dataset_review import (
|
||||
BusinessSummarySource,
|
||||
ConfidenceState,
|
||||
DatasetProfile,
|
||||
DatasetReviewSession,
|
||||
FindingArea,
|
||||
FindingSeverity,
|
||||
RecommendedAction,
|
||||
ReadinessState,
|
||||
ResolutionState,
|
||||
SessionPhase,
|
||||
SessionStatus,
|
||||
ValidationFinding,
|
||||
)
|
||||
from src.services.dataset_review.repositories.session_repository import (
|
||||
DatasetReviewSessionRepository,
|
||||
)
|
||||
from src.services.dataset_review.semantic_resolver import SemanticSourceResolver
|
||||
# [/DEF:DatasetReviewOrchestrator.imports:Block]
|
||||
|
||||
|
||||
# [DEF:StartSessionCommand:Class]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Typed input contract for starting a dataset review session.
|
||||
@dataclass
|
||||
class StartSessionCommand:
|
||||
user: User
|
||||
environment_id: str
|
||||
source_kind: str
|
||||
source_input: str
|
||||
# [/DEF:StartSessionCommand:Class]
|
||||
|
||||
|
||||
# [DEF:StartSessionResult:Class]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Session-start result carrying the persisted session and intake recovery metadata.
|
||||
@dataclass
|
||||
class StartSessionResult:
|
||||
session: DatasetReviewSession
|
||||
parsed_context: Optional[SupersetParsedContext] = None
|
||||
findings: List[ValidationFinding] = field(default_factory=list)
|
||||
# [/DEF:StartSessionResult:Class]
|
||||
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator:Class]
|
||||
# @COMPLEXITY: 5
|
||||
# @PURPOSE: Coordinate safe session startup while preserving cross-user isolation and explicit partial recovery.
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
|
||||
# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
|
||||
# @RELATION: [DEPENDS_ON] ->[TaskManager]
|
||||
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
|
||||
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
|
||||
# @PRE: constructor dependencies are valid and tied to the current request/task scope.
|
||||
# @POST: orchestrator instance can execute session-scoped mutations for one authenticated user.
|
||||
# @SIDE_EFFECT: downstream operations may persist session/profile/finding state and enqueue background tasks.
|
||||
# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
|
||||
# @INVARIANT: session ownership is preserved on every mutation and recovery remains explicit when partial.
|
||||
class DatasetReviewOrchestrator:
|
||||
# [DEF:DatasetReviewOrchestrator.__init__:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Bind repository, config, and task dependencies required by the orchestration boundary.
|
||||
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
|
||||
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
|
||||
def __init__(
|
||||
self,
|
||||
repository: DatasetReviewSessionRepository,
|
||||
config_manager: ConfigManager,
|
||||
task_manager: Optional[TaskManager] = None,
|
||||
semantic_resolver: Optional[SemanticSourceResolver] = None,
|
||||
) -> None:
|
||||
self.repository = repository
|
||||
self.config_manager = config_manager
|
||||
self.task_manager = task_manager
|
||||
self.semantic_resolver = semantic_resolver or SemanticSourceResolver()
|
||||
# [/DEF:DatasetReviewOrchestrator.__init__:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator.start_session:Function]
|
||||
# @COMPLEXITY: 5
|
||||
# @PURPOSE: Initialize a new session from a Superset link or dataset selection and trigger context recovery.
|
||||
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
|
||||
# @RELATION: [CALLS] ->[SupersetContextExtractor.parse_superset_link]
|
||||
# @RELATION: [CALLS] ->[create_task]
|
||||
# @PRE: source input is non-empty and environment is accessible.
|
||||
# @POST: session exists in persisted storage with intake/recovery state and task linkage when async work is required.
|
||||
# @SIDE_EFFECT: persists session and may enqueue recovery task.
|
||||
# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
|
||||
# @INVARIANT: no cross-user session leakage occurs; session and follow-up task remain owned by the authenticated user.
|
||||
def start_session(self, command: StartSessionCommand) -> StartSessionResult:
|
||||
with belief_scope("DatasetReviewOrchestrator.start_session"):
|
||||
normalized_source_kind = str(command.source_kind or "").strip()
|
||||
normalized_source_input = str(command.source_input or "").strip()
|
||||
normalized_environment_id = str(command.environment_id or "").strip()
|
||||
|
||||
if not normalized_source_input:
|
||||
logger.explore("Blocked dataset review session start due to empty source input")
|
||||
raise ValueError("source_input must be non-empty")
|
||||
|
||||
if normalized_source_kind not in {"superset_link", "dataset_selection"}:
|
||||
logger.explore(
|
||||
"Blocked dataset review session start due to unsupported source kind",
|
||||
extra={"source_kind": normalized_source_kind},
|
||||
)
|
||||
raise ValueError("source_kind must be 'superset_link' or 'dataset_selection'")
|
||||
|
||||
environment = self.config_manager.get_environment(normalized_environment_id)
|
||||
if environment is None:
|
||||
logger.explore(
|
||||
"Blocked dataset review session start because environment was not found",
|
||||
extra={"environment_id": normalized_environment_id},
|
||||
)
|
||||
raise ValueError("Environment not found")
|
||||
|
||||
logger.reason(
|
||||
"Starting dataset review session",
|
||||
extra={
|
||||
"user_id": command.user.id,
|
||||
"environment_id": normalized_environment_id,
|
||||
"source_kind": normalized_source_kind,
|
||||
},
|
||||
)
|
||||
|
||||
parsed_context: Optional[SupersetParsedContext] = None
|
||||
findings: List[ValidationFinding] = []
|
||||
dataset_ref = normalized_source_input
|
||||
dataset_id: Optional[int] = None
|
||||
dashboard_id: Optional[int] = None
|
||||
readiness_state = ReadinessState.IMPORTING
|
||||
recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
|
||||
current_phase = SessionPhase.RECOVERY
|
||||
|
||||
if normalized_source_kind == "superset_link":
|
||||
extractor = SupersetContextExtractor(environment)
|
||||
parsed_context = extractor.parse_superset_link(normalized_source_input)
|
||||
dataset_ref = parsed_context.dataset_ref
|
||||
dataset_id = parsed_context.dataset_id
|
||||
dashboard_id = parsed_context.dashboard_id
|
||||
|
||||
if parsed_context.partial_recovery:
|
||||
readiness_state = ReadinessState.RECOVERY_REQUIRED
|
||||
recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
|
||||
findings.extend(self._build_partial_recovery_findings(parsed_context))
|
||||
else:
|
||||
readiness_state = ReadinessState.REVIEW_READY
|
||||
else:
|
||||
dataset_ref, dataset_id = self._parse_dataset_selection(normalized_source_input)
|
||||
readiness_state = ReadinessState.REVIEW_READY
|
||||
current_phase = SessionPhase.REVIEW
|
||||
|
||||
session = DatasetReviewSession(
|
||||
user_id=command.user.id,
|
||||
environment_id=normalized_environment_id,
|
||||
source_kind=normalized_source_kind,
|
||||
source_input=normalized_source_input,
|
||||
dataset_ref=dataset_ref,
|
||||
dataset_id=dataset_id,
|
||||
dashboard_id=dashboard_id,
|
||||
readiness_state=readiness_state,
|
||||
recommended_action=recommended_action,
|
||||
status=SessionStatus.ACTIVE,
|
||||
current_phase=current_phase,
|
||||
)
|
||||
persisted_session = self.repository.create_session(session)
|
||||
|
||||
profile = self._build_initial_profile(
|
||||
session_id=persisted_session.session_id,
|
||||
parsed_context=parsed_context,
|
||||
dataset_ref=dataset_ref,
|
||||
)
|
||||
persisted_session = self.repository.save_profile_and_findings(
|
||||
persisted_session.session_id,
|
||||
command.user.id,
|
||||
profile,
|
||||
findings,
|
||||
)
|
||||
|
||||
active_task_id = self._enqueue_recovery_task(
|
||||
command=command,
|
||||
session=persisted_session,
|
||||
parsed_context=parsed_context,
|
||||
)
|
||||
if active_task_id:
|
||||
persisted_session.active_task_id = active_task_id
|
||||
self.repository.db.commit()
|
||||
self.repository.db.refresh(persisted_session)
|
||||
logger.reason(
|
||||
"Linked recovery task to started dataset review session",
|
||||
extra={"session_id": persisted_session.session_id, "task_id": active_task_id},
|
||||
)
|
||||
|
||||
logger.reflect(
|
||||
"Dataset review session start completed",
|
||||
extra={
|
||||
"session_id": persisted_session.session_id,
|
||||
"dataset_ref": persisted_session.dataset_ref,
|
||||
"dataset_id": persisted_session.dataset_id,
|
||||
"dashboard_id": persisted_session.dashboard_id,
|
||||
"readiness_state": persisted_session.readiness_state.value,
|
||||
"active_task_id": persisted_session.active_task_id,
|
||||
"finding_count": len(findings),
|
||||
},
|
||||
)
|
||||
return StartSessionResult(
|
||||
session=persisted_session,
|
||||
parsed_context=parsed_context,
|
||||
findings=findings,
|
||||
)
|
||||
# [/DEF:DatasetReviewOrchestrator.start_session:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Normalize dataset-selection payload into canonical session references.
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
|
||||
def _parse_dataset_selection(self, source_input: str) -> tuple[str, Optional[int]]:
|
||||
normalized = str(source_input or "").strip()
|
||||
if not normalized:
|
||||
raise ValueError("dataset selection input must be non-empty")
|
||||
|
||||
if normalized.isdigit():
|
||||
dataset_id = int(normalized)
|
||||
return f"dataset:{dataset_id}", dataset_id
|
||||
|
||||
if normalized.startswith("dataset:"):
|
||||
suffix = normalized.split(":", 1)[1].strip()
|
||||
if suffix.isdigit():
|
||||
return normalized, int(suffix)
|
||||
return normalized, None
|
||||
|
||||
return normalized, None
|
||||
# [/DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Create the first profile snapshot so exports and detail views remain usable immediately after intake.
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetProfile]
|
||||
def _build_initial_profile(
|
||||
self,
|
||||
session_id: str,
|
||||
parsed_context: Optional[SupersetParsedContext],
|
||||
dataset_ref: str,
|
||||
) -> DatasetProfile:
|
||||
dataset_name = dataset_ref.split(".")[-1] if dataset_ref else "Unresolved dataset"
|
||||
business_summary = (
|
||||
f"Review session initialized for {dataset_ref}."
|
||||
if dataset_ref
|
||||
else "Review session initialized with unresolved dataset context."
|
||||
)
|
||||
confidence_state = (
|
||||
ConfidenceState.MIXED
|
||||
if parsed_context and parsed_context.partial_recovery
|
||||
else ConfidenceState.MOSTLY_CONFIRMED
|
||||
)
|
||||
return DatasetProfile(
|
||||
session_id=session_id,
|
||||
dataset_name=dataset_name or "Unresolved dataset",
|
||||
schema_name=dataset_ref.split(".")[0] if "." in dataset_ref else None,
|
||||
business_summary=business_summary,
|
||||
business_summary_source=BusinessSummarySource.IMPORTED,
|
||||
description="Initial review profile created from source intake.",
|
||||
dataset_type="unknown",
|
||||
is_sqllab_view=False,
|
||||
completeness_score=0.25,
|
||||
confidence_state=confidence_state,
|
||||
has_blocking_findings=False,
|
||||
has_warning_findings=bool(parsed_context and parsed_context.partial_recovery),
|
||||
manual_summary_locked=False,
|
||||
)
|
||||
# [/DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Project partial Superset intake recovery into explicit findings without blocking session usability.
|
||||
# @RELATION: [DEPENDS_ON] ->[ValidationFinding]
|
||||
# @PRE: parsed_context.partial_recovery is true.
|
||||
# @POST: returns warning-level findings that preserve usable but incomplete state.
|
||||
# @SIDE_EFFECT: none beyond structured finding creation.
|
||||
# @DATA_CONTRACT: Input[SupersetParsedContext] -> Output[List[ValidationFinding]]
|
||||
def _build_partial_recovery_findings(
|
||||
self,
|
||||
parsed_context: SupersetParsedContext,
|
||||
) -> List[ValidationFinding]:
|
||||
findings: List[ValidationFinding] = []
|
||||
for unresolved_ref in parsed_context.unresolved_references:
|
||||
findings.append(
|
||||
ValidationFinding(
|
||||
area=FindingArea.SOURCE_INTAKE,
|
||||
severity=FindingSeverity.WARNING,
|
||||
code="PARTIAL_SUPERSET_RECOVERY",
|
||||
title="Superset context recovered partially",
|
||||
message=(
|
||||
"Session remains usable, but some Superset context requires review: "
|
||||
f"{unresolved_ref.replace('_', ' ')}."
|
||||
),
|
||||
resolution_state=ResolutionState.OPEN,
|
||||
caused_by_ref=unresolved_ref,
|
||||
)
|
||||
)
|
||||
return findings
|
||||
# [/DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Link session start to observable async recovery when task infrastructure is available.
|
||||
# @RELATION: [CALLS] ->[create_task]
|
||||
# @PRE: session is already persisted.
|
||||
# @POST: returns task identifier when a task could be enqueued, otherwise None.
|
||||
# @SIDE_EFFECT: may create one background task for progressive recovery.
|
||||
# @DATA_CONTRACT: Input[StartSessionCommand,DatasetReviewSession,SupersetParsedContext|None] -> Output[task_id:str|None]
|
||||
def _enqueue_recovery_task(
|
||||
self,
|
||||
command: StartSessionCommand,
|
||||
session: DatasetReviewSession,
|
||||
parsed_context: Optional[SupersetParsedContext],
|
||||
) -> Optional[str]:
|
||||
if self.task_manager is None:
|
||||
logger.reason(
|
||||
"Dataset review session started without task manager; continuing synchronously",
|
||||
extra={"session_id": session.session_id},
|
||||
)
|
||||
return None
|
||||
|
||||
task_params: Dict[str, Any] = {
|
||||
"session_id": session.session_id,
|
||||
"user_id": command.user.id,
|
||||
"environment_id": session.environment_id,
|
||||
"source_kind": session.source_kind,
|
||||
"source_input": session.source_input,
|
||||
"dataset_ref": session.dataset_ref,
|
||||
"dataset_id": session.dataset_id,
|
||||
"dashboard_id": session.dashboard_id,
|
||||
"partial_recovery": bool(parsed_context and parsed_context.partial_recovery),
|
||||
}
|
||||
|
||||
create_task = getattr(self.task_manager, "create_task", None)
|
||||
if create_task is None:
|
||||
logger.explore("Task manager has no create_task method; skipping recovery enqueue")
|
||||
return None
|
||||
|
||||
try:
|
||||
task_object = create_task(
|
||||
plugin_id="dataset-review-recovery",
|
||||
params=task_params,
|
||||
)
|
||||
except TypeError:
|
||||
logger.explore(
|
||||
"Recovery task enqueue skipped because task manager create_task contract is incompatible",
|
||||
extra={"session_id": session.session_id},
|
||||
)
|
||||
return None
|
||||
|
||||
task_id = getattr(task_object, "id", None)
|
||||
return str(task_id) if task_id else None
|
||||
# [/DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
|
||||
# [/DEF:DatasetReviewOrchestrator:Class]
|
||||
|
||||
# [/DEF:DatasetReviewOrchestrator:Module]
|
||||
Reference in New Issue
Block a user