feat(us1): add dataset review orchestration automatic review slice

2026-03-17 10:57:49 +03:00
parent e916cb1f17
commit 023bacde39
24 changed files with 4870 additions and 131 deletions
--- a/backend/src/services/dataset_review/orchestrator.py
+++ b/backend/src/services/dataset_review/orchestrator.py
@@ -0,0 +1,386 @@
+# [DEF:DatasetReviewOrchestrator:Module]
+# @COMPLEXITY: 5
+# @SEMANTICS: dataset_review, orchestration, session_lifecycle, intake, recovery
+# @PURPOSE: Coordinate dataset review session startup and lifecycle-safe intake recovery for one authenticated user.
+# @LAYER: Domain
+# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
+# @RELATION: [DEPENDS_ON] ->[SemanticSourceResolver]
+# @RELATION: [DEPENDS_ON] ->[ClarificationEngine]
+# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
+# @RELATION: [DEPENDS_ON] ->[SupersetCompilationAdapter]
+# @RELATION: [DEPENDS_ON] ->[TaskManager]
+# @PRE: session mutations must execute inside a persisted session boundary scoped to one authenticated user.
+# @POST: state transitions are persisted atomically and emit observable progress for long-running steps.
+# @SIDE_EFFECT: creates task records, updates session aggregates, triggers upstream Superset calls, persists audit artifacts.
+# @DATA_CONTRACT: Input[SessionCommand] -> Output[DatasetReviewSession | CompiledPreview | DatasetRunContext]
+# @INVARIANT: Launch is blocked unless a current session has no open blocking findings, all launch-sensitive mappings are approved, and a non-stale Superset-generated compiled preview matches the current input fingerprint.
+
+from __future__ import annotations
+
+# [DEF:DatasetReviewOrchestrator.imports:Block]
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+from src.core.config_manager import ConfigManager
+from src.core.logger import belief_scope, logger
+from src.core.task_manager import TaskManager
+from src.core.utils.superset_context_extractor import (
+    SupersetContextExtractor,
+    SupersetParsedContext,
+)
+from src.models.auth import User
+from src.models.dataset_review import (
+    BusinessSummarySource,
+    ConfidenceState,
+    DatasetProfile,
+    DatasetReviewSession,
+    FindingArea,
+    FindingSeverity,
+    RecommendedAction,
+    ReadinessState,
+    ResolutionState,
+    SessionPhase,
+    SessionStatus,
+    ValidationFinding,
+)
+from src.services.dataset_review.repositories.session_repository import (
+    DatasetReviewSessionRepository,
+)
+from src.services.dataset_review.semantic_resolver import SemanticSourceResolver
+# [/DEF:DatasetReviewOrchestrator.imports:Block]
+
+
+# [DEF:StartSessionCommand:Class]
+# @COMPLEXITY: 2
+# @PURPOSE: Typed input contract for starting a dataset review session.
+@dataclass
+class StartSessionCommand:
+    user: User
+    environment_id: str
+    source_kind: str
+    source_input: str
+# [/DEF:StartSessionCommand:Class]
+
+
+# [DEF:StartSessionResult:Class]
+# @COMPLEXITY: 2
+# @PURPOSE: Session-start result carrying the persisted session and intake recovery metadata.
+@dataclass
+class StartSessionResult:
+    session: DatasetReviewSession
+    parsed_context: Optional[SupersetParsedContext] = None
+    findings: List[ValidationFinding] = field(default_factory=list)
+# [/DEF:StartSessionResult:Class]
+
+
+# [DEF:DatasetReviewOrchestrator:Class]
+# @COMPLEXITY: 5
+# @PURPOSE: Coordinate safe session startup while preserving cross-user isolation and explicit partial recovery.
+# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
+# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
+# @RELATION: [DEPENDS_ON] ->[TaskManager]
+# @RELATION: [DEPENDS_ON] ->[SessionRepo]
+# @RELATION: [DEPENDS_ON] ->[ConfigManager]
+# @PRE: constructor dependencies are valid and tied to the current request/task scope.
+# @POST: orchestrator instance can execute session-scoped mutations for one authenticated user.
+# @SIDE_EFFECT: downstream operations may persist session/profile/finding state and enqueue background tasks.
+# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
+# @INVARIANT: session ownership is preserved on every mutation and recovery remains explicit when partial.
+class DatasetReviewOrchestrator:
+    # [DEF:DatasetReviewOrchestrator.__init__:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Bind repository, config, and task dependencies required by the orchestration boundary.
+    # @RELATION: [DEPENDS_ON] ->[SessionRepo]
+    # @RELATION: [DEPENDS_ON] ->[ConfigManager]
+    def __init__(
+        self,
+        repository: DatasetReviewSessionRepository,
+        config_manager: ConfigManager,
+        task_manager: Optional[TaskManager] = None,
+        semantic_resolver: Optional[SemanticSourceResolver] = None,
+    ) -> None:
+        self.repository = repository
+        self.config_manager = config_manager
+        self.task_manager = task_manager
+        self.semantic_resolver = semantic_resolver or SemanticSourceResolver()
+    # [/DEF:DatasetReviewOrchestrator.__init__:Function]
+
+    # [DEF:DatasetReviewOrchestrator.start_session:Function]
+    # @COMPLEXITY: 5
+    # @PURPOSE: Initialize a new session from a Superset link or dataset selection and trigger context recovery.
+    # @RELATION: [DEPENDS_ON] ->[SessionRepo]
+    # @RELATION: [CALLS] ->[SupersetContextExtractor.parse_superset_link]
+    # @RELATION: [CALLS] ->[create_task]
+    # @PRE: source input is non-empty and environment is accessible.
+    # @POST: session exists in persisted storage with intake/recovery state and task linkage when async work is required.
+    # @SIDE_EFFECT: persists session and may enqueue recovery task.
+    # @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
+    # @INVARIANT: no cross-user session leakage occurs; session and follow-up task remain owned by the authenticated user.
+    def start_session(self, command: StartSessionCommand) -> StartSessionResult:
+        with belief_scope("DatasetReviewOrchestrator.start_session"):
+            normalized_source_kind = str(command.source_kind or "").strip()
+            normalized_source_input = str(command.source_input or "").strip()
+            normalized_environment_id = str(command.environment_id or "").strip()
+
+            if not normalized_source_input:
+                logger.explore("Blocked dataset review session start due to empty source input")
+                raise ValueError("source_input must be non-empty")
+
+            if normalized_source_kind not in {"superset_link", "dataset_selection"}:
+                logger.explore(
+                    "Blocked dataset review session start due to unsupported source kind",
+                    extra={"source_kind": normalized_source_kind},
+                )
+                raise ValueError("source_kind must be 'superset_link' or 'dataset_selection'")
+
+            environment = self.config_manager.get_environment(normalized_environment_id)
+            if environment is None:
+                logger.explore(
+                    "Blocked dataset review session start because environment was not found",
+                    extra={"environment_id": normalized_environment_id},
+                )
+                raise ValueError("Environment not found")
+
+            logger.reason(
+                "Starting dataset review session",
+                extra={
+                    "user_id": command.user.id,
+                    "environment_id": normalized_environment_id,
+                    "source_kind": normalized_source_kind,
+                },
+            )
+
+            parsed_context: Optional[SupersetParsedContext] = None
+            findings: List[ValidationFinding] = []
+            dataset_ref = normalized_source_input
+            dataset_id: Optional[int] = None
+            dashboard_id: Optional[int] = None
+            readiness_state = ReadinessState.IMPORTING
+            recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
+            current_phase = SessionPhase.RECOVERY
+
+            if normalized_source_kind == "superset_link":
+                extractor = SupersetContextExtractor(environment)
+                parsed_context = extractor.parse_superset_link(normalized_source_input)
+                dataset_ref = parsed_context.dataset_ref
+                dataset_id = parsed_context.dataset_id
+                dashboard_id = parsed_context.dashboard_id
+
+                if parsed_context.partial_recovery:
+                    readiness_state = ReadinessState.RECOVERY_REQUIRED
+                    recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
+                    findings.extend(self._build_partial_recovery_findings(parsed_context))
+                else:
+                    readiness_state = ReadinessState.REVIEW_READY
+            else:
+                dataset_ref, dataset_id = self._parse_dataset_selection(normalized_source_input)
+                readiness_state = ReadinessState.REVIEW_READY
+                current_phase = SessionPhase.REVIEW
+
+            session = DatasetReviewSession(
+                user_id=command.user.id,
+                environment_id=normalized_environment_id,
+                source_kind=normalized_source_kind,
+                source_input=normalized_source_input,
+                dataset_ref=dataset_ref,
+                dataset_id=dataset_id,
+                dashboard_id=dashboard_id,
+                readiness_state=readiness_state,
+                recommended_action=recommended_action,
+                status=SessionStatus.ACTIVE,
+                current_phase=current_phase,
+            )
+            persisted_session = self.repository.create_session(session)
+
+            profile = self._build_initial_profile(
+                session_id=persisted_session.session_id,
+                parsed_context=parsed_context,
+                dataset_ref=dataset_ref,
+            )
+            persisted_session = self.repository.save_profile_and_findings(
+                persisted_session.session_id,
+                command.user.id,
+                profile,
+                findings,
+            )
+
+            active_task_id = self._enqueue_recovery_task(
+                command=command,
+                session=persisted_session,
+                parsed_context=parsed_context,
+            )
+            if active_task_id:
+                persisted_session.active_task_id = active_task_id
+                self.repository.db.commit()
+                self.repository.db.refresh(persisted_session)
+                logger.reason(
+                    "Linked recovery task to started dataset review session",
+                    extra={"session_id": persisted_session.session_id, "task_id": active_task_id},
+                )
+
+            logger.reflect(
+                "Dataset review session start completed",
+                extra={
+                    "session_id": persisted_session.session_id,
+                    "dataset_ref": persisted_session.dataset_ref,
+                    "dataset_id": persisted_session.dataset_id,
+                    "dashboard_id": persisted_session.dashboard_id,
+                    "readiness_state": persisted_session.readiness_state.value,
+                    "active_task_id": persisted_session.active_task_id,
+                    "finding_count": len(findings),
+                },
+            )
+            return StartSessionResult(
+                session=persisted_session,
+                parsed_context=parsed_context,
+                findings=findings,
+            )
+    # [/DEF:DatasetReviewOrchestrator.start_session:Function]
+
+    # [DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Normalize dataset-selection payload into canonical session references.
+    # @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
+    def _parse_dataset_selection(self, source_input: str) -> tuple[str, Optional[int]]:
+        normalized = str(source_input or "").strip()
+        if not normalized:
+            raise ValueError("dataset selection input must be non-empty")
+
+        if normalized.isdigit():
+            dataset_id = int(normalized)
+            return f"dataset:{dataset_id}", dataset_id
+
+        if normalized.startswith("dataset:"):
+            suffix = normalized.split(":", 1)[1].strip()
+            if suffix.isdigit():
+                return normalized, int(suffix)
+            return normalized, None
+
+        return normalized, None
+    # [/DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
+
+    # [DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Create the first profile snapshot so exports and detail views remain usable immediately after intake.
+    # @RELATION: [DEPENDS_ON] ->[DatasetProfile]
+    def _build_initial_profile(
+        self,
+        session_id: str,
+        parsed_context: Optional[SupersetParsedContext],
+        dataset_ref: str,
+    ) -> DatasetProfile:
+        dataset_name = dataset_ref.split(".")[-1] if dataset_ref else "Unresolved dataset"
+        business_summary = (
+            f"Review session initialized for {dataset_ref}."
+            if dataset_ref
+            else "Review session initialized with unresolved dataset context."
+        )
+        confidence_state = (
+            ConfidenceState.MIXED
+            if parsed_context and parsed_context.partial_recovery
+            else ConfidenceState.MOSTLY_CONFIRMED
+        )
+        return DatasetProfile(
+            session_id=session_id,
+            dataset_name=dataset_name or "Unresolved dataset",
+            schema_name=dataset_ref.split(".")[0] if "." in dataset_ref else None,
+            business_summary=business_summary,
+            business_summary_source=BusinessSummarySource.IMPORTED,
+            description="Initial review profile created from source intake.",
+            dataset_type="unknown",
+            is_sqllab_view=False,
+            completeness_score=0.25,
+            confidence_state=confidence_state,
+            has_blocking_findings=False,
+            has_warning_findings=bool(parsed_context and parsed_context.partial_recovery),
+            manual_summary_locked=False,
+        )
+    # [/DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
+
+    # [DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Project partial Superset intake recovery into explicit findings without blocking session usability.
+    # @RELATION: [DEPENDS_ON] ->[ValidationFinding]
+    # @PRE: parsed_context.partial_recovery is true.
+    # @POST: returns warning-level findings that preserve usable but incomplete state.
+    # @SIDE_EFFECT: none beyond structured finding creation.
+    # @DATA_CONTRACT: Input[SupersetParsedContext] -> Output[List[ValidationFinding]]
+    def _build_partial_recovery_findings(
+        self,
+        parsed_context: SupersetParsedContext,
+    ) -> List[ValidationFinding]:
+        findings: List[ValidationFinding] = []
+        for unresolved_ref in parsed_context.unresolved_references:
+            findings.append(
+                ValidationFinding(
+                    area=FindingArea.SOURCE_INTAKE,
+                    severity=FindingSeverity.WARNING,
+                    code="PARTIAL_SUPERSET_RECOVERY",
+                    title="Superset context recovered partially",
+                    message=(
+                        "Session remains usable, but some Superset context requires review: "
+                        f"{unresolved_ref.replace('_', ' ')}."
+                    ),
+                    resolution_state=ResolutionState.OPEN,
+                    caused_by_ref=unresolved_ref,
+                )
+            )
+        return findings
+    # [/DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
+
+    # [DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Link session start to observable async recovery when task infrastructure is available.
+    # @RELATION: [CALLS] ->[create_task]
+    # @PRE: session is already persisted.
+    # @POST: returns task identifier when a task could be enqueued, otherwise None.
+    # @SIDE_EFFECT: may create one background task for progressive recovery.
+    # @DATA_CONTRACT: Input[StartSessionCommand,DatasetReviewSession,SupersetParsedContext|None] -> Output[task_id:str|None]
+    def _enqueue_recovery_task(
+        self,
+        command: StartSessionCommand,
+        session: DatasetReviewSession,
+        parsed_context: Optional[SupersetParsedContext],
+    ) -> Optional[str]:
+        if self.task_manager is None:
+            logger.reason(
+                "Dataset review session started without task manager; continuing synchronously",
+                extra={"session_id": session.session_id},
+            )
+            return None
+
+        task_params: Dict[str, Any] = {
+            "session_id": session.session_id,
+            "user_id": command.user.id,
+            "environment_id": session.environment_id,
+            "source_kind": session.source_kind,
+            "source_input": session.source_input,
+            "dataset_ref": session.dataset_ref,
+            "dataset_id": session.dataset_id,
+            "dashboard_id": session.dashboard_id,
+            "partial_recovery": bool(parsed_context and parsed_context.partial_recovery),
+        }
+
+        create_task = getattr(self.task_manager, "create_task", None)
+        if create_task is None:
+            logger.explore("Task manager has no create_task method; skipping recovery enqueue")
+            return None
+
+        try:
+            task_object = create_task(
+                plugin_id="dataset-review-recovery",
+                params=task_params,
+            )
+        except TypeError:
+            logger.explore(
+                "Recovery task enqueue skipped because task manager create_task contract is incompatible",
+                extra={"session_id": session.session_id},
+            )
+            return None
+
+        task_id = getattr(task_object, "id", None)
+        return str(task_id) if task_id else None
+    # [/DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
+# [/DEF:DatasetReviewOrchestrator:Class]
+
+# [/DEF:DatasetReviewOrchestrator:Module]