feat(us1): add dataset review orchestration automatic review slice

2026-03-17 10:57:49 +03:00
parent e916cb1f17
commit 023bacde39
24 changed files with 4870 additions and 131 deletions
--- a/backend/src/services/dataset_review/orchestrator.py
+++ b/backend/src/services/dataset_review/orchestrator.py
@@ -0,0 +1,386 @@
+# [DEF:DatasetReviewOrchestrator:Module]
+# @COMPLEXITY: 5
+# @SEMANTICS: dataset_review, orchestration, session_lifecycle, intake, recovery
+# @PURPOSE: Coordinate dataset review session startup and lifecycle-safe intake recovery for one authenticated user.
+# @LAYER: Domain
+# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
+# @RELATION: [DEPENDS_ON] ->[SemanticSourceResolver]
+# @RELATION: [DEPENDS_ON] ->[ClarificationEngine]
+# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
+# @RELATION: [DEPENDS_ON] ->[SupersetCompilationAdapter]
+# @RELATION: [DEPENDS_ON] ->[TaskManager]
+# @PRE: session mutations must execute inside a persisted session boundary scoped to one authenticated user.
+# @POST: state transitions are persisted atomically and emit observable progress for long-running steps.
+# @SIDE_EFFECT: creates task records, updates session aggregates, triggers upstream Superset calls, persists audit artifacts.
+# @DATA_CONTRACT: Input[SessionCommand] -> Output[DatasetReviewSession | CompiledPreview | DatasetRunContext]
+# @INVARIANT: Launch is blocked unless a current session has no open blocking findings, all launch-sensitive mappings are approved, and a non-stale Superset-generated compiled preview matches the current input fingerprint.
+
+from __future__ import annotations
+
+# [DEF:DatasetReviewOrchestrator.imports:Block]
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+from src.core.config_manager import ConfigManager
+from src.core.logger import belief_scope, logger
+from src.core.task_manager import TaskManager
+from src.core.utils.superset_context_extractor import (
+    SupersetContextExtractor,
+    SupersetParsedContext,
+)
+from src.models.auth import User
+from src.models.dataset_review import (
+    BusinessSummarySource,
+    ConfidenceState,
+    DatasetProfile,
+    DatasetReviewSession,
+    FindingArea,
+    FindingSeverity,
+    RecommendedAction,
+    ReadinessState,
+    ResolutionState,
+    SessionPhase,
+    SessionStatus,
+    ValidationFinding,
+)
+from src.services.dataset_review.repositories.session_repository import (
+    DatasetReviewSessionRepository,
+)
+from src.services.dataset_review.semantic_resolver import SemanticSourceResolver
+# [/DEF:DatasetReviewOrchestrator.imports:Block]
+
+
+# [DEF:StartSessionCommand:Class]
+# @COMPLEXITY: 2
+# @PURPOSE: Typed input contract for starting a dataset review session.
+@dataclass
+class StartSessionCommand:
+    user: User
+    environment_id: str
+    source_kind: str
+    source_input: str
+# [/DEF:StartSessionCommand:Class]
+
+
+# [DEF:StartSessionResult:Class]
+# @COMPLEXITY: 2
+# @PURPOSE: Session-start result carrying the persisted session and intake recovery metadata.
+@dataclass
+class StartSessionResult:
+    session: DatasetReviewSession
+    parsed_context: Optional[SupersetParsedContext] = None
+    findings: List[ValidationFinding] = field(default_factory=list)
+# [/DEF:StartSessionResult:Class]
+
+
+# [DEF:DatasetReviewOrchestrator:Class]
+# @COMPLEXITY: 5
+# @PURPOSE: Coordinate safe session startup while preserving cross-user isolation and explicit partial recovery.
+# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
+# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
+# @RELATION: [DEPENDS_ON] ->[TaskManager]
+# @RELATION: [DEPENDS_ON] ->[SessionRepo]
+# @RELATION: [DEPENDS_ON] ->[ConfigManager]
+# @PRE: constructor dependencies are valid and tied to the current request/task scope.
+# @POST: orchestrator instance can execute session-scoped mutations for one authenticated user.
+# @SIDE_EFFECT: downstream operations may persist session/profile/finding state and enqueue background tasks.
+# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
+# @INVARIANT: session ownership is preserved on every mutation and recovery remains explicit when partial.
+class DatasetReviewOrchestrator:
+    # [DEF:DatasetReviewOrchestrator.__init__:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Bind repository, config, and task dependencies required by the orchestration boundary.
+    # @RELATION: [DEPENDS_ON] ->[SessionRepo]
+    # @RELATION: [DEPENDS_ON] ->[ConfigManager]
+    def __init__(
+        self,
+        repository: DatasetReviewSessionRepository,
+        config_manager: ConfigManager,
+        task_manager: Optional[TaskManager] = None,
+        semantic_resolver: Optional[SemanticSourceResolver] = None,
+    ) -> None:
+        self.repository = repository
+        self.config_manager = config_manager
+        self.task_manager = task_manager
+        self.semantic_resolver = semantic_resolver or SemanticSourceResolver()
+    # [/DEF:DatasetReviewOrchestrator.__init__:Function]
+
+    # [DEF:DatasetReviewOrchestrator.start_session:Function]
+    # @COMPLEXITY: 5
+    # @PURPOSE: Initialize a new session from a Superset link or dataset selection and trigger context recovery.
+    # @RELATION: [DEPENDS_ON] ->[SessionRepo]
+    # @RELATION: [CALLS] ->[SupersetContextExtractor.parse_superset_link]
+    # @RELATION: [CALLS] ->[create_task]
+    # @PRE: source input is non-empty and environment is accessible.
+    # @POST: session exists in persisted storage with intake/recovery state and task linkage when async work is required.
+    # @SIDE_EFFECT: persists session and may enqueue recovery task.
+    # @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
+    # @INVARIANT: no cross-user session leakage occurs; session and follow-up task remain owned by the authenticated user.
+    def start_session(self, command: StartSessionCommand) -> StartSessionResult:
+        with belief_scope("DatasetReviewOrchestrator.start_session"):
+            normalized_source_kind = str(command.source_kind or "").strip()
+            normalized_source_input = str(command.source_input or "").strip()
+            normalized_environment_id = str(command.environment_id or "").strip()
+
+            if not normalized_source_input:
+                logger.explore("Blocked dataset review session start due to empty source input")
+                raise ValueError("source_input must be non-empty")
+
+            if normalized_source_kind not in {"superset_link", "dataset_selection"}:
+                logger.explore(
+                    "Blocked dataset review session start due to unsupported source kind",
+                    extra={"source_kind": normalized_source_kind},
+                )
+                raise ValueError("source_kind must be 'superset_link' or 'dataset_selection'")
+
+            environment = self.config_manager.get_environment(normalized_environment_id)
+            if environment is None:
+                logger.explore(
+                    "Blocked dataset review session start because environment was not found",
+                    extra={"environment_id": normalized_environment_id},
+                )
+                raise ValueError("Environment not found")
+
+            logger.reason(
+                "Starting dataset review session",
+                extra={
+                    "user_id": command.user.id,
+                    "environment_id": normalized_environment_id,
+                    "source_kind": normalized_source_kind,
+                },
+            )
+
+            parsed_context: Optional[SupersetParsedContext] = None
+            findings: List[ValidationFinding] = []
+            dataset_ref = normalized_source_input
+            dataset_id: Optional[int] = None
+            dashboard_id: Optional[int] = None
+            readiness_state = ReadinessState.IMPORTING
+            recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
+            current_phase = SessionPhase.RECOVERY
+
+            if normalized_source_kind == "superset_link":
+                extractor = SupersetContextExtractor(environment)
+                parsed_context = extractor.parse_superset_link(normalized_source_input)
+                dataset_ref = parsed_context.dataset_ref
+                dataset_id = parsed_context.dataset_id
+                dashboard_id = parsed_context.dashboard_id
+
+                if parsed_context.partial_recovery:
+                    readiness_state = ReadinessState.RECOVERY_REQUIRED
+                    recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
+                    findings.extend(self._build_partial_recovery_findings(parsed_context))
+                else:
+                    readiness_state = ReadinessState.REVIEW_READY
+            else:
+                dataset_ref, dataset_id = self._parse_dataset_selection(normalized_source_input)
+                readiness_state = ReadinessState.REVIEW_READY
+                current_phase = SessionPhase.REVIEW
+
+            session = DatasetReviewSession(
+                user_id=command.user.id,
+                environment_id=normalized_environment_id,
+                source_kind=normalized_source_kind,
+                source_input=normalized_source_input,
+                dataset_ref=dataset_ref,
+                dataset_id=dataset_id,
+                dashboard_id=dashboard_id,
+                readiness_state=readiness_state,
+                recommended_action=recommended_action,
+                status=SessionStatus.ACTIVE,
+                current_phase=current_phase,
+            )
+            persisted_session = self.repository.create_session(session)
+
+            profile = self._build_initial_profile(
+                session_id=persisted_session.session_id,
+                parsed_context=parsed_context,
+                dataset_ref=dataset_ref,
+            )
+            persisted_session = self.repository.save_profile_and_findings(
+                persisted_session.session_id,
+                command.user.id,
+                profile,
+                findings,
+            )
+
+            active_task_id = self._enqueue_recovery_task(
+                command=command,
+                session=persisted_session,
+                parsed_context=parsed_context,
+            )
+            if active_task_id:
+                persisted_session.active_task_id = active_task_id
+                self.repository.db.commit()
+                self.repository.db.refresh(persisted_session)
+                logger.reason(
+                    "Linked recovery task to started dataset review session",
+                    extra={"session_id": persisted_session.session_id, "task_id": active_task_id},
+                )
+
+            logger.reflect(
+                "Dataset review session start completed",
+                extra={
+                    "session_id": persisted_session.session_id,
+                    "dataset_ref": persisted_session.dataset_ref,
+                    "dataset_id": persisted_session.dataset_id,
+                    "dashboard_id": persisted_session.dashboard_id,
+                    "readiness_state": persisted_session.readiness_state.value,
+                    "active_task_id": persisted_session.active_task_id,
+                    "finding_count": len(findings),
+                },
+            )
+            return StartSessionResult(
+                session=persisted_session,
+                parsed_context=parsed_context,
+                findings=findings,
+            )
+    # [/DEF:DatasetReviewOrchestrator.start_session:Function]
+
+    # [DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Normalize dataset-selection payload into canonical session references.
+    # @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
+    def _parse_dataset_selection(self, source_input: str) -> tuple[str, Optional[int]]:
+        normalized = str(source_input or "").strip()
+        if not normalized:
+            raise ValueError("dataset selection input must be non-empty")
+
+        if normalized.isdigit():
+            dataset_id = int(normalized)
+            return f"dataset:{dataset_id}", dataset_id
+
+        if normalized.startswith("dataset:"):
+            suffix = normalized.split(":", 1)[1].strip()
+            if suffix.isdigit():
+                return normalized, int(suffix)
+            return normalized, None
+
+        return normalized, None
+    # [/DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
+
+    # [DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Create the first profile snapshot so exports and detail views remain usable immediately after intake.
+    # @RELATION: [DEPENDS_ON] ->[DatasetProfile]
+    def _build_initial_profile(
+        self,
+        session_id: str,
+        parsed_context: Optional[SupersetParsedContext],
+        dataset_ref: str,
+    ) -> DatasetProfile:
+        dataset_name = dataset_ref.split(".")[-1] if dataset_ref else "Unresolved dataset"
+        business_summary = (
+            f"Review session initialized for {dataset_ref}."
+            if dataset_ref
+            else "Review session initialized with unresolved dataset context."
+        )
+        confidence_state = (
+            ConfidenceState.MIXED
+            if parsed_context and parsed_context.partial_recovery
+            else ConfidenceState.MOSTLY_CONFIRMED
+        )
+        return DatasetProfile(
+            session_id=session_id,
+            dataset_name=dataset_name or "Unresolved dataset",
+            schema_name=dataset_ref.split(".")[0] if "." in dataset_ref else None,
+            business_summary=business_summary,
+            business_summary_source=BusinessSummarySource.IMPORTED,
+            description="Initial review profile created from source intake.",
+            dataset_type="unknown",
+            is_sqllab_view=False,
+            completeness_score=0.25,
+            confidence_state=confidence_state,
+            has_blocking_findings=False,
+            has_warning_findings=bool(parsed_context and parsed_context.partial_recovery),
+            manual_summary_locked=False,
+        )
+    # [/DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
+
+    # [DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Project partial Superset intake recovery into explicit findings without blocking session usability.
+    # @RELATION: [DEPENDS_ON] ->[ValidationFinding]
+    # @PRE: parsed_context.partial_recovery is true.
+    # @POST: returns warning-level findings that preserve usable but incomplete state.
+    # @SIDE_EFFECT: none beyond structured finding creation.
+    # @DATA_CONTRACT: Input[SupersetParsedContext] -> Output[List[ValidationFinding]]
+    def _build_partial_recovery_findings(
+        self,
+        parsed_context: SupersetParsedContext,
+    ) -> List[ValidationFinding]:
+        findings: List[ValidationFinding] = []
+        for unresolved_ref in parsed_context.unresolved_references:
+            findings.append(
+                ValidationFinding(
+                    area=FindingArea.SOURCE_INTAKE,
+                    severity=FindingSeverity.WARNING,
+                    code="PARTIAL_SUPERSET_RECOVERY",
+                    title="Superset context recovered partially",
+                    message=(
+                        "Session remains usable, but some Superset context requires review: "
+                        f"{unresolved_ref.replace('_', ' ')}."
+                    ),
+                    resolution_state=ResolutionState.OPEN,
+                    caused_by_ref=unresolved_ref,
+                )
+            )
+        return findings
+    # [/DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
+
+    # [DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Link session start to observable async recovery when task infrastructure is available.
+    # @RELATION: [CALLS] ->[create_task]
+    # @PRE: session is already persisted.
+    # @POST: returns task identifier when a task could be enqueued, otherwise None.
+    # @SIDE_EFFECT: may create one background task for progressive recovery.
+    # @DATA_CONTRACT: Input[StartSessionCommand,DatasetReviewSession,SupersetParsedContext|None] -> Output[task_id:str|None]
+    def _enqueue_recovery_task(
+        self,
+        command: StartSessionCommand,
+        session: DatasetReviewSession,
+        parsed_context: Optional[SupersetParsedContext],
+    ) -> Optional[str]:
+        if self.task_manager is None:
+            logger.reason(
+                "Dataset review session started without task manager; continuing synchronously",
+                extra={"session_id": session.session_id},
+            )
+            return None
+
+        task_params: Dict[str, Any] = {
+            "session_id": session.session_id,
+            "user_id": command.user.id,
+            "environment_id": session.environment_id,
+            "source_kind": session.source_kind,
+            "source_input": session.source_input,
+            "dataset_ref": session.dataset_ref,
+            "dataset_id": session.dataset_id,
+            "dashboard_id": session.dashboard_id,
+            "partial_recovery": bool(parsed_context and parsed_context.partial_recovery),
+        }
+
+        create_task = getattr(self.task_manager, "create_task", None)
+        if create_task is None:
+            logger.explore("Task manager has no create_task method; skipping recovery enqueue")
+            return None
+
+        try:
+            task_object = create_task(
+                plugin_id="dataset-review-recovery",
+                params=task_params,
+            )
+        except TypeError:
+            logger.explore(
+                "Recovery task enqueue skipped because task manager create_task contract is incompatible",
+                extra={"session_id": session.session_id},
+            )
+            return None
+
+        task_id = getattr(task_object, "id", None)
+        return str(task_id) if task_id else None
+    # [/DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
+# [/DEF:DatasetReviewOrchestrator:Class]
+
+# [/DEF:DatasetReviewOrchestrator:Module]
--- a/backend/src/services/dataset_review/repositories/session_repository.py
+++ b/backend/src/services/dataset_review/repositories/session_repository.py
@@ -8,6 +8,9 @@
 # @RELATION: [DEPENDS_ON] -> [CompiledPreview]
 # @PRE: repository operations execute within authenticated request or task scope.
 # @POST: session aggregate reads are structurally consistent and writes preserve ownership and version semantics.
+# @SIDE_EFFECT: reads and writes SQLAlchemy-backed session aggregates.
+# @DATA_CONTRACT: Input[SessionMutation] -> Output[PersistedSessionAggregate]
+# @INVARIANT: answers, mapping approvals, preview artifacts, and launch snapshots are never attributed to the wrong user or session.

 from typing import Optional, List
 from sqlalchemy import or_
@@ -22,27 +25,51 @@ from src.models.dataset_review import (
 )
 from src.core.logger import belief_scope

+# [DEF:SessionRepo:Class]
+# @COMPLEXITY: 4
+# @PURPOSE: Enforce ownership-scoped persistence and retrieval for dataset review session aggregates.
+# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
+# @RELATION: [DEPENDS_ON] -> [DatasetProfile]
+# @RELATION: [DEPENDS_ON] -> [ValidationFinding]
+# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
+# @PRE: constructor receives a live SQLAlchemy session and callers provide authenticated user scope for guarded reads and writes.
+# @POST: repository methods return ownership-scoped aggregates or persisted child records without changing domain meaning.
+# @SIDE_EFFECT: mutates and queries the persistence layer through the injected database session.
+# @DATA_CONTRACT: Input[OwnedSessionQuery|SessionMutation] -> Output[PersistedSessionAggregate|PersistedChildRecord]
 class DatasetReviewSessionRepository:
    """
    @PURPOSE: Persist and retrieve dataset review session aggregates.
    @INVARIANT: ownership_scope -> All operations must respect the session owner's user_id.
    """
+
+    # [DEF:init_repo:Function]
    def __init__(self, db: Session):
        self.db = db
+    # [/DEF:init_repo:Function]

+    # [DEF:create_sess:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Persist an initial dataset review session shell.
+    # @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
+    # @PRE: session is a new aggregate root bound to the current ownership scope.
+    # @POST: session is committed, refreshed, and returned with persisted identifiers.
+    # @SIDE_EFFECT: inserts a session row and commits the active transaction.
+    # @DATA_CONTRACT: Input[DatasetReviewSession] -> Output[DatasetReviewSession]
    def create_session(self, session: DatasetReviewSession) -> DatasetReviewSession:
-        """
-        @PURPOSE: Persist initial session shell.
-        """
        with belief_scope("DatasetReviewSessionRepository.create_session"):
            self.db.add(session)
            self.db.commit()
            self.db.refresh(session)
            return session
+    # [/DEF:create_sess:Function]

+    # [DEF:load_detail:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Return the full session aggregate for API and frontend resume flows.
+    # @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
+    # @RELATION: [DEPENDS_ON] -> [SessionCollaborator]
    def load_session_detail(self, session_id: str, user_id: str) -> Optional[DatasetReviewSession]:
        """
-        @PURPOSE: Return the full session aggregate for API/frontend use.
        @PRE: user_id must match session owner or authorized collaborator.
        """
        with belief_scope("DatasetReviewSessionRepository.load_session_detail"):
@@ -70,17 +97,25 @@ class DatasetReviewSessionRepository:
                    )
                )\
                .first()
+    # [/DEF:load_detail:Function]

+    # [DEF:save_prof_find:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Persist profile state and replace validation findings for an owned session in one transaction.
+    # @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
+    # @RELATION: [DEPENDS_ON] -> [DatasetProfile]
+    # @RELATION: [DEPENDS_ON] -> [ValidationFinding]
+    # @PRE: session_id belongs to user_id and the supplied profile/findings belong to the same aggregate scope.
+    # @POST: stored profile matches the current session and findings are replaced by the supplied collection.
+    # @SIDE_EFFECT: updates profile rows, deletes stale findings, inserts current findings, and commits the transaction.
+    # @DATA_CONTRACT: Input[ProfileAndFindingsMutation] -> Output[DatasetReviewSession]
    def save_profile_and_findings(self, session_id: str, user_id: str, profile: DatasetProfile, findings: List[ValidationFinding]) -> DatasetReviewSession:
-        """
-        @PURPOSE: Persist profile and validation state together.
-        """
        with belief_scope("DatasetReviewSessionRepository.save_profile_and_findings"):
            session = self.db.query(DatasetReviewSession).filter(
                DatasetReviewSession.session_id == session_id,
                DatasetReviewSession.user_id == user_id
            ).first()
-            
+
            if not session:
                raise ValueError("Session not found or access denied")

@@ -90,24 +125,31 @@ class DatasetReviewSessionRepository:
                if existing_profile:
                    profile.profile_id = existing_profile.profile_id
                self.db.merge(profile)
-            
+
            # Remove old findings for this session to avoid stale data
            self.db.query(ValidationFinding).filter(
                ValidationFinding.session_id == session_id
            ).delete()
-            
+
            # Add new findings
            for finding in findings:
                finding.session_id = session_id
                self.db.add(finding)
-                
+
            self.db.commit()
            return self.load_session_detail(session_id, user_id)
+    # [/DEF:save_prof_find:Function]

+    # [DEF:save_prev:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Persist a preview snapshot and mark prior session previews stale.
+    # @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
+    # @RELATION: [DEPENDS_ON] -> [CompiledPreview]
+    # @PRE: session_id belongs to user_id and preview is prepared for the same session aggregate.
+    # @POST: preview is persisted and the session points to the latest preview identifier.
+    # @SIDE_EFFECT: updates prior preview statuses, inserts a preview row, mutates the parent session, and commits.
+    # @DATA_CONTRACT: Input[PreviewMutation] -> Output[CompiledPreview]
    def save_preview(self, session_id: str, user_id: str, preview: CompiledPreview) -> CompiledPreview:
-        """
-        @PURPOSE: Persist compiled preview attempt and mark older fingerprints stale.
-        """
        with belief_scope("DatasetReviewSessionRepository.save_preview"):
            session = self.db.query(DatasetReviewSession).filter(
                DatasetReviewSession.session_id == session_id,
@@ -125,15 +167,22 @@ class DatasetReviewSessionRepository:
            self.db.add(preview)
            self.db.flush()
            session.last_preview_id = preview.preview_id
-            
+
            self.db.commit()
            self.db.refresh(preview)
            return preview
+    # [/DEF:save_prev:Function]

+    # [DEF:save_run_ctx:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Persist an immutable launch audit snapshot for an owned session.
+    # @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
+    # @RELATION: [DEPENDS_ON] -> [DatasetRunContext]
+    # @PRE: session_id belongs to user_id and run_context targets the same aggregate.
+    # @POST: run context is persisted and linked as the latest launch snapshot for the session.
+    # @SIDE_EFFECT: inserts a run-context row, mutates the parent session pointer, and commits.
+    # @DATA_CONTRACT: Input[RunContextMutation] -> Output[DatasetRunContext]
    def save_run_context(self, session_id: str, user_id: str, run_context: DatasetRunContext) -> DatasetRunContext:
-        """
-        @PURPOSE: Persist immutable launch audit snapshot.
-        """
        with belief_scope("DatasetReviewSessionRepository.save_run_context"):
            session = self.db.query(DatasetReviewSession).filter(
                DatasetReviewSession.session_id == session_id,
@@ -146,18 +195,22 @@ class DatasetReviewSessionRepository:
            self.db.add(run_context)
            self.db.flush()
            session.last_run_context_id = run_context.run_context_id
-            
+
            self.db.commit()
            self.db.refresh(run_context)
            return run_context
+    # [/DEF:save_run_ctx:Function]

+    # [DEF:list_user_sess:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: List review sessions owned by a specific user ordered by most recent update.
+    # @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
    def list_sessions_for_user(self, user_id: str) -> List[DatasetReviewSession]:
-        """
-        @PURPOSE: List all review sessions owned by a user.
-        """
        with belief_scope("DatasetReviewSessionRepository.list_sessions_for_user"):
            return self.db.query(DatasetReviewSession).filter(
                DatasetReviewSession.user_id == user_id
            ).order_by(DatasetReviewSession.updated_at.desc()).all()
+    # [/DEF:list_user_sess:Function]
+# [/DEF:SessionRepo:Class]

 # [/DEF:DatasetReviewSessionRepository:Module]
--- a/backend/src/services/dataset_review/semantic_resolver.py
+++ b/backend/src/services/dataset_review/semantic_resolver.py
@@ -0,0 +1,342 @@
+# [DEF:SemanticSourceResolver:Module]
+# @COMPLEXITY: 4
+# @SEMANTICS: dataset_review, semantic_resolution, dictionary, trusted_sources, ranking
+# @PURPOSE: Resolve and rank semantic candidates from trusted dictionary-like sources before any inferred fallback.
+# @LAYER: Domain
+# @RELATION: [DEPENDS_ON] ->[LLMProviderService]
+# @RELATION: [DEPENDS_ON] ->[SemanticSource]
+# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
+# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
+# @PRE: selected source and target field set must be known.
+# @POST: candidate ranking follows the configured confidence hierarchy and unresolved fuzzy matches remain reviewable.
+# @SIDE_EFFECT: may create conflict findings and semantic candidate records.
+# @INVARIANT: Manual overrides are never silently replaced by imported, inferred, or AI-generated values.
+
+from __future__ import annotations
+
+# [DEF:SemanticSourceResolver.imports:Block]
+from dataclasses import dataclass, field
+from difflib import SequenceMatcher
+from typing import Any, Dict, Iterable, List, Mapping, Optional
+
+from src.core.logger import belief_scope, logger
+from src.models.dataset_review import (
+    CandidateMatchType,
+    CandidateStatus,
+    FieldProvenance,
+)
+# [/DEF:SemanticSourceResolver.imports:Block]
+
+
+# [DEF:DictionaryResolutionResult:Class]
+# @COMPLEXITY: 2
+# @PURPOSE: Carries field-level dictionary resolution output with explicit review and partial-recovery state.
+@dataclass
+class DictionaryResolutionResult:
+    source_ref: str
+    resolved_fields: List[Dict[str, Any]] = field(default_factory=list)
+    unresolved_fields: List[str] = field(default_factory=list)
+    partial_recovery: bool = False
+# [/DEF:DictionaryResolutionResult:Class]
+
+
+# [DEF:SemanticSourceResolver:Class]
+# @COMPLEXITY: 4
+# @PURPOSE: Resolve semantic candidates from trusted sources while preserving manual locks and confidence ordering.
+# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
+# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
+# @PRE: source payload and target field collection are provided by the caller.
+# @POST: result contains confidence-ranked candidates and does not overwrite manual locks implicitly.
+# @SIDE_EFFECT: emits semantic trace logs for ranking and fallback decisions.
+class SemanticSourceResolver:
+    # [DEF:SemanticSourceResolver.resolve_from_file:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Normalize uploaded semantic file records into field-level candidates.
+    def resolve_from_file(self, source_payload: Mapping[str, Any], fields: Iterable[Mapping[str, Any]]) -> DictionaryResolutionResult:
+        return DictionaryResolutionResult(source_ref=str(source_payload.get("source_ref") or "uploaded_file"))
+    # [/DEF:SemanticSourceResolver.resolve_from_file:Function]
+
+    # [DEF:SemanticSourceResolver.resolve_from_dictionary:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Resolve candidates from connected tabular dictionary sources.
+    # @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
+    # @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
+    # @PRE: dictionary source exists and fields contain stable field_name values.
+    # @POST: returns confidence-ranked candidates where exact dictionary matches outrank fuzzy matches and unresolved fields stay explicit.
+    # @SIDE_EFFECT: emits belief-state logs describing trusted-match and partial-recovery outcomes.
+    # @DATA_CONTRACT: Input[source_payload:Mapping,fields:Iterable] -> Output[DictionaryResolutionResult]
+    def resolve_from_dictionary(
+        self,
+        source_payload: Mapping[str, Any],
+        fields: Iterable[Mapping[str, Any]],
+    ) -> DictionaryResolutionResult:
+        with belief_scope("SemanticSourceResolver.resolve_from_dictionary"):
+            source_ref = str(source_payload.get("source_ref") or "").strip()
+            dictionary_rows = source_payload.get("rows")
+
+            if not source_ref:
+                logger.explore("Dictionary semantic source is missing source_ref")
+                raise ValueError("Dictionary semantic source must include source_ref")
+
+            if not isinstance(dictionary_rows, list) or not dictionary_rows:
+                logger.explore(
+                    "Dictionary semantic source has no usable rows",
+                    extra={"source_ref": source_ref},
+                )
+                raise ValueError("Dictionary semantic source must include non-empty rows")
+
+            logger.reason(
+                "Resolving semantics from trusted dictionary source",
+                extra={"source_ref": source_ref, "row_count": len(dictionary_rows)},
+            )
+
+            normalized_rows = [self._normalize_dictionary_row(row) for row in dictionary_rows if isinstance(row, Mapping)]
+            row_index = {
+                row["field_key"]: row
+                for row in normalized_rows
+                if row.get("field_key")
+            }
+
+            resolved_fields: List[Dict[str, Any]] = []
+            unresolved_fields: List[str] = []
+
+            for raw_field in fields:
+                field_name = str(raw_field.get("field_name") or "").strip()
+                if not field_name:
+                    continue
+
+                is_locked = bool(raw_field.get("is_locked"))
+                if is_locked:
+                    logger.reason(
+                        "Preserving manual lock during dictionary resolution",
+                        extra={"field_name": field_name},
+                    )
+                    resolved_fields.append(
+                        {
+                            "field_name": field_name,
+                            "applied_candidate": None,
+                            "candidates": [],
+                            "provenance": FieldProvenance.MANUAL_OVERRIDE.value,
+                            "needs_review": False,
+                            "has_conflict": False,
+                            "is_locked": True,
+                            "status": "preserved_manual",
+                        }
+                    )
+                    continue
+
+                exact_match = row_index.get(self._normalize_key(field_name))
+                candidates: List[Dict[str, Any]] = []
+
+                if exact_match is not None:
+                    logger.reason(
+                        "Resolved exact dictionary match",
+                        extra={"field_name": field_name, "source_ref": source_ref},
+                    )
+                    candidates.append(
+                        self._build_candidate_payload(
+                            rank=1,
+                            match_type=CandidateMatchType.EXACT,
+                            confidence_score=1.0,
+                            row=exact_match,
+                        )
+                    )
+                else:
+                    fuzzy_matches = self._find_fuzzy_matches(field_name, normalized_rows)
+                    for rank_offset, fuzzy_match in enumerate(fuzzy_matches, start=1):
+                        candidates.append(
+                            self._build_candidate_payload(
+                                rank=rank_offset,
+                                match_type=CandidateMatchType.FUZZY,
+                                confidence_score=float(fuzzy_match["score"]),
+                                row=fuzzy_match["row"],
+                            )
+                        )
+
+                if not candidates:
+                    unresolved_fields.append(field_name)
+                    resolved_fields.append(
+                        {
+                            "field_name": field_name,
+                            "applied_candidate": None,
+                            "candidates": [],
+                            "provenance": FieldProvenance.UNRESOLVED.value,
+                            "needs_review": True,
+                            "has_conflict": False,
+                            "is_locked": False,
+                            "status": "unresolved",
+                        }
+                    )
+                    logger.explore(
+                        "No trusted dictionary match found for field",
+                        extra={"field_name": field_name, "source_ref": source_ref},
+                    )
+                    continue
+
+                ranked_candidates = self.rank_candidates(candidates)
+                applied_candidate = ranked_candidates[0]
+                has_conflict = len(ranked_candidates) > 1
+                provenance = (
+                    FieldProvenance.DICTIONARY_EXACT.value
+                    if applied_candidate["match_type"] == CandidateMatchType.EXACT.value
+                    else FieldProvenance.FUZZY_INFERRED.value
+                )
+                needs_review = applied_candidate["match_type"] != CandidateMatchType.EXACT.value
+
+                resolved_fields.append(
+                    {
+                        "field_name": field_name,
+                        "applied_candidate": applied_candidate,
+                        "candidates": ranked_candidates,
+                        "provenance": provenance,
+                        "needs_review": needs_review,
+                        "has_conflict": has_conflict,
+                        "is_locked": False,
+                        "status": "resolved",
+                    }
+                )
+
+            result = DictionaryResolutionResult(
+                source_ref=source_ref,
+                resolved_fields=resolved_fields,
+                unresolved_fields=unresolved_fields,
+                partial_recovery=bool(unresolved_fields),
+            )
+            logger.reflect(
+                "Dictionary resolution completed",
+                extra={
+                    "source_ref": source_ref,
+                    "resolved_fields": len(resolved_fields),
+                    "unresolved_fields": len(unresolved_fields),
+                    "partial_recovery": result.partial_recovery,
+                },
+            )
+            return result
+    # [/DEF:SemanticSourceResolver.resolve_from_dictionary:Function]
+
+    # [DEF:SemanticSourceResolver.resolve_from_reference_dataset:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Reuse semantic metadata from trusted Superset datasets.
+    def resolve_from_reference_dataset(
+        self,
+        source_payload: Mapping[str, Any],
+        fields: Iterable[Mapping[str, Any]],
+    ) -> DictionaryResolutionResult:
+        return DictionaryResolutionResult(source_ref=str(source_payload.get("source_ref") or "reference_dataset"))
+    # [/DEF:SemanticSourceResolver.resolve_from_reference_dataset:Function]
+
+    # [DEF:SemanticSourceResolver.rank_candidates:Function]
+    # @COMPLEXITY: 3
+    # @PURPOSE: Apply confidence ordering and determine best candidate per field.
+    # @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
+    def rank_candidates(self, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        ranked = sorted(
+            candidates,
+            key=lambda candidate: (
+                self._match_priority(candidate.get("match_type")),
+                -float(candidate.get("confidence_score", 0.0)),
+                int(candidate.get("candidate_rank", 999)),
+            ),
+        )
+        for index, candidate in enumerate(ranked, start=1):
+            candidate["candidate_rank"] = index
+        return ranked
+    # [/DEF:SemanticSourceResolver.rank_candidates:Function]
+
+    # [DEF:SemanticSourceResolver.detect_conflicts:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Mark competing candidate sets that require explicit user review.
+    def detect_conflicts(self, candidates: List[Dict[str, Any]]) -> bool:
+        return len(candidates) > 1
+    # [/DEF:SemanticSourceResolver.detect_conflicts:Function]
+
+    # [DEF:SemanticSourceResolver.apply_field_decision:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Accept, reject, or manually override a field-level semantic value.
+    def apply_field_decision(self, field_state: Mapping[str, Any], decision: Mapping[str, Any]) -> Dict[str, Any]:
+        merged = dict(field_state)
+        merged.update(decision)
+        return merged
+    # [/DEF:SemanticSourceResolver.apply_field_decision:Function]
+
+    # [DEF:SemanticSourceResolver._normalize_dictionary_row:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Normalize one dictionary row into a consistent lookup structure.
+    def _normalize_dictionary_row(self, row: Mapping[str, Any]) -> Dict[str, Any]:
+        field_name = (
+            row.get("field_name")
+            or row.get("column_name")
+            or row.get("name")
+            or row.get("field")
+        )
+        normalized_name = str(field_name or "").strip()
+        return {
+            "field_name": normalized_name,
+            "field_key": self._normalize_key(normalized_name),
+            "verbose_name": row.get("verbose_name") or row.get("label"),
+            "description": row.get("description"),
+            "display_format": row.get("display_format") or row.get("format"),
+        }
+    # [/DEF:SemanticSourceResolver._normalize_dictionary_row:Function]
+
+    # [DEF:SemanticSourceResolver._find_fuzzy_matches:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Produce confidence-scored fuzzy matches while keeping them reviewable.
+    def _find_fuzzy_matches(self, field_name: str, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        normalized_target = self._normalize_key(field_name)
+        fuzzy_matches: List[Dict[str, Any]] = []
+        for row in rows:
+            candidate_key = str(row.get("field_key") or "")
+            if not candidate_key:
+                continue
+            score = SequenceMatcher(None, normalized_target, candidate_key).ratio()
+            if score < 0.72:
+                continue
+            fuzzy_matches.append({"row": row, "score": round(score, 3)})
+        fuzzy_matches.sort(key=lambda item: item["score"], reverse=True)
+        return fuzzy_matches[:3]
+    # [/DEF:SemanticSourceResolver._find_fuzzy_matches:Function]
+
+    # [DEF:SemanticSourceResolver._build_candidate_payload:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Project normalized dictionary rows into semantic candidate payloads.
+    def _build_candidate_payload(
+        self,
+        rank: int,
+        match_type: CandidateMatchType,
+        confidence_score: float,
+        row: Mapping[str, Any],
+    ) -> Dict[str, Any]:
+        return {
+            "candidate_rank": rank,
+            "match_type": match_type.value,
+            "confidence_score": confidence_score,
+            "proposed_verbose_name": row.get("verbose_name"),
+            "proposed_description": row.get("description"),
+            "proposed_display_format": row.get("display_format"),
+            "status": CandidateStatus.PROPOSED.value,
+        }
+    # [/DEF:SemanticSourceResolver._build_candidate_payload:Function]
+
+    # [DEF:SemanticSourceResolver._match_priority:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Encode trusted-confidence ordering so exact dictionary reuse beats fuzzy invention.
+    def _match_priority(self, match_type: Optional[str]) -> int:
+        priority = {
+            CandidateMatchType.EXACT.value: 0,
+            CandidateMatchType.REFERENCE.value: 1,
+            CandidateMatchType.FUZZY.value: 2,
+            CandidateMatchType.GENERATED.value: 3,
+        }
+        return priority.get(str(match_type or ""), 99)
+    # [/DEF:SemanticSourceResolver._match_priority:Function]
+
+    # [DEF:SemanticSourceResolver._normalize_key:Function]
+    # @COMPLEXITY: 1
+    # @PURPOSE: Normalize field identifiers for stable exact/fuzzy comparisons.
+    def _normalize_key(self, value: str) -> str:
+        return "".join(ch for ch in str(value or "").strip().lower() if ch.isalnum() or ch == "_")
+    # [/DEF:SemanticSourceResolver._normalize_key:Function]
+# [/DEF:SemanticSourceResolver:Class]
+
+# [/DEF:SemanticSourceResolver:Module]