feat(us1): add dataset review orchestration automatic review slice
This commit is contained in:
386
backend/src/services/dataset_review/orchestrator.py
Normal file
386
backend/src/services/dataset_review/orchestrator.py
Normal file
@@ -0,0 +1,386 @@
|
||||
# [DEF:DatasetReviewOrchestrator:Module]
|
||||
# @COMPLEXITY: 5
|
||||
# @SEMANTICS: dataset_review, orchestration, session_lifecycle, intake, recovery
|
||||
# @PURPOSE: Coordinate dataset review session startup and lifecycle-safe intake recovery for one authenticated user.
|
||||
# @LAYER: Domain
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticSourceResolver]
|
||||
# @RELATION: [DEPENDS_ON] ->[ClarificationEngine]
|
||||
# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
|
||||
# @RELATION: [DEPENDS_ON] ->[SupersetCompilationAdapter]
|
||||
# @RELATION: [DEPENDS_ON] ->[TaskManager]
|
||||
# @PRE: session mutations must execute inside a persisted session boundary scoped to one authenticated user.
|
||||
# @POST: state transitions are persisted atomically and emit observable progress for long-running steps.
|
||||
# @SIDE_EFFECT: creates task records, updates session aggregates, triggers upstream Superset calls, persists audit artifacts.
|
||||
# @DATA_CONTRACT: Input[SessionCommand] -> Output[DatasetReviewSession | CompiledPreview | DatasetRunContext]
|
||||
# @INVARIANT: Launch is blocked unless a current session has no open blocking findings, all launch-sensitive mappings are approved, and a non-stale Superset-generated compiled preview matches the current input fingerprint.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator.imports:Block]
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from src.core.config_manager import ConfigManager
|
||||
from src.core.logger import belief_scope, logger
|
||||
from src.core.task_manager import TaskManager
|
||||
from src.core.utils.superset_context_extractor import (
|
||||
SupersetContextExtractor,
|
||||
SupersetParsedContext,
|
||||
)
|
||||
from src.models.auth import User
|
||||
from src.models.dataset_review import (
|
||||
BusinessSummarySource,
|
||||
ConfidenceState,
|
||||
DatasetProfile,
|
||||
DatasetReviewSession,
|
||||
FindingArea,
|
||||
FindingSeverity,
|
||||
RecommendedAction,
|
||||
ReadinessState,
|
||||
ResolutionState,
|
||||
SessionPhase,
|
||||
SessionStatus,
|
||||
ValidationFinding,
|
||||
)
|
||||
from src.services.dataset_review.repositories.session_repository import (
|
||||
DatasetReviewSessionRepository,
|
||||
)
|
||||
from src.services.dataset_review.semantic_resolver import SemanticSourceResolver
|
||||
# [/DEF:DatasetReviewOrchestrator.imports:Block]
|
||||
|
||||
|
||||
# [DEF:StartSessionCommand:Class]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Typed input contract for starting a dataset review session.
|
||||
@dataclass
|
||||
class StartSessionCommand:
|
||||
user: User
|
||||
environment_id: str
|
||||
source_kind: str
|
||||
source_input: str
|
||||
# [/DEF:StartSessionCommand:Class]
|
||||
|
||||
|
||||
# [DEF:StartSessionResult:Class]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Session-start result carrying the persisted session and intake recovery metadata.
|
||||
@dataclass
|
||||
class StartSessionResult:
|
||||
session: DatasetReviewSession
|
||||
parsed_context: Optional[SupersetParsedContext] = None
|
||||
findings: List[ValidationFinding] = field(default_factory=list)
|
||||
# [/DEF:StartSessionResult:Class]
|
||||
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator:Class]
|
||||
# @COMPLEXITY: 5
|
||||
# @PURPOSE: Coordinate safe session startup while preserving cross-user isolation and explicit partial recovery.
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
|
||||
# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
|
||||
# @RELATION: [DEPENDS_ON] ->[TaskManager]
|
||||
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
|
||||
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
|
||||
# @PRE: constructor dependencies are valid and tied to the current request/task scope.
|
||||
# @POST: orchestrator instance can execute session-scoped mutations for one authenticated user.
|
||||
# @SIDE_EFFECT: downstream operations may persist session/profile/finding state and enqueue background tasks.
|
||||
# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
|
||||
# @INVARIANT: session ownership is preserved on every mutation and recovery remains explicit when partial.
|
||||
class DatasetReviewOrchestrator:
|
||||
# [DEF:DatasetReviewOrchestrator.__init__:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Bind repository, config, and task dependencies required by the orchestration boundary.
|
||||
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
|
||||
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
|
||||
def __init__(
|
||||
self,
|
||||
repository: DatasetReviewSessionRepository,
|
||||
config_manager: ConfigManager,
|
||||
task_manager: Optional[TaskManager] = None,
|
||||
semantic_resolver: Optional[SemanticSourceResolver] = None,
|
||||
) -> None:
|
||||
self.repository = repository
|
||||
self.config_manager = config_manager
|
||||
self.task_manager = task_manager
|
||||
self.semantic_resolver = semantic_resolver or SemanticSourceResolver()
|
||||
# [/DEF:DatasetReviewOrchestrator.__init__:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator.start_session:Function]
|
||||
# @COMPLEXITY: 5
|
||||
# @PURPOSE: Initialize a new session from a Superset link or dataset selection and trigger context recovery.
|
||||
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
|
||||
# @RELATION: [CALLS] ->[SupersetContextExtractor.parse_superset_link]
|
||||
# @RELATION: [CALLS] ->[create_task]
|
||||
# @PRE: source input is non-empty and environment is accessible.
|
||||
# @POST: session exists in persisted storage with intake/recovery state and task linkage when async work is required.
|
||||
# @SIDE_EFFECT: persists session and may enqueue recovery task.
|
||||
# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
|
||||
# @INVARIANT: no cross-user session leakage occurs; session and follow-up task remain owned by the authenticated user.
|
||||
def start_session(self, command: StartSessionCommand) -> StartSessionResult:
|
||||
with belief_scope("DatasetReviewOrchestrator.start_session"):
|
||||
normalized_source_kind = str(command.source_kind or "").strip()
|
||||
normalized_source_input = str(command.source_input or "").strip()
|
||||
normalized_environment_id = str(command.environment_id or "").strip()
|
||||
|
||||
if not normalized_source_input:
|
||||
logger.explore("Blocked dataset review session start due to empty source input")
|
||||
raise ValueError("source_input must be non-empty")
|
||||
|
||||
if normalized_source_kind not in {"superset_link", "dataset_selection"}:
|
||||
logger.explore(
|
||||
"Blocked dataset review session start due to unsupported source kind",
|
||||
extra={"source_kind": normalized_source_kind},
|
||||
)
|
||||
raise ValueError("source_kind must be 'superset_link' or 'dataset_selection'")
|
||||
|
||||
environment = self.config_manager.get_environment(normalized_environment_id)
|
||||
if environment is None:
|
||||
logger.explore(
|
||||
"Blocked dataset review session start because environment was not found",
|
||||
extra={"environment_id": normalized_environment_id},
|
||||
)
|
||||
raise ValueError("Environment not found")
|
||||
|
||||
logger.reason(
|
||||
"Starting dataset review session",
|
||||
extra={
|
||||
"user_id": command.user.id,
|
||||
"environment_id": normalized_environment_id,
|
||||
"source_kind": normalized_source_kind,
|
||||
},
|
||||
)
|
||||
|
||||
parsed_context: Optional[SupersetParsedContext] = None
|
||||
findings: List[ValidationFinding] = []
|
||||
dataset_ref = normalized_source_input
|
||||
dataset_id: Optional[int] = None
|
||||
dashboard_id: Optional[int] = None
|
||||
readiness_state = ReadinessState.IMPORTING
|
||||
recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
|
||||
current_phase = SessionPhase.RECOVERY
|
||||
|
||||
if normalized_source_kind == "superset_link":
|
||||
extractor = SupersetContextExtractor(environment)
|
||||
parsed_context = extractor.parse_superset_link(normalized_source_input)
|
||||
dataset_ref = parsed_context.dataset_ref
|
||||
dataset_id = parsed_context.dataset_id
|
||||
dashboard_id = parsed_context.dashboard_id
|
||||
|
||||
if parsed_context.partial_recovery:
|
||||
readiness_state = ReadinessState.RECOVERY_REQUIRED
|
||||
recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
|
||||
findings.extend(self._build_partial_recovery_findings(parsed_context))
|
||||
else:
|
||||
readiness_state = ReadinessState.REVIEW_READY
|
||||
else:
|
||||
dataset_ref, dataset_id = self._parse_dataset_selection(normalized_source_input)
|
||||
readiness_state = ReadinessState.REVIEW_READY
|
||||
current_phase = SessionPhase.REVIEW
|
||||
|
||||
session = DatasetReviewSession(
|
||||
user_id=command.user.id,
|
||||
environment_id=normalized_environment_id,
|
||||
source_kind=normalized_source_kind,
|
||||
source_input=normalized_source_input,
|
||||
dataset_ref=dataset_ref,
|
||||
dataset_id=dataset_id,
|
||||
dashboard_id=dashboard_id,
|
||||
readiness_state=readiness_state,
|
||||
recommended_action=recommended_action,
|
||||
status=SessionStatus.ACTIVE,
|
||||
current_phase=current_phase,
|
||||
)
|
||||
persisted_session = self.repository.create_session(session)
|
||||
|
||||
profile = self._build_initial_profile(
|
||||
session_id=persisted_session.session_id,
|
||||
parsed_context=parsed_context,
|
||||
dataset_ref=dataset_ref,
|
||||
)
|
||||
persisted_session = self.repository.save_profile_and_findings(
|
||||
persisted_session.session_id,
|
||||
command.user.id,
|
||||
profile,
|
||||
findings,
|
||||
)
|
||||
|
||||
active_task_id = self._enqueue_recovery_task(
|
||||
command=command,
|
||||
session=persisted_session,
|
||||
parsed_context=parsed_context,
|
||||
)
|
||||
if active_task_id:
|
||||
persisted_session.active_task_id = active_task_id
|
||||
self.repository.db.commit()
|
||||
self.repository.db.refresh(persisted_session)
|
||||
logger.reason(
|
||||
"Linked recovery task to started dataset review session",
|
||||
extra={"session_id": persisted_session.session_id, "task_id": active_task_id},
|
||||
)
|
||||
|
||||
logger.reflect(
|
||||
"Dataset review session start completed",
|
||||
extra={
|
||||
"session_id": persisted_session.session_id,
|
||||
"dataset_ref": persisted_session.dataset_ref,
|
||||
"dataset_id": persisted_session.dataset_id,
|
||||
"dashboard_id": persisted_session.dashboard_id,
|
||||
"readiness_state": persisted_session.readiness_state.value,
|
||||
"active_task_id": persisted_session.active_task_id,
|
||||
"finding_count": len(findings),
|
||||
},
|
||||
)
|
||||
return StartSessionResult(
|
||||
session=persisted_session,
|
||||
parsed_context=parsed_context,
|
||||
findings=findings,
|
||||
)
|
||||
# [/DEF:DatasetReviewOrchestrator.start_session:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Normalize dataset-selection payload into canonical session references.
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
|
||||
def _parse_dataset_selection(self, source_input: str) -> tuple[str, Optional[int]]:
|
||||
normalized = str(source_input or "").strip()
|
||||
if not normalized:
|
||||
raise ValueError("dataset selection input must be non-empty")
|
||||
|
||||
if normalized.isdigit():
|
||||
dataset_id = int(normalized)
|
||||
return f"dataset:{dataset_id}", dataset_id
|
||||
|
||||
if normalized.startswith("dataset:"):
|
||||
suffix = normalized.split(":", 1)[1].strip()
|
||||
if suffix.isdigit():
|
||||
return normalized, int(suffix)
|
||||
return normalized, None
|
||||
|
||||
return normalized, None
|
||||
# [/DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Create the first profile snapshot so exports and detail views remain usable immediately after intake.
|
||||
# @RELATION: [DEPENDS_ON] ->[DatasetProfile]
|
||||
def _build_initial_profile(
|
||||
self,
|
||||
session_id: str,
|
||||
parsed_context: Optional[SupersetParsedContext],
|
||||
dataset_ref: str,
|
||||
) -> DatasetProfile:
|
||||
dataset_name = dataset_ref.split(".")[-1] if dataset_ref else "Unresolved dataset"
|
||||
business_summary = (
|
||||
f"Review session initialized for {dataset_ref}."
|
||||
if dataset_ref
|
||||
else "Review session initialized with unresolved dataset context."
|
||||
)
|
||||
confidence_state = (
|
||||
ConfidenceState.MIXED
|
||||
if parsed_context and parsed_context.partial_recovery
|
||||
else ConfidenceState.MOSTLY_CONFIRMED
|
||||
)
|
||||
return DatasetProfile(
|
||||
session_id=session_id,
|
||||
dataset_name=dataset_name or "Unresolved dataset",
|
||||
schema_name=dataset_ref.split(".")[0] if "." in dataset_ref else None,
|
||||
business_summary=business_summary,
|
||||
business_summary_source=BusinessSummarySource.IMPORTED,
|
||||
description="Initial review profile created from source intake.",
|
||||
dataset_type="unknown",
|
||||
is_sqllab_view=False,
|
||||
completeness_score=0.25,
|
||||
confidence_state=confidence_state,
|
||||
has_blocking_findings=False,
|
||||
has_warning_findings=bool(parsed_context and parsed_context.partial_recovery),
|
||||
manual_summary_locked=False,
|
||||
)
|
||||
# [/DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Project partial Superset intake recovery into explicit findings without blocking session usability.
|
||||
# @RELATION: [DEPENDS_ON] ->[ValidationFinding]
|
||||
# @PRE: parsed_context.partial_recovery is true.
|
||||
# @POST: returns warning-level findings that preserve usable but incomplete state.
|
||||
# @SIDE_EFFECT: none beyond structured finding creation.
|
||||
# @DATA_CONTRACT: Input[SupersetParsedContext] -> Output[List[ValidationFinding]]
|
||||
def _build_partial_recovery_findings(
|
||||
self,
|
||||
parsed_context: SupersetParsedContext,
|
||||
) -> List[ValidationFinding]:
|
||||
findings: List[ValidationFinding] = []
|
||||
for unresolved_ref in parsed_context.unresolved_references:
|
||||
findings.append(
|
||||
ValidationFinding(
|
||||
area=FindingArea.SOURCE_INTAKE,
|
||||
severity=FindingSeverity.WARNING,
|
||||
code="PARTIAL_SUPERSET_RECOVERY",
|
||||
title="Superset context recovered partially",
|
||||
message=(
|
||||
"Session remains usable, but some Superset context requires review: "
|
||||
f"{unresolved_ref.replace('_', ' ')}."
|
||||
),
|
||||
resolution_state=ResolutionState.OPEN,
|
||||
caused_by_ref=unresolved_ref,
|
||||
)
|
||||
)
|
||||
return findings
|
||||
# [/DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
|
||||
|
||||
# [DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Link session start to observable async recovery when task infrastructure is available.
|
||||
# @RELATION: [CALLS] ->[create_task]
|
||||
# @PRE: session is already persisted.
|
||||
# @POST: returns task identifier when a task could be enqueued, otherwise None.
|
||||
# @SIDE_EFFECT: may create one background task for progressive recovery.
|
||||
# @DATA_CONTRACT: Input[StartSessionCommand,DatasetReviewSession,SupersetParsedContext|None] -> Output[task_id:str|None]
|
||||
def _enqueue_recovery_task(
|
||||
self,
|
||||
command: StartSessionCommand,
|
||||
session: DatasetReviewSession,
|
||||
parsed_context: Optional[SupersetParsedContext],
|
||||
) -> Optional[str]:
|
||||
if self.task_manager is None:
|
||||
logger.reason(
|
||||
"Dataset review session started without task manager; continuing synchronously",
|
||||
extra={"session_id": session.session_id},
|
||||
)
|
||||
return None
|
||||
|
||||
task_params: Dict[str, Any] = {
|
||||
"session_id": session.session_id,
|
||||
"user_id": command.user.id,
|
||||
"environment_id": session.environment_id,
|
||||
"source_kind": session.source_kind,
|
||||
"source_input": session.source_input,
|
||||
"dataset_ref": session.dataset_ref,
|
||||
"dataset_id": session.dataset_id,
|
||||
"dashboard_id": session.dashboard_id,
|
||||
"partial_recovery": bool(parsed_context and parsed_context.partial_recovery),
|
||||
}
|
||||
|
||||
create_task = getattr(self.task_manager, "create_task", None)
|
||||
if create_task is None:
|
||||
logger.explore("Task manager has no create_task method; skipping recovery enqueue")
|
||||
return None
|
||||
|
||||
try:
|
||||
task_object = create_task(
|
||||
plugin_id="dataset-review-recovery",
|
||||
params=task_params,
|
||||
)
|
||||
except TypeError:
|
||||
logger.explore(
|
||||
"Recovery task enqueue skipped because task manager create_task contract is incompatible",
|
||||
extra={"session_id": session.session_id},
|
||||
)
|
||||
return None
|
||||
|
||||
task_id = getattr(task_object, "id", None)
|
||||
return str(task_id) if task_id else None
|
||||
# [/DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
|
||||
# [/DEF:DatasetReviewOrchestrator:Class]
|
||||
|
||||
# [/DEF:DatasetReviewOrchestrator:Module]
|
||||
@@ -8,6 +8,9 @@
|
||||
# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
|
||||
# @PRE: repository operations execute within authenticated request or task scope.
|
||||
# @POST: session aggregate reads are structurally consistent and writes preserve ownership and version semantics.
|
||||
# @SIDE_EFFECT: reads and writes SQLAlchemy-backed session aggregates.
|
||||
# @DATA_CONTRACT: Input[SessionMutation] -> Output[PersistedSessionAggregate]
|
||||
# @INVARIANT: answers, mapping approvals, preview artifacts, and launch snapshots are never attributed to the wrong user or session.
|
||||
|
||||
from typing import Optional, List
|
||||
from sqlalchemy import or_
|
||||
@@ -22,27 +25,51 @@ from src.models.dataset_review import (
|
||||
)
|
||||
from src.core.logger import belief_scope
|
||||
|
||||
# [DEF:SessionRepo:Class]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Enforce ownership-scoped persistence and retrieval for dataset review session aggregates.
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetProfile]
|
||||
# @RELATION: [DEPENDS_ON] -> [ValidationFinding]
|
||||
# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
|
||||
# @PRE: constructor receives a live SQLAlchemy session and callers provide authenticated user scope for guarded reads and writes.
|
||||
# @POST: repository methods return ownership-scoped aggregates or persisted child records without changing domain meaning.
|
||||
# @SIDE_EFFECT: mutates and queries the persistence layer through the injected database session.
|
||||
# @DATA_CONTRACT: Input[OwnedSessionQuery|SessionMutation] -> Output[PersistedSessionAggregate|PersistedChildRecord]
|
||||
class DatasetReviewSessionRepository:
|
||||
"""
|
||||
@PURPOSE: Persist and retrieve dataset review session aggregates.
|
||||
@INVARIANT: ownership_scope -> All operations must respect the session owner's user_id.
|
||||
"""
|
||||
|
||||
# [DEF:init_repo:Function]
|
||||
def __init__(self, db: Session):
|
||||
self.db = db
|
||||
# [/DEF:init_repo:Function]
|
||||
|
||||
# [DEF:create_sess:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Persist an initial dataset review session shell.
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
|
||||
# @PRE: session is a new aggregate root bound to the current ownership scope.
|
||||
# @POST: session is committed, refreshed, and returned with persisted identifiers.
|
||||
# @SIDE_EFFECT: inserts a session row and commits the active transaction.
|
||||
# @DATA_CONTRACT: Input[DatasetReviewSession] -> Output[DatasetReviewSession]
|
||||
def create_session(self, session: DatasetReviewSession) -> DatasetReviewSession:
|
||||
"""
|
||||
@PURPOSE: Persist initial session shell.
|
||||
"""
|
||||
with belief_scope("DatasetReviewSessionRepository.create_session"):
|
||||
self.db.add(session)
|
||||
self.db.commit()
|
||||
self.db.refresh(session)
|
||||
return session
|
||||
# [/DEF:create_sess:Function]
|
||||
|
||||
# [DEF:load_detail:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Return the full session aggregate for API and frontend resume flows.
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
|
||||
# @RELATION: [DEPENDS_ON] -> [SessionCollaborator]
|
||||
def load_session_detail(self, session_id: str, user_id: str) -> Optional[DatasetReviewSession]:
|
||||
"""
|
||||
@PURPOSE: Return the full session aggregate for API/frontend use.
|
||||
@PRE: user_id must match session owner or authorized collaborator.
|
||||
"""
|
||||
with belief_scope("DatasetReviewSessionRepository.load_session_detail"):
|
||||
@@ -70,17 +97,25 @@ class DatasetReviewSessionRepository:
|
||||
)
|
||||
)\
|
||||
.first()
|
||||
# [/DEF:load_detail:Function]
|
||||
|
||||
# [DEF:save_prof_find:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Persist profile state and replace validation findings for an owned session in one transaction.
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetProfile]
|
||||
# @RELATION: [DEPENDS_ON] -> [ValidationFinding]
|
||||
# @PRE: session_id belongs to user_id and the supplied profile/findings belong to the same aggregate scope.
|
||||
# @POST: stored profile matches the current session and findings are replaced by the supplied collection.
|
||||
# @SIDE_EFFECT: updates profile rows, deletes stale findings, inserts current findings, and commits the transaction.
|
||||
# @DATA_CONTRACT: Input[ProfileAndFindingsMutation] -> Output[DatasetReviewSession]
|
||||
def save_profile_and_findings(self, session_id: str, user_id: str, profile: DatasetProfile, findings: List[ValidationFinding]) -> DatasetReviewSession:
|
||||
"""
|
||||
@PURPOSE: Persist profile and validation state together.
|
||||
"""
|
||||
with belief_scope("DatasetReviewSessionRepository.save_profile_and_findings"):
|
||||
session = self.db.query(DatasetReviewSession).filter(
|
||||
DatasetReviewSession.session_id == session_id,
|
||||
DatasetReviewSession.user_id == user_id
|
||||
).first()
|
||||
|
||||
|
||||
if not session:
|
||||
raise ValueError("Session not found or access denied")
|
||||
|
||||
@@ -90,24 +125,31 @@ class DatasetReviewSessionRepository:
|
||||
if existing_profile:
|
||||
profile.profile_id = existing_profile.profile_id
|
||||
self.db.merge(profile)
|
||||
|
||||
|
||||
# Remove old findings for this session to avoid stale data
|
||||
self.db.query(ValidationFinding).filter(
|
||||
ValidationFinding.session_id == session_id
|
||||
).delete()
|
||||
|
||||
|
||||
# Add new findings
|
||||
for finding in findings:
|
||||
finding.session_id = session_id
|
||||
self.db.add(finding)
|
||||
|
||||
|
||||
self.db.commit()
|
||||
return self.load_session_detail(session_id, user_id)
|
||||
# [/DEF:save_prof_find:Function]
|
||||
|
||||
# [DEF:save_prev:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Persist a preview snapshot and mark prior session previews stale.
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
|
||||
# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
|
||||
# @PRE: session_id belongs to user_id and preview is prepared for the same session aggregate.
|
||||
# @POST: preview is persisted and the session points to the latest preview identifier.
|
||||
# @SIDE_EFFECT: updates prior preview statuses, inserts a preview row, mutates the parent session, and commits.
|
||||
# @DATA_CONTRACT: Input[PreviewMutation] -> Output[CompiledPreview]
|
||||
def save_preview(self, session_id: str, user_id: str, preview: CompiledPreview) -> CompiledPreview:
|
||||
"""
|
||||
@PURPOSE: Persist compiled preview attempt and mark older fingerprints stale.
|
||||
"""
|
||||
with belief_scope("DatasetReviewSessionRepository.save_preview"):
|
||||
session = self.db.query(DatasetReviewSession).filter(
|
||||
DatasetReviewSession.session_id == session_id,
|
||||
@@ -125,15 +167,22 @@ class DatasetReviewSessionRepository:
|
||||
self.db.add(preview)
|
||||
self.db.flush()
|
||||
session.last_preview_id = preview.preview_id
|
||||
|
||||
|
||||
self.db.commit()
|
||||
self.db.refresh(preview)
|
||||
return preview
|
||||
# [/DEF:save_prev:Function]
|
||||
|
||||
# [DEF:save_run_ctx:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Persist an immutable launch audit snapshot for an owned session.
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetRunContext]
|
||||
# @PRE: session_id belongs to user_id and run_context targets the same aggregate.
|
||||
# @POST: run context is persisted and linked as the latest launch snapshot for the session.
|
||||
# @SIDE_EFFECT: inserts a run-context row, mutates the parent session pointer, and commits.
|
||||
# @DATA_CONTRACT: Input[RunContextMutation] -> Output[DatasetRunContext]
|
||||
def save_run_context(self, session_id: str, user_id: str, run_context: DatasetRunContext) -> DatasetRunContext:
|
||||
"""
|
||||
@PURPOSE: Persist immutable launch audit snapshot.
|
||||
"""
|
||||
with belief_scope("DatasetReviewSessionRepository.save_run_context"):
|
||||
session = self.db.query(DatasetReviewSession).filter(
|
||||
DatasetReviewSession.session_id == session_id,
|
||||
@@ -146,18 +195,22 @@ class DatasetReviewSessionRepository:
|
||||
self.db.add(run_context)
|
||||
self.db.flush()
|
||||
session.last_run_context_id = run_context.run_context_id
|
||||
|
||||
|
||||
self.db.commit()
|
||||
self.db.refresh(run_context)
|
||||
return run_context
|
||||
# [/DEF:save_run_ctx:Function]
|
||||
|
||||
# [DEF:list_user_sess:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: List review sessions owned by a specific user ordered by most recent update.
|
||||
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
|
||||
def list_sessions_for_user(self, user_id: str) -> List[DatasetReviewSession]:
|
||||
"""
|
||||
@PURPOSE: List all review sessions owned by a user.
|
||||
"""
|
||||
with belief_scope("DatasetReviewSessionRepository.list_sessions_for_user"):
|
||||
return self.db.query(DatasetReviewSession).filter(
|
||||
DatasetReviewSession.user_id == user_id
|
||||
).order_by(DatasetReviewSession.updated_at.desc()).all()
|
||||
# [/DEF:list_user_sess:Function]
|
||||
# [/DEF:SessionRepo:Class]
|
||||
|
||||
# [/DEF:DatasetReviewSessionRepository:Module]
|
||||
342
backend/src/services/dataset_review/semantic_resolver.py
Normal file
342
backend/src/services/dataset_review/semantic_resolver.py
Normal file
@@ -0,0 +1,342 @@
|
||||
# [DEF:SemanticSourceResolver:Module]
|
||||
# @COMPLEXITY: 4
|
||||
# @SEMANTICS: dataset_review, semantic_resolution, dictionary, trusted_sources, ranking
|
||||
# @PURPOSE: Resolve and rank semantic candidates from trusted dictionary-like sources before any inferred fallback.
|
||||
# @LAYER: Domain
|
||||
# @RELATION: [DEPENDS_ON] ->[LLMProviderService]
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticSource]
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
|
||||
# @PRE: selected source and target field set must be known.
|
||||
# @POST: candidate ranking follows the configured confidence hierarchy and unresolved fuzzy matches remain reviewable.
|
||||
# @SIDE_EFFECT: may create conflict findings and semantic candidate records.
|
||||
# @INVARIANT: Manual overrides are never silently replaced by imported, inferred, or AI-generated values.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# [DEF:SemanticSourceResolver.imports:Block]
|
||||
from dataclasses import dataclass, field
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
from src.core.logger import belief_scope, logger
|
||||
from src.models.dataset_review import (
|
||||
CandidateMatchType,
|
||||
CandidateStatus,
|
||||
FieldProvenance,
|
||||
)
|
||||
# [/DEF:SemanticSourceResolver.imports:Block]
|
||||
|
||||
|
||||
# [DEF:DictionaryResolutionResult:Class]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Carries field-level dictionary resolution output with explicit review and partial-recovery state.
|
||||
@dataclass
|
||||
class DictionaryResolutionResult:
|
||||
source_ref: str
|
||||
resolved_fields: List[Dict[str, Any]] = field(default_factory=list)
|
||||
unresolved_fields: List[str] = field(default_factory=list)
|
||||
partial_recovery: bool = False
|
||||
# [/DEF:DictionaryResolutionResult:Class]
|
||||
|
||||
|
||||
# [DEF:SemanticSourceResolver:Class]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Resolve semantic candidates from trusted sources while preserving manual locks and confidence ordering.
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
|
||||
# @PRE: source payload and target field collection are provided by the caller.
|
||||
# @POST: result contains confidence-ranked candidates and does not overwrite manual locks implicitly.
|
||||
# @SIDE_EFFECT: emits semantic trace logs for ranking and fallback decisions.
|
||||
class SemanticSourceResolver:
|
||||
# [DEF:SemanticSourceResolver.resolve_from_file:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Normalize uploaded semantic file records into field-level candidates.
|
||||
def resolve_from_file(self, source_payload: Mapping[str, Any], fields: Iterable[Mapping[str, Any]]) -> DictionaryResolutionResult:
|
||||
return DictionaryResolutionResult(source_ref=str(source_payload.get("source_ref") or "uploaded_file"))
|
||||
# [/DEF:SemanticSourceResolver.resolve_from_file:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver.resolve_from_dictionary:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Resolve candidates from connected tabular dictionary sources.
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
|
||||
# @PRE: dictionary source exists and fields contain stable field_name values.
|
||||
# @POST: returns confidence-ranked candidates where exact dictionary matches outrank fuzzy matches and unresolved fields stay explicit.
|
||||
# @SIDE_EFFECT: emits belief-state logs describing trusted-match and partial-recovery outcomes.
|
||||
# @DATA_CONTRACT: Input[source_payload:Mapping,fields:Iterable] -> Output[DictionaryResolutionResult]
|
||||
def resolve_from_dictionary(
|
||||
self,
|
||||
source_payload: Mapping[str, Any],
|
||||
fields: Iterable[Mapping[str, Any]],
|
||||
) -> DictionaryResolutionResult:
|
||||
with belief_scope("SemanticSourceResolver.resolve_from_dictionary"):
|
||||
source_ref = str(source_payload.get("source_ref") or "").strip()
|
||||
dictionary_rows = source_payload.get("rows")
|
||||
|
||||
if not source_ref:
|
||||
logger.explore("Dictionary semantic source is missing source_ref")
|
||||
raise ValueError("Dictionary semantic source must include source_ref")
|
||||
|
||||
if not isinstance(dictionary_rows, list) or not dictionary_rows:
|
||||
logger.explore(
|
||||
"Dictionary semantic source has no usable rows",
|
||||
extra={"source_ref": source_ref},
|
||||
)
|
||||
raise ValueError("Dictionary semantic source must include non-empty rows")
|
||||
|
||||
logger.reason(
|
||||
"Resolving semantics from trusted dictionary source",
|
||||
extra={"source_ref": source_ref, "row_count": len(dictionary_rows)},
|
||||
)
|
||||
|
||||
normalized_rows = [self._normalize_dictionary_row(row) for row in dictionary_rows if isinstance(row, Mapping)]
|
||||
row_index = {
|
||||
row["field_key"]: row
|
||||
for row in normalized_rows
|
||||
if row.get("field_key")
|
||||
}
|
||||
|
||||
resolved_fields: List[Dict[str, Any]] = []
|
||||
unresolved_fields: List[str] = []
|
||||
|
||||
for raw_field in fields:
|
||||
field_name = str(raw_field.get("field_name") or "").strip()
|
||||
if not field_name:
|
||||
continue
|
||||
|
||||
is_locked = bool(raw_field.get("is_locked"))
|
||||
if is_locked:
|
||||
logger.reason(
|
||||
"Preserving manual lock during dictionary resolution",
|
||||
extra={"field_name": field_name},
|
||||
)
|
||||
resolved_fields.append(
|
||||
{
|
||||
"field_name": field_name,
|
||||
"applied_candidate": None,
|
||||
"candidates": [],
|
||||
"provenance": FieldProvenance.MANUAL_OVERRIDE.value,
|
||||
"needs_review": False,
|
||||
"has_conflict": False,
|
||||
"is_locked": True,
|
||||
"status": "preserved_manual",
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
exact_match = row_index.get(self._normalize_key(field_name))
|
||||
candidates: List[Dict[str, Any]] = []
|
||||
|
||||
if exact_match is not None:
|
||||
logger.reason(
|
||||
"Resolved exact dictionary match",
|
||||
extra={"field_name": field_name, "source_ref": source_ref},
|
||||
)
|
||||
candidates.append(
|
||||
self._build_candidate_payload(
|
||||
rank=1,
|
||||
match_type=CandidateMatchType.EXACT,
|
||||
confidence_score=1.0,
|
||||
row=exact_match,
|
||||
)
|
||||
)
|
||||
else:
|
||||
fuzzy_matches = self._find_fuzzy_matches(field_name, normalized_rows)
|
||||
for rank_offset, fuzzy_match in enumerate(fuzzy_matches, start=1):
|
||||
candidates.append(
|
||||
self._build_candidate_payload(
|
||||
rank=rank_offset,
|
||||
match_type=CandidateMatchType.FUZZY,
|
||||
confidence_score=float(fuzzy_match["score"]),
|
||||
row=fuzzy_match["row"],
|
||||
)
|
||||
)
|
||||
|
||||
if not candidates:
|
||||
unresolved_fields.append(field_name)
|
||||
resolved_fields.append(
|
||||
{
|
||||
"field_name": field_name,
|
||||
"applied_candidate": None,
|
||||
"candidates": [],
|
||||
"provenance": FieldProvenance.UNRESOLVED.value,
|
||||
"needs_review": True,
|
||||
"has_conflict": False,
|
||||
"is_locked": False,
|
||||
"status": "unresolved",
|
||||
}
|
||||
)
|
||||
logger.explore(
|
||||
"No trusted dictionary match found for field",
|
||||
extra={"field_name": field_name, "source_ref": source_ref},
|
||||
)
|
||||
continue
|
||||
|
||||
ranked_candidates = self.rank_candidates(candidates)
|
||||
applied_candidate = ranked_candidates[0]
|
||||
has_conflict = len(ranked_candidates) > 1
|
||||
provenance = (
|
||||
FieldProvenance.DICTIONARY_EXACT.value
|
||||
if applied_candidate["match_type"] == CandidateMatchType.EXACT.value
|
||||
else FieldProvenance.FUZZY_INFERRED.value
|
||||
)
|
||||
needs_review = applied_candidate["match_type"] != CandidateMatchType.EXACT.value
|
||||
|
||||
resolved_fields.append(
|
||||
{
|
||||
"field_name": field_name,
|
||||
"applied_candidate": applied_candidate,
|
||||
"candidates": ranked_candidates,
|
||||
"provenance": provenance,
|
||||
"needs_review": needs_review,
|
||||
"has_conflict": has_conflict,
|
||||
"is_locked": False,
|
||||
"status": "resolved",
|
||||
}
|
||||
)
|
||||
|
||||
result = DictionaryResolutionResult(
|
||||
source_ref=source_ref,
|
||||
resolved_fields=resolved_fields,
|
||||
unresolved_fields=unresolved_fields,
|
||||
partial_recovery=bool(unresolved_fields),
|
||||
)
|
||||
logger.reflect(
|
||||
"Dictionary resolution completed",
|
||||
extra={
|
||||
"source_ref": source_ref,
|
||||
"resolved_fields": len(resolved_fields),
|
||||
"unresolved_fields": len(unresolved_fields),
|
||||
"partial_recovery": result.partial_recovery,
|
||||
},
|
||||
)
|
||||
return result
|
||||
# [/DEF:SemanticSourceResolver.resolve_from_dictionary:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver.resolve_from_reference_dataset:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Reuse semantic metadata from trusted Superset datasets.
|
||||
def resolve_from_reference_dataset(
|
||||
self,
|
||||
source_payload: Mapping[str, Any],
|
||||
fields: Iterable[Mapping[str, Any]],
|
||||
) -> DictionaryResolutionResult:
|
||||
return DictionaryResolutionResult(source_ref=str(source_payload.get("source_ref") or "reference_dataset"))
|
||||
# [/DEF:SemanticSourceResolver.resolve_from_reference_dataset:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver.rank_candidates:Function]
|
||||
# @COMPLEXITY: 3
|
||||
# @PURPOSE: Apply confidence ordering and determine best candidate per field.
|
||||
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
|
||||
def rank_candidates(self, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
ranked = sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
self._match_priority(candidate.get("match_type")),
|
||||
-float(candidate.get("confidence_score", 0.0)),
|
||||
int(candidate.get("candidate_rank", 999)),
|
||||
),
|
||||
)
|
||||
for index, candidate in enumerate(ranked, start=1):
|
||||
candidate["candidate_rank"] = index
|
||||
return ranked
|
||||
# [/DEF:SemanticSourceResolver.rank_candidates:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver.detect_conflicts:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Mark competing candidate sets that require explicit user review.
|
||||
def detect_conflicts(self, candidates: List[Dict[str, Any]]) -> bool:
|
||||
return len(candidates) > 1
|
||||
# [/DEF:SemanticSourceResolver.detect_conflicts:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver.apply_field_decision:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Accept, reject, or manually override a field-level semantic value.
|
||||
def apply_field_decision(self, field_state: Mapping[str, Any], decision: Mapping[str, Any]) -> Dict[str, Any]:
|
||||
merged = dict(field_state)
|
||||
merged.update(decision)
|
||||
return merged
|
||||
# [/DEF:SemanticSourceResolver.apply_field_decision:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver._normalize_dictionary_row:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Normalize one dictionary row into a consistent lookup structure.
|
||||
def _normalize_dictionary_row(self, row: Mapping[str, Any]) -> Dict[str, Any]:
|
||||
field_name = (
|
||||
row.get("field_name")
|
||||
or row.get("column_name")
|
||||
or row.get("name")
|
||||
or row.get("field")
|
||||
)
|
||||
normalized_name = str(field_name or "").strip()
|
||||
return {
|
||||
"field_name": normalized_name,
|
||||
"field_key": self._normalize_key(normalized_name),
|
||||
"verbose_name": row.get("verbose_name") or row.get("label"),
|
||||
"description": row.get("description"),
|
||||
"display_format": row.get("display_format") or row.get("format"),
|
||||
}
|
||||
# [/DEF:SemanticSourceResolver._normalize_dictionary_row:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver._find_fuzzy_matches:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Produce confidence-scored fuzzy matches while keeping them reviewable.
|
||||
def _find_fuzzy_matches(self, field_name: str, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
normalized_target = self._normalize_key(field_name)
|
||||
fuzzy_matches: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
candidate_key = str(row.get("field_key") or "")
|
||||
if not candidate_key:
|
||||
continue
|
||||
score = SequenceMatcher(None, normalized_target, candidate_key).ratio()
|
||||
if score < 0.72:
|
||||
continue
|
||||
fuzzy_matches.append({"row": row, "score": round(score, 3)})
|
||||
fuzzy_matches.sort(key=lambda item: item["score"], reverse=True)
|
||||
return fuzzy_matches[:3]
|
||||
# [/DEF:SemanticSourceResolver._find_fuzzy_matches:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver._build_candidate_payload:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Project normalized dictionary rows into semantic candidate payloads.
|
||||
def _build_candidate_payload(
|
||||
self,
|
||||
rank: int,
|
||||
match_type: CandidateMatchType,
|
||||
confidence_score: float,
|
||||
row: Mapping[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
return {
|
||||
"candidate_rank": rank,
|
||||
"match_type": match_type.value,
|
||||
"confidence_score": confidence_score,
|
||||
"proposed_verbose_name": row.get("verbose_name"),
|
||||
"proposed_description": row.get("description"),
|
||||
"proposed_display_format": row.get("display_format"),
|
||||
"status": CandidateStatus.PROPOSED.value,
|
||||
}
|
||||
# [/DEF:SemanticSourceResolver._build_candidate_payload:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver._match_priority:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Encode trusted-confidence ordering so exact dictionary reuse beats fuzzy invention.
|
||||
def _match_priority(self, match_type: Optional[str]) -> int:
|
||||
priority = {
|
||||
CandidateMatchType.EXACT.value: 0,
|
||||
CandidateMatchType.REFERENCE.value: 1,
|
||||
CandidateMatchType.FUZZY.value: 2,
|
||||
CandidateMatchType.GENERATED.value: 3,
|
||||
}
|
||||
return priority.get(str(match_type or ""), 99)
|
||||
# [/DEF:SemanticSourceResolver._match_priority:Function]
|
||||
|
||||
# [DEF:SemanticSourceResolver._normalize_key:Function]
|
||||
# @COMPLEXITY: 1
|
||||
# @PURPOSE: Normalize field identifiers for stable exact/fuzzy comparisons.
|
||||
def _normalize_key(self, value: str) -> str:
|
||||
return "".join(ch for ch in str(value or "").strip().lower() if ch.isalnum() or ch == "_")
|
||||
# [/DEF:SemanticSourceResolver._normalize_key:Function]
|
||||
# [/DEF:SemanticSourceResolver:Class]
|
||||
|
||||
# [/DEF:SemanticSourceResolver:Module]
|
||||
Reference in New Issue
Block a user