feat(us1): add dataset review orchestration automatic review slice

This commit is contained in:
2026-03-17 10:57:49 +03:00
parent e916cb1f17
commit 023bacde39
24 changed files with 4870 additions and 131 deletions

View File

@@ -0,0 +1,386 @@
# [DEF:DatasetReviewOrchestrator:Module]
# @COMPLEXITY: 5
# @SEMANTICS: dataset_review, orchestration, session_lifecycle, intake, recovery
# @PURPOSE: Coordinate dataset review session startup and lifecycle-safe intake recovery for one authenticated user.
# @LAYER: Domain
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
# @RELATION: [DEPENDS_ON] ->[SemanticSourceResolver]
# @RELATION: [DEPENDS_ON] ->[ClarificationEngine]
# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
# @RELATION: [DEPENDS_ON] ->[SupersetCompilationAdapter]
# @RELATION: [DEPENDS_ON] ->[TaskManager]
# @PRE: session mutations must execute inside a persisted session boundary scoped to one authenticated user.
# @POST: state transitions are persisted atomically and emit observable progress for long-running steps.
# @SIDE_EFFECT: creates task records, updates session aggregates, triggers upstream Superset calls, persists audit artifacts.
# @DATA_CONTRACT: Input[SessionCommand] -> Output[DatasetReviewSession | CompiledPreview | DatasetRunContext]
# @INVARIANT: Launch is blocked unless a current session has no open blocking findings, all launch-sensitive mappings are approved, and a non-stale Superset-generated compiled preview matches the current input fingerprint.
from __future__ import annotations
# [DEF:DatasetReviewOrchestrator.imports:Block]
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from src.core.config_manager import ConfigManager
from src.core.logger import belief_scope, logger
from src.core.task_manager import TaskManager
from src.core.utils.superset_context_extractor import (
SupersetContextExtractor,
SupersetParsedContext,
)
from src.models.auth import User
from src.models.dataset_review import (
BusinessSummarySource,
ConfidenceState,
DatasetProfile,
DatasetReviewSession,
FindingArea,
FindingSeverity,
RecommendedAction,
ReadinessState,
ResolutionState,
SessionPhase,
SessionStatus,
ValidationFinding,
)
from src.services.dataset_review.repositories.session_repository import (
DatasetReviewSessionRepository,
)
from src.services.dataset_review.semantic_resolver import SemanticSourceResolver
# [/DEF:DatasetReviewOrchestrator.imports:Block]
# [DEF:StartSessionCommand:Class]
# @COMPLEXITY: 2
# @PURPOSE: Typed input contract for starting a dataset review session.
@dataclass
class StartSessionCommand:
user: User
environment_id: str
source_kind: str
source_input: str
# [/DEF:StartSessionCommand:Class]
# [DEF:StartSessionResult:Class]
# @COMPLEXITY: 2
# @PURPOSE: Session-start result carrying the persisted session and intake recovery metadata.
@dataclass
class StartSessionResult:
session: DatasetReviewSession
parsed_context: Optional[SupersetParsedContext] = None
findings: List[ValidationFinding] = field(default_factory=list)
# [/DEF:StartSessionResult:Class]
# [DEF:DatasetReviewOrchestrator:Class]
# @COMPLEXITY: 5
# @PURPOSE: Coordinate safe session startup while preserving cross-user isolation and explicit partial recovery.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
# @RELATION: [DEPENDS_ON] ->[SupersetContextExtractor]
# @RELATION: [DEPENDS_ON] ->[TaskManager]
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
# @PRE: constructor dependencies are valid and tied to the current request/task scope.
# @POST: orchestrator instance can execute session-scoped mutations for one authenticated user.
# @SIDE_EFFECT: downstream operations may persist session/profile/finding state and enqueue background tasks.
# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
# @INVARIANT: session ownership is preserved on every mutation and recovery remains explicit when partial.
class DatasetReviewOrchestrator:
# [DEF:DatasetReviewOrchestrator.__init__:Function]
# @COMPLEXITY: 3
# @PURPOSE: Bind repository, config, and task dependencies required by the orchestration boundary.
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
def __init__(
self,
repository: DatasetReviewSessionRepository,
config_manager: ConfigManager,
task_manager: Optional[TaskManager] = None,
semantic_resolver: Optional[SemanticSourceResolver] = None,
) -> None:
self.repository = repository
self.config_manager = config_manager
self.task_manager = task_manager
self.semantic_resolver = semantic_resolver or SemanticSourceResolver()
# [/DEF:DatasetReviewOrchestrator.__init__:Function]
# [DEF:DatasetReviewOrchestrator.start_session:Function]
# @COMPLEXITY: 5
# @PURPOSE: Initialize a new session from a Superset link or dataset selection and trigger context recovery.
# @RELATION: [DEPENDS_ON] ->[SessionRepo]
# @RELATION: [CALLS] ->[SupersetContextExtractor.parse_superset_link]
# @RELATION: [CALLS] ->[create_task]
# @PRE: source input is non-empty and environment is accessible.
# @POST: session exists in persisted storage with intake/recovery state and task linkage when async work is required.
# @SIDE_EFFECT: persists session and may enqueue recovery task.
# @DATA_CONTRACT: Input[StartSessionCommand] -> Output[StartSessionResult]
# @INVARIANT: no cross-user session leakage occurs; session and follow-up task remain owned by the authenticated user.
def start_session(self, command: StartSessionCommand) -> StartSessionResult:
with belief_scope("DatasetReviewOrchestrator.start_session"):
normalized_source_kind = str(command.source_kind or "").strip()
normalized_source_input = str(command.source_input or "").strip()
normalized_environment_id = str(command.environment_id or "").strip()
if not normalized_source_input:
logger.explore("Blocked dataset review session start due to empty source input")
raise ValueError("source_input must be non-empty")
if normalized_source_kind not in {"superset_link", "dataset_selection"}:
logger.explore(
"Blocked dataset review session start due to unsupported source kind",
extra={"source_kind": normalized_source_kind},
)
raise ValueError("source_kind must be 'superset_link' or 'dataset_selection'")
environment = self.config_manager.get_environment(normalized_environment_id)
if environment is None:
logger.explore(
"Blocked dataset review session start because environment was not found",
extra={"environment_id": normalized_environment_id},
)
raise ValueError("Environment not found")
logger.reason(
"Starting dataset review session",
extra={
"user_id": command.user.id,
"environment_id": normalized_environment_id,
"source_kind": normalized_source_kind,
},
)
parsed_context: Optional[SupersetParsedContext] = None
findings: List[ValidationFinding] = []
dataset_ref = normalized_source_input
dataset_id: Optional[int] = None
dashboard_id: Optional[int] = None
readiness_state = ReadinessState.IMPORTING
recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
current_phase = SessionPhase.RECOVERY
if normalized_source_kind == "superset_link":
extractor = SupersetContextExtractor(environment)
parsed_context = extractor.parse_superset_link(normalized_source_input)
dataset_ref = parsed_context.dataset_ref
dataset_id = parsed_context.dataset_id
dashboard_id = parsed_context.dashboard_id
if parsed_context.partial_recovery:
readiness_state = ReadinessState.RECOVERY_REQUIRED
recommended_action = RecommendedAction.REVIEW_DOCUMENTATION
findings.extend(self._build_partial_recovery_findings(parsed_context))
else:
readiness_state = ReadinessState.REVIEW_READY
else:
dataset_ref, dataset_id = self._parse_dataset_selection(normalized_source_input)
readiness_state = ReadinessState.REVIEW_READY
current_phase = SessionPhase.REVIEW
session = DatasetReviewSession(
user_id=command.user.id,
environment_id=normalized_environment_id,
source_kind=normalized_source_kind,
source_input=normalized_source_input,
dataset_ref=dataset_ref,
dataset_id=dataset_id,
dashboard_id=dashboard_id,
readiness_state=readiness_state,
recommended_action=recommended_action,
status=SessionStatus.ACTIVE,
current_phase=current_phase,
)
persisted_session = self.repository.create_session(session)
profile = self._build_initial_profile(
session_id=persisted_session.session_id,
parsed_context=parsed_context,
dataset_ref=dataset_ref,
)
persisted_session = self.repository.save_profile_and_findings(
persisted_session.session_id,
command.user.id,
profile,
findings,
)
active_task_id = self._enqueue_recovery_task(
command=command,
session=persisted_session,
parsed_context=parsed_context,
)
if active_task_id:
persisted_session.active_task_id = active_task_id
self.repository.db.commit()
self.repository.db.refresh(persisted_session)
logger.reason(
"Linked recovery task to started dataset review session",
extra={"session_id": persisted_session.session_id, "task_id": active_task_id},
)
logger.reflect(
"Dataset review session start completed",
extra={
"session_id": persisted_session.session_id,
"dataset_ref": persisted_session.dataset_ref,
"dataset_id": persisted_session.dataset_id,
"dashboard_id": persisted_session.dashboard_id,
"readiness_state": persisted_session.readiness_state.value,
"active_task_id": persisted_session.active_task_id,
"finding_count": len(findings),
},
)
return StartSessionResult(
session=persisted_session,
parsed_context=parsed_context,
findings=findings,
)
# [/DEF:DatasetReviewOrchestrator.start_session:Function]
# [DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
# @COMPLEXITY: 3
# @PURPOSE: Normalize dataset-selection payload into canonical session references.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
def _parse_dataset_selection(self, source_input: str) -> tuple[str, Optional[int]]:
normalized = str(source_input or "").strip()
if not normalized:
raise ValueError("dataset selection input must be non-empty")
if normalized.isdigit():
dataset_id = int(normalized)
return f"dataset:{dataset_id}", dataset_id
if normalized.startswith("dataset:"):
suffix = normalized.split(":", 1)[1].strip()
if suffix.isdigit():
return normalized, int(suffix)
return normalized, None
return normalized, None
# [/DEF:DatasetReviewOrchestrator._parse_dataset_selection:Function]
# [DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
# @COMPLEXITY: 3
# @PURPOSE: Create the first profile snapshot so exports and detail views remain usable immediately after intake.
# @RELATION: [DEPENDS_ON] ->[DatasetProfile]
def _build_initial_profile(
self,
session_id: str,
parsed_context: Optional[SupersetParsedContext],
dataset_ref: str,
) -> DatasetProfile:
dataset_name = dataset_ref.split(".")[-1] if dataset_ref else "Unresolved dataset"
business_summary = (
f"Review session initialized for {dataset_ref}."
if dataset_ref
else "Review session initialized with unresolved dataset context."
)
confidence_state = (
ConfidenceState.MIXED
if parsed_context and parsed_context.partial_recovery
else ConfidenceState.MOSTLY_CONFIRMED
)
return DatasetProfile(
session_id=session_id,
dataset_name=dataset_name or "Unresolved dataset",
schema_name=dataset_ref.split(".")[0] if "." in dataset_ref else None,
business_summary=business_summary,
business_summary_source=BusinessSummarySource.IMPORTED,
description="Initial review profile created from source intake.",
dataset_type="unknown",
is_sqllab_view=False,
completeness_score=0.25,
confidence_state=confidence_state,
has_blocking_findings=False,
has_warning_findings=bool(parsed_context and parsed_context.partial_recovery),
manual_summary_locked=False,
)
# [/DEF:DatasetReviewOrchestrator._build_initial_profile:Function]
# [DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
# @COMPLEXITY: 4
# @PURPOSE: Project partial Superset intake recovery into explicit findings without blocking session usability.
# @RELATION: [DEPENDS_ON] ->[ValidationFinding]
# @PRE: parsed_context.partial_recovery is true.
# @POST: returns warning-level findings that preserve usable but incomplete state.
# @SIDE_EFFECT: none beyond structured finding creation.
# @DATA_CONTRACT: Input[SupersetParsedContext] -> Output[List[ValidationFinding]]
def _build_partial_recovery_findings(
self,
parsed_context: SupersetParsedContext,
) -> List[ValidationFinding]:
findings: List[ValidationFinding] = []
for unresolved_ref in parsed_context.unresolved_references:
findings.append(
ValidationFinding(
area=FindingArea.SOURCE_INTAKE,
severity=FindingSeverity.WARNING,
code="PARTIAL_SUPERSET_RECOVERY",
title="Superset context recovered partially",
message=(
"Session remains usable, but some Superset context requires review: "
f"{unresolved_ref.replace('_', ' ')}."
),
resolution_state=ResolutionState.OPEN,
caused_by_ref=unresolved_ref,
)
)
return findings
# [/DEF:DatasetReviewOrchestrator._build_partial_recovery_findings:Function]
# [DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
# @COMPLEXITY: 4
# @PURPOSE: Link session start to observable async recovery when task infrastructure is available.
# @RELATION: [CALLS] ->[create_task]
# @PRE: session is already persisted.
# @POST: returns task identifier when a task could be enqueued, otherwise None.
# @SIDE_EFFECT: may create one background task for progressive recovery.
# @DATA_CONTRACT: Input[StartSessionCommand,DatasetReviewSession,SupersetParsedContext|None] -> Output[task_id:str|None]
def _enqueue_recovery_task(
self,
command: StartSessionCommand,
session: DatasetReviewSession,
parsed_context: Optional[SupersetParsedContext],
) -> Optional[str]:
if self.task_manager is None:
logger.reason(
"Dataset review session started without task manager; continuing synchronously",
extra={"session_id": session.session_id},
)
return None
task_params: Dict[str, Any] = {
"session_id": session.session_id,
"user_id": command.user.id,
"environment_id": session.environment_id,
"source_kind": session.source_kind,
"source_input": session.source_input,
"dataset_ref": session.dataset_ref,
"dataset_id": session.dataset_id,
"dashboard_id": session.dashboard_id,
"partial_recovery": bool(parsed_context and parsed_context.partial_recovery),
}
create_task = getattr(self.task_manager, "create_task", None)
if create_task is None:
logger.explore("Task manager has no create_task method; skipping recovery enqueue")
return None
try:
task_object = create_task(
plugin_id="dataset-review-recovery",
params=task_params,
)
except TypeError:
logger.explore(
"Recovery task enqueue skipped because task manager create_task contract is incompatible",
extra={"session_id": session.session_id},
)
return None
task_id = getattr(task_object, "id", None)
return str(task_id) if task_id else None
# [/DEF:DatasetReviewOrchestrator._enqueue_recovery_task:Function]
# [/DEF:DatasetReviewOrchestrator:Class]
# [/DEF:DatasetReviewOrchestrator:Module]

View File

@@ -8,6 +8,9 @@
# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
# @PRE: repository operations execute within authenticated request or task scope.
# @POST: session aggregate reads are structurally consistent and writes preserve ownership and version semantics.
# @SIDE_EFFECT: reads and writes SQLAlchemy-backed session aggregates.
# @DATA_CONTRACT: Input[SessionMutation] -> Output[PersistedSessionAggregate]
# @INVARIANT: answers, mapping approvals, preview artifacts, and launch snapshots are never attributed to the wrong user or session.
from typing import Optional, List
from sqlalchemy import or_
@@ -22,27 +25,51 @@ from src.models.dataset_review import (
)
from src.core.logger import belief_scope
# [DEF:SessionRepo:Class]
# @COMPLEXITY: 4
# @PURPOSE: Enforce ownership-scoped persistence and retrieval for dataset review session aggregates.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @RELATION: [DEPENDS_ON] -> [DatasetProfile]
# @RELATION: [DEPENDS_ON] -> [ValidationFinding]
# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
# @PRE: constructor receives a live SQLAlchemy session and callers provide authenticated user scope for guarded reads and writes.
# @POST: repository methods return ownership-scoped aggregates or persisted child records without changing domain meaning.
# @SIDE_EFFECT: mutates and queries the persistence layer through the injected database session.
# @DATA_CONTRACT: Input[OwnedSessionQuery|SessionMutation] -> Output[PersistedSessionAggregate|PersistedChildRecord]
class DatasetReviewSessionRepository:
"""
@PURPOSE: Persist and retrieve dataset review session aggregates.
@INVARIANT: ownership_scope -> All operations must respect the session owner's user_id.
"""
# [DEF:init_repo:Function]
def __init__(self, db: Session):
self.db = db
# [/DEF:init_repo:Function]
# [DEF:create_sess:Function]
# @COMPLEXITY: 4
# @PURPOSE: Persist an initial dataset review session shell.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @PRE: session is a new aggregate root bound to the current ownership scope.
# @POST: session is committed, refreshed, and returned with persisted identifiers.
# @SIDE_EFFECT: inserts a session row and commits the active transaction.
# @DATA_CONTRACT: Input[DatasetReviewSession] -> Output[DatasetReviewSession]
def create_session(self, session: DatasetReviewSession) -> DatasetReviewSession:
"""
@PURPOSE: Persist initial session shell.
"""
with belief_scope("DatasetReviewSessionRepository.create_session"):
self.db.add(session)
self.db.commit()
self.db.refresh(session)
return session
# [/DEF:create_sess:Function]
# [DEF:load_detail:Function]
# @COMPLEXITY: 3
# @PURPOSE: Return the full session aggregate for API and frontend resume flows.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @RELATION: [DEPENDS_ON] -> [SessionCollaborator]
def load_session_detail(self, session_id: str, user_id: str) -> Optional[DatasetReviewSession]:
"""
@PURPOSE: Return the full session aggregate for API/frontend use.
@PRE: user_id must match session owner or authorized collaborator.
"""
with belief_scope("DatasetReviewSessionRepository.load_session_detail"):
@@ -70,17 +97,25 @@ class DatasetReviewSessionRepository:
)
)\
.first()
# [/DEF:load_detail:Function]
# [DEF:save_prof_find:Function]
# @COMPLEXITY: 4
# @PURPOSE: Persist profile state and replace validation findings for an owned session in one transaction.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @RELATION: [DEPENDS_ON] -> [DatasetProfile]
# @RELATION: [DEPENDS_ON] -> [ValidationFinding]
# @PRE: session_id belongs to user_id and the supplied profile/findings belong to the same aggregate scope.
# @POST: stored profile matches the current session and findings are replaced by the supplied collection.
# @SIDE_EFFECT: updates profile rows, deletes stale findings, inserts current findings, and commits the transaction.
# @DATA_CONTRACT: Input[ProfileAndFindingsMutation] -> Output[DatasetReviewSession]
def save_profile_and_findings(self, session_id: str, user_id: str, profile: DatasetProfile, findings: List[ValidationFinding]) -> DatasetReviewSession:
"""
@PURPOSE: Persist profile and validation state together.
"""
with belief_scope("DatasetReviewSessionRepository.save_profile_and_findings"):
session = self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.session_id == session_id,
DatasetReviewSession.user_id == user_id
).first()
if not session:
raise ValueError("Session not found or access denied")
@@ -90,24 +125,31 @@ class DatasetReviewSessionRepository:
if existing_profile:
profile.profile_id = existing_profile.profile_id
self.db.merge(profile)
# Remove old findings for this session to avoid stale data
self.db.query(ValidationFinding).filter(
ValidationFinding.session_id == session_id
).delete()
# Add new findings
for finding in findings:
finding.session_id = session_id
self.db.add(finding)
self.db.commit()
return self.load_session_detail(session_id, user_id)
# [/DEF:save_prof_find:Function]
# [DEF:save_prev:Function]
# @COMPLEXITY: 4
# @PURPOSE: Persist a preview snapshot and mark prior session previews stale.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
# @PRE: session_id belongs to user_id and preview is prepared for the same session aggregate.
# @POST: preview is persisted and the session points to the latest preview identifier.
# @SIDE_EFFECT: updates prior preview statuses, inserts a preview row, mutates the parent session, and commits.
# @DATA_CONTRACT: Input[PreviewMutation] -> Output[CompiledPreview]
def save_preview(self, session_id: str, user_id: str, preview: CompiledPreview) -> CompiledPreview:
"""
@PURPOSE: Persist compiled preview attempt and mark older fingerprints stale.
"""
with belief_scope("DatasetReviewSessionRepository.save_preview"):
session = self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.session_id == session_id,
@@ -125,15 +167,22 @@ class DatasetReviewSessionRepository:
self.db.add(preview)
self.db.flush()
session.last_preview_id = preview.preview_id
self.db.commit()
self.db.refresh(preview)
return preview
# [/DEF:save_prev:Function]
# [DEF:save_run_ctx:Function]
# @COMPLEXITY: 4
# @PURPOSE: Persist an immutable launch audit snapshot for an owned session.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @RELATION: [DEPENDS_ON] -> [DatasetRunContext]
# @PRE: session_id belongs to user_id and run_context targets the same aggregate.
# @POST: run context is persisted and linked as the latest launch snapshot for the session.
# @SIDE_EFFECT: inserts a run-context row, mutates the parent session pointer, and commits.
# @DATA_CONTRACT: Input[RunContextMutation] -> Output[DatasetRunContext]
def save_run_context(self, session_id: str, user_id: str, run_context: DatasetRunContext) -> DatasetRunContext:
"""
@PURPOSE: Persist immutable launch audit snapshot.
"""
with belief_scope("DatasetReviewSessionRepository.save_run_context"):
session = self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.session_id == session_id,
@@ -146,18 +195,22 @@ class DatasetReviewSessionRepository:
self.db.add(run_context)
self.db.flush()
session.last_run_context_id = run_context.run_context_id
self.db.commit()
self.db.refresh(run_context)
return run_context
# [/DEF:save_run_ctx:Function]
# [DEF:list_user_sess:Function]
# @COMPLEXITY: 3
# @PURPOSE: List review sessions owned by a specific user ordered by most recent update.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
def list_sessions_for_user(self, user_id: str) -> List[DatasetReviewSession]:
"""
@PURPOSE: List all review sessions owned by a user.
"""
with belief_scope("DatasetReviewSessionRepository.list_sessions_for_user"):
return self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.user_id == user_id
).order_by(DatasetReviewSession.updated_at.desc()).all()
# [/DEF:list_user_sess:Function]
# [/DEF:SessionRepo:Class]
# [/DEF:DatasetReviewSessionRepository:Module]

View File

@@ -0,0 +1,342 @@
# [DEF:SemanticSourceResolver:Module]
# @COMPLEXITY: 4
# @SEMANTICS: dataset_review, semantic_resolution, dictionary, trusted_sources, ranking
# @PURPOSE: Resolve and rank semantic candidates from trusted dictionary-like sources before any inferred fallback.
# @LAYER: Domain
# @RELATION: [DEPENDS_ON] ->[LLMProviderService]
# @RELATION: [DEPENDS_ON] ->[SemanticSource]
# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
# @PRE: selected source and target field set must be known.
# @POST: candidate ranking follows the configured confidence hierarchy and unresolved fuzzy matches remain reviewable.
# @SIDE_EFFECT: may create conflict findings and semantic candidate records.
# @INVARIANT: Manual overrides are never silently replaced by imported, inferred, or AI-generated values.
from __future__ import annotations
# [DEF:SemanticSourceResolver.imports:Block]
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Any, Dict, Iterable, List, Mapping, Optional
from src.core.logger import belief_scope, logger
from src.models.dataset_review import (
CandidateMatchType,
CandidateStatus,
FieldProvenance,
)
# [/DEF:SemanticSourceResolver.imports:Block]
# [DEF:DictionaryResolutionResult:Class]
# @COMPLEXITY: 2
# @PURPOSE: Carries field-level dictionary resolution output with explicit review and partial-recovery state.
@dataclass
class DictionaryResolutionResult:
source_ref: str
resolved_fields: List[Dict[str, Any]] = field(default_factory=list)
unresolved_fields: List[str] = field(default_factory=list)
partial_recovery: bool = False
# [/DEF:DictionaryResolutionResult:Class]
# [DEF:SemanticSourceResolver:Class]
# @COMPLEXITY: 4
# @PURPOSE: Resolve semantic candidates from trusted sources while preserving manual locks and confidence ordering.
# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
# @PRE: source payload and target field collection are provided by the caller.
# @POST: result contains confidence-ranked candidates and does not overwrite manual locks implicitly.
# @SIDE_EFFECT: emits semantic trace logs for ranking and fallback decisions.
class SemanticSourceResolver:
# [DEF:SemanticSourceResolver.resolve_from_file:Function]
# @COMPLEXITY: 2
# @PURPOSE: Normalize uploaded semantic file records into field-level candidates.
def resolve_from_file(self, source_payload: Mapping[str, Any], fields: Iterable[Mapping[str, Any]]) -> DictionaryResolutionResult:
return DictionaryResolutionResult(source_ref=str(source_payload.get("source_ref") or "uploaded_file"))
# [/DEF:SemanticSourceResolver.resolve_from_file:Function]
# [DEF:SemanticSourceResolver.resolve_from_dictionary:Function]
# @COMPLEXITY: 4
# @PURPOSE: Resolve candidates from connected tabular dictionary sources.
# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntry]
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
# @PRE: dictionary source exists and fields contain stable field_name values.
# @POST: returns confidence-ranked candidates where exact dictionary matches outrank fuzzy matches and unresolved fields stay explicit.
# @SIDE_EFFECT: emits belief-state logs describing trusted-match and partial-recovery outcomes.
# @DATA_CONTRACT: Input[source_payload:Mapping,fields:Iterable] -> Output[DictionaryResolutionResult]
def resolve_from_dictionary(
self,
source_payload: Mapping[str, Any],
fields: Iterable[Mapping[str, Any]],
) -> DictionaryResolutionResult:
with belief_scope("SemanticSourceResolver.resolve_from_dictionary"):
source_ref = str(source_payload.get("source_ref") or "").strip()
dictionary_rows = source_payload.get("rows")
if not source_ref:
logger.explore("Dictionary semantic source is missing source_ref")
raise ValueError("Dictionary semantic source must include source_ref")
if not isinstance(dictionary_rows, list) or not dictionary_rows:
logger.explore(
"Dictionary semantic source has no usable rows",
extra={"source_ref": source_ref},
)
raise ValueError("Dictionary semantic source must include non-empty rows")
logger.reason(
"Resolving semantics from trusted dictionary source",
extra={"source_ref": source_ref, "row_count": len(dictionary_rows)},
)
normalized_rows = [self._normalize_dictionary_row(row) for row in dictionary_rows if isinstance(row, Mapping)]
row_index = {
row["field_key"]: row
for row in normalized_rows
if row.get("field_key")
}
resolved_fields: List[Dict[str, Any]] = []
unresolved_fields: List[str] = []
for raw_field in fields:
field_name = str(raw_field.get("field_name") or "").strip()
if not field_name:
continue
is_locked = bool(raw_field.get("is_locked"))
if is_locked:
logger.reason(
"Preserving manual lock during dictionary resolution",
extra={"field_name": field_name},
)
resolved_fields.append(
{
"field_name": field_name,
"applied_candidate": None,
"candidates": [],
"provenance": FieldProvenance.MANUAL_OVERRIDE.value,
"needs_review": False,
"has_conflict": False,
"is_locked": True,
"status": "preserved_manual",
}
)
continue
exact_match = row_index.get(self._normalize_key(field_name))
candidates: List[Dict[str, Any]] = []
if exact_match is not None:
logger.reason(
"Resolved exact dictionary match",
extra={"field_name": field_name, "source_ref": source_ref},
)
candidates.append(
self._build_candidate_payload(
rank=1,
match_type=CandidateMatchType.EXACT,
confidence_score=1.0,
row=exact_match,
)
)
else:
fuzzy_matches = self._find_fuzzy_matches(field_name, normalized_rows)
for rank_offset, fuzzy_match in enumerate(fuzzy_matches, start=1):
candidates.append(
self._build_candidate_payload(
rank=rank_offset,
match_type=CandidateMatchType.FUZZY,
confidence_score=float(fuzzy_match["score"]),
row=fuzzy_match["row"],
)
)
if not candidates:
unresolved_fields.append(field_name)
resolved_fields.append(
{
"field_name": field_name,
"applied_candidate": None,
"candidates": [],
"provenance": FieldProvenance.UNRESOLVED.value,
"needs_review": True,
"has_conflict": False,
"is_locked": False,
"status": "unresolved",
}
)
logger.explore(
"No trusted dictionary match found for field",
extra={"field_name": field_name, "source_ref": source_ref},
)
continue
ranked_candidates = self.rank_candidates(candidates)
applied_candidate = ranked_candidates[0]
has_conflict = len(ranked_candidates) > 1
provenance = (
FieldProvenance.DICTIONARY_EXACT.value
if applied_candidate["match_type"] == CandidateMatchType.EXACT.value
else FieldProvenance.FUZZY_INFERRED.value
)
needs_review = applied_candidate["match_type"] != CandidateMatchType.EXACT.value
resolved_fields.append(
{
"field_name": field_name,
"applied_candidate": applied_candidate,
"candidates": ranked_candidates,
"provenance": provenance,
"needs_review": needs_review,
"has_conflict": has_conflict,
"is_locked": False,
"status": "resolved",
}
)
result = DictionaryResolutionResult(
source_ref=source_ref,
resolved_fields=resolved_fields,
unresolved_fields=unresolved_fields,
partial_recovery=bool(unresolved_fields),
)
logger.reflect(
"Dictionary resolution completed",
extra={
"source_ref": source_ref,
"resolved_fields": len(resolved_fields),
"unresolved_fields": len(unresolved_fields),
"partial_recovery": result.partial_recovery,
},
)
return result
# [/DEF:SemanticSourceResolver.resolve_from_dictionary:Function]
# [DEF:SemanticSourceResolver.resolve_from_reference_dataset:Function]
# @COMPLEXITY: 2
# @PURPOSE: Reuse semantic metadata from trusted Superset datasets.
def resolve_from_reference_dataset(
self,
source_payload: Mapping[str, Any],
fields: Iterable[Mapping[str, Any]],
) -> DictionaryResolutionResult:
return DictionaryResolutionResult(source_ref=str(source_payload.get("source_ref") or "reference_dataset"))
# [/DEF:SemanticSourceResolver.resolve_from_reference_dataset:Function]
# [DEF:SemanticSourceResolver.rank_candidates:Function]
# @COMPLEXITY: 3
# @PURPOSE: Apply confidence ordering and determine best candidate per field.
# @RELATION: [DEPENDS_ON] ->[SemanticCandidate]
def rank_candidates(self, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
ranked = sorted(
candidates,
key=lambda candidate: (
self._match_priority(candidate.get("match_type")),
-float(candidate.get("confidence_score", 0.0)),
int(candidate.get("candidate_rank", 999)),
),
)
for index, candidate in enumerate(ranked, start=1):
candidate["candidate_rank"] = index
return ranked
# [/DEF:SemanticSourceResolver.rank_candidates:Function]
# [DEF:SemanticSourceResolver.detect_conflicts:Function]
# @COMPLEXITY: 2
# @PURPOSE: Mark competing candidate sets that require explicit user review.
def detect_conflicts(self, candidates: List[Dict[str, Any]]) -> bool:
return len(candidates) > 1
# [/DEF:SemanticSourceResolver.detect_conflicts:Function]
# [DEF:SemanticSourceResolver.apply_field_decision:Function]
# @COMPLEXITY: 2
# @PURPOSE: Accept, reject, or manually override a field-level semantic value.
def apply_field_decision(self, field_state: Mapping[str, Any], decision: Mapping[str, Any]) -> Dict[str, Any]:
merged = dict(field_state)
merged.update(decision)
return merged
# [/DEF:SemanticSourceResolver.apply_field_decision:Function]
# [DEF:SemanticSourceResolver._normalize_dictionary_row:Function]
# @COMPLEXITY: 2
# @PURPOSE: Normalize one dictionary row into a consistent lookup structure.
def _normalize_dictionary_row(self, row: Mapping[str, Any]) -> Dict[str, Any]:
field_name = (
row.get("field_name")
or row.get("column_name")
or row.get("name")
or row.get("field")
)
normalized_name = str(field_name or "").strip()
return {
"field_name": normalized_name,
"field_key": self._normalize_key(normalized_name),
"verbose_name": row.get("verbose_name") or row.get("label"),
"description": row.get("description"),
"display_format": row.get("display_format") or row.get("format"),
}
# [/DEF:SemanticSourceResolver._normalize_dictionary_row:Function]
# [DEF:SemanticSourceResolver._find_fuzzy_matches:Function]
# @COMPLEXITY: 2
# @PURPOSE: Produce confidence-scored fuzzy matches while keeping them reviewable.
def _find_fuzzy_matches(self, field_name: str, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
normalized_target = self._normalize_key(field_name)
fuzzy_matches: List[Dict[str, Any]] = []
for row in rows:
candidate_key = str(row.get("field_key") or "")
if not candidate_key:
continue
score = SequenceMatcher(None, normalized_target, candidate_key).ratio()
if score < 0.72:
continue
fuzzy_matches.append({"row": row, "score": round(score, 3)})
fuzzy_matches.sort(key=lambda item: item["score"], reverse=True)
return fuzzy_matches[:3]
# [/DEF:SemanticSourceResolver._find_fuzzy_matches:Function]
# [DEF:SemanticSourceResolver._build_candidate_payload:Function]
# @COMPLEXITY: 2
# @PURPOSE: Project normalized dictionary rows into semantic candidate payloads.
def _build_candidate_payload(
self,
rank: int,
match_type: CandidateMatchType,
confidence_score: float,
row: Mapping[str, Any],
) -> Dict[str, Any]:
return {
"candidate_rank": rank,
"match_type": match_type.value,
"confidence_score": confidence_score,
"proposed_verbose_name": row.get("verbose_name"),
"proposed_description": row.get("description"),
"proposed_display_format": row.get("display_format"),
"status": CandidateStatus.PROPOSED.value,
}
# [/DEF:SemanticSourceResolver._build_candidate_payload:Function]
# [DEF:SemanticSourceResolver._match_priority:Function]
# @COMPLEXITY: 2
# @PURPOSE: Encode trusted-confidence ordering so exact dictionary reuse beats fuzzy invention.
def _match_priority(self, match_type: Optional[str]) -> int:
priority = {
CandidateMatchType.EXACT.value: 0,
CandidateMatchType.REFERENCE.value: 1,
CandidateMatchType.FUZZY.value: 2,
CandidateMatchType.GENERATED.value: 3,
}
return priority.get(str(match_type or ""), 99)
# [/DEF:SemanticSourceResolver._match_priority:Function]
# [DEF:SemanticSourceResolver._normalize_key:Function]
# @COMPLEXITY: 1
# @PURPOSE: Normalize field identifiers for stable exact/fuzzy comparisons.
def _normalize_key(self, value: str) -> str:
return "".join(ch for ch in str(value or "").strip().lower() if ch.isalnum() or ch == "_")
# [/DEF:SemanticSourceResolver._normalize_key:Function]
# [/DEF:SemanticSourceResolver:Class]
# [/DEF:SemanticSourceResolver:Module]