feat: initial dataset review orchestration flow implementation

This commit is contained in:
2026-03-16 23:43:03 +03:00
parent 9cae07a3b4
commit f4416c3ebb
9 changed files with 1565 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
# [DEF:backend.src.services.dataset_review:Module]
#
# @SEMANTICS: dataset, review, orchestration
# @PURPOSE: Provides services for dataset-centered orchestration flow.
# @LAYER: Services
#
# [/DEF:backend.src.services.dataset_review:Module]

View File

@@ -0,0 +1,171 @@
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from src.models.mapping import Base, Environment
from src.models.auth import User
from src.models.dataset_review import (
DatasetReviewSession,
DatasetProfile,
ValidationFinding,
CompiledPreview,
DatasetRunContext,
BusinessSummarySource,
ConfidenceState,
FindingArea,
FindingSeverity,
ReadinessState,
RecommendedAction
)
from src.services.dataset_review.repositories.session_repository import DatasetReviewSessionRepository
# [DEF:SessionRepositoryTests:Module]
# @COMPLEXITY: 3
# @PURPOSE: Unit tests for DatasetReviewSessionRepository.
# @RELATION: TESTS -> [DatasetReviewSessionRepository]
@pytest.fixture
def db_session():
# [DEF:db_session:Function]
# @COMPLEXITY: 1
# @RELATION: BINDS_TO -> [SessionRepositoryTests]
engine = create_engine("sqlite:///:memory:")
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
# Create test data
user = User(id="user1", username="testuser", email="test@example.com", password_hash="pw")
env = Environment(id="env1", name="Prod", url="http://superset", credentials_id="cred1")
session.add_all([user, env])
session.commit()
yield session
session.close()
def test_create_session(db_session):
# @PURPOSE: Verify session creation and persistence.
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1",
environment_id="env1",
source_kind="superset_link",
source_input="http://link",
dataset_ref="dataset1"
)
repo.create_session(session)
assert session.session_id is not None
loaded = db_session.query(DatasetReviewSession).filter_by(session_id=session.session_id).first()
assert loaded.user_id == "user1"
def test_load_session_detail_ownership(db_session):
# @PURPOSE: Verify ownership enforcement in detail loading.
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1", environment_id="env1", source_kind="superset_link",
source_input="http://link", dataset_ref="dataset1"
)
repo.create_session(session)
# Correct user
loaded = repo.load_session_detail(session.session_id, "user1")
assert loaded is not None
# Wrong user
loaded_wrong = repo.load_session_detail(session.session_id, "wrong_user")
assert loaded_wrong is None
def test_save_preview_marks_stale(db_session):
# @PURPOSE: Verify that saving a new preview marks old ones as stale.
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1", environment_id="env1", source_kind="superset_link",
source_input="http://link", dataset_ref="dataset1"
)
repo.create_session(session)
p1 = CompiledPreview(session_id=session.session_id, preview_status="ready", preview_fingerprint="f1")
repo.save_preview(session.session_id, "user1", p1)
p2 = CompiledPreview(session_id=session.session_id, preview_status="ready", preview_fingerprint="f2")
repo.save_preview(session.session_id, "user1", p2)
db_session.refresh(p1)
assert p1.preview_status == "stale"
assert p2.preview_status == "ready"
assert session.last_preview_id == p2.preview_id
def test_save_profile_and_findings(db_session):
# @PURPOSE: Verify persistence of profile and findings.
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1", environment_id="env1", source_kind="superset_link",
source_input="http://link", dataset_ref="dataset1"
)
repo.create_session(session)
profile = DatasetProfile(
session_id=session.session_id,
dataset_name="Test DS",
business_summary="Summary",
business_summary_source=BusinessSummarySource.INFERRED,
confidence_state=ConfidenceState.UNRESOLVED
)
finding = ValidationFinding(
session_id=session.session_id,
area=FindingArea.SOURCE_INTAKE,
severity=FindingSeverity.BLOCKING,
code="ERR1",
title="Error",
message="Failure"
)
repo.save_profile_and_findings(session.session_id, "user1", profile, [finding])
updated_session = repo.load_session_detail(session.session_id, "user1")
assert updated_session.profile.dataset_name == "Test DS"
assert len(updated_session.findings) == 1
assert updated_session.findings[0].code == "ERR1"
def test_save_run_context(db_session):
# @PURPOSE: Verify saving of run context.
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1", environment_id="env1", source_kind="superset_link",
source_input="http://link", dataset_ref="dataset1"
)
repo.create_session(session)
rc = DatasetRunContext(
session_id=session.session_id,
dataset_ref="ds1",
environment_id="env1",
preview_id="p1",
sql_lab_session_ref="s1",
effective_filters={},
template_params={},
approved_mapping_ids=[],
semantic_decision_refs=[],
open_warning_refs=[],
launch_status="success"
)
repo.save_run_context(session.session_id, "user1", rc)
assert session.last_run_context_id == rc.run_context_id
def test_list_sessions_for_user(db_session):
# @PURPOSE: Verify listing of sessions by user.
repo = DatasetReviewSessionRepository(db_session)
s1 = DatasetReviewSession(user_id="user1", environment_id="env1", source_kind="k", source_input="i", dataset_ref="r1")
s2 = DatasetReviewSession(user_id="user1", environment_id="env1", source_kind="k", source_input="i", dataset_ref="r2")
s3 = DatasetReviewSession(user_id="other", environment_id="env1", source_kind="k", source_input="i", dataset_ref="r3")
db_session.add_all([s1, s2, s3])
db_session.commit()
sessions = repo.list_sessions_for_user("user1")
assert len(sessions) == 2
assert all(s.user_id == "user1" for s in sessions)
# [/DEF:SessionRepositoryTests:Module]

View File

@@ -0,0 +1,146 @@
# [DEF:DatasetReviewSessionRepository:Module]
# @COMPLEXITY: 5
# @PURPOSE: Persist and retrieve dataset review session aggregates, including readiness, findings, semantic decisions, clarification state, previews, and run contexts.
# @LAYER: Domain
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @RELATION: [DEPENDS_ON] -> [DatasetProfile]
# @RELATION: [DEPENDS_ON] -> [ValidationFinding]
# @RELATION: [DEPENDS_ON] -> [CompiledPreview]
# @PRE: repository operations execute within authenticated request or task scope.
# @POST: session aggregate reads are structurally consistent and writes preserve ownership and version semantics.
from typing import Optional, List
from sqlalchemy.orm import Session, joinedload
from src.models.dataset_review import (
DatasetReviewSession,
DatasetProfile,
ValidationFinding,
CompiledPreview,
DatasetRunContext
)
from src.core.logger import belief_scope
class DatasetReviewSessionRepository:
"""
@PURPOSE: Persist and retrieve dataset review session aggregates.
@INVARIANT: ownership_scope -> All operations must respect the session owner's user_id.
"""
def __init__(self, db: Session):
self.db = db
def create_session(self, session: DatasetReviewSession) -> DatasetReviewSession:
"""
@PURPOSE: Persist initial session shell.
"""
with belief_scope("DatasetReviewSessionRepository.create_session"):
self.db.add(session)
self.db.commit()
self.db.refresh(session)
return session
def load_session_detail(self, session_id: str, user_id: str) -> Optional[DatasetReviewSession]:
"""
@PURPOSE: Return the full session aggregate for API/frontend use.
@PRE: user_id must match session owner or authorized collaborator.
"""
with belief_scope("DatasetReviewSessionRepository.load_session_detail"):
# Note: We check user_id to enforce the ownership_scope invariant.
return self.db.query(DatasetReviewSession)\
.options(
joinedload(DatasetReviewSession.profile),
joinedload(DatasetReviewSession.findings),
joinedload(DatasetReviewSession.collaborators),
joinedload(DatasetReviewSession.semantic_sources),
joinedload(DatasetReviewSession.semantic_fields),
joinedload(DatasetReviewSession.imported_filters),
joinedload(DatasetReviewSession.template_variables),
joinedload(DatasetReviewSession.execution_mappings),
joinedload(DatasetReviewSession.clarification_sessions),
joinedload(DatasetReviewSession.previews),
joinedload(DatasetReviewSession.run_contexts)
)\
.filter(DatasetReviewSession.session_id == session_id)\
.filter(DatasetReviewSession.user_id == user_id)\
.first()
def save_profile_and_findings(self, session_id: str, user_id: str, profile: DatasetProfile, findings: List[ValidationFinding]) -> DatasetReviewSession:
"""
@PURPOSE: Persist profile and validation state together.
"""
with belief_scope("DatasetReviewSessionRepository.save_profile_and_findings"):
session = self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.session_id == session_id,
DatasetReviewSession.user_id == user_id
).first()
if not session:
raise ValueError("Session not found or access denied")
if profile:
self.db.merge(profile)
# For findings, we might want to sync them (remove old ones if not in new list, or update)
# Simplest for now: add/merge findings
for finding in findings:
self.db.merge(finding)
self.db.commit()
return self.load_session_detail(session_id, user_id)
def save_preview(self, session_id: str, user_id: str, preview: CompiledPreview) -> CompiledPreview:
"""
@PURPOSE: Persist compiled preview attempt and mark older fingerprints stale.
"""
with belief_scope("DatasetReviewSessionRepository.save_preview"):
session = self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.session_id == session_id,
DatasetReviewSession.user_id == user_id
).first()
if not session:
raise ValueError("Session not found or access denied")
# Mark existing previews for this session as stale if they are not the new one
self.db.query(CompiledPreview).filter(
CompiledPreview.session_id == session_id
).update({"preview_status": "stale"})
self.db.add(preview)
self.db.flush()
session.last_preview_id = preview.preview_id
self.db.commit()
self.db.refresh(preview)
return preview
def save_run_context(self, session_id: str, user_id: str, run_context: DatasetRunContext) -> DatasetRunContext:
"""
@PURPOSE: Persist immutable launch audit snapshot.
"""
with belief_scope("DatasetReviewSessionRepository.save_run_context"):
session = self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.session_id == session_id,
DatasetReviewSession.user_id == user_id
).first()
if not session:
raise ValueError("Session not found or access denied")
self.db.add(run_context)
self.db.flush()
session.last_run_context_id = run_context.run_context_id
self.db.commit()
self.db.refresh(run_context)
return run_context
def list_sessions_for_user(self, user_id: str) -> List[DatasetReviewSession]:
"""
@PURPOSE: List all review sessions owned by a user.
"""
with belief_scope("DatasetReviewSessionRepository.list_sessions_for_user"):
return self.db.query(DatasetReviewSession).filter(
DatasetReviewSession.user_id == user_id
).order_by(DatasetReviewSession.updated_at.desc()).all()
# [/DEF:DatasetReviewSessionRepository:Module]