feat(us1): add dataset review orchestration automatic review slice

This commit is contained in:
2026-03-17 10:57:49 +03:00
parent e916cb1f17
commit 023bacde39
24 changed files with 4870 additions and 131 deletions

View File

@@ -0,0 +1,533 @@
# [DEF:DatasetReviewApi:Module]
# @COMPLEXITY: 4
# @SEMANTICS: dataset_review, api, session_lifecycle, exports, rbac, feature_flags
# @PURPOSE: Expose dataset review session lifecycle and export endpoints for backend US1.
# @LAYER: API
# @RELATION: [DEPENDS_ON] ->[AppDependencies]
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
# @RELATION: [DEPENDS_ON] ->[DatasetReviewOrchestrator]
# @PRE: Authenticated user and valid environment/session scope are required for all mutations and reads.
# @POST: Returns ownership-scoped session state and export payloads with feature-flag/RBAC enforcement.
# @SIDE_EFFECT: Persists session state and may enqueue recovery task.
# @DATA_CONTRACT: Input[HTTP Request] -> Output[SessionSummary | SessionDetail | ExportArtifactResponse | HTTP 204]
# @INVARIANT: No cross-user session leakage is allowed; export payloads only expose the current user's accessible session.
from __future__ import annotations
# [DEF:DatasetReviewApi.imports:Block]
import json
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, Depends, HTTPException, Query, Response, status
from pydantic import BaseModel, Field
from sqlalchemy.orm import Session
from src.core.database import get_db
from src.core.logger import belief_scope, logger
from src.dependencies import get_config_manager, get_current_user, get_task_manager, has_permission
from src.models.auth import User
from src.models.dataset_review import (
ArtifactFormat,
DatasetReviewSession,
RecommendedAction,
SessionStatus,
)
from src.schemas.dataset_review import SessionDetail, SessionSummary
from src.services.dataset_review.orchestrator import (
DatasetReviewOrchestrator,
StartSessionCommand,
)
from src.services.dataset_review.repositories.session_repository import (
DatasetReviewSessionRepository,
)
# [/DEF:DatasetReviewApi.imports:Block]
router = APIRouter(prefix="/api/dataset-orchestration", tags=["Dataset Orchestration"])
# [DEF:StartSessionRequest:Class]
# @COMPLEXITY: 2
# @PURPOSE: Request DTO for starting one dataset review session from a Superset link or dataset selection.
class StartSessionRequest(BaseModel):
source_kind: str = Field(..., pattern="^(superset_link|dataset_selection)$")
source_input: str = Field(..., min_length=1)
environment_id: str = Field(..., min_length=1)
# [/DEF:StartSessionRequest:Class]
# [DEF:UpdateSessionRequest:Class]
# @COMPLEXITY: 2
# @PURPOSE: Request DTO for lifecycle state updates on an existing session.
class UpdateSessionRequest(BaseModel):
status: SessionStatus
note: Optional[str] = None
# [/DEF:UpdateSessionRequest:Class]
# [DEF:SessionCollectionResponse:Class]
# @COMPLEXITY: 2
# @PURPOSE: Paginated ownership-scoped dataset review session collection response.
class SessionCollectionResponse(BaseModel):
items: List[SessionSummary]
total: int
page: int
page_size: int
has_next: bool
# [/DEF:SessionCollectionResponse:Class]
# [DEF:ExportArtifactResponse:Class]
# @COMPLEXITY: 2
# @PURPOSE: Inline export response for documentation or validation outputs without introducing unrelated persistence changes.
class ExportArtifactResponse(BaseModel):
artifact_id: str
session_id: str
artifact_type: str
format: str
storage_ref: str
created_by_user_id: str
created_at: Optional[str] = None
content: Dict[str, Any]
# [/DEF:ExportArtifactResponse:Class]
# [DEF:_require_auto_review_flag:Function]
# @COMPLEXITY: 3
# @PURPOSE: Guard US1 dataset review endpoints behind the configured feature flag.
# @RELATION: [DEPENDS_ON] ->[ConfigManager]
def _require_auto_review_flag(config_manager=Depends(get_config_manager)) -> bool:
with belief_scope("dataset_review.require_auto_review_flag"):
if not config_manager.get_config().settings.ff_dataset_auto_review:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Dataset auto review feature is disabled",
)
return True
# [/DEF:_require_auto_review_flag:Function]
# [DEF:_get_repository:Function]
# @COMPLEXITY: 2
# @PURPOSE: Build repository dependency for dataset review session aggregate access.
def _get_repository(db: Session = Depends(get_db)) -> DatasetReviewSessionRepository:
return DatasetReviewSessionRepository(db)
# [/DEF:_get_repository:Function]
# [DEF:_get_orchestrator:Function]
# @COMPLEXITY: 3
# @PURPOSE: Build orchestrator dependency for session lifecycle actions.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewOrchestrator]
def _get_orchestrator(
repository: DatasetReviewSessionRepository = Depends(_get_repository),
config_manager=Depends(get_config_manager),
task_manager=Depends(get_task_manager),
) -> DatasetReviewOrchestrator:
return DatasetReviewOrchestrator(
repository=repository,
config_manager=config_manager,
task_manager=task_manager,
)
# [/DEF:_get_orchestrator:Function]
# [DEF:_serialize_session_summary:Function]
# @COMPLEXITY: 3
# @PURPOSE: Map SQLAlchemy session aggregate root into stable API summary DTO.
# @RELATION: [DEPENDS_ON] ->[SessionSummary]
def _serialize_session_summary(session: DatasetReviewSession) -> SessionSummary:
return SessionSummary.model_validate(session, from_attributes=True)
# [/DEF:_serialize_session_summary:Function]
# [DEF:_serialize_session_detail:Function]
# @COMPLEXITY: 3
# @PURPOSE: Map SQLAlchemy session aggregate root into stable API detail DTO.
# @RELATION: [DEPENDS_ON] ->[SessionDetail]
def _serialize_session_detail(session: DatasetReviewSession) -> SessionDetail:
return SessionDetail.model_validate(session, from_attributes=True)
# [/DEF:_serialize_session_detail:Function]
# [DEF:_get_owned_session_or_404:Function]
# @COMPLEXITY: 4
# @PURPOSE: Resolve one session for current user or collaborator scope, returning 404 when inaccessible.
# @RELATION: [CALLS] ->[load_detail]
# @PRE: session_id is a non-empty identifier and current_user is authenticated.
# @POST: returns accessible session detail or raises HTTP 404 without leaking foreign-session existence.
# @SIDE_EFFECT: none.
# @DATA_CONTRACT: Input[session_id:str,current_user:User] -> Output[DatasetReviewSession|HTTPException]
def _get_owned_session_or_404(
repository: DatasetReviewSessionRepository,
session_id: str,
current_user: User,
) -> DatasetReviewSession:
with belief_scope("dataset_review.get_owned_session_or_404"):
session = repository.load_session_detail(session_id, current_user.id)
if session is None:
logger.explore(
"Dataset review session not found in current ownership scope",
extra={"session_id": session_id, "user_id": current_user.id},
)
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Session not found")
return session
# [/DEF:_get_owned_session_or_404:Function]
# [DEF:_build_documentation_export:Function]
# @COMPLEXITY: 3
# @PURPOSE: Produce session documentation export content from current persisted review state.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
def _build_documentation_export(session: DatasetReviewSession, export_format: ArtifactFormat) -> Dict[str, Any]:
profile = session.profile
findings = sorted(session.findings, key=lambda item: (item.severity.value, item.code))
if export_format == ArtifactFormat.MARKDOWN:
lines = [
f"# Dataset Review: {session.dataset_ref}",
"",
f"- Session ID: {session.session_id}",
f"- Environment: {session.environment_id}",
f"- Readiness: {session.readiness_state.value}",
f"- Recommended action: {session.recommended_action.value}",
"",
"## Business Summary",
profile.business_summary if profile else "No profile summary available.",
"",
"## Findings",
]
if findings:
for finding in findings:
lines.append(
f"- [{finding.severity.value}] {finding.title}: {finding.message}"
)
else:
lines.append("- No findings recorded.")
content = {"markdown": "\n".join(lines)}
storage_ref = f"inline://dataset-review/{session.session_id}/documentation.md"
else:
content = {
"session": _serialize_session_summary(session).model_dump(mode="json"),
"profile": profile and {
"dataset_name": profile.dataset_name,
"business_summary": profile.business_summary,
"confidence_state": profile.confidence_state.value,
"dataset_type": profile.dataset_type,
},
"findings": [
{
"code": finding.code,
"severity": finding.severity.value,
"title": finding.title,
"message": finding.message,
"resolution_state": finding.resolution_state.value,
}
for finding in findings
],
}
storage_ref = f"inline://dataset-review/{session.session_id}/documentation.json"
return {"storage_ref": storage_ref, "content": content}
# [/DEF:_build_documentation_export:Function]
# [DEF:_build_validation_export:Function]
# @COMPLEXITY: 3
# @PURPOSE: Produce validation-focused export content from persisted findings and readiness state.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
def _build_validation_export(session: DatasetReviewSession, export_format: ArtifactFormat) -> Dict[str, Any]:
findings = sorted(session.findings, key=lambda item: (item.severity.value, item.code))
if export_format == ArtifactFormat.MARKDOWN:
lines = [
f"# Validation Report: {session.dataset_ref}",
"",
f"- Session ID: {session.session_id}",
f"- Readiness: {session.readiness_state.value}",
"",
"## Findings",
]
if findings:
for finding in findings:
lines.append(
f"- `{finding.code}` [{finding.severity.value}] {finding.message}"
)
else:
lines.append("- No findings recorded.")
content = {"markdown": "\n".join(lines)}
storage_ref = f"inline://dataset-review/{session.session_id}/validation.md"
else:
content = {
"session_id": session.session_id,
"dataset_ref": session.dataset_ref,
"readiness_state": session.readiness_state.value,
"findings": [
{
"finding_id": finding.finding_id,
"area": finding.area.value,
"severity": finding.severity.value,
"code": finding.code,
"title": finding.title,
"message": finding.message,
"resolution_state": finding.resolution_state.value,
}
for finding in findings
],
}
storage_ref = f"inline://dataset-review/{session.session_id}/validation.json"
return {"storage_ref": storage_ref, "content": content}
# [/DEF:_build_validation_export:Function]
# [DEF:list_sessions:Function]
# @COMPLEXITY: 3
# @PURPOSE: List resumable dataset review sessions for the current user.
# @RELATION: [CALLS] ->[list_user_sess]
@router.get(
"/sessions",
response_model=SessionCollectionResponse,
dependencies=[
Depends(_require_auto_review_flag),
Depends(has_permission("dataset:session", "READ")),
],
)
async def list_sessions(
page: int = Query(1, ge=1),
page_size: int = Query(20, ge=1, le=100),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.list_sessions"):
sessions = repository.list_sessions_for_user(current_user.id)
start = (page - 1) * page_size
end = start + page_size
items = [_serialize_session_summary(session) for session in sessions[start:end]]
return SessionCollectionResponse(
items=items,
total=len(sessions),
page=page,
page_size=page_size,
has_next=end < len(sessions),
)
# [/DEF:list_sessions:Function]
# [DEF:start_session:Function]
# @COMPLEXITY: 4
# @PURPOSE: Start a new dataset review session from a Superset link or dataset selection.
# @RELATION: [CALLS] ->[DatasetReviewOrchestrator.start_session]
# @PRE: feature flag enabled, user authenticated, and request body valid.
# @POST: returns persisted session summary scoped to the authenticated user.
# @SIDE_EFFECT: persists session/profile/findings and may enqueue recovery task.
# @DATA_CONTRACT: Input[StartSessionRequest] -> Output[SessionSummary]
@router.post(
"/sessions",
response_model=SessionSummary,
status_code=status.HTTP_201_CREATED,
dependencies=[
Depends(_require_auto_review_flag),
Depends(has_permission("dataset:session", "MANAGE")),
],
)
async def start_session(
request: StartSessionRequest,
orchestrator: DatasetReviewOrchestrator = Depends(_get_orchestrator),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.start_session"):
try:
result = orchestrator.start_session(
StartSessionCommand(
user=current_user,
environment_id=request.environment_id,
source_kind=request.source_kind,
source_input=request.source_input,
)
)
except ValueError as exc:
logger.explore(
"Dataset review session start rejected",
extra={"user_id": current_user.id, "error": str(exc)},
)
detail = str(exc)
status_code = status.HTTP_404_NOT_FOUND if detail == "Environment not found" else status.HTTP_400_BAD_REQUEST
raise HTTPException(status_code=status_code, detail=detail) from exc
return _serialize_session_summary(result.session)
# [/DEF:start_session:Function]
# [DEF:get_session_detail:Function]
# @COMPLEXITY: 3
# @PURPOSE: Return the full accessible dataset review session aggregate for current user scope.
# @RELATION: [CALLS] ->[_get_owned_session_or_404]
@router.get(
"/sessions/{session_id}",
response_model=SessionDetail,
dependencies=[
Depends(_require_auto_review_flag),
Depends(has_permission("dataset:session", "READ")),
],
)
async def get_session_detail(
session_id: str,
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.get_session_detail"):
session = _get_owned_session_or_404(repository, session_id, current_user)
return _serialize_session_detail(session)
# [/DEF:get_session_detail:Function]
# [DEF:update_session:Function]
# @COMPLEXITY: 4
# @PURPOSE: Update resumable lifecycle status for an owned dataset review session.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
# @PRE: session is accessible to current user and requested status is allowed by lifecycle policy.
# @POST: returns updated summary without changing ownership or unrelated aggregates.
# @SIDE_EFFECT: mutates session lifecycle fields in persistence.
# @DATA_CONTRACT: Input[UpdateSessionRequest] -> Output[SessionSummary]
@router.patch(
"/sessions/{session_id}",
response_model=SessionSummary,
dependencies=[
Depends(_require_auto_review_flag),
Depends(has_permission("dataset:session", "MANAGE")),
],
)
async def update_session(
session_id: str,
request: UpdateSessionRequest,
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.update_session"):
session = _get_owned_session_or_404(repository, session_id, current_user)
if session.user_id != current_user.id:
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Only the owner can mutate session lifecycle")
session.status = request.status
if request.status == SessionStatus.PAUSED:
session.recommended_action = RecommendedAction.RESUME_SESSION
elif request.status in {SessionStatus.ARCHIVED, SessionStatus.CANCELLED, SessionStatus.COMPLETED}:
session.active_task_id = None
repository.db.commit()
repository.db.refresh(session)
return _serialize_session_summary(session)
# [/DEF:update_session:Function]
# [DEF:delete_session:Function]
# @COMPLEXITY: 4
# @PURPOSE: Archive or hard-delete a session owned by the current user.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
# @PRE: session is owner-scoped to current user.
# @POST: session is archived or deleted and no foreign-session existence is disclosed.
# @SIDE_EFFECT: mutates or deletes persisted session aggregate.
# @DATA_CONTRACT: Input[session_id:str,hard_delete:bool] -> Output[HTTP 204]
@router.delete(
"/sessions/{session_id}",
status_code=status.HTTP_204_NO_CONTENT,
dependencies=[
Depends(_require_auto_review_flag),
Depends(has_permission("dataset:session", "MANAGE")),
],
)
async def delete_session(
session_id: str,
hard_delete: bool = Query(False),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.delete_session"):
session = _get_owned_session_or_404(repository, session_id, current_user)
if session.user_id != current_user.id:
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Only the owner can delete a session")
if hard_delete:
repository.db.delete(session)
else:
session.status = SessionStatus.ARCHIVED
session.active_task_id = None
repository.db.commit()
return Response(status_code=status.HTTP_204_NO_CONTENT)
# [/DEF:delete_session:Function]
# [DEF:export_documentation:Function]
# @COMPLEXITY: 4
# @PURPOSE: Export documentation output for the current session in JSON or Markdown form.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
# @PRE: session is accessible to current user and requested format is supported.
# @POST: returns ownership-scoped export payload without fabricating unrelated artifacts.
# @SIDE_EFFECT: none beyond response construction.
# @DATA_CONTRACT: Input[session_id:str,format:ArtifactFormat] -> Output[ExportArtifactResponse]
@router.get(
"/sessions/{session_id}/exports/documentation",
response_model=ExportArtifactResponse,
dependencies=[
Depends(_require_auto_review_flag),
Depends(has_permission("dataset:session", "READ")),
],
)
async def export_documentation(
session_id: str,
format: ArtifactFormat = Query(ArtifactFormat.JSON),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.export_documentation"):
if format not in {ArtifactFormat.JSON, ArtifactFormat.MARKDOWN}:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only json and markdown exports are supported")
session = _get_owned_session_or_404(repository, session_id, current_user)
export_payload = _build_documentation_export(session, format)
return ExportArtifactResponse(
artifact_id=f"documentation-{session.session_id}-{format.value}",
session_id=session.session_id,
artifact_type="documentation",
format=format.value,
storage_ref=export_payload["storage_ref"],
created_by_user_id=current_user.id,
content=export_payload["content"],
)
# [/DEF:export_documentation:Function]
# [DEF:export_validation:Function]
# @COMPLEXITY: 4
# @PURPOSE: Export validation findings for the current session in JSON or Markdown form.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
# @PRE: session is accessible to current user and requested format is supported.
# @POST: returns explicit validation export payload scoped to current user session access.
# @SIDE_EFFECT: none beyond response construction.
# @DATA_CONTRACT: Input[session_id:str,format:ArtifactFormat] -> Output[ExportArtifactResponse]
@router.get(
"/sessions/{session_id}/exports/validation",
response_model=ExportArtifactResponse,
dependencies=[
Depends(_require_auto_review_flag),
Depends(has_permission("dataset:session", "READ")),
],
)
async def export_validation(
session_id: str,
format: ArtifactFormat = Query(ArtifactFormat.JSON),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.export_validation"):
if format not in {ArtifactFormat.JSON, ArtifactFormat.MARKDOWN}:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only json and markdown exports are supported")
session = _get_owned_session_or_404(repository, session_id, current_user)
export_payload = _build_validation_export(session, format)
return ExportArtifactResponse(
artifact_id=f"validation-{session.session_id}-{format.value}",
session_id=session.session_id,
artifact_type="validation_report",
format=format.value,
storage_ref=export_payload["storage_ref"],
created_by_user_id=current_user.id,
content=export_payload["content"],
)
# [/DEF:export_validation:Function]
# [/DEF:DatasetReviewApi:Module]