feat(ui): add chat-driven dataset review flow

Move dataset review clarification into the assistant workspace and
rework the review page into a chat-centric layout with execution rails.

Add session-scoped assistant actions for mappings, semantic fields,
and SQL preview generation. Introduce optimistic locking for dataset
review mutations, propagate session versions through API responses,
and mask imported filter values before assistant exposure.

Refresh tests, i18n, and spec artifacts to match the new workflow.

BREAKING CHANGE: dataset review mutation endpoints now require the
X-Session-Version header, and clarification is no longer handled
through ClarificationDialog-based flows
This commit is contained in:
2026-03-26 13:33:12 +03:00
parent d7911fb2f1
commit 7c85552132
74 changed files with 6122 additions and 2970 deletions

View File

@@ -21,6 +21,20 @@ from pydantic import BaseModel
from src.api.routes import assistant as assistant_routes
from src.schemas.auth import User
from src.models.assistant import AssistantMessageRecord
from src.models.dataset_review import (
ApprovalState,
CandidateStatus,
DatasetReviewSession,
ExecutionMapping,
ImportedFilter,
MappingMethod,
SemanticCandidate,
SemanticFieldEntry,
ReadinessState,
RecommendedAction,
SessionPhase,
SessionStatus,
)
# [DEF:_run_async:Function]
@@ -167,6 +181,12 @@ class _FakeQuery:
def __init__(self, items):
self.items = items
def outerjoin(self, *args, **kwargs):
return self
def options(self, *args, **kwargs):
return self
def filter(self, *args, **kwargs):
# @INVARIANT: filter() is predicate-blind; returns all records regardless of user_id scope
return self
@@ -203,10 +223,13 @@ class _FakeQuery:
class _FakeDb:
def __init__(self):
self.added = []
self.dataset_sessions = {}
def query(self, model):
if model == AssistantMessageRecord:
return _FakeQuery([])
if model == DatasetReviewSession:
return _FakeQuery(list(self.dataset_sessions.values()))
return _FakeQuery([])
def add(self, obj):
@@ -240,6 +263,121 @@ def _clear_assistant_state():
# [/DEF:_clear_assistant_state:Function]
# [DEF:_dataset_review_session:Function]
# @RELATION: BINDS_TO -> [AssistantApiTests]
# @COMPLEXITY: 1
# @PURPOSE: Build minimal owned dataset-review session fixture for assistant scoped routing tests.
def _dataset_review_session():
session = DatasetReviewSession(
session_id="sess-1",
user_id="u-admin",
environment_id="env-1",
source_kind="superset_link",
source_input="http://superset.local/dashboard/10",
dataset_ref="public.sales",
dataset_id=42,
version=3,
readiness_state=ReadinessState.MAPPING_REVIEW_NEEDED,
recommended_action=RecommendedAction.APPROVE_MAPPING,
status=SessionStatus.ACTIVE,
current_phase=SessionPhase.MAPPING_REVIEW,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
last_activity_at=datetime.utcnow(),
)
session.findings = []
session.previews = []
session.imported_filters = [
ImportedFilter(
filter_id="filter-1",
session_id="sess-1",
filter_name="email",
display_name="Email",
raw_value="john.doe@example.com",
raw_value_masked=False,
normalized_value="john.doe@example.com",
source="manual",
confidence_state="confirmed",
requires_confirmation=False,
recovery_status="recovered",
notes=None,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
)
]
session.execution_mappings = [
ExecutionMapping(
mapping_id="map-1",
session_id="sess-1",
filter_id="filter-1",
variable_id="var-1",
mapping_method=MappingMethod.DIRECT_MATCH,
raw_input_value="john.doe@example.com",
effective_value="john.doe@example.com",
transformation_note=None,
warning_level=None,
requires_explicit_approval=True,
approval_state=ApprovalState.PENDING,
approved_by_user_id=None,
approved_at=None,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
)
]
session.semantic_fields = []
session.semantic_fields = [
SemanticFieldEntry(
field_id="field-1",
session_id="sess-1",
field_name="customer_name",
field_kind="dimension",
verbose_name="Customer name",
description="Current semantic label",
display_format="text",
provenance="unresolved",
source_id=None,
source_version=None,
confidence_rank=None,
is_locked=False,
has_conflict=True,
needs_review=True,
last_changed_by="system",
)
]
session.semantic_fields[0].candidates = [
SemanticCandidate(
candidate_id="cand-1",
field_id="field-1",
source_id=None,
candidate_rank=1,
match_type="exact",
confidence_score=0.99,
proposed_verbose_name="Customer legal name",
proposed_description="Approved semantic wording",
proposed_display_format="text",
status=CandidateStatus.PROPOSED,
)
]
session.template_variables = []
session.clarification_sessions = []
session.run_contexts = []
return session
# [/DEF:_dataset_review_session:Function]
# [DEF:_await_none:Function]
# @RELATION: BINDS_TO -> [AssistantApiTests]
# @COMPLEXITY: 1
# @PURPOSE: Async helper returning None for planner fallback tests.
async def _await_none(*args, **kwargs):
return None
# [/DEF:_await_none:Function]
# [DEF:test_unknown_command_returns_needs_clarification:Function]
# @RELATION: BINDS_TO -> [AssistantApiTests]
# @PURPOSE: Unknown command should return clarification state and unknown intent.
@@ -288,6 +426,139 @@ def test_capabilities_question_returns_successful_help(monkeypatch):
assert "я могу сделать" in resp.text.lower()
# [/DEF:test_capabilities_question_returns_successful_help:Function]
# [DEF:test_assistant_message_request_accepts_dataset_review_session_binding:Function]
# @RELATION: BINDS_TO -> [AssistantApiTests]
# @PURPOSE: Assistant request schema should accept active dataset review session binding for scoped orchestration.
def test_assistant_message_request_accepts_dataset_review_session_binding():
request = assistant_routes.AssistantMessageRequest(
message="approve mappings",
dataset_review_session_id="sess-1",
)
assert request.dataset_review_session_id == "sess-1"
# [/DEF:test_assistant_message_request_accepts_dataset_review_session_binding:Function]
# [DEF:test_dataset_review_scoped_message_uses_masked_filter_context:Function]
# @RELATION: BINDS_TO -> [AssistantApiTests]
# @PURPOSE: Session-scoped assistant context should mask imported-filter raw values before assistant-visible metadata is persisted.
def test_dataset_review_scoped_message_uses_masked_filter_context(monkeypatch):
_clear_assistant_state()
db = _FakeDb()
db.dataset_sessions["sess-1"] = _dataset_review_session()
req = assistant_routes.AssistantMessageRequest(
message="show filters",
dataset_review_session_id="sess-1",
)
assistant_routes._plan_intent_with_llm = _await_none
async def _fake_dispatch_dataset_review_intent(
intent, current_user, config_manager, db
):
return str(intent["entities"]["summary"]), None, []
monkeypatch.setattr(
assistant_routes,
"_dispatch_dataset_review_intent",
_fake_dispatch_dataset_review_intent,
)
resp = _run_async(
assistant_routes.send_message(
req,
current_user=_admin_user(),
task_manager=_FakeTaskManager(),
config_manager=_FakeConfigManager(),
db=db,
)
)
assert resp.state == "success"
persisted_assistant = [
item for item in db.added if getattr(item, "role", None) == "assistant"
][-1]
imported_filters = persisted_assistant.payload["dataset_review_context"][
"imported_filters"
]
assert imported_filters[0]["raw_value"] == "***@example.com"
assert imported_filters[0]["raw_value_masked"] is True
# [/DEF:test_dataset_review_scoped_message_uses_masked_filter_context:Function]
# [DEF:test_dataset_review_scoped_command_returns_confirmation_for_mapping_approval:Function]
# @RELATION: BINDS_TO -> [AssistantApiTests]
# @PURPOSE: Session-scoped assistant commands should route dataset-review mapping approvals into confirmation workflow with bound session metadata.
def test_dataset_review_scoped_command_returns_confirmation_for_mapping_approval():
_clear_assistant_state()
db = _FakeDb()
db.dataset_sessions["sess-1"] = _dataset_review_session()
req = assistant_routes.AssistantMessageRequest(
message="approve mappings",
dataset_review_session_id="sess-1",
)
assistant_routes._plan_intent_with_llm = _await_none
resp = _run_async(
assistant_routes.send_message(
req,
current_user=_admin_user(),
task_manager=_FakeTaskManager(),
config_manager=_FakeConfigManager(),
db=db,
)
)
assert resp.state == "needs_confirmation"
assert resp.intent["operation"] == "dataset_review_approve_mappings"
assert resp.intent["entities"]["dataset_review_session_id"] == "sess-1"
assert resp.intent["entities"]["session_version"] == 3
assert resp.intent["entities"]["mapping_ids"] == ["map-1"]
# [/DEF:test_dataset_review_scoped_command_returns_confirmation_for_mapping_approval:Function]
# [DEF:test_dataset_review_scoped_command_routes_field_semantics_update:Function]
# @RELATION: BINDS_TO -> [AssistantApiTests]
# @PURPOSE: Session-scoped assistant commands should route semantic field updates through explicit confirmation metadata.
def test_dataset_review_scoped_command_routes_field_semantics_update():
_clear_assistant_state()
db = _FakeDb()
db.dataset_sessions["sess-1"] = _dataset_review_session()
req = assistant_routes.AssistantMessageRequest(
message='set field semantics target=field:field-1 desc="Approved semantic wording" lock',
dataset_review_session_id="sess-1",
)
assistant_routes._plan_intent_with_llm = _await_none
resp = _run_async(
assistant_routes.send_message(
req,
current_user=_admin_user(),
task_manager=_FakeTaskManager(),
config_manager=_FakeConfigManager(),
db=db,
)
)
assert resp.state == "needs_confirmation"
assert resp.intent["operation"] == "dataset_review_set_field_semantics"
assert resp.intent["entities"]["dataset_review_session_id"] == "sess-1"
assert resp.intent["entities"]["field_id"] == "field-1"
assert resp.intent["entities"]["description"] == "Approved semantic wording"
assert resp.intent["entities"]["lock_field"] is True
# [/DEF:test_dataset_review_scoped_command_routes_field_semantics_update:Function]
# [/DEF:test_capabilities_question_returns_successful_help:Function]
# [/DEF:AssistantApiTests:Module]

View File

@@ -65,6 +65,9 @@ from src.services.dataset_review.orchestrator import (
)
from src.services.dataset_review.semantic_resolver import SemanticSourceResolver
from src.services.dataset_review.event_logger import SessionEventLogger
from src.services.dataset_review.repositories.session_repository import (
DatasetReviewSessionVersionConflictError,
)
client = TestClient(app)
@@ -122,6 +125,7 @@ def _make_session():
dataset_ref="public.sales",
dataset_id=42,
dashboard_id=10,
version=0,
readiness_state=ReadinessState.REVIEW_READY,
recommended_action=RecommendedAction.REVIEW_DOCUMENTATION,
status=SessionStatus.ACTIVE,
@@ -853,6 +857,10 @@ def test_get_session_detail_export_and_lifecycle_endpoints(
repository = MagicMock()
repository.load_session_detail.return_value = session
repository.list_sessions_for_user.return_value = [session]
repository.require_session_version.side_effect = lambda current, expected: current
repository.bump_session_version.side_effect = lambda current: setattr(
current, "version", int(getattr(current, "version", 0) or 0) + 1
) or getattr(current, "version", 0)
repository.db = MagicMock()
repository.event_logger = MagicMock(spec=SessionEventLogger)
repository.event_logger.log_for_session.return_value = SimpleNamespace(
@@ -868,6 +876,7 @@ def test_get_session_detail_export_and_lifecycle_endpoints(
patch_response = client.patch(
"/api/dataset-orchestration/sessions/sess-1",
json={"status": "paused"},
headers={"X-Session-Version": "0"},
)
assert patch_response.status_code == 200
assert patch_response.json()["status"] == "paused"
@@ -885,13 +894,39 @@ def test_get_session_detail_export_and_lifecycle_endpoints(
assert validation_response.json()["artifact_type"] == "validation_report"
assert "Validation Report" in validation_response.json()["content"]["markdown"]
delete_response = client.delete("/api/dataset-orchestration/sessions/sess-1")
delete_response = client.delete(
"/api/dataset-orchestration/sessions/sess-1",
headers={"X-Session-Version": "1"},
)
assert delete_response.status_code == 204
# [/DEF:test_get_session_detail_export_and_lifecycle_endpoints:Function]
# [DEF:test_get_clarification_state_returns_empty_payload_when_session_has_no_record:Function]
# @PURPOSE: Clarification state endpoint should return a non-blocking empty payload when the session has no clarification aggregate yet.
def test_get_clarification_state_returns_empty_payload_when_session_has_no_record(
dataset_review_api_dependencies,
):
session = _make_us3_session()
repository = MagicMock()
repository.load_session_detail.return_value = session
app.dependency_overrides[_get_repository] = lambda: repository
response = client.get("/api/dataset-orchestration/sessions/sess-1/clarification")
assert response.status_code == 200
assert response.json() == {
"clarification_session": None,
"current_question": None,
}
# [/DEF:test_get_clarification_state_returns_empty_payload_when_session_has_no_record:Function]
# [DEF:test_us2_clarification_endpoints_persist_answer_and_feedback:Function]
# @RELATION: BINDS_TO -> DatasetReviewApiTests
# @PURPOSE: Clarification endpoints should expose one current question, persist the answer before advancement, and store feedback on the answer audit record.
@@ -901,6 +936,7 @@ def test_us2_clarification_endpoints_persist_answer_and_feedback(
session = _make_us2_session()
repository = MagicMock()
repository.load_session_detail.return_value = session
repository.require_session_version.side_effect = lambda current, expected: current
repository.db = MagicMock()
repository.db.commit.side_effect = lambda: None
repository.db.refresh.side_effect = lambda obj: None
@@ -933,6 +969,7 @@ def test_us2_clarification_endpoints_persist_answer_and_feedback(
"answer_kind": "selected",
"answer_value": "Revenue reporting",
},
headers={"X-Session-Version": "0"},
)
assert answer_response.status_code == 200
answer_payload = answer_response.json()
@@ -947,6 +984,7 @@ def test_us2_clarification_endpoints_persist_answer_and_feedback(
feedback_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/clarification/questions/q-1/feedback",
json={"feedback": "up"},
headers={"X-Session-Version": "0"},
)
assert feedback_response.status_code == 200
assert feedback_response.json() == {"target_id": "q-1", "feedback": "up"}
@@ -965,6 +1003,10 @@ def test_us2_field_semantic_override_lock_unlock_and_feedback(
session = _make_us2_session()
repository = MagicMock()
repository.load_session_detail.return_value = session
repository.require_session_version.side_effect = lambda current, expected: current
repository.bump_session_version.side_effect = lambda current: setattr(
current, "version", int(getattr(current, "version", 0) or 0) + 1
) or getattr(current, "version", 0)
repository.db = MagicMock()
repository.db.commit.side_effect = lambda: None
repository.db.refresh.side_effect = lambda obj: None
@@ -984,6 +1026,7 @@ def test_us2_field_semantic_override_lock_unlock_and_feedback(
"description": "Manual business-approved description",
"display_format": "$,.0f",
},
headers={"X-Session-Version": "0"},
)
assert override_response.status_code == 200
override_payload = override_response.json()
@@ -991,7 +1034,8 @@ def test_us2_field_semantic_override_lock_unlock_and_feedback(
assert override_payload["is_locked"] is True
unlock_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/fields/field-1/unlock"
"/api/dataset-orchestration/sessions/sess-1/fields/field-1/unlock",
headers={"X-Session-Version": "1"},
)
assert unlock_response.status_code == 200
assert unlock_response.json()["is_locked"] is False
@@ -999,6 +1043,7 @@ def test_us2_field_semantic_override_lock_unlock_and_feedback(
candidate_response = client.patch(
"/api/dataset-orchestration/sessions/sess-1/fields/field-1/semantic",
json={"candidate_id": "cand-1", "lock_field": True},
headers={"X-Session-Version": "2"},
)
assert candidate_response.status_code == 200
candidate_payload = candidate_response.json()
@@ -1013,6 +1058,7 @@ def test_us2_field_semantic_override_lock_unlock_and_feedback(
{"field_id": "field-1", "candidate_id": "cand-1", "lock_field": False}
]
},
headers={"X-Session-Version": "3"},
)
assert batch_response.status_code == 200
assert batch_response.json()[0]["field_id"] == "field-1"
@@ -1020,6 +1066,7 @@ def test_us2_field_semantic_override_lock_unlock_and_feedback(
feedback_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/fields/field-1/feedback",
json={"feedback": "down"},
headers={"X-Session-Version": "4"},
)
assert feedback_response.status_code == 200
assert feedback_response.json() == {"target_id": "field-1", "feedback": "down"}
@@ -1052,6 +1099,10 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
repository = MagicMock()
repository.load_session_detail.return_value = session
repository.require_session_version.side_effect = lambda current, expected: current
repository.bump_session_version.side_effect = lambda current: setattr(
current, "version", int(getattr(current, "version", 0) or 0) + 1
) or getattr(current, "version", 0)
repository.db = MagicMock()
repository.db.commit.side_effect = lambda: None
repository.db.refresh.side_effect = lambda obj: None
@@ -1100,6 +1151,21 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
blocked_reasons=[],
)
def _assert_expected_preview_version(command):
assert command.expected_version == 3
return PreparePreviewResult(
session=session, preview=preview, blocked_reasons=[]
)
def _assert_expected_launch_version(command):
assert command.expected_version == 5
return LaunchDatasetResult(
session=session, run_context=run_context, blocked_reasons=[]
)
orchestrator.prepare_launch_preview.side_effect = _assert_expected_preview_version
orchestrator.launch_dataset.side_effect = _assert_expected_launch_version
app.dependency_overrides[_get_repository] = lambda: repository
app.dependency_overrides[_get_orchestrator] = lambda: orchestrator
@@ -1110,6 +1176,7 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
"mapping_method": "manual_override",
"transformation_note": "Manual override for SQL Lab launch",
},
headers={"X-Session-Version": "0"},
)
assert patch_response.status_code == 200
patch_payload = patch_response.json()
@@ -1130,6 +1197,7 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
approve_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/mappings/map-1/approve",
json={"approval_note": "Approved after reviewing transformation"},
headers={"X-Session-Version": "1"},
)
assert approve_response.status_code == 200
approve_payload = approve_response.json()
@@ -1144,6 +1212,7 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
batch_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/mappings/approve-batch",
json={"mapping_ids": ["map-1"]},
headers={"X-Session-Version": "2"},
)
assert batch_response.status_code == 200
assert batch_response.json()[0]["mapping_id"] == "map-1"
@@ -1152,7 +1221,10 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
assert list_response.status_code == 200
assert list_response.json()["items"][0]["mapping_id"] == "map-1"
preview_response = client.post("/api/dataset-orchestration/sessions/sess-1/preview")
preview_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/preview",
headers={"X-Session-Version": "3"},
)
assert preview_response.status_code == 200
preview_payload = preview_response.json()
assert preview_payload["preview_id"] == "preview-1"
@@ -1160,33 +1232,44 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
assert preview_payload["compiled_by"] == "superset"
assert "SELECT * FROM sales" in preview_payload["compiled_sql"]
orchestrator.prepare_launch_preview.return_value = PreparePreviewResult(
session=session,
preview=SimpleNamespace(
preview_id="preview-pending",
session_id="sess-1",
preview_status=PreviewStatus.PENDING,
compiled_sql=None,
preview_fingerprint="fingerprint-pending",
compiled_by="superset",
error_code=None,
error_details=None,
compiled_at=None,
created_at=datetime.now(timezone.utc),
),
blocked_reasons=[],
def _assert_expected_pending_preview_version(command):
assert command.expected_version == 4
return PreparePreviewResult(
session=session,
preview=SimpleNamespace(
preview_id="preview-pending",
session_id="sess-1",
preview_status=PreviewStatus.PENDING,
compiled_sql=None,
preview_fingerprint="fingerprint-pending",
compiled_by="superset",
error_code=None,
error_details=None,
compiled_at=None,
created_at=datetime.now(timezone.utc),
),
blocked_reasons=[],
)
orchestrator.prepare_launch_preview.side_effect = (
_assert_expected_pending_preview_version
)
preview_enqueue_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/preview"
"/api/dataset-orchestration/sessions/sess-1/preview",
headers={"X-Session-Version": "4"},
)
assert preview_enqueue_response.status_code == 202
assert preview_enqueue_response.json() == {
"session_id": "sess-1",
"session_version": 3,
"preview_status": "pending",
"task_id": None,
}
launch_response = client.post("/api/dataset-orchestration/sessions/sess-1/launch")
launch_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/launch",
headers={"X-Session-Version": "5"},
)
assert launch_response.status_code == 201
launch_payload = launch_response.json()
assert launch_payload["run_context"]["run_context_id"] == "run-1"
@@ -1201,6 +1284,105 @@ def test_us3_mapping_patch_approval_preview_and_launch_endpoints(
# [/DEF:test_us3_mapping_patch_approval_preview_and_launch_endpoints:Function]
# [DEF:test_us3_preview_response_propagates_refreshed_session_version_for_launch_follow_up:Function]
# @RELATION: BINDS_TO -> DatasetReviewApiTests
# @PURPOSE: Preview response should expose the refreshed session version so the normal preview-then-launch UI flow can satisfy optimistic locking without a forced full reload.
def test_us3_preview_response_propagates_refreshed_session_version_for_launch_follow_up(
dataset_review_api_dependencies,
):
session = _make_us3_session()
session.version = 4
repository = MagicMock()
repository.load_session_detail.return_value = session
def _require_session_version(current, expected):
if int(getattr(current, "version", 0) or 0) != expected:
raise DatasetReviewSessionVersionConflictError(
current.session_id,
expected,
int(getattr(current, "version", 0) or 0),
)
return current
repository.require_session_version.side_effect = _require_session_version
repository.db = MagicMock()
repository.event_logger = MagicMock(spec=SessionEventLogger)
preview = SimpleNamespace(
preview_id="preview-2",
session_id="sess-1",
preview_status=PreviewStatus.READY,
compiled_sql="SELECT * FROM sales WHERE country = 'FR'",
preview_fingerprint="fingerprint-2",
compiled_by="superset",
error_code=None,
error_details=None,
compiled_at=datetime.now(timezone.utc),
created_at=datetime.now(timezone.utc),
)
run_context = SimpleNamespace(
run_context_id="run-2",
session_id="sess-1",
dataset_ref="public.sales",
environment_id="env-1",
preview_id="preview-2",
sql_lab_session_ref="sql-lab-88",
effective_filters=[{"mapping_id": "map-1", "effective_value": "FR"}],
template_params={"country": "FR"},
approved_mapping_ids=[],
semantic_decision_refs=[],
open_warning_refs=[],
launch_status=LaunchStatus.STARTED,
launch_error=None,
created_at=datetime.now(timezone.utc),
)
orchestrator = MagicMock()
def _prepare_preview(command):
assert command.expected_version == 4
session.version = 5
return PreparePreviewResult(
session=session,
preview=preview,
blocked_reasons=[],
)
def _launch_dataset(command):
assert command.expected_version == 5
return LaunchDatasetResult(
session=session,
run_context=run_context,
blocked_reasons=[],
)
orchestrator.prepare_launch_preview.side_effect = _prepare_preview
orchestrator.launch_dataset.side_effect = _launch_dataset
app.dependency_overrides[_get_repository] = lambda: repository
app.dependency_overrides[_get_orchestrator] = lambda: orchestrator
preview_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/preview",
headers={"X-Session-Version": "4"},
)
assert preview_response.status_code == 200
preview_payload = preview_response.json()
assert preview_payload["session_version"] == 5
launch_response = client.post(
"/api/dataset-orchestration/sessions/sess-1/launch",
headers={"X-Session-Version": str(preview_payload["session_version"])},
)
assert launch_response.status_code == 201
assert launch_response.json()["run_context"]["run_context_id"] == "run-2"
# [/DEF:test_us3_preview_response_propagates_refreshed_session_version_for_launch_follow_up:Function]
# [DEF:test_us3_preview_endpoint_returns_failed_preview_without_false_dashboard_not_found_contract_drift:Function]
# @RELATION: BINDS_TO -> DatasetReviewApiTests
# @PURPOSE: Preview endpoint should preserve API contract and surface generic upstream preview failures without fabricating dashboard-not-found semantics for non-dashboard 404s.
@@ -1235,7 +1417,10 @@ def test_us3_preview_endpoint_returns_failed_preview_without_false_dashboard_not
app.dependency_overrides[_get_repository] = lambda: repository
app.dependency_overrides[_get_orchestrator] = lambda: orchestrator
response = client.post("/api/dataset-orchestration/sessions/sess-1/preview")
response = client.post(
"/api/dataset-orchestration/sessions/sess-1/preview",
headers={"X-Session-Version": "0"},
)
assert response.status_code == 200
payload = response.json()
@@ -1252,6 +1437,46 @@ def test_us3_preview_endpoint_returns_failed_preview_without_false_dashboard_not
# [/DEF:test_us3_preview_endpoint_returns_failed_preview_without_false_dashboard_not_found_contract_drift:Function]
# [DEF:test_mutation_endpoints_surface_session_version_conflict_payload:Function]
# @RELATION: BINDS_TO -> DatasetReviewApiTests
# @PURPOSE: Dataset review mutation endpoints should return deterministic 409 conflict semantics when optimistic-lock versions are stale.
def test_mutation_endpoints_surface_session_version_conflict_payload(
dataset_review_api_dependencies,
):
session = _make_us3_session()
repository = MagicMock()
repository.load_session_detail.return_value = session
repository.require_session_version.side_effect = (
DatasetReviewSessionVersionConflictError(
session_id="sess-1",
expected_version=2,
actual_version=5,
)
)
repository.db = MagicMock()
repository.event_logger = MagicMock(spec=SessionEventLogger)
app.dependency_overrides[_get_repository] = lambda: repository
response = client.patch(
"/api/dataset-orchestration/sessions/sess-1/mappings/map-1",
json={
"effective_value": "EU",
"mapping_method": "manual_override",
},
headers={"X-Session-Version": "2"},
)
assert response.status_code == 409
payload = response.json()["detail"]
assert payload["error_code"] == "session_version_conflict"
assert payload["expected_version"] == 2
assert payload["actual_version"] == 5
# [/DEF:test_mutation_endpoints_surface_session_version_conflict_payload:Function]
# [DEF:test_execution_snapshot_includes_recovered_imported_filters_without_template_mapping:Function]
# @RELATION: BINDS_TO -> DatasetReviewApiTests
# @PURPOSE: Recovered imported filters with values should flow into preview filter context even when no template variable mapping exists.

View File

@@ -15,7 +15,7 @@ import json
import re
import uuid
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple, cast
from fastapi import APIRouter, Depends, HTTPException, Query, status
from pydantic import BaseModel, Field
@@ -40,6 +40,9 @@ from ...services.llm_prompt_templates import (
resolve_bound_provider_id,
)
from ...core.superset_client import SupersetClient
from ...core.utils.superset_context_extractor import (
sanitize_imported_filter_for_assistant,
)
from ...plugins.llm_analysis.service import LLMClient
from ...plugins.llm_analysis.models import LLMProviderType
from ...schemas.auth import User
@@ -48,9 +51,25 @@ from ...models.assistant import (
AssistantConfirmationRecord,
AssistantMessageRecord,
)
from ...models.dataset_review import (
ApprovalState,
DatasetReviewSession,
ReadinessState,
RecommendedAction,
)
from ...services.dataset_review.orchestrator import (
DatasetReviewOrchestrator,
PreparePreviewCommand,
)
from ...services.dataset_review.repositories.session_repository import (
DatasetReviewSessionRepository,
DatasetReviewSessionVersionConflictError,
)
from .dataset_review import FieldSemanticUpdateRequest, _update_semantic_field_state
router = APIRouter(tags=["Assistant"])
git_service = GitService()
logger = cast(Any, logger)
# [DEF:AssistantMessageRequest:Class]
@@ -65,6 +84,7 @@ git_service = GitService()
class AssistantMessageRequest(BaseModel):
conversation_id: Optional[str] = None
message: str = Field(..., min_length=1, max_length=4000)
dataset_review_session_id: Optional[str] = None
# [/DEF:AssistantMessageRequest:Class]
@@ -159,6 +179,10 @@ INTENT_PERMISSION_CHECKS: Dict[str, List[Tuple[str, str]]] = {
"run_llm_validation": [("plugin:llm_dashboard_validation", "EXECUTE")],
"run_llm_documentation": [("plugin:llm_documentation", "EXECUTE")],
"get_health_summary": [("plugin:migration", "READ")],
"dataset_review_answer_context": [("dataset:session", "READ")],
"dataset_review_approve_mappings": [("dataset:session", "MANAGE")],
"dataset_review_set_field_semantics": [("dataset:session", "MANAGE")],
"dataset_review_generate_sql_preview": [("dataset:session", "MANAGE")],
}
@@ -1139,7 +1163,10 @@ def _has_any_permission(current_user: User, checks: List[Tuple[str, str]]) -> bo
# @PRE: current_user is authenticated; config/db are available.
# @POST: Returns list of executable tools filtered by permission and runtime availability.
def _build_tool_catalog(
current_user: User, config_manager: ConfigManager, db: Session
current_user: User,
config_manager: ConfigManager,
db: Session,
dataset_review_context: Optional[Dict[str, Any]] = None,
) -> List[Dict[str, Any]]:
envs = config_manager.get_environments()
default_env_id = _get_default_environment_id(config_manager)
@@ -1271,6 +1298,61 @@ def _build_tool_catalog(
if checks and not _has_any_permission(current_user, checks):
continue
available.append(tool)
if dataset_review_context is not None:
dataset_tools: List[Dict[str, Any]] = [
{
"operation": "dataset_review_answer_context",
"domain": "dataset_review",
"description": "Answer questions using the currently bound dataset review session context",
"required_entities": ["dataset_review_session_id"],
"optional_entities": [],
"risk_level": "safe",
"requires_confirmation": False,
},
{
"operation": "dataset_review_approve_mappings",
"domain": "dataset_review",
"description": "Approve warning-sensitive execution mappings in the current dataset review session",
"required_entities": ["dataset_review_session_id", "session_version"],
"optional_entities": ["mapping_ids"],
"risk_level": "guarded",
"requires_confirmation": True,
},
{
"operation": "dataset_review_set_field_semantics",
"domain": "dataset_review",
"description": "Apply explicit semantic field override or candidate selection in the current dataset review session",
"required_entities": [
"dataset_review_session_id",
"session_version",
"field_id",
],
"optional_entities": [
"candidate_id",
"verbose_name",
"description",
"display_format",
"lock_field",
],
"risk_level": "guarded",
"requires_confirmation": True,
},
{
"operation": "dataset_review_generate_sql_preview",
"domain": "dataset_review",
"description": "Generate a Superset-compiled SQL preview for the current dataset review session",
"required_entities": ["dataset_review_session_id", "session_version"],
"optional_entities": [],
"risk_level": "guarded",
"requires_confirmation": True,
},
]
for tool in dataset_tools:
checks = INTENT_PERMISSION_CHECKS.get(tool["operation"], [])
if checks and not _has_any_permission(current_user, checks):
continue
available.append(tool)
return available
@@ -1301,7 +1383,602 @@ def _coerce_intent_entities(intent: Dict[str, Any]) -> Dict[str, Any]:
# Operations that are read-only and do not require confirmation.
_SAFE_OPS = {"show_capabilities", "get_task_status", "get_health_summary"}
_SAFE_OPS = {
"show_capabilities",
"get_task_status",
"get_health_summary",
"dataset_review_answer_context",
}
_DATASET_REVIEW_OPS = {
"dataset_review_approve_mappings",
"dataset_review_set_field_semantics",
"dataset_review_generate_sql_preview",
}
# [DEF:_serialize_dataset_review_context:Function]
# @COMPLEXITY: 4
# @PURPOSE: Build assistant-safe dataset-review context snapshot with masked imported-filter payloads for session-scoped assistant routing.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
def _serialize_dataset_review_context(
session: DatasetReviewSession,
) -> Dict[str, Any]:
latest_preview = None
previews = getattr(session, "previews", []) or []
if previews:
latest_preview = previews[-1]
return {
"session_id": session.session_id,
"version": int(getattr(session, "version", 0) or 0),
"dataset_ref": session.dataset_ref,
"dataset_id": session.dataset_id,
"environment_id": session.environment_id,
"readiness_state": session.readiness_state.value,
"recommended_action": session.recommended_action.value,
"status": session.status.value,
"current_phase": session.current_phase.value,
"findings": [
{
"finding_id": item.finding_id,
"code": item.code,
"severity": item.severity.value,
"message": item.message,
"resolution_state": item.resolution_state.value,
}
for item in getattr(session, "findings", [])
],
"imported_filters": [
sanitize_imported_filter_for_assistant(
{
"filter_id": item.filter_id,
"filter_name": item.filter_name,
"display_name": item.display_name,
"raw_value": item.raw_value,
"raw_value_masked": bool(getattr(item, "raw_value_masked", False)),
"normalized_value": item.normalized_value,
"source": getattr(item.source, "value", item.source),
"confidence_state": getattr(
item.confidence_state, "value", item.confidence_state
),
"requires_confirmation": bool(item.requires_confirmation),
"recovery_status": getattr(
item.recovery_status, "value", item.recovery_status
),
"notes": item.notes,
}
)
for item in getattr(session, "imported_filters", [])
],
"mappings": [
{
"mapping_id": item.mapping_id,
"filter_id": item.filter_id,
"variable_id": item.variable_id,
"mapping_method": getattr(
item.mapping_method, "value", item.mapping_method
),
"effective_value": item.effective_value,
"approval_state": getattr(
item.approval_state, "value", item.approval_state
),
"requires_explicit_approval": bool(item.requires_explicit_approval),
}
for item in getattr(session, "execution_mappings", [])
],
"semantic_fields": [
{
"field_id": item.field_id,
"field_name": item.field_name,
"verbose_name": item.verbose_name,
"description": item.description,
"display_format": item.display_format,
"provenance": getattr(item.provenance, "value", item.provenance),
"is_locked": bool(item.is_locked),
"needs_review": bool(item.needs_review),
"candidates": [
{
"candidate_id": candidate.candidate_id,
"proposed_verbose_name": candidate.proposed_verbose_name,
"proposed_description": candidate.proposed_description,
"proposed_display_format": candidate.proposed_display_format,
"status": getattr(candidate.status, "value", candidate.status),
}
for candidate in getattr(item, "candidates", [])
],
}
for item in getattr(session, "semantic_fields", [])
],
"preview": {
"preview_id": latest_preview.preview_id,
"preview_status": getattr(
latest_preview.preview_status, "value", latest_preview.preview_status
),
"compiled_sql": latest_preview.compiled_sql,
}
if latest_preview is not None
else None,
}
# [/DEF:_serialize_dataset_review_context:Function]
# [DEF:_load_dataset_review_context:Function]
# @COMPLEXITY: 4
# @PURPOSE: Load owner-scoped dataset-review context for assistant planning and grounded response generation.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
def _load_dataset_review_context(
dataset_review_session_id: Optional[str],
current_user: User,
db: Session,
) -> Optional[Dict[str, Any]]:
if not dataset_review_session_id:
return None
repository = DatasetReviewSessionRepository(db)
session = repository.load_session_detail(dataset_review_session_id, current_user.id)
if session is None or session.user_id != current_user.id:
raise HTTPException(status_code=404, detail="Dataset review session not found")
return _serialize_dataset_review_context(session)
# [/DEF:_load_dataset_review_context:Function]
# [DEF:_extract_dataset_review_target:Function]
# @COMPLEXITY: 2
# @PURPOSE: Extract structured dataset-review focus target hints embedded in assistant prompts.
def _extract_dataset_review_target(message: str) -> Tuple[Optional[str], Optional[str]]:
match = re.search(
r"(?:target|focus)\s*[:=]\s*(field|mapping|finding|filter)[:=]([A-Za-z0-9._-]+)",
str(message or ""),
re.IGNORECASE,
)
if not match:
return None, None
return match.group(1).lower(), match.group(2)
# [/DEF:_extract_dataset_review_target:Function]
# [DEF:_match_dataset_review_field:Function]
# @COMPLEXITY: 2
# @PURPOSE: Resolve one semantic field from assistant-visible context by id or user-visible label.
def _match_dataset_review_field(
dataset_context: Dict[str, Any],
message: str,
) -> Optional[Dict[str, Any]]:
target_kind, target_id = _extract_dataset_review_target(message)
fields = dataset_context.get("semantic_fields", []) or []
if target_kind == "field" and target_id:
return next(
(item for item in fields if str(item.get("field_id")) == str(target_id)),
None,
)
normalized_message = str(message or "").lower()
for field in fields:
if str(field.get("field_id", "")).lower() in normalized_message:
return field
field_name = str(field.get("field_name", "")).lower()
if field_name and field_name in normalized_message:
return field
verbose_name = str(field.get("verbose_name", "")).lower()
if verbose_name and verbose_name in normalized_message:
return field
return None
# [/DEF:_match_dataset_review_field:Function]
# [DEF:_extract_quoted_segment:Function]
# @COMPLEXITY: 2
# @PURPOSE: Extract one quoted assistant command segment after a label token.
def _extract_quoted_segment(message: str, label: str) -> Optional[str]:
pattern = rf"{label}\s*[=:]?\s*[\"']([^\"']+)[\"']"
match = re.search(pattern, str(message or ""), re.IGNORECASE)
return match.group(1).strip() if match else None
# [/DEF:_extract_quoted_segment:Function]
# [DEF:_dataset_review_conflict_http_exception:Function]
# @COMPLEXITY: 2
# @PURPOSE: Convert dataset-review optimistic-lock conflicts into shared 409 assistant semantics.
def _dataset_review_conflict_http_exception(
exc: DatasetReviewSessionVersionConflictError,
) -> HTTPException:
return HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail={
"error_code": "session_version_conflict",
"message": str(exc),
"session_id": exc.session_id,
"expected_version": exc.expected_version,
"actual_version": exc.actual_version,
},
)
# [/DEF:_dataset_review_conflict_http_exception:Function]
# [DEF:_plan_dataset_review_intent:Function]
# @COMPLEXITY: 3
# @PURPOSE: Parse session-scoped dataset-review assistant commands before falling back to generic assistant tool routing.
def _plan_dataset_review_intent(
message: str,
dataset_context: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
lower = message.strip().lower()
session_id = dataset_context["session_id"]
session_version = int(dataset_context.get("version", 0) or 0)
target_kind, target_id = _extract_dataset_review_target(message)
if any(
token in lower
for token in [
"approve mappings",
"approve mapping",
"подтверди мапп",
"одобри мапп",
]
):
pending_mapping_ids = [
item["mapping_id"]
for item in dataset_context.get("mappings", [])
if item.get("requires_explicit_approval")
and item.get("approval_state") != ApprovalState.APPROVED.value
]
return {
"domain": "dataset_review",
"operation": "dataset_review_approve_mappings",
"entities": {
"dataset_review_session_id": session_id,
"session_version": session_version,
"mapping_ids": pending_mapping_ids,
},
"confidence": 0.95,
"risk_level": "guarded",
"requires_confirmation": True,
}
if any(
token in lower
for token in [
"generate sql preview",
"generate preview",
"сгенерируй превью",
"собери превью",
]
):
return {
"domain": "dataset_review",
"operation": "dataset_review_generate_sql_preview",
"entities": {
"dataset_review_session_id": session_id,
"session_version": session_version,
},
"confidence": 0.94,
"risk_level": "guarded",
"requires_confirmation": True,
}
if any(
token in lower
for token in [
"set field semantics",
"apply field semantics",
"semantic override",
"update semantic field",
"установи семантик",
"обнови семантик",
]
):
field = _match_dataset_review_field(dataset_context, message)
if field is None:
return None
candidate_id = None
if any(
token in lower
for token in ["accept candidate", "apply candidate", "прими кандидат"]
):
candidates = field.get("candidates") or []
if candidates:
candidate_id = candidates[0].get("candidate_id")
verbose_name = _extract_quoted_segment(
message, "verbose_name|verbose name|label"
)
description = _extract_quoted_segment(message, "description|desc")
display_format = _extract_quoted_segment(
message, "display_format|display format|format"
)
lock_field = any(
token in lower for token in [" lock", "lock it", "зафикс", "закреп"]
)
if not any([candidate_id, verbose_name, description, display_format]):
return None
return {
"domain": "dataset_review",
"operation": "dataset_review_set_field_semantics",
"entities": {
"dataset_review_session_id": session_id,
"session_version": session_version,
"field_id": field.get("field_id") or target_id,
"candidate_id": candidate_id,
"verbose_name": verbose_name,
"description": description,
"display_format": display_format,
"lock_field": lock_field,
},
"confidence": 0.9,
"risk_level": "guarded",
"requires_confirmation": True,
}
if any(
token in lower
for token in [
"filters",
"фильтр",
"mapping",
"маппинг",
"preview",
"превью",
"finding",
"ошиб",
]
):
findings_count = len(dataset_context.get("findings", []))
mappings_count = len(dataset_context.get("mappings", []))
filters_count = len(dataset_context.get("imported_filters", []))
preview = dataset_context.get("preview") or {}
preview_status = preview.get("preview_status") or "missing"
masked_filters = dataset_context.get("imported_filters", [])
return {
"domain": "dataset_review",
"operation": "dataset_review_answer_context",
"entities": {
"dataset_review_session_id": session_id,
"summary": (
f"Session {session_id}: readiness={dataset_context['readiness_state']}, "
f"recommended_action={dataset_context['recommended_action']}, "
f"filters={filters_count}, mappings={mappings_count}, findings={findings_count}, "
f"preview_status={preview_status}, imported_filters={masked_filters}"
),
},
"confidence": 0.8,
"risk_level": "safe",
"requires_confirmation": False,
}
return None
# [/DEF:_plan_dataset_review_intent:Function]
# [DEF:_dispatch_dataset_review_intent:Function]
# @COMPLEXITY: 4
# @PURPOSE: Route confirmed dataset-review assistant intents through existing backend dataset-review APIs and orchestration boundaries.
async def _dispatch_dataset_review_intent(
intent: Dict[str, Any],
current_user: User,
config_manager: ConfigManager,
db: Session,
) -> Tuple[str, Optional[str], List[AssistantAction]]:
with belief_scope("assistant.dispatch_dataset_review_intent"):
entities = intent.get("entities", {})
session_id = entities.get("dataset_review_session_id")
session_version = entities.get("session_version")
if not session_id or session_version is None:
raise HTTPException(
status_code=422,
detail="Missing dataset_review_session_id/session_version",
)
operation = str(intent.get("operation") or "")
repository = DatasetReviewSessionRepository(db)
if operation == "dataset_review_answer_context":
summary = str(entities.get("summary") or "")
logger.reflect(
"Returned assistant-safe dataset review context summary",
extra={"session_id": session_id, "operation": operation},
)
return summary, None, []
session = repository.load_session_detail(session_id, current_user.id)
if session is None or session.user_id != current_user.id:
logger.explore(
"Assistant dataset-review intent rejected because session was not found",
extra={"session_id": session_id, "user_id": current_user.id},
)
raise HTTPException(
status_code=404, detail="Dataset review session not found"
)
try:
repository.require_session_version(session, int(session_version))
except DatasetReviewSessionVersionConflictError as exc:
logger.explore(
"Assistant dataset-review intent rejected due to stale session version",
extra={
"session_id": exc.session_id,
"expected_version": exc.expected_version,
"actual_version": exc.actual_version,
"operation": operation,
},
)
raise _dataset_review_conflict_http_exception(exc) from exc
logger.reason(
"Dispatching confirmed assistant dataset-review intent",
extra={
"session_id": session_id,
"session_version": session_version,
"operation": operation,
},
)
if operation == "dataset_review_approve_mappings":
mapping_ids = list(dict.fromkeys(entities.get("mapping_ids") or []))
if not mapping_ids:
raise HTTPException(
status_code=409, detail="No pending mappings to approve"
)
updated_count = 0
for mapping in session.execution_mappings:
if mapping.mapping_id not in mapping_ids:
continue
mapping.approval_state = ApprovalState.APPROVED
mapping.approved_by_user_id = current_user.id
mapping.approved_at = datetime.utcnow()
updated_count += 1
if updated_count == 0:
raise HTTPException(
status_code=409, detail="No matching mappings available to approve"
)
session.last_activity_at = datetime.utcnow()
if session.readiness_state == ReadinessState.MAPPING_REVIEW_NEEDED:
session.recommended_action = RecommendedAction.GENERATE_SQL_PREVIEW
repository.bump_session_version(session)
repository.db.commit()
repository.db.refresh(session)
repository.event_logger.log_for_session(
session,
actor_user_id=current_user.id,
event_type="assistant_mapping_approval",
event_summary="Assistant-approved warning-sensitive mappings persisted",
event_details={
"mapping_ids": mapping_ids,
"count": updated_count,
"version": int(getattr(session, "version", 0) or 0),
},
)
logger.reflect(
"Assistant mapping approval persisted within optimistic-lock boundary",
extra={
"session_id": session_id,
"updated_count": updated_count,
"version": int(getattr(session, "version", 0) or 0),
},
)
return (
f"Approved {updated_count} mapping(s) for dataset review session {session_id}.",
None,
[
AssistantAction(
type="focus_target",
label="Open mapping review",
target="mapping",
)
],
)
if operation == "dataset_review_set_field_semantics":
field_id = str(entities.get("field_id") or "").strip()
if not field_id:
raise HTTPException(status_code=422, detail="Missing field_id")
field = next(
(item for item in session.semantic_fields if item.field_id == field_id),
None,
)
if field is None:
raise HTTPException(status_code=404, detail="Semantic field not found")
update_request = FieldSemanticUpdateRequest(
candidate_id=entities.get("candidate_id"),
verbose_name=entities.get("verbose_name"),
description=entities.get("description"),
display_format=entities.get("display_format"),
lock_field=bool(entities.get("lock_field", False)),
)
try:
_update_semantic_field_state(
field, update_request, changed_by="assistant"
)
except HTTPException:
raise
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
session.last_activity_at = datetime.utcnow()
repository.bump_session_version(session)
repository.db.commit()
repository.db.refresh(session)
repository.db.refresh(field)
repository.event_logger.log_for_session(
session,
actor_user_id=current_user.id,
event_type="assistant_field_semantics_updated",
event_summary="Assistant semantic field update persisted",
event_details={
"field_id": field.field_id,
"candidate_id": entities.get("candidate_id"),
"lock_field": bool(entities.get("lock_field", False)),
"version": int(getattr(session, "version", 0) or 0),
},
)
logger.reflect(
"Assistant semantic field update committed safely",
extra={
"session_id": session_id,
"field_id": field_id,
"version": int(getattr(session, "version", 0) or 0),
},
)
return (
f"Updated semantic field {field.field_name} for dataset review session {session_id}.",
None,
[
AssistantAction(
type="focus_target",
label="Open semantic review",
target=f"field:{field.field_id}",
)
],
)
if operation == "dataset_review_generate_sql_preview":
orchestrator = DatasetReviewOrchestrator(
repository=repository,
config_manager=config_manager,
)
result = orchestrator.prepare_launch_preview(
PreparePreviewCommand(
user=current_user,
session_id=session_id,
expected_version=int(session_version),
)
)
preview_status = getattr(
result.preview.preview_status, "value", result.preview.preview_status
)
logger.reflect(
"Assistant-triggered Superset preview generation completed",
extra={
"session_id": session_id,
"preview_status": preview_status,
},
)
return (
f"SQL preview {preview_status} for dataset review session {session_id}.",
None,
[
AssistantAction(
type="focus_target",
label="Open SQL preview",
target="sql-preview",
)
],
)
raise HTTPException(
status_code=400, detail="Unsupported dataset review operation"
)
# [/DEF:_dispatch_dataset_review_intent:Function]
# [DEF:_confirmation_summary:Function]
@@ -1633,6 +2310,14 @@ async def _dispatch_intent(
operation = intent.get("operation")
entities = intent.get("entities", {})
if operation in _DATASET_REVIEW_OPS or operation == "dataset_review_answer_context":
return await _dispatch_dataset_review_intent(
intent,
current_user,
config_manager,
db,
)
if operation == "show_capabilities":
tools_catalog = _build_tool_catalog(current_user, config_manager, db)
labels = {
@@ -2041,6 +2726,11 @@ async def send_message(
):
with belief_scope("assistant.send_message"):
user_id = current_user.id
dataset_review_context = _load_dataset_review_context(
request.dataset_review_session_id,
current_user,
db,
)
conversation_id = _resolve_or_create_conversation(
user_id, request.conversation_id, db
)
@@ -2058,6 +2748,12 @@ async def send_message(
logger.warning(f"[assistant.planner][fallback] Planner error: {exc}")
if not intent:
intent = _parse_command(request.message, config_manager)
if dataset_review_context:
dataset_review_intent = _plan_dataset_review_intent(
request.message, dataset_review_context
)
if dataset_review_intent is not None:
intent = dataset_review_intent
confidence = float(intent.get("confidence", 0.0))
if intent.get("domain") == "unknown" or confidence < 0.6:
@@ -2078,6 +2774,7 @@ async def send_message(
"decision": "needs_clarification",
"message": request.message,
"intent": intent,
"dataset_review_session_id": request.dataset_review_session_id,
}
_audit(user_id, audit_payload)
_persist_audit(db, user_id, audit_payload, conversation_id)
@@ -2127,6 +2824,7 @@ async def send_message(
confirmation_id=confirmation_id,
metadata={
"intent": intent,
"dataset_review_context": dataset_review_context,
"actions": [
{
"type": "confirm",
@@ -2146,6 +2844,7 @@ async def send_message(
"message": request.message,
"intent": intent,
"confirmation_id": confirmation_id,
"dataset_review_session_id": request.dataset_review_session_id,
}
_audit(user_id, audit_payload)
_persist_audit(db, user_id, audit_payload, conversation_id)
@@ -2192,6 +2891,7 @@ async def send_message(
task_id=task_id,
metadata={
"intent": intent,
"dataset_review_context": dataset_review_context,
"actions": [a.model_dump() for a in actions],
},
)
@@ -2200,6 +2900,7 @@ async def send_message(
"message": request.message,
"intent": intent,
"task_id": task_id,
"dataset_review_session_id": request.dataset_review_session_id,
}
_audit(user_id, audit_payload)
_persist_audit(db, user_id, audit_payload, conversation_id)
@@ -2246,6 +2947,7 @@ async def send_message(
"message": request.message,
"intent": intent,
"error": text,
"dataset_review_session_id": request.dataset_review_session_id,
}
_audit(user_id, audit_payload)
_persist_audit(db, user_id, audit_payload, conversation_id)

View File

@@ -17,9 +17,9 @@ from __future__ import annotations
# [DEF:DatasetReviewApi.imports:Block]
import json
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union, cast
from fastapi import APIRouter, Depends, HTTPException, Query, Response, status
from fastapi import APIRouter, Depends, Header, HTTPException, Query, Response, status
from pydantic import BaseModel, Field
from sqlalchemy.orm import Session
@@ -76,6 +76,7 @@ from src.services.dataset_review.orchestrator import (
)
from src.services.dataset_review.repositories.session_repository import (
DatasetReviewSessionRepository,
DatasetReviewSessionVersionConflictError,
)
# [/DEF:DatasetReviewApi.imports:Block]
@@ -193,7 +194,7 @@ class ClarificationSessionSummaryResponse(BaseModel):
# @COMPLEXITY: 2
# @PURPOSE: Response DTO for current clarification state and active question payload.
class ClarificationStateResponse(BaseModel):
clarification_session: ClarificationSessionSummaryResponse
clarification_session: Optional[ClarificationSessionSummaryResponse] = None
current_question: Optional[ClarificationQuestionDto] = None
@@ -271,6 +272,7 @@ class BatchApproveMappingRequest(BaseModel):
# @PURPOSE: Contract-compliant async preview trigger response exposing only enqueue state.
class PreviewEnqueueResultResponse(BaseModel):
session_id: str
session_version: Optional[int] = None
preview_status: str
task_id: Optional[str] = None
@@ -413,7 +415,9 @@ def _get_clarification_engine(
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
# @RELATION: [DEPENDS_ON] ->[SessionSummary]
def _serialize_session_summary(session: DatasetReviewSession) -> SessionSummary:
return SessionSummary.model_validate(session, from_attributes=True)
summary = SessionSummary.model_validate(session, from_attributes=True)
summary.session_version = summary.version
return summary
# [/DEF:_serialize_session_summary:Function]
@@ -424,18 +428,104 @@ def _serialize_session_summary(session: DatasetReviewSession) -> SessionSummary:
# @PURPOSE: Map SQLAlchemy session aggregate root into stable API detail DTO.
# @RELATION: [DEPENDS_ON] ->[SessionDetail]
def _serialize_session_detail(session: DatasetReviewSession) -> SessionDetail:
return SessionDetail.model_validate(session, from_attributes=True)
detail = SessionDetail.model_validate(session, from_attributes=True)
detail.session_version = detail.version
return detail
# [/DEF:_serialize_session_detail:Function]
# [DEF:_require_session_version_header:Function]
# @COMPLEXITY: 2
# @PURPOSE: Read the optimistic-lock session version header required by dataset review mutation endpoints.
def _require_session_version_header(
session_version: int = Header(..., alias="X-Session-Version", ge=0),
) -> int:
return session_version
# [/DEF:_require_session_version_header:Function]
# [DEF:_enforce_session_version:Function]
# @COMPLEXITY: 4
# @PURPOSE: Convert repository optimistic-lock conflicts into deterministic HTTP 409 responses.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
def _enforce_session_version(
repository: DatasetReviewSessionRepository,
session: DatasetReviewSession,
expected_version: int,
) -> DatasetReviewSession:
try:
repository.require_session_version(session, expected_version)
return session
except DatasetReviewSessionVersionConflictError as exc:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail={
"error_code": "session_version_conflict",
"message": str(exc),
"session_id": exc.session_id,
"expected_version": exc.expected_version,
"actual_version": exc.actual_version,
},
) from exc
# [/DEF:_enforce_session_version:Function]
# [DEF:_prepare_owned_session_mutation:Function]
# @COMPLEXITY: 4
# @PURPOSE: Resolve owner-scoped mutation session and enforce optimistic-lock version before changing dataset review state.
def _prepare_owned_session_mutation(
repository: DatasetReviewSessionRepository,
session_id: str,
current_user: User,
expected_version: int,
) -> DatasetReviewSession:
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
return _enforce_session_version(repository, session, expected_version)
# [/DEF:_prepare_owned_session_mutation:Function]
# [DEF:_commit_owned_session_mutation:Function]
# @COMPLEXITY: 4
# @PURPOSE: Centralize dataset-review session version bumping and commit semantics for owner-scoped mutation endpoints.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSessionRepository]
def _commit_owned_session_mutation(
repository: DatasetReviewSessionRepository,
session: DatasetReviewSession,
*,
refresh_targets: Optional[List[Any]] = None,
) -> DatasetReviewSession:
repository.bump_session_version(session)
repository.db.commit()
repository.db.refresh(session)
for target in refresh_targets or []:
repository.db.refresh(target)
return session
# [/DEF:_commit_owned_session_mutation:Function]
# [DEF:_serialize_semantic_field:Function]
# @COMPLEXITY: 2
# @PURPOSE: Map one semantic field aggregate into stable field-level DTO output.
# @RELATION: [DEPENDS_ON] ->[SemanticFieldEntryDto]
def _serialize_semantic_field(field: SemanticFieldEntry) -> SemanticFieldEntryDto:
return SemanticFieldEntryDto.model_validate(field, from_attributes=True)
payload = SemanticFieldEntryDto.model_validate(field, from_attributes=True)
session_ref = getattr(field, "session", None)
version_value = getattr(session_ref, "version", None)
payload.session_version = (
int(version_value or 0) if version_value is not None else None
)
return payload
# [/DEF:_serialize_semantic_field:Function]
@@ -497,6 +587,20 @@ def _serialize_clarification_state(
# [/DEF:_serialize_clarification_state:Function]
# [DEF:_serialize_empty_clarification_state:Function]
# @COMPLEXITY: 2
# @PURPOSE: Return a stable empty clarification payload for sessions that have not started clarification yet.
# @RELATION: [DEPENDS_ON] ->[ClarificationStateResponse]
def _serialize_empty_clarification_state() -> ClarificationStateResponse:
return ClarificationStateResponse(
clarification_session=None,
current_question=None,
)
# [/DEF:_serialize_empty_clarification_state:Function]
# [DEF:_get_owned_session_or_404:Function]
# @COMPLEXITY: 4
# @PURPOSE: Resolve one session for current user or collaborator scope, returning 404 when inaccessible.
@@ -766,18 +870,51 @@ def _update_semantic_field_state(
# @PURPOSE: Map one persisted execution mapping into stable API DTO output.
# @RELATION: [DEPENDS_ON] ->[ExecutionMappingDto]
def _serialize_execution_mapping(mapping: ExecutionMapping) -> ExecutionMappingDto:
return ExecutionMappingDto.model_validate(mapping, from_attributes=True)
payload = ExecutionMappingDto.model_validate(mapping, from_attributes=True)
session_ref = getattr(mapping, "session", None)
version_value = getattr(session_ref, "version", None)
payload.session_version = (
int(version_value or 0) if version_value is not None else None
)
return payload
# [/DEF:_serialize_execution_mapping:Function]
# [DEF:_serialize_preview:Function]
# @COMPLEXITY: 2
# @PURPOSE: Map one persisted preview snapshot into stable API DTO output and surface the refreshed session version for follow-up optimistic-lock mutations.
# @RELATION: [DEPENDS_ON] ->[CompiledPreviewDto]
def _serialize_preview(
preview: CompiledPreview, *, session_version_fallback: Optional[int] = None
) -> CompiledPreviewDto:
payload = CompiledPreviewDto.model_validate(preview, from_attributes=True)
session_ref = getattr(preview, "session", None)
version_value = getattr(session_ref, "version", None)
if version_value is None:
version_value = session_version_fallback
payload.session_version = (
int(version_value or 0) if version_value is not None else None
)
return payload
# [/DEF:_serialize_preview:Function]
# [DEF:_serialize_run_context:Function]
# @COMPLEXITY: 2
# @PURPOSE: Map one persisted launch run context into stable API DTO output for SQL Lab handoff confirmation.
# @RELATION: [DEPENDS_ON] ->[DatasetRunContextDto]
def _serialize_run_context(run_context) -> DatasetRunContextDto:
return DatasetRunContextDto.model_validate(run_context, from_attributes=True)
payload = DatasetRunContextDto.model_validate(run_context, from_attributes=True)
session_ref = getattr(run_context, "session", None)
version_value = getattr(session_ref, "version", None)
payload.session_version = (
int(version_value or 0) if version_value is not None else None
)
return payload
# [/DEF:_serialize_run_context:Function]
@@ -1051,23 +1188,27 @@ async def get_session_detail(
async def update_session(
session_id: str,
request: UpdateSessionRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.update_session"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
session_record = cast(Any, session)
session.status = request.status
session_record.status = request.status
if request.status == SessionStatus.PAUSED:
session.recommended_action = RecommendedAction.RESUME_SESSION
session_record.recommended_action = RecommendedAction.RESUME_SESSION
elif request.status in {
SessionStatus.ARCHIVED,
SessionStatus.CANCELLED,
SessionStatus.COMPLETED,
}:
session.active_task_id = None
session_record.active_task_id = None
repository.bump_session_version(session)
repository.db.commit()
repository.db.refresh(session)
_record_session_event(
@@ -1076,7 +1217,10 @@ async def update_session(
current_user,
event_type="session_status_updated",
event_summary="Dataset review session lifecycle updated",
event_details={"status": session.status.value},
event_details={
"status": session_record.status.value,
"version": session_record.version,
},
)
return _serialize_session_summary(session)
@@ -1103,12 +1247,14 @@ async def update_session(
async def delete_session(
session_id: str,
hard_delete: bool = Query(False),
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.delete_session"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
if hard_delete:
_record_session_event(
@@ -1123,17 +1269,20 @@ async def delete_session(
repository.db.commit()
return Response(status_code=status.HTTP_204_NO_CONTENT)
session.status = SessionStatus.ARCHIVED
session.active_task_id = None
repository.db.commit()
repository.db.refresh(session)
session_record = cast(Any, session)
session_record.status = SessionStatus.ARCHIVED
session_record.active_task_id = None
_commit_owned_session_mutation(repository, session)
_record_session_event(
repository,
session,
current_user,
event_type="session_archived",
event_summary="Dataset review session archived",
event_details={"hard_delete": False},
event_details={
"hard_delete": False,
"version": session_record.version,
},
)
return Response(status_code=status.HTTP_204_NO_CONTENT)
@@ -1231,10 +1380,10 @@ async def export_validation(
# [DEF:get_clarification_state:Function]
# @COMPLEXITY: 4
# @PURPOSE: Return the current clarification session summary and one active question payload.
# @PURPOSE: Return the current clarification session summary and one active question payload, or an empty state when clarification has not started.
# @RELATION: [CALLS] ->[build_question_payload:Function]
# @PRE: Session is accessible to current user and clarification feature is enabled.
# @POST: Returns at most one active clarification question with why_it_matters, current_guess, and ordered options.
# @POST: Returns at most one active clarification question with why_it_matters, current_guess, and ordered options; sessions without a clarification record return a non-blocking empty state.
# @SIDE_EFFECT: May normalize clarification pointer and readiness state in persistence.
# @DATA_CONTRACT: Input[session_id:str] -> Output[ClarificationStateResponse]
@router.get(
@@ -1254,6 +1403,9 @@ async def get_clarification_state(
):
with belief_scope("dataset_review.get_clarification_state"):
session = _get_owned_session_or_404(repository, session_id, current_user)
if not session.clarification_sessions:
return _serialize_empty_clarification_state()
clarification_session = _get_latest_clarification_session_or_404(session)
current_question = clarification_engine.build_question_payload(session)
return _serialize_clarification_state(
@@ -1288,13 +1440,15 @@ async def get_clarification_state(
)
async def resume_clarification(
session_id: str,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
clarification_engine: ClarificationEngine = Depends(_get_clarification_engine),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.resume_clarification"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
clarification_session = _get_latest_clarification_session_or_404(session)
current_question = clarification_engine.build_question_payload(session)
return _serialize_clarification_state(
@@ -1330,13 +1484,15 @@ async def resume_clarification(
async def record_clarification_answer(
session_id: str,
request: ClarificationAnswerRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
clarification_engine: ClarificationEngine = Depends(_get_clarification_engine),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.record_clarification_answer"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
try:
result = clarification_engine.record_answer(
ClarificationAnswerCommand(
@@ -1385,16 +1541,18 @@ async def update_field_semantic(
session_id: str,
field_id: str,
request: FieldSemanticUpdateRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.update_field_semantic"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
field = _get_owned_field_or_404(session, field_id)
_update_semantic_field_state(field, request, changed_by="user")
repository.db.commit()
repository.db.refresh(field)
session_record = cast(Any, session)
_commit_owned_session_mutation(repository, session, refresh_targets=[field])
_record_session_event(
repository,
session,
@@ -1407,6 +1565,7 @@ async def update_field_semantic(
"is_locked": field.is_locked,
"source_id": field.source_id,
"source_version": field.source_version,
"version": session_record.version,
},
)
return _serialize_semantic_field(field)
@@ -1434,24 +1593,29 @@ async def update_field_semantic(
async def lock_field_semantic(
session_id: str,
field_id: str,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.lock_field_semantic"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
field = _get_owned_field_or_404(session, field_id)
field.is_locked = True
field.last_changed_by = "user"
repository.db.commit()
repository.db.refresh(field)
session_record = cast(Any, session)
_commit_owned_session_mutation(repository, session, refresh_targets=[field])
_record_session_event(
repository,
session,
current_user,
event_type="semantic_field_locked",
event_summary="Semantic field lock persisted",
event_details={"field_id": field.field_id},
event_details={
"field_id": field.field_id,
"version": session_record.version,
},
)
return _serialize_semantic_field(field)
@@ -1478,27 +1642,32 @@ async def lock_field_semantic(
async def unlock_field_semantic(
session_id: str,
field_id: str,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.unlock_field_semantic"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
field = _get_owned_field_or_404(session, field_id)
field.is_locked = False
field.last_changed_by = "user"
if field.provenance == FieldProvenance.MANUAL_OVERRIDE:
field.provenance = FieldProvenance.UNRESOLVED
field.needs_review = True
repository.db.commit()
repository.db.refresh(field)
session_record = cast(Any, session)
_commit_owned_session_mutation(repository, session, refresh_targets=[field])
_record_session_event(
repository,
session,
current_user,
event_type="semantic_field_unlocked",
event_summary="Semantic field unlock persisted",
event_details={"field_id": field.field_id},
event_details={
"field_id": field.field_id,
"version": session_record.version,
},
)
return _serialize_semantic_field(field)
@@ -1525,12 +1694,14 @@ async def unlock_field_semantic(
async def approve_batch_semantic_fields(
session_id: str,
request: BatchApproveSemanticRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.approve_batch_semantic_fields"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
updated_fields: List[SemanticFieldEntry] = []
for item in request.items:
@@ -1544,9 +1715,10 @@ async def approve_batch_semantic_fields(
)
updated_fields.append(updated_field)
repository.db.commit()
for field in updated_fields:
repository.db.refresh(field)
session_record = cast(Any, session)
_commit_owned_session_mutation(
repository, session, refresh_targets=list(updated_fields)
)
_record_session_event(
repository,
session,
@@ -1556,6 +1728,7 @@ async def approve_batch_semantic_fields(
event_details={
"field_ids": [field.field_id for field in updated_fields],
"count": len(updated_fields),
"version": session_record.version,
},
)
return [_serialize_semantic_field(field) for field in updated_fields]
@@ -1620,12 +1793,14 @@ async def update_execution_mapping(
session_id: str,
mapping_id: str,
request: UpdateExecutionMappingRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.update_execution_mapping"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
mapping = _get_owned_mapping_or_404(session, mapping_id)
if request.effective_value is None:
@@ -1657,8 +1832,8 @@ async def update_execution_mapping(
if preview.preview_status == PreviewStatus.READY:
preview.preview_status = PreviewStatus.STALE
repository.db.commit()
repository.db.refresh(mapping)
session_record = cast(Any, session)
_commit_owned_session_mutation(repository, session, refresh_targets=[mapping])
_record_session_event(
repository,
session,
@@ -1669,6 +1844,7 @@ async def update_execution_mapping(
"mapping_id": mapping.mapping_id,
"approval_state": mapping.approval_state.value,
"preview_state": "stale",
"version": session_record.version,
},
)
return _serialize_execution_mapping(mapping)
@@ -1698,12 +1874,14 @@ async def approve_execution_mapping(
session_id: str,
mapping_id: str,
request: ApproveMappingRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.approve_execution_mapping"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
mapping = _get_owned_mapping_or_404(session, mapping_id)
mapping.approval_state = ApprovalState.APPROVED
mapping.approved_by_user_id = current_user.id
@@ -1713,8 +1891,8 @@ async def approve_execution_mapping(
session.last_activity_at = datetime.utcnow()
if session.readiness_state == ReadinessState.MAPPING_REVIEW_NEEDED:
session.recommended_action = RecommendedAction.GENERATE_SQL_PREVIEW
repository.db.commit()
repository.db.refresh(mapping)
session_record = cast(Any, session)
_commit_owned_session_mutation(repository, session, refresh_targets=[mapping])
_record_session_event(
repository,
session,
@@ -1724,6 +1902,7 @@ async def approve_execution_mapping(
event_details={
"mapping_id": mapping.mapping_id,
"approval_state": mapping.approval_state.value,
"version": session_record.version,
},
)
return _serialize_execution_mapping(mapping)
@@ -1752,12 +1931,14 @@ async def approve_execution_mapping(
async def approve_batch_execution_mappings(
session_id: str,
request: BatchApproveMappingRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.approve_batch_execution_mappings"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
updated_mappings: List[ExecutionMapping] = []
for mapping_id in list(dict.fromkeys(request.mapping_ids)):
@@ -1773,9 +1954,10 @@ async def approve_batch_execution_mappings(
if session.readiness_state == ReadinessState.MAPPING_REVIEW_NEEDED:
session.recommended_action = RecommendedAction.GENERATE_SQL_PREVIEW
repository.db.commit()
for mapping in updated_mappings:
repository.db.refresh(mapping)
session_record = cast(Any, session)
_commit_owned_session_mutation(
repository, session, refresh_targets=list(updated_mappings)
)
_record_session_event(
repository,
session,
@@ -1785,6 +1967,7 @@ async def approve_batch_execution_mappings(
event_details={
"mapping_ids": [mapping.mapping_id for mapping in updated_mappings],
"count": len(updated_mappings),
"version": session_record.version,
},
)
return [_serialize_execution_mapping(mapping) for mapping in updated_mappings]
@@ -1814,14 +1997,20 @@ async def trigger_preview_generation(
session_id: str,
response: Response,
orchestrator: DatasetReviewOrchestrator = Depends(_get_orchestrator),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
session_version: int = Depends(_require_session_version_header),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.trigger_preview_generation"):
_prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
try:
result = orchestrator.prepare_launch_preview(
PreparePreviewCommand(
user=current_user,
session_id=session_id,
expected_version=session_version,
)
)
except ValueError as exc:
@@ -1839,12 +2028,16 @@ async def trigger_preview_generation(
response.status_code = status.HTTP_202_ACCEPTED
return PreviewEnqueueResultResponse(
session_id=result.session.session_id,
session_version=int(getattr(result.session, "version", 0) or 0),
preview_status=result.preview.preview_status.value,
task_id=None,
)
response.status_code = status.HTTP_200_OK
return CompiledPreviewDto.model_validate(result.preview, from_attributes=True)
return _serialize_preview(
result.preview,
session_version_fallback=int(getattr(result.session, "version", 0) or 0),
)
# [/DEF:trigger_preview_generation:Function]
@@ -1871,15 +2064,21 @@ async def trigger_preview_generation(
async def launch_dataset(
session_id: str,
orchestrator: DatasetReviewOrchestrator = Depends(_get_orchestrator),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
session_version: int = Depends(_require_session_version_header),
config_manager=Depends(get_config_manager),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.launch_dataset"):
_prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
try:
result = orchestrator.launch_dataset(
LaunchDatasetCommand(
user=current_user,
session_id=session_id,
expected_version=session_version,
)
)
except ValueError as exc:
@@ -1929,22 +2128,29 @@ async def record_field_feedback(
session_id: str,
field_id: str,
request: FeedbackRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.record_field_feedback"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
field = _get_owned_field_or_404(session, field_id)
field.user_feedback = request.feedback
repository.db.commit()
session_record = cast(Any, session)
_commit_owned_session_mutation(repository, session)
_record_session_event(
repository,
session,
current_user,
event_type="semantic_field_feedback_recorded",
event_summary="Semantic field feedback persisted",
event_details={"field_id": field.field_id, "feedback": request.feedback},
event_details={
"field_id": field.field_id,
"feedback": request.feedback,
"version": session_record.version,
},
)
return FeedbackResponse(target_id=field.field_id, feedback=request.feedback)
@@ -1973,12 +2179,14 @@ async def record_clarification_feedback(
session_id: str,
question_id: str,
request: FeedbackRequest,
session_version: int = Depends(_require_session_version_header),
repository: DatasetReviewSessionRepository = Depends(_get_repository),
current_user: User = Depends(get_current_user),
):
with belief_scope("dataset_review.record_clarification_feedback"):
session = _get_owned_session_or_404(repository, session_id, current_user)
_require_owner_mutation_scope(session, current_user)
session = _prepare_owned_session_mutation(
repository, session_id, current_user, session_version
)
clarification_session = _get_latest_clarification_session_or_404(session)
question = next(
(
@@ -1999,7 +2207,8 @@ async def record_clarification_feedback(
detail="Clarification answer not found",
)
question.answer.user_feedback = request.feedback
repository.db.commit()
session_record = cast(Any, session)
_commit_owned_session_mutation(repository, session)
_record_session_event(
repository,
session,
@@ -2009,6 +2218,7 @@ async def record_clarification_feedback(
event_details={
"question_id": question.question_id,
"feedback": request.feedback,
"version": session_record.version,
},
)
return FeedbackResponse(

View File

@@ -81,16 +81,49 @@ async def get_llm_status(
providers = service.get_all_providers()
active_provider = next((p for p in providers if p.is_active), None)
if not providers:
return {
"configured": False,
"reason": "no_providers_configured",
"provider_count": 0,
"active_provider_count": 0,
}
if not active_provider:
return {"configured": False, "reason": "no_active_provider"}
return {
"configured": False,
"reason": "no_active_provider",
"provider_count": len(providers),
"active_provider_count": 0,
"providers": [
{
"id": provider.id,
"name": provider.name,
"provider_type": provider.provider_type,
"is_active": bool(provider.is_active),
}
for provider in providers
],
}
api_key = service.get_decrypted_api_key(active_provider.id)
if not _is_valid_runtime_api_key(api_key):
return {"configured": False, "reason": "invalid_api_key"}
return {
"configured": False,
"reason": "invalid_api_key",
"provider_count": len(providers),
"active_provider_count": len([provider for provider in providers if provider.is_active]),
"provider_id": active_provider.id,
"provider_name": active_provider.name,
"provider_type": active_provider.provider_type,
"default_model": active_provider.default_model,
}
return {
"configured": True,
"reason": "ok",
"provider_count": len(providers),
"active_provider_count": len([provider for provider in providers if provider.is_active]),
"provider_id": active_provider.id,
"provider_name": active_provider.name,
"provider_type": active_provider.provider_type,

View File

@@ -18,6 +18,7 @@ from src.core.config_models import Environment
from src.core.utils.superset_context_extractor import (
SupersetContextExtractor,
SupersetParsedContext,
sanitize_imported_filter_for_assistant,
)
from src.models.filter_state import (
FilterState,
@@ -37,6 +38,8 @@ def _make_environment() -> Environment:
username="demo",
password="secret",
)
# [/DEF:_make_environment:Function]
@@ -60,9 +63,7 @@ def test_extract_native_filters_from_permalink():
"ownState": {},
},
"filter_date": {
"extraFormData": {
"time_range": "2020-01-01 : 2024-12-31"
},
"extraFormData": {"time_range": "2020-01-01 : 2024-12-31"},
"filterState": {"value": "2020-01-01 : 2024-12-31"},
"ownState": {},
},
@@ -81,9 +82,13 @@ def test_extract_native_filters_from_permalink():
assert "dataMask" in result
assert "filter_country" in result["dataMask"]
assert "filter_date" in result["dataMask"]
assert result["dataMask"]["filter_country"]["extraFormData"]["filters"][0]["val"] == ["DE", "FR"]
assert result["dataMask"]["filter_country"]["extraFormData"]["filters"][0][
"val"
] == ["DE", "FR"]
assert result["activeTabs"] == ["tab1", "tab2"]
assert result["anchor"] == "SECTION1"
# [/DEF:test_extract_native_filters_from_permalink]
@@ -110,6 +115,8 @@ def test_extract_native_filters_from_permalink_direct_response():
assert result["permalink_key"] == "direct-key"
assert "filter_1" in result["dataMask"]
# [/DEF:test_extract_native_filters_from_permalink_direct_response]
@@ -121,15 +128,19 @@ def test_extract_native_filters_from_key():
client.get_native_filter_state = MagicMock(
return_value={
"result": {
"value": json.dumps({
"filter_region": {
"id": "filter_region",
"extraFormData": {
"filters": [{"col": "region", "op": "IN", "val": ["EMEA"]}]
},
"filterState": {"value": ["EMEA"]},
"value": json.dumps(
{
"filter_region": {
"id": "filter_region",
"extraFormData": {
"filters": [
{"col": "region", "op": "IN", "val": ["EMEA"]}
]
},
"filterState": {"value": ["EMEA"]},
}
}
})
)
}
}
)
@@ -140,7 +151,11 @@ def test_extract_native_filters_from_key():
assert result["filter_state_key"] == "filter-state-key"
assert "dataMask" in result
assert "filter_region" in result["dataMask"]
assert result["dataMask"]["filter_region"]["extraFormData"]["filters"][0]["val"] == ["EMEA"]
assert result["dataMask"]["filter_region"]["extraFormData"]["filters"][0][
"val"
] == ["EMEA"]
# [/DEF:test_extract_native_filters_from_key]
@@ -152,11 +167,15 @@ def test_extract_native_filters_from_key_single_filter():
client.get_native_filter_state = MagicMock(
return_value={
"result": {
"value": json.dumps({
"id": "single_filter",
"extraFormData": {"filters": [{"col": "status", "op": "==", "val": "active"}]},
"filterState": {"value": "active"},
})
"value": json.dumps(
{
"id": "single_filter",
"extraFormData": {
"filters": [{"col": "status", "op": "==", "val": "active"}]
},
"filterState": {"value": "active"},
}
)
}
}
)
@@ -165,7 +184,12 @@ def test_extract_native_filters_from_key_single_filter():
assert "dataMask" in result
assert "single_filter" in result["dataMask"]
assert result["dataMask"]["single_filter"]["extraFormData"]["filters"][0]["col"] == "status"
assert (
result["dataMask"]["single_filter"]["extraFormData"]["filters"][0]["col"]
== "status"
)
# [/DEF:test_extract_native_filters_from_key_single_filter]
@@ -191,6 +215,8 @@ def test_extract_native_filters_from_key_dict_value():
assert "dataMask" in result
assert "filter_id" in result["dataMask"]
# [/DEF:test_extract_native_filters_from_key_dict_value]
@@ -209,6 +235,8 @@ def test_parse_dashboard_url_for_filters_permalink():
assert result["filter_type"] == "permalink"
assert result["filters"]["dataMask"]["f1"] == {}
# [/DEF:test_parse_dashboard_url_for_filters_permalink]
@@ -218,7 +246,11 @@ def test_parse_dashboard_url_for_filters_permalink():
def test_parse_dashboard_url_for_filters_native_key():
client = SupersetClient(_make_environment())
client.extract_native_filters_from_key = MagicMock(
return_value={"dataMask": {"f2": {}}, "dashboard_id": 42, "filter_state_key": "xyz"}
return_value={
"dataMask": {"f2": {}},
"dashboard_id": 42,
"filter_state_key": "xyz",
}
)
result = client.parse_dashboard_url_for_filters(
@@ -228,6 +260,8 @@ def test_parse_dashboard_url_for_filters_native_key():
assert result["filter_type"] == "native_filters_key"
assert result["dashboard_id"] == 42
assert result["filters"]["dataMask"]["f2"] == {}
# [/DEF:test_parse_dashboard_url_for_filters_native_key]
@@ -243,7 +277,11 @@ def test_parse_dashboard_url_for_filters_native_key_slug():
}
)
client.extract_native_filters_from_key = MagicMock(
return_value={"dataMask": {"f_slug": {}}, "dashboard_id": 99, "filter_state_key": "abc123"}
return_value={
"dataMask": {"f_slug": {}},
"dashboard_id": 99,
"filter_state_key": "abc123",
}
)
result = client.parse_dashboard_url_for_filters(
@@ -255,6 +293,8 @@ def test_parse_dashboard_url_for_filters_native_key_slug():
assert result["filters"]["dataMask"]["f_slug"] == {}
client.get_dashboard.assert_called_once_with("covid")
client.extract_native_filters_from_key.assert_called_once_with(99, "abc123")
# [/DEF:test_parse_dashboard_url_for_filters_native_key_slug]
@@ -271,6 +311,8 @@ def test_parse_dashboard_url_for_filters_native_key_slug_resolution_fails():
assert result["filter_type"] is None
assert result["dashboard_id"] is None
# [/DEF:test_parse_dashboard_url_for_filters_native_key_slug_resolution_fails]
@@ -287,6 +329,8 @@ def test_parse_dashboard_url_for_filters_native_filters_direct():
assert result["filter_type"] == "native_filters"
assert "dataMask" in result["filters"]
# [/DEF:test_parse_dashboard_url_for_filters_native_filters_direct]
@@ -302,6 +346,8 @@ def test_parse_dashboard_url_for_filters_no_filters():
assert result["filter_type"] is None
assert result["filters"] == {}
# [/DEF:test_parse_dashboard_url_for_filters_no_filters]
@@ -338,6 +384,8 @@ def test_extra_form_data_merge():
# New columns should be added
assert result["columns"] == ["col1", "col2"]
# [/DEF:test_extra_form_data_merge]
@@ -354,6 +402,8 @@ def test_filter_state_model():
assert state.extraFormData["filters"][0]["col"] == "x"
assert state.filterState["value"] == "y"
assert state.ownState["selectedValues"] == ["y"]
# [/DEF:test_filter_state_model]
@@ -371,6 +421,8 @@ def test_parsed_native_filters_model():
assert filters.has_filters() is True
assert filters.get_filter_count() == 1
assert filters.filter_type == "permalink"
# [/DEF:test_parsed_native_filters_model]
@@ -382,6 +434,8 @@ def test_parsed_native_filters_empty():
assert filters.has_filters() is False
assert filters.get_filter_count() == 0
# [/DEF:test_parsed_native_filters_empty]
@@ -392,13 +446,17 @@ def test_native_filter_data_mask_model():
data_mask = NativeFilterDataMask(
filters={
"filter_1": FilterState(extraFormData={"filters": []}, filterState={}),
"filter_2": FilterState(extraFormData={"time_range": "..."}, filterState={}),
"filter_2": FilterState(
extraFormData={"time_range": "..."}, filterState={}
),
}
)
assert data_mask.get_filter_ids() == ["filter_1", "filter_2"]
assert data_mask.get_extra_form_data("filter_1") == {"filters": []}
assert data_mask.get_extra_form_data("nonexistent") == {}
# [/DEF:test_native_filter_data_mask_model]
@@ -434,8 +492,12 @@ def test_recover_imported_filters_reconciles_raw_native_filter_ids_to_metadata_n
"display_name": "NATIVE_FILTER-EWNH3M70z",
"raw_value": ["DE", "FR"],
"normalized_value": {
"filter_clauses": [{"col": "country", "op": "IN", "val": ["DE", "FR"]}],
"extra_form_data": {"filters": [{"col": "country", "op": "IN", "val": ["DE", "FR"]}]},
"filter_clauses": [
{"col": "country", "op": "IN", "val": ["DE", "FR"]}
],
"extra_form_data": {
"filters": [{"col": "country", "op": "IN", "val": ["DE", "FR"]}]
},
"value_origin": "filter_state",
},
"source": "superset_native_filters_key",
@@ -455,9 +517,13 @@ def test_recover_imported_filters_reconciles_raw_native_filter_ids_to_metadata_n
assert result[0]["source"] == "superset_native_filters_key"
assert result[0]["normalized_value"] == {
"filter_clauses": [{"col": "country", "op": "IN", "val": ["DE", "FR"]}],
"extra_form_data": {"filters": [{"col": "country", "op": "IN", "val": ["DE", "FR"]}]},
"extra_form_data": {
"filters": [{"col": "country", "op": "IN", "val": ["DE", "FR"]}]
},
"value_origin": "filter_state",
}
# [/DEF:test_recover_imported_filters_reconciles_raw_native_filter_ids_to_metadata_names:Function]
@@ -514,6 +580,8 @@ def test_recover_imported_filters_collapses_state_and_metadata_duplicates_into_o
assert country_filter["recovery_status"] == "recovered"
assert region_filter["raw_value"] is None
assert region_filter["recovery_status"] == "partial"
# [/DEF:test_recover_imported_filters_collapses_state_and_metadata_duplicates_into_one_canonical_filter:Function]
@@ -559,13 +627,18 @@ def test_recover_imported_filters_preserves_unmatched_raw_native_filter_ids():
result = extractor.recover_imported_filters(parsed_context)
assert len(result) == 2
assert any(item["filter_name"] == "Country" and item["recovery_status"] == "partial" for item in result)
assert any(
item["filter_name"] == "Country" and item["recovery_status"] == "partial"
for item in result
)
assert any(
item["filter_name"] == "UNKNOWN_NATIVE_FILTER"
and item["raw_value"] == ["orphan"]
and item["source"] == "superset_native_filters_key"
for item in result
)
# [/DEF:test_recover_imported_filters_preserves_unmatched_raw_native_filter_ids:Function]
@@ -609,7 +682,30 @@ def test_extract_imported_filters_preserves_clause_level_native_filter_payload_f
"notes": "Recovered from Superset native_filters_key state",
}
]
# [/DEF:test_extract_imported_filters_preserves_clause_level_native_filter_payload_for_preview:Function]
# [/DEF:NativeFilterExtractionTests:Module]
# [DEF:test_sanitize_imported_filter_for_assistant_masks_sensitive_raw_values:Function]
# @RELATION: BINDS_TO -> NativeFilterExtractionTests
# @PURPOSE: Assistant-facing imported-filter payload should redact likely PII while preserving structure metadata.
def test_sanitize_imported_filter_for_assistant_masks_sensitive_raw_values():
result = sanitize_imported_filter_for_assistant(
{
"filter_name": "customer_email",
"raw_value": ["alice@example.com", "1234567890"],
"normalized_value": {"filter_clauses": []},
"source": "manual",
}
)
assert result["raw_value"] == ["***@example.com", "********90"]
assert result["raw_value_masked"] is True
assert result["normalized_value"] == {"filter_clauses": []}
# [/DEF:test_sanitize_imported_filter_for_assistant_masks_sensitive_raw_values:Function]
# [/DEF:NativeFilterExtractionTests:Module]

View File

@@ -1,5 +1,4 @@
# [DEF:AuthRepositoryModule:Module]
# @TIER: CRITICAL
# @COMPLEXITY: 5
# @SEMANTICS: auth, repository, database, user, role, permission
# @PURPOSE: Data access layer for authentication and user preference entities.

View File

@@ -467,6 +467,90 @@ def _ensure_filter_source_enum_values(bind_engine):
# [/DEF:_ensure_filter_source_enum_values:Function]
# [DEF:_ensure_dataset_review_session_columns:Function]
# @COMPLEXITY: 4
# @PURPOSE: Apply additive schema upgrades for dataset review persistence required by optimistic-lock and recovery metadata semantics.
# @PRE: bind_engine points to the application database where dataset review tables are stored.
# @POST: Missing additive columns across legacy dataset review tables are created without removing existing data.
# @SIDE_EFFECT: Executes ALTER TABLE statements against dataset review tables in the application database.
# @RELATION: [DEPENDS_ON] ->[DatasetReviewSession]
# @RELATION: [DEPENDS_ON] ->[ImportedFilter]
def _ensure_dataset_review_session_columns(bind_engine):
with belief_scope("_ensure_dataset_review_session_columns"):
inspector = inspect(bind_engine)
existing_tables = set(inspector.get_table_names())
migration_plan = {
"dataset_review_sessions": [
(
"version",
"ALTER TABLE dataset_review_sessions "
"ADD COLUMN version INTEGER NOT NULL DEFAULT 0",
)
],
"imported_filters": [
(
"raw_value_masked",
"ALTER TABLE imported_filters "
"ADD COLUMN raw_value_masked BOOLEAN NOT NULL DEFAULT FALSE",
)
],
}
for table_name, planned_columns in migration_plan.items():
if table_name not in existing_tables:
logger.reason(
"Dataset review table does not exist yet; skipping additive schema migration",
extra={"table": table_name},
)
continue
existing_columns = {
str(column.get("name") or "").strip()
for column in inspector.get_columns(table_name)
}
alter_statements = [
statement
for column_name, statement in planned_columns
if column_name not in existing_columns
]
if not alter_statements:
logger.reason(
"Dataset review table schema already up to date",
extra={"table": table_name, "columns": sorted(existing_columns)},
)
continue
logger.reason(
"Applying additive dataset review schema migration",
extra={"table": table_name, "statements": alter_statements},
)
try:
with bind_engine.begin() as connection:
for statement in alter_statements:
connection.execute(text(statement))
logger.reflect(
"Dataset review additive schema migration completed",
extra={
"table": table_name,
"added_columns": [
stmt.split(" ADD COLUMN ", 1)[1].split()[0]
for stmt in alter_statements
],
},
)
except Exception as migration_error:
logger.explore(
"Dataset review additive schema migration failed",
extra={"table": table_name, "error": str(migration_error)},
)
raise
# [/DEF:_ensure_dataset_review_session_columns:Function]
# [DEF:init_db:Function]
# @COMPLEXITY: 3
# @PURPOSE: Initializes the database by creating all tables.
@@ -475,6 +559,7 @@ def _ensure_filter_source_enum_values(bind_engine):
# @SIDE_EFFECT: Creates physical database files if they don't exist.
# @RELATION: [CALLS] ->[ensure_connection_configs_table]
# @RELATION: [CALLS] ->[_ensure_filter_source_enum_values]
# @RELATION: [CALLS] ->[_ensure_dataset_review_session_columns]
def init_db():
with belief_scope("init_db"):
Base.metadata.create_all(bind=engine)
@@ -487,6 +572,7 @@ def init_db():
_ensure_auth_users_columns(auth_engine)
ensure_connection_configs_table(engine)
_ensure_filter_source_enum_values(engine)
_ensure_dataset_review_session_columns(engine)
# [/DEF:init_db:Function]

View File

@@ -1,5 +1,4 @@
# [DEF:TaskManager:Module]
# @TIER: CRITICAL
# @COMPLEXITY: 5
# @SEMANTICS: task, manager, lifecycle, execution, state
# @PURPOSE: Manages the lifecycle of tasks, including their creation, execution, and state tracking. It uses a thread pool to run plugins asynchronously.
@@ -41,7 +40,6 @@ from ..logger import logger, belief_scope, should_log_task_level
# [DEF:TaskManager:Class]
# @TIER: CRITICAL
# @COMPLEXITY: 5
# @SEMANTICS: task, manager, lifecycle, execution, state
# @PURPOSE: Manages the lifecycle of tasks, including their creation, execution, and state tracking.

View File

@@ -28,6 +28,106 @@ from src.core.superset_client import SupersetClient
logger = cast(Any, logger)
_EMAIL_PATTERN = re.compile(
r"(?P<local>[A-Za-z0-9._%+-]+)@(?P<domain>[A-Za-z0-9.-]+\.[A-Za-z]{2,})"
)
_UUID_PATTERN = re.compile(
r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
)
_LONG_DIGIT_PATTERN = re.compile(r"\b\d{6,}\b")
_MIXED_IDENTIFIER_PATTERN = re.compile(
r"\b(?=[A-Za-z0-9_-]{10,}\b)(?=.*[A-Za-z])(?=.*\d)[A-Za-z0-9_-]+\b"
)
# [DEF:mask_pii_value:Function]
# @COMPLEXITY: 2
# @PURPOSE: Redact likely sensitive identifiers while preserving structural hints for assistant-facing dataset-review context.
def mask_pii_value(value: Any) -> tuple[Any, bool]:
if isinstance(value, str):
return _mask_sensitive_text(value)
if isinstance(value, list):
masked_any = False
masked_list: List[Any] = []
for item in value:
masked_item, item_masked = mask_pii_value(item)
masked_list.append(masked_item)
masked_any = masked_any or item_masked
return masked_list, masked_any
if isinstance(value, dict):
masked_any = False
masked_dict: Dict[str, Any] = {}
for key, item in value.items():
masked_item, item_masked = mask_pii_value(item)
masked_dict[key] = masked_item
masked_any = masked_any or item_masked
return masked_dict, masked_any
return value, False
# [/DEF:mask_pii_value:Function]
# [DEF:sanitize_imported_filter_for_assistant:Function]
# @COMPLEXITY: 2
# @PURPOSE: Build assistant-safe imported-filter payload with raw-value redaction metadata when sensitive tokens are detected.
def sanitize_imported_filter_for_assistant(
filter_payload: Dict[str, Any],
) -> Dict[str, Any]:
sanitized = deepcopy(filter_payload)
masked_raw_value, was_masked = mask_pii_value(filter_payload.get("raw_value"))
sanitized["raw_value"] = masked_raw_value
sanitized["raw_value_masked"] = bool(
filter_payload.get("raw_value_masked", False) or was_masked
)
return sanitized
# [/DEF:sanitize_imported_filter_for_assistant:Function]
# [DEF:_mask_sensitive_text:Function]
# @COMPLEXITY: 2
# @PURPOSE: Redact likely PII substrings from one string while preserving non-sensitive delimiters and token shape.
def _mask_sensitive_text(value: str) -> tuple[str, bool]:
masked = value
masked_any = False
def _replace_email(match: re.Match[str]) -> str:
nonlocal masked_any
masked_any = True
domain = match.group("domain")
return f"***@{domain}"
def _replace_uuid(match: re.Match[str]) -> str:
nonlocal masked_any
masked_any = True
token = match.group(0)
return "********-****-****-****-" + token[-12:]
def _replace_long_digits(match: re.Match[str]) -> str:
nonlocal masked_any
masked_any = True
token = match.group(0)
return "*" * max(len(token) - 2, 1) + token[-2:]
def _replace_mixed_identifier(match: re.Match[str]) -> str:
nonlocal masked_any
token = match.group(0)
if token.isupper() and len(token) <= 5:
return token
masked_any = True
return token[:2] + "***" + token[-2:]
masked = _EMAIL_PATTERN.sub(_replace_email, masked)
masked = _UUID_PATTERN.sub(_replace_uuid, masked)
masked = _LONG_DIGIT_PATTERN.sub(_replace_long_digits, masked)
masked = _MIXED_IDENTIFIER_PATTERN.sub(_replace_mixed_identifier, masked)
return masked, masked_any
# [/DEF:_mask_sensitive_text:Function]
# [DEF:SupersetParsedContext:Class]
# @COMPLEXITY: 2
@@ -44,6 +144,7 @@ class SupersetParsedContext:
imported_filters: List[Dict[str, Any]] = field(default_factory=list)
unresolved_references: List[str] = field(default_factory=list)
partial_recovery: bool = False
dataset_payload: Optional[Dict[str, Any]] = None
# [/DEF:SupersetParsedContext:Class]
@@ -313,11 +414,12 @@ class SupersetContextExtractor:
)
raise ValueError("Unsupported Superset link shape")
dataset_payload: Optional[Dict[str, Any]] = None
if dataset_id is not None:
try:
dataset_detail = self.client.get_dataset_detail(dataset_id)
table_name = str(dataset_detail.get("table_name") or "").strip()
schema_name = str(dataset_detail.get("schema") or "").strip()
dataset_payload = self.client.get_dataset_detail(dataset_id)
table_name = str(dataset_payload.get("table_name") or "").strip()
schema_name = str(dataset_payload.get("schema") or "").strip()
if table_name:
dataset_ref = (
f"{schema_name}.{table_name}" if schema_name else table_name
@@ -349,6 +451,7 @@ class SupersetContextExtractor:
imported_filters=imported_filters,
unresolved_references=unresolved_references,
partial_recovery=partial_recovery,
dataset_payload=dataset_payload,
)
logger.reflect(
"Superset link parsing completed",

View File

@@ -1,6 +1,4 @@
# [DEF:AuthModels:Module]
#
# @TIER: STANDARD
# @COMPLEXITY: 3
# @SEMANTICS: auth, models, user, role, permission, sqlalchemy
# @PURPOSE: SQLAlchemy models for multi-user authentication and authorization.

View File

@@ -1,6 +1,4 @@
# [DEF:DatasetReviewModels:Module]
#
# @TIER: STANDARD
# @COMPLEXITY: 3
# @SEMANTICS: dataset_review, session, profile, findings, semantics, clarification, execution, sqlalchemy
# @PURPOSE: SQLAlchemy models for the dataset review orchestration flow.
@@ -15,11 +13,24 @@ import uuid
import enum
from datetime import datetime
from typing import List, Optional
from sqlalchemy import Column, String, Integer, Boolean, DateTime, ForeignKey, Text, JSON, Float, Enum as SQLEnum, Table
from sqlalchemy import (
Column,
String,
Integer,
Boolean,
DateTime,
ForeignKey,
Text,
JSON,
Float,
Enum as SQLEnum,
Table,
)
from sqlalchemy.orm import relationship
from .mapping import Base
# [/SECTION]
# [DEF:SessionStatus:Class]
class SessionStatus(str, enum.Enum):
ACTIVE = "active"
@@ -27,8 +38,11 @@ class SessionStatus(str, enum.Enum):
COMPLETED = "completed"
ARCHIVED = "archived"
CANCELLED = "cancelled"
# [/DEF:SessionStatus:Class]
# [DEF:SessionPhase:Class]
class SessionPhase(str, enum.Enum):
INTAKE = "intake"
@@ -40,8 +54,11 @@ class SessionPhase(str, enum.Enum):
PREVIEW = "preview"
LAUNCH = "launch"
POST_RUN = "post_run"
# [/DEF:SessionPhase:Class]
# [DEF:ReadinessState:Class]
class ReadinessState(str, enum.Enum):
EMPTY = "empty"
@@ -57,8 +74,11 @@ class ReadinessState(str, enum.Enum):
RUN_IN_PROGRESS = "run_in_progress"
COMPLETED = "completed"
RECOVERY_REQUIRED = "recovery_required"
# [/DEF:ReadinessState:Class]
# [DEF:RecommendedAction:Class]
class RecommendedAction(str, enum.Enum):
IMPORT_FROM_SUPERSET = "import_from_superset"
@@ -72,29 +92,40 @@ class RecommendedAction(str, enum.Enum):
LAUNCH_DATASET = "launch_dataset"
RESUME_SESSION = "resume_session"
EXPORT_OUTPUTS = "export_outputs"
# [/DEF:RecommendedAction:Class]
# [DEF:SessionCollaboratorRole:Class]
class SessionCollaboratorRole(str, enum.Enum):
VIEWER = "viewer"
REVIEWER = "reviewer"
APPROVER = "approver"
# [/DEF:SessionCollaboratorRole:Class]
# [DEF:SessionCollaborator:Class]
class SessionCollaborator(Base):
__tablename__ = "session_collaborators"
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
user_id = Column(String, ForeignKey("users.id"), nullable=False)
role = Column(SQLEnum(SessionCollaboratorRole), nullable=False)
added_at = Column(DateTime, default=datetime.utcnow, nullable=False)
session = relationship("DatasetReviewSession", back_populates="collaborators")
user = relationship("User")
# [/DEF:SessionCollaborator:Class]
# [DEF:DatasetReviewSession:Class]
class DatasetReviewSession(Base):
__tablename__ = "dataset_review_sessions"
@@ -102,39 +133,84 @@ class DatasetReviewSession(Base):
session_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
user_id = Column(String, ForeignKey("users.id"), nullable=False)
environment_id = Column(String, ForeignKey("environments.id"), nullable=False)
source_kind = Column(String, nullable=False) # superset_link, dataset_selection
source_kind = Column(String, nullable=False) # superset_link, dataset_selection
source_input = Column(String, nullable=False)
dataset_ref = Column(String, nullable=False)
dataset_id = Column(Integer, nullable=True)
dashboard_id = Column(Integer, nullable=True)
readiness_state = Column(SQLEnum(ReadinessState), nullable=False, default=ReadinessState.EMPTY)
recommended_action = Column(SQLEnum(RecommendedAction), nullable=False, default=RecommendedAction.IMPORT_FROM_SUPERSET)
status = Column(SQLEnum(SessionStatus), nullable=False, default=SessionStatus.ACTIVE)
current_phase = Column(SQLEnum(SessionPhase), nullable=False, default=SessionPhase.INTAKE)
readiness_state = Column(
SQLEnum(ReadinessState), nullable=False, default=ReadinessState.EMPTY
)
recommended_action = Column(
SQLEnum(RecommendedAction),
nullable=False,
default=RecommendedAction.IMPORT_FROM_SUPERSET,
)
version = Column(Integer, nullable=False, default=0)
status = Column(
SQLEnum(SessionStatus), nullable=False, default=SessionStatus.ACTIVE
)
current_phase = Column(
SQLEnum(SessionPhase), nullable=False, default=SessionPhase.INTAKE
)
active_task_id = Column(String, nullable=True)
last_preview_id = Column(String, nullable=True)
last_run_context_id = Column(String, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
last_activity_at = Column(DateTime, default=datetime.utcnow, nullable=False)
closed_at = Column(DateTime, nullable=True)
owner = relationship("User")
collaborators = relationship("SessionCollaborator", back_populates="session", cascade="all, delete-orphan")
profile = relationship("DatasetProfile", back_populates="session", uselist=False, cascade="all, delete-orphan")
findings = relationship("ValidationFinding", back_populates="session", cascade="all, delete-orphan")
semantic_sources = relationship("SemanticSource", back_populates="session", cascade="all, delete-orphan")
semantic_fields = relationship("SemanticFieldEntry", back_populates="session", cascade="all, delete-orphan")
imported_filters = relationship("ImportedFilter", back_populates="session", cascade="all, delete-orphan")
template_variables = relationship("TemplateVariable", back_populates="session", cascade="all, delete-orphan")
execution_mappings = relationship("ExecutionMapping", back_populates="session", cascade="all, delete-orphan")
clarification_sessions = relationship("ClarificationSession", back_populates="session", cascade="all, delete-orphan")
previews = relationship("CompiledPreview", back_populates="session", cascade="all, delete-orphan")
run_contexts = relationship("DatasetRunContext", back_populates="session", cascade="all, delete-orphan")
export_artifacts = relationship("ExportArtifact", back_populates="session", cascade="all, delete-orphan")
events = relationship("SessionEvent", back_populates="session", cascade="all, delete-orphan")
collaborators = relationship(
"SessionCollaborator", back_populates="session", cascade="all, delete-orphan"
)
profile = relationship(
"DatasetProfile",
back_populates="session",
uselist=False,
cascade="all, delete-orphan",
)
findings = relationship(
"ValidationFinding", back_populates="session", cascade="all, delete-orphan"
)
semantic_sources = relationship(
"SemanticSource", back_populates="session", cascade="all, delete-orphan"
)
semantic_fields = relationship(
"SemanticFieldEntry", back_populates="session", cascade="all, delete-orphan"
)
imported_filters = relationship(
"ImportedFilter", back_populates="session", cascade="all, delete-orphan"
)
template_variables = relationship(
"TemplateVariable", back_populates="session", cascade="all, delete-orphan"
)
execution_mappings = relationship(
"ExecutionMapping", back_populates="session", cascade="all, delete-orphan"
)
clarification_sessions = relationship(
"ClarificationSession", back_populates="session", cascade="all, delete-orphan"
)
previews = relationship(
"CompiledPreview", back_populates="session", cascade="all, delete-orphan"
)
run_contexts = relationship(
"DatasetRunContext", back_populates="session", cascade="all, delete-orphan"
)
export_artifacts = relationship(
"ExportArtifact", back_populates="session", cascade="all, delete-orphan"
)
events = relationship(
"SessionEvent", back_populates="session", cascade="all, delete-orphan"
)
# [/DEF:DatasetReviewSession:Class]
# [DEF:BusinessSummarySource:Class]
class BusinessSummarySource(str, enum.Enum):
CONFIRMED = "confirmed"
@@ -142,8 +218,11 @@ class BusinessSummarySource(str, enum.Enum):
INFERRED = "inferred"
AI_DRAFT = "ai_draft"
MANUAL_OVERRIDE = "manual_override"
# [/DEF:BusinessSummarySource:Class]
# [DEF:ConfidenceState:Class]
class ConfidenceState(str, enum.Enum):
CONFIRMED = "confirmed"
@@ -151,21 +230,29 @@ class ConfidenceState(str, enum.Enum):
MIXED = "mixed"
LOW_CONFIDENCE = "low_confidence"
UNRESOLVED = "unresolved"
# [/DEF:ConfidenceState:Class]
# [DEF:DatasetProfile:Class]
class DatasetProfile(Base):
__tablename__ = "dataset_profiles"
profile_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False, unique=True)
session_id = Column(
String,
ForeignKey("dataset_review_sessions.session_id"),
nullable=False,
unique=True,
)
dataset_name = Column(String, nullable=False)
schema_name = Column(String, nullable=True)
database_name = Column(String, nullable=True)
business_summary = Column(Text, nullable=False)
business_summary_source = Column(SQLEnum(BusinessSummarySource), nullable=False)
description = Column(Text, nullable=True)
dataset_type = Column(String, nullable=True) # table, virtual, sqllab_view, unknown
dataset_type = Column(String, nullable=True) # table, virtual, sqllab_view, unknown
is_sqllab_view = Column(Boolean, nullable=False, default=False)
completeness_score = Column(Float, nullable=True)
confidence_state = Column(SQLEnum(ConfidenceState), nullable=False)
@@ -173,11 +260,16 @@ class DatasetProfile(Base):
has_warning_findings = Column(Boolean, nullable=False, default=False)
manual_summary_locked = Column(Boolean, nullable=False, default=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
session = relationship("DatasetReviewSession", back_populates="profile")
# [/DEF:DatasetProfile:Class]
# [DEF:FindingArea:Class]
class FindingArea(str, enum.Enum):
SOURCE_INTAKE = "source_intake"
@@ -189,15 +281,21 @@ class FindingArea(str, enum.Enum):
COMPILED_PREVIEW = "compiled_preview"
LAUNCH = "launch"
AUDIT = "audit"
# [/DEF:FindingArea:Class]
# [DEF:FindingSeverity:Class]
class FindingSeverity(str, enum.Enum):
BLOCKING = "blocking"
WARNING = "warning"
INFORMATIONAL = "informational"
# [/DEF:FindingSeverity:Class]
# [DEF:ResolutionState:Class]
class ResolutionState(str, enum.Enum):
OPEN = "open"
@@ -206,28 +304,38 @@ class ResolutionState(str, enum.Enum):
SKIPPED = "skipped"
DEFERRED = "deferred"
EXPERT_REVIEW = "expert_review"
# [/DEF:ResolutionState:Class]
# [DEF:ValidationFinding:Class]
class ValidationFinding(Base):
__tablename__ = "validation_findings"
finding_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
area = Column(SQLEnum(FindingArea), nullable=False)
severity = Column(SQLEnum(FindingSeverity), nullable=False)
code = Column(String, nullable=False)
title = Column(String, nullable=False)
message = Column(Text, nullable=False)
resolution_state = Column(SQLEnum(ResolutionState), nullable=False, default=ResolutionState.OPEN)
resolution_state = Column(
SQLEnum(ResolutionState), nullable=False, default=ResolutionState.OPEN
)
resolution_note = Column(Text, nullable=True)
caused_by_ref = Column(String, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
resolved_at = Column(DateTime, nullable=True)
session = relationship("DatasetReviewSession", back_populates="findings")
# [/DEF:ValidationFinding:Class]
# [DEF:SemanticSourceType:Class]
class SemanticSourceType(str, enum.Enum):
UPLOADED_FILE = "uploaded_file"
@@ -235,16 +343,22 @@ class SemanticSourceType(str, enum.Enum):
REFERENCE_DATASET = "reference_dataset"
NEIGHBOR_DATASET = "neighbor_dataset"
AI_GENERATED = "ai_generated"
# [/DEF:SemanticSourceType:Class]
# [DEF:TrustLevel:Class]
class TrustLevel(str, enum.Enum):
TRUSTED = "trusted"
RECOMMENDED = "recommended"
CANDIDATE = "candidate"
GENERATED = "generated"
# [/DEF:TrustLevel:Class]
# [DEF:SemanticSourceStatus:Class]
class SemanticSourceStatus(str, enum.Enum):
AVAILABLE = "available"
@@ -253,34 +367,49 @@ class SemanticSourceStatus(str, enum.Enum):
REJECTED = "rejected"
PARTIAL = "partial"
FAILED = "failed"
# [/DEF:SemanticSourceStatus:Class]
# [DEF:SemanticSource:Class]
class SemanticSource(Base):
__tablename__ = "semantic_sources"
source_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
source_type = Column(SQLEnum(SemanticSourceType), nullable=False)
source_ref = Column(String, nullable=False)
source_version = Column(String, nullable=False)
display_name = Column(String, nullable=False)
trust_level = Column(SQLEnum(TrustLevel), nullable=False)
schema_overlap_score = Column(Float, nullable=True)
status = Column(SQLEnum(SemanticSourceStatus), nullable=False, default=SemanticSourceStatus.AVAILABLE)
status = Column(
SQLEnum(SemanticSourceStatus),
nullable=False,
default=SemanticSourceStatus.AVAILABLE,
)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
session = relationship("DatasetReviewSession", back_populates="semantic_sources")
# [/DEF:SemanticSource:Class]
# [DEF:FieldKind:Class]
class FieldKind(str, enum.Enum):
COLUMN = "column"
METRIC = "metric"
FILTER_DIMENSION = "filter_dimension"
PARAMETER = "parameter"
# [/DEF:FieldKind:Class]
# [DEF:FieldProvenance:Class]
class FieldProvenance(str, enum.Enum):
DICTIONARY_EXACT = "dictionary_exact"
@@ -289,57 +418,79 @@ class FieldProvenance(str, enum.Enum):
AI_GENERATED = "ai_generated"
MANUAL_OVERRIDE = "manual_override"
UNRESOLVED = "unresolved"
# [/DEF:FieldProvenance:Class]
# [DEF:SemanticFieldEntry:Class]
class SemanticFieldEntry(Base):
__tablename__ = "semantic_field_entries"
field_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
field_name = Column(String, nullable=False)
field_kind = Column(SQLEnum(FieldKind), nullable=False)
verbose_name = Column(String, nullable=True)
description = Column(Text, nullable=True)
display_format = Column(String, nullable=True)
provenance = Column(SQLEnum(FieldProvenance), nullable=False, default=FieldProvenance.UNRESOLVED)
provenance = Column(
SQLEnum(FieldProvenance), nullable=False, default=FieldProvenance.UNRESOLVED
)
source_id = Column(String, nullable=True)
source_version = Column(String, nullable=True)
confidence_rank = Column(Integer, nullable=True)
is_locked = Column(Boolean, nullable=False, default=False)
has_conflict = Column(Boolean, nullable=False, default=False)
needs_review = Column(Boolean, nullable=False, default=True)
last_changed_by = Column(String, nullable=False) # system, user, agent
user_feedback = Column(String, nullable=True) # up, down, null
last_changed_by = Column(String, nullable=False) # system, user, agent
user_feedback = Column(String, nullable=True) # up, down, null
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
session = relationship("DatasetReviewSession", back_populates="semantic_fields")
candidates = relationship("SemanticCandidate", back_populates="field", cascade="all, delete-orphan")
candidates = relationship(
"SemanticCandidate", back_populates="field", cascade="all, delete-orphan"
)
# [/DEF:SemanticFieldEntry:Class]
# [DEF:CandidateMatchType:Class]
class CandidateMatchType(str, enum.Enum):
EXACT = "exact"
REFERENCE = "reference"
FUZZY = "fuzzy"
GENERATED = "generated"
# [/DEF:CandidateMatchType:Class]
# [DEF:CandidateStatus:Class]
class CandidateStatus(str, enum.Enum):
PROPOSED = "proposed"
ACCEPTED = "accepted"
REJECTED = "rejected"
SUPERSEDED = "superseded"
# [/DEF:CandidateStatus:Class]
# [DEF:SemanticCandidate:Class]
class SemanticCandidate(Base):
__tablename__ = "semantic_candidates"
candidate_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
field_id = Column(String, ForeignKey("semantic_field_entries.field_id"), nullable=False)
field_id = Column(
String, ForeignKey("semantic_field_entries.field_id"), nullable=False
)
source_id = Column(String, nullable=True)
candidate_rank = Column(Integer, nullable=False)
match_type = Column(SQLEnum(CandidateMatchType), nullable=False)
@@ -347,12 +498,17 @@ class SemanticCandidate(Base):
proposed_verbose_name = Column(String, nullable=True)
proposed_description = Column(Text, nullable=True)
proposed_display_format = Column(String, nullable=True)
status = Column(SQLEnum(CandidateStatus), nullable=False, default=CandidateStatus.PROPOSED)
status = Column(
SQLEnum(CandidateStatus), nullable=False, default=CandidateStatus.PROPOSED
)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
field = relationship("SemanticFieldEntry", back_populates="candidates")
# [/DEF:SemanticCandidate:Class]
# [DEF:FilterSource:Class]
class FilterSource(str, enum.Enum):
SUPERSET_NATIVE = "superset_native"
@@ -361,8 +517,11 @@ class FilterSource(str, enum.Enum):
SUPERSET_NATIVE_FILTERS_KEY = "superset_native_filters_key"
MANUAL = "manual"
INFERRED = "inferred"
# [/DEF:FilterSource:Class]
# [DEF:FilterConfidenceState:Class]
class FilterConfidenceState(str, enum.Enum):
CONFIRMED = "confirmed"
@@ -370,25 +529,34 @@ class FilterConfidenceState(str, enum.Enum):
INFERRED = "inferred"
AI_DRAFT = "ai_draft"
UNRESOLVED = "unresolved"
# [/DEF:FilterConfidenceState:Class]
# [DEF:FilterRecoveryStatus:Class]
class FilterRecoveryStatus(str, enum.Enum):
RECOVERED = "recovered"
PARTIAL = "partial"
MISSING = "missing"
CONFLICTED = "conflicted"
# [/DEF:FilterRecoveryStatus:Class]
# [DEF:ImportedFilter:Class]
class ImportedFilter(Base):
__tablename__ = "imported_filters"
filter_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
filter_name = Column(String, nullable=False)
display_name = Column(String, nullable=True)
raw_value = Column(JSON, nullable=False)
raw_value_masked = Column(Boolean, nullable=False, default=False)
normalized_value = Column(JSON, nullable=True)
source = Column(SQLEnum(FilterSource), nullable=False)
confidence_state = Column(SQLEnum(FilterConfidenceState), nullable=False)
@@ -396,19 +564,27 @@ class ImportedFilter(Base):
recovery_status = Column(SQLEnum(FilterRecoveryStatus), nullable=False)
notes = Column(Text, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
session = relationship("DatasetReviewSession", back_populates="imported_filters")
# [/DEF:ImportedFilter:Class]
# [DEF:VariableKind:Class]
class VariableKind(str, enum.Enum):
NATIVE_FILTER = "native_filter"
PARAMETER = "parameter"
DERIVED = "derived"
UNKNOWN = "unknown"
# [/DEF:VariableKind:Class]
# [DEF:MappingStatus:Class]
class MappingStatus(str, enum.Enum):
UNMAPPED = "unmapped"
@@ -416,55 +592,78 @@ class MappingStatus(str, enum.Enum):
APPROVED = "approved"
OVERRIDDEN = "overridden"
INVALID = "invalid"
# [/DEF:MappingStatus:Class]
# [DEF:TemplateVariable:Class]
class TemplateVariable(Base):
__tablename__ = "template_variables"
variable_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
variable_name = Column(String, nullable=False)
expression_source = Column(Text, nullable=False)
variable_kind = Column(SQLEnum(VariableKind), nullable=False)
is_required = Column(Boolean, nullable=False, default=True)
default_value = Column(JSON, nullable=True)
mapping_status = Column(SQLEnum(MappingStatus), nullable=False, default=MappingStatus.UNMAPPED)
mapping_status = Column(
SQLEnum(MappingStatus), nullable=False, default=MappingStatus.UNMAPPED
)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
session = relationship("DatasetReviewSession", back_populates="template_variables")
# [/DEF:TemplateVariable:Class]
# [DEF:MappingMethod:Class]
class MappingMethod(str, enum.Enum):
DIRECT_MATCH = "direct_match"
HEURISTIC_MATCH = "heuristic_match"
SEMANTIC_MATCH = "semantic_match"
MANUAL_OVERRIDE = "manual_override"
# [/DEF:MappingMethod:Class]
# [DEF:MappingWarningLevel:Class]
class MappingWarningLevel(str, enum.Enum):
LOW = "low"
MEDIUM = "medium"
HIGH = "high"
# [/DEF:MappingWarningLevel:Class]
# [DEF:ApprovalState:Class]
class ApprovalState(str, enum.Enum):
PENDING = "pending"
APPROVED = "approved"
REJECTED = "rejected"
NOT_REQUIRED = "not_required"
# [/DEF:ApprovalState:Class]
# [DEF:ExecutionMapping:Class]
class ExecutionMapping(Base):
__tablename__ = "execution_mappings"
mapping_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
filter_id = Column(String, nullable=False)
variable_id = Column(String, nullable=False)
mapping_method = Column(SQLEnum(MappingMethod), nullable=False)
@@ -473,15 +672,22 @@ class ExecutionMapping(Base):
transformation_note = Column(Text, nullable=True)
warning_level = Column(SQLEnum(MappingWarningLevel), nullable=True)
requires_explicit_approval = Column(Boolean, nullable=False, default=False)
approval_state = Column(SQLEnum(ApprovalState), nullable=False, default=ApprovalState.NOT_REQUIRED)
approval_state = Column(
SQLEnum(ApprovalState), nullable=False, default=ApprovalState.NOT_REQUIRED
)
approved_by_user_id = Column(String, nullable=True)
approved_at = Column(DateTime, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
session = relationship("DatasetReviewSession", back_populates="execution_mappings")
# [/DEF:ExecutionMapping:Class]
# [DEF:ClarificationStatus:Class]
class ClarificationStatus(str, enum.Enum):
PENDING = "pending"
@@ -489,27 +695,49 @@ class ClarificationStatus(str, enum.Enum):
PAUSED = "paused"
COMPLETED = "completed"
CANCELLED = "cancelled"
# [/DEF:ClarificationStatus:Class]
# [DEF:ClarificationSession:Class]
class ClarificationSession(Base):
__tablename__ = "clarification_sessions"
clarification_session_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
status = Column(SQLEnum(ClarificationStatus), nullable=False, default=ClarificationStatus.PENDING)
clarification_session_id = Column(
String, primary_key=True, default=lambda: str(uuid.uuid4())
)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
status = Column(
SQLEnum(ClarificationStatus),
nullable=False,
default=ClarificationStatus.PENDING,
)
current_question_id = Column(String, nullable=True)
resolved_count = Column(Integer, nullable=False, default=0)
remaining_count = Column(Integer, nullable=False, default=0)
summary_delta = Column(Text, nullable=True)
started_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
completed_at = Column(DateTime, nullable=True)
session = relationship("DatasetReviewSession", back_populates="clarification_sessions")
questions = relationship("ClarificationQuestion", back_populates="clarification_session", cascade="all, delete-orphan")
session = relationship(
"DatasetReviewSession", back_populates="clarification_sessions"
)
questions = relationship(
"ClarificationQuestion",
back_populates="clarification_session",
cascade="all, delete-orphan",
)
# [/DEF:ClarificationSession:Class]
# [DEF:QuestionState:Class]
class QuestionState(str, enum.Enum):
OPEN = "open"
@@ -517,14 +745,21 @@ class QuestionState(str, enum.Enum):
SKIPPED = "skipped"
EXPERT_REVIEW = "expert_review"
SUPERSEDED = "superseded"
# [/DEF:QuestionState:Class]
# [DEF:ClarificationQuestion:Class]
class ClarificationQuestion(Base):
__tablename__ = "clarification_questions"
question_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
clarification_session_id = Column(String, ForeignKey("clarification_sessions.clarification_session_id"), nullable=False)
clarification_session_id = Column(
String,
ForeignKey("clarification_sessions.clarification_session_id"),
nullable=False,
)
topic_ref = Column(String, nullable=False)
question_text = Column(Text, nullable=False)
why_it_matters = Column(Text, nullable=False)
@@ -532,66 +767,103 @@ class ClarificationQuestion(Base):
priority = Column(Integer, nullable=False, default=0)
state = Column(SQLEnum(QuestionState), nullable=False, default=QuestionState.OPEN)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at = Column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
clarification_session = relationship(
"ClarificationSession", back_populates="questions"
)
options = relationship(
"ClarificationOption", back_populates="question", cascade="all, delete-orphan"
)
answer = relationship(
"ClarificationAnswer",
back_populates="question",
uselist=False,
cascade="all, delete-orphan",
)
clarification_session = relationship("ClarificationSession", back_populates="questions")
options = relationship("ClarificationOption", back_populates="question", cascade="all, delete-orphan")
answer = relationship("ClarificationAnswer", back_populates="question", uselist=False, cascade="all, delete-orphan")
# [/DEF:ClarificationQuestion:Class]
# [DEF:ClarificationOption:Class]
class ClarificationOption(Base):
__tablename__ = "clarification_options"
option_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
question_id = Column(String, ForeignKey("clarification_questions.question_id"), nullable=False)
question_id = Column(
String, ForeignKey("clarification_questions.question_id"), nullable=False
)
label = Column(String, nullable=False)
value = Column(String, nullable=False)
is_recommended = Column(Boolean, nullable=False, default=False)
display_order = Column(Integer, nullable=False, default=0)
question = relationship("ClarificationQuestion", back_populates="options")
# [/DEF:ClarificationOption:Class]
# [DEF:AnswerKind:Class]
class AnswerKind(str, enum.Enum):
SELECTED = "selected"
CUSTOM = "custom"
SKIPPED = "skipped"
EXPERT_REVIEW = "expert_review"
# [/DEF:AnswerKind:Class]
# [DEF:ClarificationAnswer:Class]
class ClarificationAnswer(Base):
__tablename__ = "clarification_answers"
answer_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
question_id = Column(String, ForeignKey("clarification_questions.question_id"), nullable=False, unique=True)
question_id = Column(
String,
ForeignKey("clarification_questions.question_id"),
nullable=False,
unique=True,
)
answer_kind = Column(SQLEnum(AnswerKind), nullable=False)
answer_value = Column(Text, nullable=True)
answered_by_user_id = Column(String, nullable=False)
impact_summary = Column(Text, nullable=True)
user_feedback = Column(String, nullable=True) # up, down, null
user_feedback = Column(String, nullable=True) # up, down, null
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
question = relationship("ClarificationQuestion", back_populates="answer")
# [/DEF:ClarificationAnswer:Class]
# [DEF:PreviewStatus:Class]
class PreviewStatus(str, enum.Enum):
PENDING = "pending"
READY = "ready"
FAILED = "failed"
STALE = "stale"
# [/DEF:PreviewStatus:Class]
# [DEF:CompiledPreview:Class]
class CompiledPreview(Base):
__tablename__ = "compiled_previews"
preview_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
preview_status = Column(SQLEnum(PreviewStatus), nullable=False, default=PreviewStatus.PENDING)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
preview_status = Column(
SQLEnum(PreviewStatus), nullable=False, default=PreviewStatus.PENDING
)
compiled_sql = Column(Text, nullable=True)
preview_fingerprint = Column(String, nullable=False)
compiled_by = Column(String, nullable=False, default="superset")
@@ -601,21 +873,29 @@ class CompiledPreview(Base):
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
session = relationship("DatasetReviewSession", back_populates="previews")
# [/DEF:CompiledPreview:Class]
# [DEF:LaunchStatus:Class]
class LaunchStatus(str, enum.Enum):
STARTED = "started"
SUCCESS = "success"
FAILED = "failed"
# [/DEF:LaunchStatus:Class]
# [DEF:DatasetRunContext:Class]
class DatasetRunContext(Base):
__tablename__ = "dataset_run_contexts"
run_context_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
dataset_ref = Column(String, nullable=False)
environment_id = Column(String, nullable=False)
preview_id = Column(String, nullable=False)
@@ -630,14 +910,21 @@ class DatasetRunContext(Base):
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
session = relationship("DatasetReviewSession", back_populates="run_contexts")
# [/DEF:DatasetRunContext:Class]
# [DEF:SessionEvent:Class]
class SessionEvent(Base):
__tablename__ = "session_events"
session_event_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_event_id = Column(
String, primary_key=True, default=lambda: str(uuid.uuid4())
)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
actor_user_id = Column(String, ForeignKey("users.id"), nullable=False)
event_type = Column(String, nullable=False)
event_summary = Column(Text, nullable=False)
@@ -648,29 +935,40 @@ class SessionEvent(Base):
session = relationship("DatasetReviewSession", back_populates="events")
actor = relationship("User")
# [/DEF:SessionEvent:Class]
# [DEF:ArtifactType:Class]
class ArtifactType(str, enum.Enum):
DOCUMENTATION = "documentation"
VALIDATION_REPORT = "validation_report"
RUN_SUMMARY = "run_summary"
# [/DEF:ArtifactType:Class]
# [DEF:ArtifactFormat:Class]
class ArtifactFormat(str, enum.Enum):
JSON = "json"
MARKDOWN = "markdown"
CSV = "csv"
PDF = "pdf"
# [/DEF:ArtifactFormat:Class]
# [DEF:ExportArtifact:Class]
class ExportArtifact(Base):
__tablename__ = "export_artifacts"
artifact_id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String, ForeignKey("dataset_review_sessions.session_id"), nullable=False)
session_id = Column(
String, ForeignKey("dataset_review_sessions.session_id"), nullable=False
)
artifact_type = Column(SQLEnum(ArtifactType), nullable=False)
format = Column(SQLEnum(ArtifactFormat), nullable=False)
storage_ref = Column(String, nullable=False)
@@ -678,6 +976,8 @@ class ExportArtifact(Base):
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
session = relationship("DatasetReviewSession", back_populates="export_artifacts")
# [/DEF:ExportArtifact:Class]
# [/DEF:DatasetReviewModels:Module]
# [/DEF:DatasetReviewModels:Module]

View File

@@ -1,6 +1,5 @@
# [DEF:MappingModels:Module]
#
# @TIER: STANDARD
# @COMPLEXITY: 3
# @SEMANTICS: database, mapping, environment, migration, sqlalchemy, sqlite
# @PURPOSE: Defines the database schema for environment metadata and database mappings using SQLAlchemy.

View File

@@ -1,6 +1,5 @@
# [DEF:ProfileModels:Module]
#
# @TIER: STANDARD
# @COMPLEXITY: 3
# @SEMANTICS: profile, preferences, persistence, user, dashboard-filter, git, ui-preferences, sqlalchemy
# @PURPOSE: Defines persistent per-user profile settings for dashboard filter, Git identity/token, and UX preferences.

View File

@@ -42,10 +42,11 @@ from src.models.dataset_review import (
PreviewStatus,
LaunchStatus,
ArtifactType,
ArtifactFormat
ArtifactFormat,
)
# [/SECTION]
# [DEF:SessionCollaboratorDto:Class]
class SessionCollaboratorDto(BaseModel):
user_id: str
@@ -54,8 +55,11 @@ class SessionCollaboratorDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:SessionCollaboratorDto:Class]
# [DEF:DatasetProfileDto:Class]
class DatasetProfileDto(BaseModel):
profile_id: str
@@ -78,8 +82,11 @@ class DatasetProfileDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:DatasetProfileDto:Class]
# [DEF:ValidationFindingDto:Class]
class ValidationFindingDto(BaseModel):
finding_id: str
@@ -97,8 +104,11 @@ class ValidationFindingDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:ValidationFindingDto:Class]
# [DEF:SemanticSourceDto:Class]
class SemanticSourceDto(BaseModel):
source_id: str
@@ -114,8 +124,11 @@ class SemanticSourceDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:SemanticSourceDto:Class]
# [DEF:SemanticCandidateDto:Class]
class SemanticCandidateDto(BaseModel):
candidate_id: str
@@ -132,12 +145,16 @@ class SemanticCandidateDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:SemanticCandidateDto:Class]
# [DEF:SemanticFieldEntryDto:Class]
class SemanticFieldEntryDto(BaseModel):
field_id: str
session_id: str
session_version: Optional[int] = None
field_name: str
field_kind: FieldKind
verbose_name: Optional[str] = None
@@ -158,8 +175,11 @@ class SemanticFieldEntryDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:SemanticFieldEntryDto:Class]
# [DEF:ImportedFilterDto:Class]
class ImportedFilterDto(BaseModel):
filter_id: str
@@ -167,6 +187,7 @@ class ImportedFilterDto(BaseModel):
filter_name: str
display_name: Optional[str] = None
raw_value: Any
raw_value_masked: bool = False
normalized_value: Optional[Any] = None
source: FilterSource
confidence_state: FilterConfidenceState
@@ -178,8 +199,11 @@ class ImportedFilterDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:ImportedFilterDto:Class]
# [DEF:TemplateVariableDto:Class]
class TemplateVariableDto(BaseModel):
variable_id: str
@@ -195,12 +219,16 @@ class TemplateVariableDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:TemplateVariableDto:Class]
# [DEF:ExecutionMappingDto:Class]
class ExecutionMappingDto(BaseModel):
mapping_id: str
session_id: str
session_version: Optional[int] = None
filter_id: str
variable_id: str
mapping_method: MappingMethod
@@ -217,8 +245,11 @@ class ExecutionMappingDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:ExecutionMappingDto:Class]
# [DEF:ClarificationOptionDto:Class]
class ClarificationOptionDto(BaseModel):
option_id: str
@@ -230,8 +261,11 @@ class ClarificationOptionDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:ClarificationOptionDto:Class]
# [DEF:ClarificationAnswerDto:Class]
class ClarificationAnswerDto(BaseModel):
answer_id: str
@@ -245,8 +279,11 @@ class ClarificationAnswerDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:ClarificationAnswerDto:Class]
# [DEF:ClarificationQuestionDto:Class]
class ClarificationQuestionDto(BaseModel):
question_id: str
@@ -264,8 +301,11 @@ class ClarificationQuestionDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:ClarificationQuestionDto:Class]
# [DEF:ClarificationSessionDto:Class]
class ClarificationSessionDto(BaseModel):
clarification_session_id: str
@@ -282,12 +322,16 @@ class ClarificationSessionDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:ClarificationSessionDto:Class]
# [DEF:CompiledPreviewDto:Class]
class CompiledPreviewDto(BaseModel):
preview_id: str
session_id: str
session_version: Optional[int] = None
preview_status: PreviewStatus
compiled_sql: Optional[str] = None
preview_fingerprint: str
@@ -299,12 +343,16 @@ class CompiledPreviewDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:CompiledPreviewDto:Class]
# [DEF:DatasetRunContextDto:Class]
class DatasetRunContextDto(BaseModel):
run_context_id: str
session_id: str
session_version: Optional[int] = None
dataset_ref: str
environment_id: str
preview_id: str
@@ -320,8 +368,11 @@ class DatasetRunContextDto(BaseModel):
class Config:
from_attributes = True
# [/DEF:DatasetRunContextDto:Class]
# [DEF:SessionSummary:Class]
class SessionSummary(BaseModel):
session_id: str
@@ -331,6 +382,8 @@ class SessionSummary(BaseModel):
source_input: str
dataset_ref: str
dataset_id: Optional[int] = None
version: int = 0
session_version: int = 0
readiness_state: ReadinessState
recommended_action: RecommendedAction
status: SessionStatus
@@ -341,8 +394,11 @@ class SessionSummary(BaseModel):
class Config:
from_attributes = True
# [/DEF:SessionSummary:Class]
# [DEF:SessionDetail:Class]
class SessionDetail(SessionSummary):
collaborators: List[SessionCollaboratorDto] = []
@@ -357,8 +413,7 @@ class SessionDetail(SessionSummary):
previews: List[CompiledPreviewDto] = []
run_contexts: List[DatasetRunContextDto] = []
class Config:
from_attributes = True
# [/DEF:SessionDetail:Class]
# [/DEF:DatasetReviewSchemas:Module]
# [/DEF:DatasetReviewSchemas:Module]

View File

@@ -59,6 +59,8 @@ class ClarificationQuestionPayload:
priority: int
state: QuestionState
options: list[dict[str, object]] = field(default_factory=list)
# [/DEF:ClarificationQuestionPayload:Class]
@@ -71,6 +73,8 @@ class ClarificationStateResult:
current_question: Optional[ClarificationQuestionPayload]
session: DatasetReviewSession
changed_findings: List[ValidationFinding] = field(default_factory=list)
# [/DEF:ClarificationStateResult:Class]
@@ -84,6 +88,8 @@ class ClarificationAnswerCommand:
answer_kind: AnswerKind
answer_value: Optional[str]
user: User
# [/DEF:ClarificationAnswerCommand:Class]
@@ -102,6 +108,7 @@ class ClarificationEngine:
# @PURPOSE: Bind repository dependency for clarification persistence operations.
def __init__(self, repository: DatasetReviewSessionRepository) -> None:
self.repository = repository
# [/DEF:ClarificationEngine_init:Function]
# [DEF:build_question_payload:Function]
@@ -127,10 +134,17 @@ class ClarificationEngine:
return None
active_questions = [
question for question in clarification_session.questions
question
for question in clarification_session.questions
if question.state == QuestionState.OPEN
]
active_questions.sort(key=lambda item: (-int(item.priority), item.created_at, item.question_id))
active_questions.sort(
key=lambda item: (
-int(item.priority),
item.created_at,
item.question_id,
)
)
if not active_questions:
clarification_session.current_question_id = None
@@ -184,7 +198,11 @@ class ClarificationEngine:
}
for option in sorted(
selected_question.options,
key=lambda item: (item.display_order, item.label, item.option_id),
key=lambda item: (
item.display_order,
item.label,
item.option_id,
),
)
],
)
@@ -197,6 +215,7 @@ class ClarificationEngine:
},
)
return payload
# [/DEF:build_question_payload:Function]
# [DEF:record_answer:Function]
@@ -208,7 +227,9 @@ class ClarificationEngine:
# @POST: Answer row is persisted before current-question pointer advances; skipped/expert-review items remain unresolved and visible.
# @SIDE_EFFECT: Inserts answer row, mutates question/session states, updates clarification findings, and commits.
# @DATA_CONTRACT: Input[ClarificationAnswerCommand] -> Output[ClarificationStateResult]
def record_answer(self, command: ClarificationAnswerCommand) -> ClarificationStateResult:
def record_answer(
self, command: ClarificationAnswerCommand
) -> ClarificationStateResult:
with belief_scope("ClarificationEngine.record_answer"):
session = command.session
clarification_session = self._get_latest_clarification_session(session)
@@ -223,18 +244,27 @@ class ClarificationEngine:
if question is None:
logger.explore(
"Cannot record clarification answer for foreign or missing question",
extra={"session_id": session.session_id, "question_id": command.question_id},
extra={
"session_id": session.session_id,
"question_id": command.question_id,
},
)
raise ValueError("Clarification question not found")
if question.answer is not None:
logger.explore(
"Rejected duplicate clarification answer submission",
extra={"session_id": session.session_id, "question_id": command.question_id},
extra={
"session_id": session.session_id,
"question_id": command.question_id,
},
)
raise ValueError("Clarification question already answered")
if clarification_session.current_question_id and clarification_session.current_question_id != question.question_id:
if (
clarification_session.current_question_id
and clarification_session.current_question_id != question.question_id
):
logger.explore(
"Rejected answer for non-active clarification question",
extra={
@@ -243,9 +273,13 @@ class ClarificationEngine:
"current_question_id": clarification_session.current_question_id,
},
)
raise ValueError("Only the active clarification question can be answered")
raise ValueError(
"Only the active clarification question can be answered"
)
normalized_answer_value = self._normalize_answer_value(command.answer_kind, command.answer_value, question)
normalized_answer_value = self._normalize_answer_value(
command.answer_kind, command.answer_value, question
)
logger.reason(
"Persisting clarification answer before state advancement",
@@ -260,7 +294,9 @@ class ClarificationEngine:
answer_kind=command.answer_kind,
answer_value=normalized_answer_value,
answered_by_user_id=command.user.id,
impact_summary=self._build_impact_summary(question, command.answer_kind, normalized_answer_value),
impact_summary=self._build_impact_summary(
question, command.answer_kind, normalized_answer_value
),
)
self.repository.db.add(persisted_answer)
self.repository.db.flush()
@@ -284,15 +320,25 @@ class ClarificationEngine:
question.updated_at = datetime.utcnow()
self.repository.db.flush()
clarification_session.resolved_count = self._count_resolved_questions(clarification_session)
clarification_session.remaining_count = self._count_remaining_questions(clarification_session)
clarification_session.summary_delta = self.summarize_progress(clarification_session)
clarification_session.resolved_count = self._count_resolved_questions(
clarification_session
)
clarification_session.remaining_count = self._count_remaining_questions(
clarification_session
)
clarification_session.summary_delta = self.summarize_progress(
clarification_session
)
clarification_session.updated_at = datetime.utcnow()
next_question = self._select_next_open_question(clarification_session)
clarification_session.current_question_id = next_question.question_id if next_question else None
clarification_session.current_question_id = (
next_question.question_id if next_question else None
)
clarification_session.status = (
ClarificationStatus.ACTIVE if next_question else ClarificationStatus.COMPLETED
ClarificationStatus.ACTIVE
if next_question
else ClarificationStatus.COMPLETED
)
if clarification_session.status == ClarificationStatus.COMPLETED:
clarification_session.completed_at = datetime.utcnow()
@@ -304,7 +350,7 @@ class ClarificationEngine:
if clarification_session.current_question_id
else SessionPhase.REVIEW
)
session.last_activity_at = datetime.utcnow()
self.repository.bump_session_version(session)
self.repository.db.commit()
self.repository.db.refresh(session)
@@ -326,6 +372,7 @@ class ClarificationEngine:
session=session,
changed_findings=[changed_finding] if changed_finding else [],
)
# [/DEF:record_answer:Function]
# [DEF:summarize_progress:Function]
@@ -336,6 +383,7 @@ class ClarificationEngine:
resolved = self._count_resolved_questions(clarification_session)
remaining = self._count_remaining_questions(clarification_session)
return f"{resolved} resolved, {remaining} unresolved"
# [/DEF:summarize_progress:Function]
# [DEF:_get_latest_clarification_session:Function]
@@ -353,6 +401,7 @@ class ClarificationEngine:
reverse=True,
)
return ordered_sessions[0]
# [/DEF:_get_latest_clarification_session:Function]
# [DEF:_find_question:Function]
@@ -367,6 +416,7 @@ class ClarificationEngine:
if question.question_id == question_id:
return question
return None
# [/DEF:_find_question:Function]
# [DEF:_select_next_open_question:Function]
@@ -377,35 +427,46 @@ class ClarificationEngine:
clarification_session: ClarificationSession,
) -> Optional[ClarificationQuestion]:
open_questions = [
question for question in clarification_session.questions
question
for question in clarification_session.questions
if question.state == QuestionState.OPEN
]
if not open_questions:
return None
open_questions.sort(key=lambda item: (-int(item.priority), item.created_at, item.question_id))
open_questions.sort(
key=lambda item: (-int(item.priority), item.created_at, item.question_id)
)
return open_questions[0]
# [/DEF:_select_next_open_question:Function]
# [DEF:_count_resolved_questions:Function]
# @COMPLEXITY: 2
# @PURPOSE: Count questions whose answers fully resolved the ambiguity.
def _count_resolved_questions(self, clarification_session: ClarificationSession) -> int:
def _count_resolved_questions(
self, clarification_session: ClarificationSession
) -> int:
return sum(
1
for question in clarification_session.questions
if question.state == QuestionState.ANSWERED
)
# [/DEF:_count_resolved_questions:Function]
# [DEF:_count_remaining_questions:Function]
# @COMPLEXITY: 2
# @PURPOSE: Count questions still unresolved or deferred after clarification interaction.
def _count_remaining_questions(self, clarification_session: ClarificationSession) -> int:
def _count_remaining_questions(
self, clarification_session: ClarificationSession
) -> int:
return sum(
1
for question in clarification_session.questions
if question.state in {QuestionState.OPEN, QuestionState.SKIPPED, QuestionState.EXPERT_REVIEW}
if question.state
in {QuestionState.OPEN, QuestionState.SKIPPED, QuestionState.EXPERT_REVIEW}
)
# [/DEF:_count_remaining_questions:Function]
# [DEF:_normalize_answer_value:Function]
@@ -417,18 +478,28 @@ class ClarificationEngine:
answer_value: Optional[str],
question: ClarificationQuestion,
) -> Optional[str]:
normalized_answer_value = str(answer_value).strip() if answer_value is not None else None
if answer_kind in {AnswerKind.SELECTED, AnswerKind.CUSTOM} and not normalized_answer_value:
raise ValueError("answer_value is required for selected or custom clarification answers")
normalized_answer_value = (
str(answer_value).strip() if answer_value is not None else None
)
if (
answer_kind in {AnswerKind.SELECTED, AnswerKind.CUSTOM}
and not normalized_answer_value
):
raise ValueError(
"answer_value is required for selected or custom clarification answers"
)
if answer_kind == AnswerKind.SELECTED:
allowed_values = {option.value for option in question.options}
if normalized_answer_value not in allowed_values:
raise ValueError("answer_value must match one of the current clarification options")
raise ValueError(
"answer_value must match one of the current clarification options"
)
if answer_kind == AnswerKind.SKIPPED:
return normalized_answer_value or "skipped"
if answer_kind == AnswerKind.EXPERT_REVIEW:
return normalized_answer_value or "expert_review"
return normalized_answer_value
# [/DEF:_normalize_answer_value:Function]
# [DEF:_build_impact_summary:Function]
@@ -445,6 +516,7 @@ class ClarificationEngine:
if answer_kind == AnswerKind.EXPERT_REVIEW:
return f"Clarification for {question.topic_ref} was deferred for expert review."
return f"Clarification for {question.topic_ref} recorded as '{answer_value}'."
# [/DEF:_build_impact_summary:Function]
# [DEF:_upsert_clarification_finding:Function]
@@ -461,8 +533,10 @@ class ClarificationEngine:
caused_by_ref = f"clarification:{question.question_id}"
existing = next(
(
finding for finding in session.findings
if finding.area == FindingArea.CLARIFICATION and finding.caused_by_ref == caused_by_ref
finding
for finding in session.findings
if finding.area == FindingArea.CLARIFICATION
and finding.caused_by_ref == caused_by_ref
),
None,
)
@@ -478,7 +552,9 @@ class ClarificationEngine:
else:
resolution_state = ResolutionState.EXPERT_REVIEW
resolved_at = None
message = f"Clarification for '{question.topic_ref}' requires expert review."
message = (
f"Clarification for '{question.topic_ref}' requires expert review."
)
if existing is None:
existing = ValidationFinding(
@@ -513,6 +589,7 @@ class ClarificationEngine:
existing.title = "Clarification requires expert review"
return existing
# [/DEF:_upsert_clarification_finding:Function]
# [DEF:_derive_readiness_state:Function]
@@ -532,12 +609,15 @@ class ClarificationEngine:
return ReadinessState.CLARIFICATION_NEEDED
return ReadinessState.REVIEW_READY
# [/DEF:_derive_readiness_state:Function]
# [DEF:_derive_recommended_action:Function]
# @COMPLEXITY: 2
# @PURPOSE: Recompute next-action guidance after clarification mutations.
def _derive_recommended_action(self, session: DatasetReviewSession) -> RecommendedAction:
def _derive_recommended_action(
self, session: DatasetReviewSession
) -> RecommendedAction:
clarification_session = self._get_latest_clarification_session(session)
if clarification_session is None:
return session.recommended_action
@@ -546,7 +626,10 @@ class ClarificationEngine:
if clarification_session.remaining_count > 0:
return RecommendedAction.START_CLARIFICATION
return RecommendedAction.REVIEW_DOCUMENTATION
# [/DEF:_derive_recommended_action:Function]
# [/DEF:ClarificationEngine:Class]
# [/DEF:ClarificationEngine:Module]
# [/DEF:ClarificationEngine:Module]

View File

@@ -109,6 +109,7 @@ class StartSessionResult:
class PreparePreviewCommand:
user: User
session_id: str
expected_version: Optional[int] = None
# [/DEF:PreparePreviewCommand:Class]
@@ -134,6 +135,7 @@ class PreparePreviewResult:
class LaunchDatasetCommand:
user: User
session_id: str
expected_version: Optional[int] = None
# [/DEF:LaunchDatasetCommand:Class]
@@ -344,6 +346,7 @@ class DatasetReviewOrchestrator:
)
if active_task_id:
persisted_session.active_task_id = active_task_id
self.repository.bump_session_version(persisted_session)
self.repository.db.commit()
self.repository.db.refresh(persisted_session)
self.repository.event_logger.log_event(
@@ -410,6 +413,11 @@ class DatasetReviewOrchestrator:
)
raise ValueError("Session not found")
if command.expected_version is not None:
self.repository.require_session_version(
session, command.expected_version
)
if session.dataset_id is None:
raise ValueError("Preview requires a resolved dataset_id")
@@ -521,6 +529,11 @@ class DatasetReviewOrchestrator:
)
raise ValueError("Session not found")
if command.expected_version is not None:
self.repository.require_session_version(
session, command.expected_version
)
if session.dataset_id is None:
raise ValueError("Launch requires a resolved dataset_id")
@@ -761,6 +774,7 @@ class DatasetReviewOrchestrator:
filter_name=str(item.get("filter_name") or f"imported_filter_{index}"),
display_name=item.get("display_name"),
raw_value=item.get("raw_value"),
raw_value_masked=bool(item.get("raw_value_masked", False)),
normalized_value=item.get("normalized_value"),
source=FilterSource(
str(item.get("source") or FilterSource.SUPERSET_URL.value)
@@ -788,9 +802,11 @@ class DatasetReviewOrchestrator:
if session.dataset_id is not None:
try:
dataset_payload = extractor.client.get_dataset_detail(
session_record.dataset_id
)
dataset_payload = parsed_context.dataset_payload
if not isinstance(dataset_payload, dict):
dataset_payload = extractor.client.get_dataset_detail(
session_record.dataset_id
)
discovered_variables = extractor.discover_template_variables(
dataset_payload
)

View File

@@ -1,6 +1,7 @@
import pytest
from sqlalchemy import create_engine
from sqlalchemy import create_engine, inspect, text
from sqlalchemy.orm import sessionmaker
from src.core.database import _ensure_dataset_review_session_columns
from src.models.mapping import Base, Environment
from src.models.auth import User
from src.models.dataset_review import (
@@ -9,6 +10,7 @@ from src.models.dataset_review import (
ValidationFinding,
CompiledPreview,
DatasetRunContext,
ImportedFilter,
BusinessSummarySource,
ConfidenceState,
FindingArea,
@@ -20,6 +22,7 @@ from src.models.dataset_review import (
)
from src.services.dataset_review.repositories.session_repository import (
DatasetReviewSessionRepository,
DatasetReviewSessionVersionConflictError,
)
# [DEF:SessionRepositoryTests:Module]
@@ -77,11 +80,103 @@ def test_create_session(db_session):
.first()
)
assert loaded.user_id == "user1"
assert loaded.version == 0
# [/DEF:test_create_session:Function]
# [DEF:test_require_session_version_conflict:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
# @PURPOSE: Verify optimistic-lock conflict is raised when caller version is stale.
def test_require_session_version_conflict(db_session):
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1",
environment_id="env1",
source_kind="superset_link",
source_input="http://link",
dataset_ref="dataset1",
version=2,
)
repo.create_session(session)
with pytest.raises(DatasetReviewSessionVersionConflictError) as exc_info:
repo.require_session_version(session, 1)
assert exc_info.value.expected_version == 1
assert exc_info.value.actual_version == 2
# [/DEF:test_require_session_version_conflict:Function]
# [DEF:test_bump_session_version_updates_last_activity:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
# @PURPOSE: Verify repository version bump increments monotonically and refreshes last activity.
def test_bump_session_version_updates_last_activity(db_session):
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1",
environment_id="env1",
source_kind="superset_link",
source_input="http://link",
dataset_ref="dataset1",
)
repo.create_session(session)
before_activity = session.last_activity_at
next_version = repo.bump_session_version(session)
assert next_version == 1
assert session.version == 1
assert session.last_activity_at >= before_activity
# [/DEF:test_bump_session_version_updates_last_activity:Function]
# [DEF:test_save_recovery_state_preserves_raw_value_masked_flag:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
# @PURPOSE: Verify imported-filter masking metadata persists with recovery bootstrap state.
def test_save_recovery_state_preserves_raw_value_masked_flag(db_session):
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1",
environment_id="env1",
source_kind="superset_link",
source_input="http://link",
dataset_ref="dataset1",
)
repo.create_session(session)
imported_filter = ImportedFilter(
session_id=session.session_id,
filter_name="email",
raw_value="***@example.com",
raw_value_masked=True,
normalized_value=None,
source="manual",
confidence_state="confirmed",
requires_confirmation=False,
recovery_status="recovered",
)
updated = repo.save_recovery_state(
session.session_id,
"user1",
[imported_filter],
[],
[],
)
assert updated.imported_filters[0].raw_value_masked is True
assert updated.version == 1
# [/DEF:test_save_recovery_state_preserves_raw_value_masked_flag:Function]
# [DEF:test_load_session_detail_ownership:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
def test_load_session_detail_ownership(db_session):
@@ -178,6 +273,48 @@ def test_save_preview_marks_stale(db_session):
# [/DEF:test_save_preview_marks_stale:Function]
# [DEF:test_save_preview_increments_session_version_once_per_call:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
# @PURPOSE: Verify preview persistence itself contributes exactly one optimistic-lock version increment so higher orchestration layers do not need to bump again for the same preview mutation.
def test_save_preview_increments_session_version_once_per_call(db_session):
repo = DatasetReviewSessionRepository(db_session)
session = DatasetReviewSession(
user_id="user1",
environment_id="env1",
source_kind="superset_link",
source_input="http://link",
dataset_ref="dataset1",
)
repo.create_session(session)
repo.save_preview(
session.session_id,
"user1",
CompiledPreview(
session_id=session.session_id,
preview_status="ready",
preview_fingerprint="fp-ready",
),
)
db_session.refresh(session)
assert session.version == 1
repo.save_preview(
session.session_id,
"user1",
CompiledPreview(
session_id=session.session_id,
preview_status="pending",
preview_fingerprint="fp-pending",
),
)
db_session.refresh(session)
assert session.version == 2
# [/DEF:test_save_preview_increments_session_version_once_per_call:Function]
# [DEF:test_save_profile_and_findings:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
def test_save_profile_and_findings(db_session):
@@ -272,6 +409,173 @@ def test_save_run_context(db_session):
# [/DEF:test_save_run_context:Function]
# [DEF:test_ensure_dataset_review_session_columns_adds_missing_legacy_columns:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
# @PURPOSE: Verify additive dataset review migration creates missing legacy columns for session and imported-filter tables without dropping rows.
def test_ensure_dataset_review_session_columns_adds_missing_legacy_columns():
engine = create_engine("sqlite:///:memory:")
with engine.begin() as connection:
connection.execute(
text(
"""
CREATE TABLE dataset_review_sessions (
session_id VARCHAR PRIMARY KEY,
user_id VARCHAR NOT NULL,
environment_id VARCHAR NOT NULL,
source_kind VARCHAR NOT NULL,
source_input VARCHAR NOT NULL,
dataset_ref VARCHAR NOT NULL,
dataset_id INTEGER,
dashboard_id INTEGER,
readiness_state VARCHAR NOT NULL,
recommended_action VARCHAR NOT NULL,
status VARCHAR NOT NULL,
current_phase VARCHAR NOT NULL,
active_task_id VARCHAR,
last_preview_id VARCHAR,
last_run_context_id VARCHAR,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
last_activity_at DATETIME NOT NULL,
closed_at DATETIME
)
"""
)
)
connection.execute(
text(
"""
CREATE TABLE imported_filters (
filter_id VARCHAR PRIMARY KEY,
session_id VARCHAR NOT NULL,
filter_name VARCHAR NOT NULL,
display_name VARCHAR,
raw_value JSON NOT NULL,
normalized_value JSON,
source VARCHAR NOT NULL,
confidence_state VARCHAR NOT NULL,
requires_confirmation BOOLEAN NOT NULL DEFAULT FALSE,
recovery_status VARCHAR NOT NULL,
notes TEXT,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL
)
"""
)
)
connection.execute(
text(
"""
INSERT INTO dataset_review_sessions (
session_id,
user_id,
environment_id,
source_kind,
source_input,
dataset_ref,
dataset_id,
dashboard_id,
readiness_state,
recommended_action,
status,
current_phase,
active_task_id,
last_preview_id,
last_run_context_id,
created_at,
updated_at,
last_activity_at,
closed_at
) VALUES (
'sess-legacy',
'user1',
'env1',
'superset_link',
'http://link',
'dataset1',
NULL,
NULL,
'EMPTY',
'IMPORT_FROM_SUPERSET',
'ACTIVE',
'INTAKE',
NULL,
NULL,
NULL,
'2026-03-21 00:00:00',
'2026-03-21 00:00:00',
'2026-03-21 00:00:00',
NULL
)
"""
)
)
connection.execute(
text(
"""
INSERT INTO imported_filters (
filter_id,
session_id,
filter_name,
display_name,
raw_value,
normalized_value,
source,
confidence_state,
requires_confirmation,
recovery_status,
notes,
created_at,
updated_at
) VALUES (
'filter-legacy',
'sess-legacy',
'country',
'Country',
'"DE"',
NULL,
'MANUAL',
'CONFIRMED',
FALSE,
'RECOVERED',
NULL,
'2026-03-21 00:00:00',
'2026-03-21 00:00:00'
)
"""
)
)
_ensure_dataset_review_session_columns(engine)
inspector = inspect(engine)
session_column_names = {
column["name"] for column in inspector.get_columns("dataset_review_sessions")
}
imported_filter_column_names = {
column["name"] for column in inspector.get_columns("imported_filters")
}
assert "version" in session_column_names
assert "raw_value_masked" in imported_filter_column_names
with engine.connect() as connection:
version_value = connection.execute(
text(
"SELECT version FROM dataset_review_sessions WHERE session_id = 'sess-legacy'"
)
).scalar_one()
raw_value_masked = connection.execute(
text(
"SELECT raw_value_masked FROM imported_filters WHERE filter_id = 'filter-legacy'"
)
).scalar_one()
assert version_value == 0
assert raw_value_masked in (False, 0)
# [/DEF:test_ensure_dataset_review_session_columns_adds_missing_legacy_columns:Function]
# [DEF:test_list_sessions_for_user:Function]
# @RELATION: BINDS_TO -> SessionRepositoryTests
def test_list_sessions_for_user(db_session):

View File

@@ -12,7 +12,8 @@
# @DATA_CONTRACT: Input[SessionMutation] -> Output[PersistedSessionAggregate]
# @INVARIANT: answers, mapping approvals, preview artifacts, and launch snapshots are never attributed to the wrong user or session.
from typing import Optional, List
from datetime import datetime
from typing import Any, Optional, List, cast
from sqlalchemy import or_
from sqlalchemy.orm import Session, joinedload
from src.models.dataset_review import (
@@ -33,6 +34,24 @@ from src.models.dataset_review import (
from src.core.logger import belief_scope, logger
from src.services.dataset_review.event_logger import SessionEventLogger
logger = cast(Any, logger)
# [DEF:DatasetReviewSessionVersionConflictError:Class]
# @COMPLEXITY: 2
# @PURPOSE: Signal optimistic-lock conflicts for dataset review session mutations.
class DatasetReviewSessionVersionConflictError(ValueError):
def __init__(self, session_id: str, expected_version: int, actual_version: int):
self.session_id = session_id
self.expected_version = expected_version
self.actual_version = actual_version
super().__init__(
f"Session version conflict: expected {expected_version}, actual {actual_version}"
)
# [/DEF:DatasetReviewSessionVersionConflictError:Class]
# [DEF:DatasetReviewSessionRepository:Class]
# @COMPLEXITY: 4
@@ -120,6 +139,70 @@ class DatasetReviewSessionRepository:
# [/DEF:create_sess:Function]
# [DEF:require_session_version:Function]
# @COMPLEXITY: 4
# @PURPOSE: Enforce optimistic-lock version matching before a session mutation is persisted.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @PRE: session belongs to the current owner mutation scope and expected_version is the caller's last observed version.
# @POST: returns the same session when versions match; otherwise raises deterministic conflict error.
# @SIDE_EFFECT: none.
# @DATA_CONTRACT: Input[DatasetReviewSession,int] -> Output[DatasetReviewSession|DatasetReviewSessionVersionConflictError]
def require_session_version(
self, session: DatasetReviewSession, expected_version: int
) -> DatasetReviewSession:
with belief_scope("DatasetReviewSessionRepository.require_session_version"):
session_record = cast(Any, session)
actual_version = int(getattr(session_record, "version", 0) or 0)
logger.reason(
"Checking optimistic-lock version for dataset review mutation",
extra={
"session_id": session.session_id,
"expected_version": expected_version,
"actual_version": actual_version,
},
)
if actual_version != expected_version:
logger.explore(
"Rejected dataset review mutation due to stale session version",
extra={
"session_id": session.session_id,
"expected_version": expected_version,
"actual_version": actual_version,
},
)
raise DatasetReviewSessionVersionConflictError(
str(session_record.session_id), expected_version, actual_version
)
logger.reflect(
"Optimistic-lock version accepted for dataset review mutation",
extra={"session_id": session.session_id, "version": actual_version},
)
return session
# [/DEF:require_session_version:Function]
# [DEF:bump_session_version:Function]
# @COMPLEXITY: 4
# @PURPOSE: Increment optimistic-lock version after a successful session mutation is assembled.
# @RELATION: [DEPENDS_ON] -> [DatasetReviewSession]
# @PRE: session mutation has passed guards and will be committed in the current transaction.
# @POST: session version increments monotonically and last_activity_at reflects the mutation time.
# @SIDE_EFFECT: mutates the in-memory session aggregate before commit.
# @DATA_CONTRACT: Input[DatasetReviewSession] -> Output[int]
def bump_session_version(self, session: DatasetReviewSession) -> int:
with belief_scope("DatasetReviewSessionRepository.bump_session_version"):
session_record = cast(Any, session)
next_version = int(getattr(session_record, "version", 0) or 0) + 1
session_record.version = next_version
session_record.last_activity_at = datetime.utcnow()
logger.reflect(
"Prepared incremented dataset review session version",
extra={"session_id": session.session_id, "version": next_version},
)
return next_version
# [/DEF:bump_session_version:Function]
# [DEF:load_detail:Function]
# @COMPLEXITY: 4
# @PURPOSE: Return the full session aggregate for API and frontend resume flows.
@@ -200,6 +283,7 @@ class DatasetReviewSessionRepository:
) -> DatasetReviewSession:
with belief_scope("DatasetReviewSessionRepository.save_profile_and_findings"):
session = self._get_owned_session(session_id, user_id)
session_record = cast(Any, session)
logger.reason(
"Persisting dataset profile and replacing validation findings",
extra={
@@ -225,14 +309,17 @@ class DatasetReviewSessionRepository:
).delete()
for finding in findings:
finding.session_id = session_id
finding_record = cast(Any, finding)
finding_record.session_id = session_id
self.db.add(finding)
self.bump_session_version(session)
self.db.commit()
logger.reflect(
"Dataset profile and validation findings committed",
extra={
"session_id": session.session_id,
"version": session_record.version,
"user_id": user_id,
"findings_count": len(findings),
},
@@ -257,6 +344,7 @@ class DatasetReviewSessionRepository:
) -> DatasetReviewSession:
with belief_scope("DatasetReviewSessionRepository.save_recovery_state"):
session = self._get_owned_session(session_id, user_id)
session_record = cast(Any, session)
logger.reason(
"Persisting dataset review recovery bootstrap state",
extra={
@@ -279,24 +367,29 @@ class DatasetReviewSessionRepository:
).delete()
for imported_filter in imported_filters:
imported_filter.session_id = session_id
imported_filter_record = cast(Any, imported_filter)
imported_filter_record.session_id = session_id
self.db.add(imported_filter)
for template_variable in template_variables:
template_variable.session_id = session_id
template_variable_record = cast(Any, template_variable)
template_variable_record.session_id = session_id
self.db.add(template_variable)
self.db.flush()
for execution_mapping in execution_mappings:
execution_mapping.session_id = session_id
execution_mapping_record = cast(Any, execution_mapping)
execution_mapping_record.session_id = session_id
self.db.add(execution_mapping)
self.bump_session_version(session)
self.db.commit()
logger.reflect(
"Dataset review recovery bootstrap state committed",
extra={
"session_id": session.session_id,
"version": session_record.version,
"user_id": user_id,
"imported_filters_count": len(imported_filters),
"template_variables_count": len(template_variables),
@@ -321,6 +414,7 @@ class DatasetReviewSessionRepository:
) -> CompiledPreview:
with belief_scope("DatasetReviewSessionRepository.save_preview"):
session = self._get_owned_session(session_id, user_id)
session_record = cast(Any, session)
logger.reason(
"Persisting compiled preview and staling previous preview snapshots",
extra={"session_id": session_id, "user_id": user_id},
@@ -332,14 +426,16 @@ class DatasetReviewSessionRepository:
self.db.add(preview)
self.db.flush()
session.last_preview_id = preview.preview_id
session_record.last_preview_id = preview.preview_id
self.bump_session_version(session)
self.db.commit()
self.db.refresh(preview)
logger.reflect(
"Compiled preview committed as latest session preview",
extra={
"session_id": session.session_id,
"version": session_record.version,
"preview_id": preview.preview_id,
"user_id": user_id,
},
@@ -362,6 +458,7 @@ class DatasetReviewSessionRepository:
) -> DatasetRunContext:
with belief_scope("DatasetReviewSessionRepository.save_run_context"):
session = self._get_owned_session(session_id, user_id)
session_record = cast(Any, session)
logger.reason(
"Persisting dataset run context audit snapshot",
extra={"session_id": session_id, "user_id": user_id},
@@ -369,14 +466,16 @@ class DatasetReviewSessionRepository:
self.db.add(run_context)
self.db.flush()
session.last_run_context_id = run_context.run_context_id
session_record.last_run_context_id = run_context.run_context_id
self.bump_session_version(session)
self.db.commit()
self.db.refresh(run_context)
logger.reflect(
"Dataset run context committed as latest launch snapshot",
extra={
"session_id": session.session_id,
"version": session_record.version,
"run_context_id": run_context.run_context_id,
"user_id": user_id,
},