feat(us1): add dataset review orchestration automatic review slice

This commit is contained in:
2026-03-17 10:57:49 +03:00
parent e916cb1f17
commit 023bacde39
24 changed files with 4870 additions and 131 deletions

View File

@@ -0,0 +1,334 @@
# [DEF:SupersetContextExtractor:Module]
# @COMPLEXITY: 4
# @SEMANTICS: dataset_review, superset, link_parsing, context_recovery, partial_recovery
# @PURPOSE: Recover dataset and dashboard context from Superset links while preserving explicit partial-recovery markers.
# @LAYER: Infra
# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient:Class]
# @RELATION: [DEPENDS_ON] ->[ImportedFilter]
# @RELATION: [DEPENDS_ON] ->[TemplateVariable]
# @PRE: Superset link or dataset reference must be parseable enough to resolve an environment-scoped target resource.
# @POST: Returns the best available recovered context with explicit provenance and partial-recovery markers when necessary.
# @SIDE_EFFECT: Performs upstream Superset API reads.
# @INVARIANT: Partial recovery is surfaced explicitly and never misrepresented as fully confirmed context.
from __future__ import annotations
# [DEF:SupersetContextExtractor.imports:Block]
import json
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from urllib.parse import parse_qs, unquote, urlparse
from src.core.config_models import Environment
from src.core.logger import belief_scope, logger
from src.core.superset_client import SupersetClient
# [/DEF:SupersetContextExtractor.imports:Block]
# [DEF:SupersetParsedContext:Class]
# @COMPLEXITY: 2
# @PURPOSE: Normalized output of Superset link parsing for session intake and recovery.
@dataclass
class SupersetParsedContext:
source_url: str
dataset_ref: str
dataset_id: Optional[int] = None
dashboard_id: Optional[int] = None
chart_id: Optional[int] = None
resource_type: str = "unknown"
query_state: Dict[str, Any] = field(default_factory=dict)
imported_filters: List[Dict[str, Any]] = field(default_factory=list)
unresolved_references: List[str] = field(default_factory=list)
partial_recovery: bool = False
# [/DEF:SupersetParsedContext:Class]
# [DEF:SupersetContextExtractor:Class]
# @COMPLEXITY: 4
# @PURPOSE: Parse supported Superset URLs and recover canonical dataset/dashboard references for review-session intake.
# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient]
# @PRE: constructor receives a configured environment with a usable Superset base URL.
# @POST: extractor instance is ready to parse links against one Superset environment.
# @SIDE_EFFECT: downstream parse operations may call Superset APIs through SupersetClient.
class SupersetContextExtractor:
# [DEF:SupersetContextExtractor.__init__:Function]
# @COMPLEXITY: 2
# @PURPOSE: Bind extractor to one Superset environment and client instance.
def __init__(self, environment: Environment, client: Optional[SupersetClient] = None) -> None:
self.environment = environment
self.client = client or SupersetClient(environment)
# [/DEF:SupersetContextExtractor.__init__:Function]
# [DEF:SupersetContextExtractor.parse_superset_link:Function]
# @COMPLEXITY: 4
# @PURPOSE: Extract candidate identifiers and query state from supported Superset URLs.
# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient]
# @PRE: link is a non-empty Superset URL compatible with the configured environment.
# @POST: returns resolved dataset/dashboard context, preserving explicit partial-recovery state if some identifiers cannot be confirmed.
# @SIDE_EFFECT: may issue Superset API reads to resolve dataset references from dashboard or chart URLs.
# @DATA_CONTRACT: Input[link:str] -> Output[SupersetParsedContext]
def parse_superset_link(self, link: str) -> SupersetParsedContext:
with belief_scope("SupersetContextExtractor.parse_superset_link"):
normalized_link = str(link or "").strip()
if not normalized_link:
logger.explore("Rejected empty Superset link during intake")
raise ValueError("Superset link must be non-empty")
parsed_url = urlparse(normalized_link)
if parsed_url.scheme not in {"http", "https"} or not parsed_url.netloc:
logger.explore(
"Superset link is not a parseable absolute URL",
extra={"link": normalized_link},
)
raise ValueError("Superset link must be an absolute http(s) URL")
logger.reason(
"Parsing Superset link for dataset review intake",
extra={"path": parsed_url.path, "query": parsed_url.query},
)
path_parts = [part for part in parsed_url.path.split("/") if part]
query_params = parse_qs(parsed_url.query, keep_blank_values=True)
query_state = self._decode_query_state(query_params)
dataset_id = self._extract_numeric_identifier(path_parts, "dataset")
dashboard_id = self._extract_numeric_identifier(path_parts, "dashboard")
chart_id = self._extract_numeric_identifier(path_parts, "chart")
resource_type = "unknown"
dataset_ref: Optional[str] = None
partial_recovery = False
unresolved_references: List[str] = []
if dataset_id is not None:
resource_type = "dataset"
dataset_ref = f"dataset:{dataset_id}"
logger.reason(
"Resolved direct dataset link",
extra={"dataset_id": dataset_id},
)
elif dashboard_id is not None:
resource_type = "dashboard"
logger.reason(
"Resolving dashboard-bound dataset from Superset",
extra={"dashboard_id": dashboard_id},
)
dashboard_detail = self.client.get_dashboard_detail(dashboard_id)
datasets = dashboard_detail.get("datasets") or []
if datasets:
first_dataset = datasets[0]
resolved_dataset_id = first_dataset.get("id")
if resolved_dataset_id is not None:
dataset_id = int(resolved_dataset_id)
dataset_ref = f"dataset:{dataset_id}"
logger.reason(
"Recovered dataset reference from dashboard context",
extra={
"dashboard_id": dashboard_id,
"dataset_id": dataset_id,
"dataset_count": len(datasets),
},
)
if len(datasets) > 1:
partial_recovery = True
unresolved_references.append("multiple_dashboard_datasets")
else:
partial_recovery = True
unresolved_references.append("dashboard_dataset_id_missing")
else:
partial_recovery = True
unresolved_references.append("dashboard_dataset_binding_missing")
elif chart_id is not None:
resource_type = "chart"
partial_recovery = True
unresolved_references.append("chart_dataset_binding_unresolved")
dataset_ref = f"chart:{chart_id}"
logger.reason(
"Accepted chart link with explicit partial recovery",
extra={"chart_id": chart_id},
)
else:
logger.explore(
"Unsupported Superset link shape encountered",
extra={"path": parsed_url.path},
)
raise ValueError("Unsupported Superset link shape")
if dataset_id is not None:
try:
dataset_detail = self.client.get_dataset_detail(dataset_id)
table_name = str(dataset_detail.get("table_name") or "").strip()
schema_name = str(dataset_detail.get("schema") or "").strip()
if table_name:
dataset_ref = (
f"{schema_name}.{table_name}" if schema_name else table_name
)
logger.reason(
"Canonicalized dataset reference from dataset detail",
extra={"dataset_ref": dataset_ref, "dataset_id": dataset_id},
)
except Exception as exc:
partial_recovery = True
unresolved_references.append("dataset_detail_lookup_failed")
logger.explore(
"Dataset detail lookup failed during link parsing; keeping session usable",
extra={"dataset_id": dataset_id, "error": str(exc)},
)
imported_filters = self._extract_imported_filters(query_state)
result = SupersetParsedContext(
source_url=normalized_link,
dataset_ref=dataset_ref or "unresolved",
dataset_id=dataset_id,
dashboard_id=dashboard_id,
chart_id=chart_id,
resource_type=resource_type,
query_state=query_state,
imported_filters=imported_filters,
unresolved_references=unresolved_references,
partial_recovery=partial_recovery,
)
logger.reflect(
"Superset link parsing completed",
extra={
"dataset_ref": result.dataset_ref,
"dataset_id": result.dataset_id,
"dashboard_id": result.dashboard_id,
"chart_id": result.chart_id,
"partial_recovery": result.partial_recovery,
"unresolved_references": result.unresolved_references,
"imported_filters": len(result.imported_filters),
},
)
return result
# [/DEF:SupersetContextExtractor.parse_superset_link:Function]
# [DEF:SupersetContextExtractor.recover_imported_filters:Function]
# @COMPLEXITY: 2
# @PURPOSE: Build imported filter entries from URL state and Superset-side saved context.
def recover_imported_filters(self, parsed_context: SupersetParsedContext) -> List[Dict[str, Any]]:
return list(parsed_context.imported_filters)
# [/DEF:SupersetContextExtractor.recover_imported_filters:Function]
# [DEF:SupersetContextExtractor.discover_template_variables:Function]
# @COMPLEXITY: 2
# @PURPOSE: Detect runtime variables and Jinja references from dataset query-bearing fields.
def discover_template_variables(self, dataset_payload: Dict[str, Any]) -> List[Dict[str, Any]]:
return []
# [/DEF:SupersetContextExtractor.discover_template_variables:Function]
# [DEF:SupersetContextExtractor.build_recovery_summary:Function]
# @COMPLEXITY: 2
# @PURPOSE: Summarize recovered, partial, and unresolved context for session state and UX.
def build_recovery_summary(self, parsed_context: SupersetParsedContext) -> Dict[str, Any]:
return {
"dataset_ref": parsed_context.dataset_ref,
"dataset_id": parsed_context.dataset_id,
"dashboard_id": parsed_context.dashboard_id,
"chart_id": parsed_context.chart_id,
"partial_recovery": parsed_context.partial_recovery,
"unresolved_references": list(parsed_context.unresolved_references),
"imported_filter_count": len(parsed_context.imported_filters),
}
# [/DEF:SupersetContextExtractor.build_recovery_summary:Function]
# [DEF:SupersetContextExtractor._extract_numeric_identifier:Function]
# @COMPLEXITY: 2
# @PURPOSE: Extract a numeric identifier from a REST-like Superset URL path.
def _extract_numeric_identifier(self, path_parts: List[str], resource_name: str) -> Optional[int]:
if resource_name not in path_parts:
return None
try:
resource_index = path_parts.index(resource_name)
except ValueError:
return None
if resource_index + 1 >= len(path_parts):
return None
candidate = str(path_parts[resource_index + 1]).strip()
if not candidate.isdigit():
return None
return int(candidate)
# [/DEF:SupersetContextExtractor._extract_numeric_identifier:Function]
# [DEF:SupersetContextExtractor._decode_query_state:Function]
# @COMPLEXITY: 2
# @PURPOSE: Decode query-string structures used by Superset URL state transport.
def _decode_query_state(self, query_params: Dict[str, List[str]]) -> Dict[str, Any]:
query_state: Dict[str, Any] = {}
for key, values in query_params.items():
if not values:
continue
raw_value = values[-1]
decoded_value = unquote(raw_value)
if key in {"native_filters", "native_filters_key", "form_data", "q"}:
try:
query_state[key] = json.loads(decoded_value)
continue
except Exception:
logger.explore(
"Failed to decode structured Superset query state; preserving raw value",
extra={"key": key},
)
query_state[key] = decoded_value
return query_state
# [/DEF:SupersetContextExtractor._decode_query_state:Function]
# [DEF:SupersetContextExtractor._extract_imported_filters:Function]
# @COMPLEXITY: 2
# @PURPOSE: Normalize imported filters from decoded query state without fabricating missing values.
def _extract_imported_filters(self, query_state: Dict[str, Any]) -> List[Dict[str, Any]]:
imported_filters: List[Dict[str, Any]] = []
native_filters_payload = query_state.get("native_filters")
if isinstance(native_filters_payload, list):
for index, item in enumerate(native_filters_payload):
if not isinstance(item, dict):
continue
filter_name = (
item.get("filter_name")
or item.get("column")
or item.get("name")
or f"native_filter_{index}"
)
imported_filters.append(
{
"filter_name": str(filter_name),
"raw_value": item.get("value"),
"display_name": item.get("label") or item.get("name"),
"source": "superset_url",
"recovery_status": "recovered"
if item.get("value") is not None
else "partial",
"requires_confirmation": item.get("value") is None,
"notes": "Recovered from Superset native filter URL state",
}
)
form_data_payload = query_state.get("form_data")
if isinstance(form_data_payload, dict):
extra_filters = form_data_payload.get("extra_filters") or []
for index, item in enumerate(extra_filters):
if not isinstance(item, dict):
continue
filter_name = item.get("col") or item.get("column") or f"extra_filter_{index}"
imported_filters.append(
{
"filter_name": str(filter_name),
"raw_value": item.get("val"),
"display_name": item.get("label"),
"source": "superset_url",
"recovery_status": "recovered"
if item.get("val") is not None
else "partial",
"requires_confirmation": item.get("val") is None,
"notes": "Recovered from Superset form_data extra_filters",
}
)
return imported_filters
# [/DEF:SupersetContextExtractor._extract_imported_filters:Function]
# [/DEF:SupersetContextExtractor:Class]
# [/DEF:SupersetContextExtractor:Module]