feat(us1): add dataset review orchestration automatic review slice
This commit is contained in:
334
backend/src/core/utils/superset_context_extractor.py
Normal file
334
backend/src/core/utils/superset_context_extractor.py
Normal file
@@ -0,0 +1,334 @@
|
||||
# [DEF:SupersetContextExtractor:Module]
|
||||
# @COMPLEXITY: 4
|
||||
# @SEMANTICS: dataset_review, superset, link_parsing, context_recovery, partial_recovery
|
||||
# @PURPOSE: Recover dataset and dashboard context from Superset links while preserving explicit partial-recovery markers.
|
||||
# @LAYER: Infra
|
||||
# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient:Class]
|
||||
# @RELATION: [DEPENDS_ON] ->[ImportedFilter]
|
||||
# @RELATION: [DEPENDS_ON] ->[TemplateVariable]
|
||||
# @PRE: Superset link or dataset reference must be parseable enough to resolve an environment-scoped target resource.
|
||||
# @POST: Returns the best available recovered context with explicit provenance and partial-recovery markers when necessary.
|
||||
# @SIDE_EFFECT: Performs upstream Superset API reads.
|
||||
# @INVARIANT: Partial recovery is surfaced explicitly and never misrepresented as fully confirmed context.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# [DEF:SupersetContextExtractor.imports:Block]
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import parse_qs, unquote, urlparse
|
||||
|
||||
from src.core.config_models import Environment
|
||||
from src.core.logger import belief_scope, logger
|
||||
from src.core.superset_client import SupersetClient
|
||||
# [/DEF:SupersetContextExtractor.imports:Block]
|
||||
|
||||
|
||||
# [DEF:SupersetParsedContext:Class]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Normalized output of Superset link parsing for session intake and recovery.
|
||||
@dataclass
|
||||
class SupersetParsedContext:
|
||||
source_url: str
|
||||
dataset_ref: str
|
||||
dataset_id: Optional[int] = None
|
||||
dashboard_id: Optional[int] = None
|
||||
chart_id: Optional[int] = None
|
||||
resource_type: str = "unknown"
|
||||
query_state: Dict[str, Any] = field(default_factory=dict)
|
||||
imported_filters: List[Dict[str, Any]] = field(default_factory=list)
|
||||
unresolved_references: List[str] = field(default_factory=list)
|
||||
partial_recovery: bool = False
|
||||
# [/DEF:SupersetParsedContext:Class]
|
||||
|
||||
|
||||
# [DEF:SupersetContextExtractor:Class]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Parse supported Superset URLs and recover canonical dataset/dashboard references for review-session intake.
|
||||
# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient]
|
||||
# @PRE: constructor receives a configured environment with a usable Superset base URL.
|
||||
# @POST: extractor instance is ready to parse links against one Superset environment.
|
||||
# @SIDE_EFFECT: downstream parse operations may call Superset APIs through SupersetClient.
|
||||
class SupersetContextExtractor:
|
||||
# [DEF:SupersetContextExtractor.__init__:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Bind extractor to one Superset environment and client instance.
|
||||
def __init__(self, environment: Environment, client: Optional[SupersetClient] = None) -> None:
|
||||
self.environment = environment
|
||||
self.client = client or SupersetClient(environment)
|
||||
# [/DEF:SupersetContextExtractor.__init__:Function]
|
||||
|
||||
# [DEF:SupersetContextExtractor.parse_superset_link:Function]
|
||||
# @COMPLEXITY: 4
|
||||
# @PURPOSE: Extract candidate identifiers and query state from supported Superset URLs.
|
||||
# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient]
|
||||
# @PRE: link is a non-empty Superset URL compatible with the configured environment.
|
||||
# @POST: returns resolved dataset/dashboard context, preserving explicit partial-recovery state if some identifiers cannot be confirmed.
|
||||
# @SIDE_EFFECT: may issue Superset API reads to resolve dataset references from dashboard or chart URLs.
|
||||
# @DATA_CONTRACT: Input[link:str] -> Output[SupersetParsedContext]
|
||||
def parse_superset_link(self, link: str) -> SupersetParsedContext:
|
||||
with belief_scope("SupersetContextExtractor.parse_superset_link"):
|
||||
normalized_link = str(link or "").strip()
|
||||
if not normalized_link:
|
||||
logger.explore("Rejected empty Superset link during intake")
|
||||
raise ValueError("Superset link must be non-empty")
|
||||
|
||||
parsed_url = urlparse(normalized_link)
|
||||
if parsed_url.scheme not in {"http", "https"} or not parsed_url.netloc:
|
||||
logger.explore(
|
||||
"Superset link is not a parseable absolute URL",
|
||||
extra={"link": normalized_link},
|
||||
)
|
||||
raise ValueError("Superset link must be an absolute http(s) URL")
|
||||
|
||||
logger.reason(
|
||||
"Parsing Superset link for dataset review intake",
|
||||
extra={"path": parsed_url.path, "query": parsed_url.query},
|
||||
)
|
||||
|
||||
path_parts = [part for part in parsed_url.path.split("/") if part]
|
||||
query_params = parse_qs(parsed_url.query, keep_blank_values=True)
|
||||
query_state = self._decode_query_state(query_params)
|
||||
|
||||
dataset_id = self._extract_numeric_identifier(path_parts, "dataset")
|
||||
dashboard_id = self._extract_numeric_identifier(path_parts, "dashboard")
|
||||
chart_id = self._extract_numeric_identifier(path_parts, "chart")
|
||||
|
||||
resource_type = "unknown"
|
||||
dataset_ref: Optional[str] = None
|
||||
partial_recovery = False
|
||||
unresolved_references: List[str] = []
|
||||
|
||||
if dataset_id is not None:
|
||||
resource_type = "dataset"
|
||||
dataset_ref = f"dataset:{dataset_id}"
|
||||
logger.reason(
|
||||
"Resolved direct dataset link",
|
||||
extra={"dataset_id": dataset_id},
|
||||
)
|
||||
elif dashboard_id is not None:
|
||||
resource_type = "dashboard"
|
||||
logger.reason(
|
||||
"Resolving dashboard-bound dataset from Superset",
|
||||
extra={"dashboard_id": dashboard_id},
|
||||
)
|
||||
dashboard_detail = self.client.get_dashboard_detail(dashboard_id)
|
||||
datasets = dashboard_detail.get("datasets") or []
|
||||
if datasets:
|
||||
first_dataset = datasets[0]
|
||||
resolved_dataset_id = first_dataset.get("id")
|
||||
if resolved_dataset_id is not None:
|
||||
dataset_id = int(resolved_dataset_id)
|
||||
dataset_ref = f"dataset:{dataset_id}"
|
||||
logger.reason(
|
||||
"Recovered dataset reference from dashboard context",
|
||||
extra={
|
||||
"dashboard_id": dashboard_id,
|
||||
"dataset_id": dataset_id,
|
||||
"dataset_count": len(datasets),
|
||||
},
|
||||
)
|
||||
if len(datasets) > 1:
|
||||
partial_recovery = True
|
||||
unresolved_references.append("multiple_dashboard_datasets")
|
||||
else:
|
||||
partial_recovery = True
|
||||
unresolved_references.append("dashboard_dataset_id_missing")
|
||||
else:
|
||||
partial_recovery = True
|
||||
unresolved_references.append("dashboard_dataset_binding_missing")
|
||||
elif chart_id is not None:
|
||||
resource_type = "chart"
|
||||
partial_recovery = True
|
||||
unresolved_references.append("chart_dataset_binding_unresolved")
|
||||
dataset_ref = f"chart:{chart_id}"
|
||||
logger.reason(
|
||||
"Accepted chart link with explicit partial recovery",
|
||||
extra={"chart_id": chart_id},
|
||||
)
|
||||
else:
|
||||
logger.explore(
|
||||
"Unsupported Superset link shape encountered",
|
||||
extra={"path": parsed_url.path},
|
||||
)
|
||||
raise ValueError("Unsupported Superset link shape")
|
||||
|
||||
if dataset_id is not None:
|
||||
try:
|
||||
dataset_detail = self.client.get_dataset_detail(dataset_id)
|
||||
table_name = str(dataset_detail.get("table_name") or "").strip()
|
||||
schema_name = str(dataset_detail.get("schema") or "").strip()
|
||||
if table_name:
|
||||
dataset_ref = (
|
||||
f"{schema_name}.{table_name}" if schema_name else table_name
|
||||
)
|
||||
logger.reason(
|
||||
"Canonicalized dataset reference from dataset detail",
|
||||
extra={"dataset_ref": dataset_ref, "dataset_id": dataset_id},
|
||||
)
|
||||
except Exception as exc:
|
||||
partial_recovery = True
|
||||
unresolved_references.append("dataset_detail_lookup_failed")
|
||||
logger.explore(
|
||||
"Dataset detail lookup failed during link parsing; keeping session usable",
|
||||
extra={"dataset_id": dataset_id, "error": str(exc)},
|
||||
)
|
||||
|
||||
imported_filters = self._extract_imported_filters(query_state)
|
||||
result = SupersetParsedContext(
|
||||
source_url=normalized_link,
|
||||
dataset_ref=dataset_ref or "unresolved",
|
||||
dataset_id=dataset_id,
|
||||
dashboard_id=dashboard_id,
|
||||
chart_id=chart_id,
|
||||
resource_type=resource_type,
|
||||
query_state=query_state,
|
||||
imported_filters=imported_filters,
|
||||
unresolved_references=unresolved_references,
|
||||
partial_recovery=partial_recovery,
|
||||
)
|
||||
logger.reflect(
|
||||
"Superset link parsing completed",
|
||||
extra={
|
||||
"dataset_ref": result.dataset_ref,
|
||||
"dataset_id": result.dataset_id,
|
||||
"dashboard_id": result.dashboard_id,
|
||||
"chart_id": result.chart_id,
|
||||
"partial_recovery": result.partial_recovery,
|
||||
"unresolved_references": result.unresolved_references,
|
||||
"imported_filters": len(result.imported_filters),
|
||||
},
|
||||
)
|
||||
return result
|
||||
# [/DEF:SupersetContextExtractor.parse_superset_link:Function]
|
||||
|
||||
# [DEF:SupersetContextExtractor.recover_imported_filters:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Build imported filter entries from URL state and Superset-side saved context.
|
||||
def recover_imported_filters(self, parsed_context: SupersetParsedContext) -> List[Dict[str, Any]]:
|
||||
return list(parsed_context.imported_filters)
|
||||
# [/DEF:SupersetContextExtractor.recover_imported_filters:Function]
|
||||
|
||||
# [DEF:SupersetContextExtractor.discover_template_variables:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Detect runtime variables and Jinja references from dataset query-bearing fields.
|
||||
def discover_template_variables(self, dataset_payload: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
return []
|
||||
# [/DEF:SupersetContextExtractor.discover_template_variables:Function]
|
||||
|
||||
# [DEF:SupersetContextExtractor.build_recovery_summary:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Summarize recovered, partial, and unresolved context for session state and UX.
|
||||
def build_recovery_summary(self, parsed_context: SupersetParsedContext) -> Dict[str, Any]:
|
||||
return {
|
||||
"dataset_ref": parsed_context.dataset_ref,
|
||||
"dataset_id": parsed_context.dataset_id,
|
||||
"dashboard_id": parsed_context.dashboard_id,
|
||||
"chart_id": parsed_context.chart_id,
|
||||
"partial_recovery": parsed_context.partial_recovery,
|
||||
"unresolved_references": list(parsed_context.unresolved_references),
|
||||
"imported_filter_count": len(parsed_context.imported_filters),
|
||||
}
|
||||
# [/DEF:SupersetContextExtractor.build_recovery_summary:Function]
|
||||
|
||||
# [DEF:SupersetContextExtractor._extract_numeric_identifier:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Extract a numeric identifier from a REST-like Superset URL path.
|
||||
def _extract_numeric_identifier(self, path_parts: List[str], resource_name: str) -> Optional[int]:
|
||||
if resource_name not in path_parts:
|
||||
return None
|
||||
try:
|
||||
resource_index = path_parts.index(resource_name)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
if resource_index + 1 >= len(path_parts):
|
||||
return None
|
||||
|
||||
candidate = str(path_parts[resource_index + 1]).strip()
|
||||
if not candidate.isdigit():
|
||||
return None
|
||||
return int(candidate)
|
||||
# [/DEF:SupersetContextExtractor._extract_numeric_identifier:Function]
|
||||
|
||||
# [DEF:SupersetContextExtractor._decode_query_state:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Decode query-string structures used by Superset URL state transport.
|
||||
def _decode_query_state(self, query_params: Dict[str, List[str]]) -> Dict[str, Any]:
|
||||
query_state: Dict[str, Any] = {}
|
||||
for key, values in query_params.items():
|
||||
if not values:
|
||||
continue
|
||||
raw_value = values[-1]
|
||||
decoded_value = unquote(raw_value)
|
||||
if key in {"native_filters", "native_filters_key", "form_data", "q"}:
|
||||
try:
|
||||
query_state[key] = json.loads(decoded_value)
|
||||
continue
|
||||
except Exception:
|
||||
logger.explore(
|
||||
"Failed to decode structured Superset query state; preserving raw value",
|
||||
extra={"key": key},
|
||||
)
|
||||
query_state[key] = decoded_value
|
||||
return query_state
|
||||
# [/DEF:SupersetContextExtractor._decode_query_state:Function]
|
||||
|
||||
# [DEF:SupersetContextExtractor._extract_imported_filters:Function]
|
||||
# @COMPLEXITY: 2
|
||||
# @PURPOSE: Normalize imported filters from decoded query state without fabricating missing values.
|
||||
def _extract_imported_filters(self, query_state: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
imported_filters: List[Dict[str, Any]] = []
|
||||
|
||||
native_filters_payload = query_state.get("native_filters")
|
||||
if isinstance(native_filters_payload, list):
|
||||
for index, item in enumerate(native_filters_payload):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
filter_name = (
|
||||
item.get("filter_name")
|
||||
or item.get("column")
|
||||
or item.get("name")
|
||||
or f"native_filter_{index}"
|
||||
)
|
||||
imported_filters.append(
|
||||
{
|
||||
"filter_name": str(filter_name),
|
||||
"raw_value": item.get("value"),
|
||||
"display_name": item.get("label") or item.get("name"),
|
||||
"source": "superset_url",
|
||||
"recovery_status": "recovered"
|
||||
if item.get("value") is not None
|
||||
else "partial",
|
||||
"requires_confirmation": item.get("value") is None,
|
||||
"notes": "Recovered from Superset native filter URL state",
|
||||
}
|
||||
)
|
||||
|
||||
form_data_payload = query_state.get("form_data")
|
||||
if isinstance(form_data_payload, dict):
|
||||
extra_filters = form_data_payload.get("extra_filters") or []
|
||||
for index, item in enumerate(extra_filters):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
filter_name = item.get("col") or item.get("column") or f"extra_filter_{index}"
|
||||
imported_filters.append(
|
||||
{
|
||||
"filter_name": str(filter_name),
|
||||
"raw_value": item.get("val"),
|
||||
"display_name": item.get("label"),
|
||||
"source": "superset_url",
|
||||
"recovery_status": "recovered"
|
||||
if item.get("val") is not None
|
||||
else "partial",
|
||||
"requires_confirmation": item.get("val") is None,
|
||||
"notes": "Recovered from Superset form_data extra_filters",
|
||||
}
|
||||
)
|
||||
|
||||
return imported_filters
|
||||
# [/DEF:SupersetContextExtractor._extract_imported_filters:Function]
|
||||
# [/DEF:SupersetContextExtractor:Class]
|
||||
|
||||
# [/DEF:SupersetContextExtractor:Module]
|
||||
Reference in New Issue
Block a user