feat(us1): add dataset review orchestration automatic review slice

2026-03-17 10:57:49 +03:00
parent e916cb1f17
commit 023bacde39
24 changed files with 4870 additions and 131 deletions
--- a/backend/src/core/utils/superset_context_extractor.py
+++ b/backend/src/core/utils/superset_context_extractor.py
@@ -0,0 +1,334 @@
+# [DEF:SupersetContextExtractor:Module]
+# @COMPLEXITY: 4
+# @SEMANTICS: dataset_review, superset, link_parsing, context_recovery, partial_recovery
+# @PURPOSE: Recover dataset and dashboard context from Superset links while preserving explicit partial-recovery markers.
+# @LAYER: Infra
+# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient:Class]
+# @RELATION: [DEPENDS_ON] ->[ImportedFilter]
+# @RELATION: [DEPENDS_ON] ->[TemplateVariable]
+# @PRE: Superset link or dataset reference must be parseable enough to resolve an environment-scoped target resource.
+# @POST: Returns the best available recovered context with explicit provenance and partial-recovery markers when necessary.
+# @SIDE_EFFECT: Performs upstream Superset API reads.
+# @INVARIANT: Partial recovery is surfaced explicitly and never misrepresented as fully confirmed context.
+
+from __future__ import annotations
+
+# [DEF:SupersetContextExtractor.imports:Block]
+import json
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+from urllib.parse import parse_qs, unquote, urlparse
+
+from src.core.config_models import Environment
+from src.core.logger import belief_scope, logger
+from src.core.superset_client import SupersetClient
+# [/DEF:SupersetContextExtractor.imports:Block]
+
+
+# [DEF:SupersetParsedContext:Class]
+# @COMPLEXITY: 2
+# @PURPOSE: Normalized output of Superset link parsing for session intake and recovery.
+@dataclass
+class SupersetParsedContext:
+    source_url: str
+    dataset_ref: str
+    dataset_id: Optional[int] = None
+    dashboard_id: Optional[int] = None
+    chart_id: Optional[int] = None
+    resource_type: str = "unknown"
+    query_state: Dict[str, Any] = field(default_factory=dict)
+    imported_filters: List[Dict[str, Any]] = field(default_factory=list)
+    unresolved_references: List[str] = field(default_factory=list)
+    partial_recovery: bool = False
+# [/DEF:SupersetParsedContext:Class]
+
+
+# [DEF:SupersetContextExtractor:Class]
+# @COMPLEXITY: 4
+# @PURPOSE: Parse supported Superset URLs and recover canonical dataset/dashboard references for review-session intake.
+# @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient]
+# @PRE: constructor receives a configured environment with a usable Superset base URL.
+# @POST: extractor instance is ready to parse links against one Superset environment.
+# @SIDE_EFFECT: downstream parse operations may call Superset APIs through SupersetClient.
+class SupersetContextExtractor:
+    # [DEF:SupersetContextExtractor.__init__:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Bind extractor to one Superset environment and client instance.
+    def __init__(self, environment: Environment, client: Optional[SupersetClient] = None) -> None:
+        self.environment = environment
+        self.client = client or SupersetClient(environment)
+    # [/DEF:SupersetContextExtractor.__init__:Function]
+
+    # [DEF:SupersetContextExtractor.parse_superset_link:Function]
+    # @COMPLEXITY: 4
+    # @PURPOSE: Extract candidate identifiers and query state from supported Superset URLs.
+    # @RELATION: [CALLS] ->[backend.src.core.superset_client.SupersetClient]
+    # @PRE: link is a non-empty Superset URL compatible with the configured environment.
+    # @POST: returns resolved dataset/dashboard context, preserving explicit partial-recovery state if some identifiers cannot be confirmed.
+    # @SIDE_EFFECT: may issue Superset API reads to resolve dataset references from dashboard or chart URLs.
+    # @DATA_CONTRACT: Input[link:str] -> Output[SupersetParsedContext]
+    def parse_superset_link(self, link: str) -> SupersetParsedContext:
+        with belief_scope("SupersetContextExtractor.parse_superset_link"):
+            normalized_link = str(link or "").strip()
+            if not normalized_link:
+                logger.explore("Rejected empty Superset link during intake")
+                raise ValueError("Superset link must be non-empty")
+
+            parsed_url = urlparse(normalized_link)
+            if parsed_url.scheme not in {"http", "https"} or not parsed_url.netloc:
+                logger.explore(
+                    "Superset link is not a parseable absolute URL",
+                    extra={"link": normalized_link},
+                )
+                raise ValueError("Superset link must be an absolute http(s) URL")
+
+            logger.reason(
+                "Parsing Superset link for dataset review intake",
+                extra={"path": parsed_url.path, "query": parsed_url.query},
+            )
+
+            path_parts = [part for part in parsed_url.path.split("/") if part]
+            query_params = parse_qs(parsed_url.query, keep_blank_values=True)
+            query_state = self._decode_query_state(query_params)
+
+            dataset_id = self._extract_numeric_identifier(path_parts, "dataset")
+            dashboard_id = self._extract_numeric_identifier(path_parts, "dashboard")
+            chart_id = self._extract_numeric_identifier(path_parts, "chart")
+
+            resource_type = "unknown"
+            dataset_ref: Optional[str] = None
+            partial_recovery = False
+            unresolved_references: List[str] = []
+
+            if dataset_id is not None:
+                resource_type = "dataset"
+                dataset_ref = f"dataset:{dataset_id}"
+                logger.reason(
+                    "Resolved direct dataset link",
+                    extra={"dataset_id": dataset_id},
+                )
+            elif dashboard_id is not None:
+                resource_type = "dashboard"
+                logger.reason(
+                    "Resolving dashboard-bound dataset from Superset",
+                    extra={"dashboard_id": dashboard_id},
+                )
+                dashboard_detail = self.client.get_dashboard_detail(dashboard_id)
+                datasets = dashboard_detail.get("datasets") or []
+                if datasets:
+                    first_dataset = datasets[0]
+                    resolved_dataset_id = first_dataset.get("id")
+                    if resolved_dataset_id is not None:
+                        dataset_id = int(resolved_dataset_id)
+                        dataset_ref = f"dataset:{dataset_id}"
+                        logger.reason(
+                            "Recovered dataset reference from dashboard context",
+                            extra={
+                                "dashboard_id": dashboard_id,
+                                "dataset_id": dataset_id,
+                                "dataset_count": len(datasets),
+                            },
+                        )
+                        if len(datasets) > 1:
+                            partial_recovery = True
+                            unresolved_references.append("multiple_dashboard_datasets")
+                    else:
+                        partial_recovery = True
+                        unresolved_references.append("dashboard_dataset_id_missing")
+                else:
+                    partial_recovery = True
+                    unresolved_references.append("dashboard_dataset_binding_missing")
+            elif chart_id is not None:
+                resource_type = "chart"
+                partial_recovery = True
+                unresolved_references.append("chart_dataset_binding_unresolved")
+                dataset_ref = f"chart:{chart_id}"
+                logger.reason(
+                    "Accepted chart link with explicit partial recovery",
+                    extra={"chart_id": chart_id},
+                )
+            else:
+                logger.explore(
+                    "Unsupported Superset link shape encountered",
+                    extra={"path": parsed_url.path},
+                )
+                raise ValueError("Unsupported Superset link shape")
+
+            if dataset_id is not None:
+                try:
+                    dataset_detail = self.client.get_dataset_detail(dataset_id)
+                    table_name = str(dataset_detail.get("table_name") or "").strip()
+                    schema_name = str(dataset_detail.get("schema") or "").strip()
+                    if table_name:
+                        dataset_ref = (
+                            f"{schema_name}.{table_name}" if schema_name else table_name
+                        )
+                        logger.reason(
+                            "Canonicalized dataset reference from dataset detail",
+                            extra={"dataset_ref": dataset_ref, "dataset_id": dataset_id},
+                        )
+                except Exception as exc:
+                    partial_recovery = True
+                    unresolved_references.append("dataset_detail_lookup_failed")
+                    logger.explore(
+                        "Dataset detail lookup failed during link parsing; keeping session usable",
+                        extra={"dataset_id": dataset_id, "error": str(exc)},
+                    )
+
+            imported_filters = self._extract_imported_filters(query_state)
+            result = SupersetParsedContext(
+                source_url=normalized_link,
+                dataset_ref=dataset_ref or "unresolved",
+                dataset_id=dataset_id,
+                dashboard_id=dashboard_id,
+                chart_id=chart_id,
+                resource_type=resource_type,
+                query_state=query_state,
+                imported_filters=imported_filters,
+                unresolved_references=unresolved_references,
+                partial_recovery=partial_recovery,
+            )
+            logger.reflect(
+                "Superset link parsing completed",
+                extra={
+                    "dataset_ref": result.dataset_ref,
+                    "dataset_id": result.dataset_id,
+                    "dashboard_id": result.dashboard_id,
+                    "chart_id": result.chart_id,
+                    "partial_recovery": result.partial_recovery,
+                    "unresolved_references": result.unresolved_references,
+                    "imported_filters": len(result.imported_filters),
+                },
+            )
+            return result
+    # [/DEF:SupersetContextExtractor.parse_superset_link:Function]
+
+    # [DEF:SupersetContextExtractor.recover_imported_filters:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Build imported filter entries from URL state and Superset-side saved context.
+    def recover_imported_filters(self, parsed_context: SupersetParsedContext) -> List[Dict[str, Any]]:
+        return list(parsed_context.imported_filters)
+    # [/DEF:SupersetContextExtractor.recover_imported_filters:Function]
+
+    # [DEF:SupersetContextExtractor.discover_template_variables:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Detect runtime variables and Jinja references from dataset query-bearing fields.
+    def discover_template_variables(self, dataset_payload: Dict[str, Any]) -> List[Dict[str, Any]]:
+        return []
+    # [/DEF:SupersetContextExtractor.discover_template_variables:Function]
+
+    # [DEF:SupersetContextExtractor.build_recovery_summary:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Summarize recovered, partial, and unresolved context for session state and UX.
+    def build_recovery_summary(self, parsed_context: SupersetParsedContext) -> Dict[str, Any]:
+        return {
+            "dataset_ref": parsed_context.dataset_ref,
+            "dataset_id": parsed_context.dataset_id,
+            "dashboard_id": parsed_context.dashboard_id,
+            "chart_id": parsed_context.chart_id,
+            "partial_recovery": parsed_context.partial_recovery,
+            "unresolved_references": list(parsed_context.unresolved_references),
+            "imported_filter_count": len(parsed_context.imported_filters),
+        }
+    # [/DEF:SupersetContextExtractor.build_recovery_summary:Function]
+
+    # [DEF:SupersetContextExtractor._extract_numeric_identifier:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Extract a numeric identifier from a REST-like Superset URL path.
+    def _extract_numeric_identifier(self, path_parts: List[str], resource_name: str) -> Optional[int]:
+        if resource_name not in path_parts:
+            return None
+        try:
+            resource_index = path_parts.index(resource_name)
+        except ValueError:
+            return None
+
+        if resource_index + 1 >= len(path_parts):
+            return None
+
+        candidate = str(path_parts[resource_index + 1]).strip()
+        if not candidate.isdigit():
+            return None
+        return int(candidate)
+    # [/DEF:SupersetContextExtractor._extract_numeric_identifier:Function]
+
+    # [DEF:SupersetContextExtractor._decode_query_state:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Decode query-string structures used by Superset URL state transport.
+    def _decode_query_state(self, query_params: Dict[str, List[str]]) -> Dict[str, Any]:
+        query_state: Dict[str, Any] = {}
+        for key, values in query_params.items():
+            if not values:
+                continue
+            raw_value = values[-1]
+            decoded_value = unquote(raw_value)
+            if key in {"native_filters", "native_filters_key", "form_data", "q"}:
+                try:
+                    query_state[key] = json.loads(decoded_value)
+                    continue
+                except Exception:
+                    logger.explore(
+                        "Failed to decode structured Superset query state; preserving raw value",
+                        extra={"key": key},
+                    )
+            query_state[key] = decoded_value
+        return query_state
+    # [/DEF:SupersetContextExtractor._decode_query_state:Function]
+
+    # [DEF:SupersetContextExtractor._extract_imported_filters:Function]
+    # @COMPLEXITY: 2
+    # @PURPOSE: Normalize imported filters from decoded query state without fabricating missing values.
+    def _extract_imported_filters(self, query_state: Dict[str, Any]) -> List[Dict[str, Any]]:
+        imported_filters: List[Dict[str, Any]] = []
+
+        native_filters_payload = query_state.get("native_filters")
+        if isinstance(native_filters_payload, list):
+            for index, item in enumerate(native_filters_payload):
+                if not isinstance(item, dict):
+                    continue
+                filter_name = (
+                    item.get("filter_name")
+                    or item.get("column")
+                    or item.get("name")
+                    or f"native_filter_{index}"
+                )
+                imported_filters.append(
+                    {
+                        "filter_name": str(filter_name),
+                        "raw_value": item.get("value"),
+                        "display_name": item.get("label") or item.get("name"),
+                        "source": "superset_url",
+                        "recovery_status": "recovered"
+                        if item.get("value") is not None
+                        else "partial",
+                        "requires_confirmation": item.get("value") is None,
+                        "notes": "Recovered from Superset native filter URL state",
+                    }
+                )
+
+        form_data_payload = query_state.get("form_data")
+        if isinstance(form_data_payload, dict):
+            extra_filters = form_data_payload.get("extra_filters") or []
+            for index, item in enumerate(extra_filters):
+                if not isinstance(item, dict):
+                    continue
+                filter_name = item.get("col") or item.get("column") or f"extra_filter_{index}"
+                imported_filters.append(
+                    {
+                        "filter_name": str(filter_name),
+                        "raw_value": item.get("val"),
+                        "display_name": item.get("label"),
+                        "source": "superset_url",
+                        "recovery_status": "recovered"
+                        if item.get("val") is not None
+                        else "partial",
+                        "requires_confirmation": item.get("val") is None,
+                        "notes": "Recovered from Superset form_data extra_filters",
+                    }
+                )
+
+        return imported_filters
+    # [/DEF:SupersetContextExtractor._extract_imported_filters:Function]
+# [/DEF:SupersetContextExtractor:Class]
+
+# [/DEF:SupersetContextExtractor:Module]