ECP

milind-jain-uipath · milind-jain-uipath · commit 12b31a2d0807 · 2026-03-24T10:43:14.000+05:30
diff --git a/src/uipath_langchain/agent/react/init_node.py b/src/uipath_langchain/agent/react/init_node.py
@@ -22,34 +22,31 @@ async def _build_schema_context(entities: list) -> str:
         format_schemas_for_context,
     )
 
+    ecp_enabled = False
     try:
         from uipath.core.feature_flags import FeatureFlags
 
-        flag_value = FeatureFlags.is_flag_enabled("EnableEntityContextPackEnrichment")
-        with open("/tmp/init_node_debug.log", "a") as _dbg:
-            _dbg.write(f"[ECP] flag value: {flag_value}\n")
-            _dbg.write(f"[ECP] all flags: {FeatureFlags._flags if hasattr(FeatureFlags, '_flags') else 'no _flags attr'}\n")
+        ecp_enabled = FeatureFlags.is_flag_enabled(
+            "EnableEntityContextPackEnrichment"
+        )
+    except Exception:
+        logger.info("Feature flags unavailable, using basic schema context")
 
-        if flag_value:
+    if ecp_enabled:
+        try:
             from uipath_langchain.agent.tools.datafabric_tool import (
                 build_entity_context_packs,
                 format_ecp_for_context,
             )
 
-            with open("/tmp/init_node_debug.log", "a") as _dbg:
-                _dbg.write("[ECP] Building enriched ECPs\n")
+            logger.info("Building enriched Entity Context Packs")
             context_packs = await build_entity_context_packs(entities)
-            with open("/tmp/init_node_ecp.json", "w") as _ef:
-                import json
-                _ef.write(json.dumps([p.to_dict() for p in context_packs], indent=2, default=str))
             return format_ecp_for_context(context_packs)
-    except Exception as e:
-        with open("/tmp/init_node_debug.log", "a") as _dbg:
-            _dbg.write(f"[ECP] EXCEPTION: {type(e).__name__}: {e}\n")
-        logger.warning(
-            "ECP enrichment failed, falling back to basic schema",
-            exc_info=True,
-        )
+        except Exception:
+            logger.warning(
+                "ECP enrichment failed, falling back to basic schema",
+                exc_info=True,
+            )
 
     return format_schemas_for_context(entities)
 
@@ -64,9 +61,6 @@ def create_init_node(
     async def graph_state_init(state: Any) -> Any:
         # --- Data Fabric schema fetch (INIT-time) ---
         schema_context: str | None = None
-        # Debug: write to file since robot swallows stdout/stderr
-        with open("/tmp/init_node_debug.log", "a") as _dbg:
-            _dbg.write(f"[INIT_NODE] resources_for_init present: {resources_for_init is not None}\n")
         if resources_for_init:
             from uipath_langchain.agent.tools.datafabric_tool import (
                 fetch_entity_schemas,
@@ -76,27 +70,13 @@ async def graph_state_init(state: Any) -> Any:
             entity_identifiers = get_datafabric_entity_identifiers_from_resources(
                 resources_for_init
             )
-            with open("/tmp/init_node_debug.log", "a") as _dbg:
-                _dbg.write(f"[INIT_NODE] entity_identifiers: {entity_identifiers}\n")
             if entity_identifiers:
                 logger.info(
                     "Fetching Data Fabric schemas for %d identifier(s)",
                     len(entity_identifiers),
                 )
                 entities = await fetch_entity_schemas(entity_identifiers)
-                with open("/tmp/init_node_debug.log", "a") as _dbg:
-                    _dbg.write(f"[INIT_NODE] fetched {len(entities)} entities\n")
                 schema_context = await _build_schema_context(entities)
-                with open("/tmp/init_node_debug.log", "a") as _dbg:
-                    _dbg.write(f"[INIT_NODE] schema_context length: {len(schema_context) if schema_context else 0}\n")
-                with open("/tmp/init_node_schema.txt", "w") as _sf:
-                    _sf.write(schema_context or "")
-                if schema_context:
-                    logger.info(
-                        "Schema context length: %d chars, starts with: %.200s",
-                        len(schema_context),
-                        schema_context,
-                    )
 
         # --- Resolve messages ---
         resolved_messages: Sequence[SystemMessage | HumanMessage] | Overwrite
@@ -110,10 +90,15 @@ async def graph_state_init(state: Any) -> Any:
         else:
             resolved_messages = list(messages)
 
-        # Debug: dump the full system prompt the LLM will see
-        _msgs = resolved_messages.value if isinstance(resolved_messages, Overwrite) else resolved_messages
+        # Log the full system prompt for debugging
+        _msgs = (
+            resolved_messages.value
+            if isinstance(resolved_messages, Overwrite)
+            else resolved_messages
+        )
         for _m in _msgs:
             if isinstance(_m, SystemMessage):
+                logger.debug("Full system prompt:\n%s", _m.content)
                 with open("/tmp/init_node_full_system_prompt.txt", "w") as _fp:
                     _fp.write(str(_m.content))
                 break
diff --git a/src/uipath_langchain/agent/tools/datafabric_tool/entity_context_pack.py b/src/uipath_langchain/agent/tools/datafabric_tool/entity_context_pack.py
@@ -1,4 +1,4 @@
-"""Entity Context Pack - Rich metadata for prod Text2SQL optimization.
+"""Entity Context Pack — rich metadata for Text2SQL optimization.
 
 Builds ECPs from Data Fabric entity metadata + sample data at INIT time.
 No LLM generation — synonyms from field.description, samples from DF API,
@@ -10,16 +10,12 @@
 import logging
 import re
 from dataclasses import dataclass, field
-from functools import lru_cache
-from pathlib import Path
 from typing import Any
 
-from uipath.platform.entities import Entity, FieldMetadata
+from uipath.platform.entities import Entity
 
 logger = logging.getLogger(__name__)
 
-_PROMPTS_DIR = Path(__file__).parent
-
 # --- Type classification sets ---
 
 _NUMERIC_TYPES = frozenset({
@@ -38,6 +34,7 @@
 
 
 # --- Dataclasses ---
+# to_dict() methods use sparse serialization (omit falsy fields) to save tokens.
 
 
 @dataclass
@@ -57,11 +54,7 @@ class ColumnContext:
     reference_entity: str | None = None
 
     def to_dict(self) -> dict[str, Any]:
-        """Serialize to JSON-compatible dict."""
-        d: dict[str, Any] = {
-            "name": self.name,
-            "type": self.type,
-        }
+        d: dict[str, Any] = {"name": self.name, "type": self.type}
         if self.description:
             d["description"] = self.description
         if self.synonyms:
@@ -74,12 +67,76 @@ def to_dict(self) -> dict[str, Any]:
             d["is_foreign_key"] = True
             if self.reference_entity:
                 d["reference_entity"] = self.reference_entity
-        d["is_numeric"] = self.is_numeric
-        d["is_temporal"] = self.is_temporal
-        d["is_categorical"] = self.is_categorical
+        if self.is_numeric:
+            d["is_numeric"] = True
+        if self.is_temporal:
+            d["is_temporal"] = True
+        if self.is_categorical:
+            d["is_categorical"] = True
         return d
 
 
+@dataclass
+class QueryCapabilities:
+    """Structured SQL capabilities for LLM parsing.
+
+    Intentionally duplicates some sql_constraints.txt content in a
+    machine-parseable format alongside the free-text rules.
+    """
+
+    allowed_clauses: list[str] = field(default_factory=lambda: [
+        "SELECT", "WHERE", "GROUP BY", "HAVING", "ORDER BY",
+        "LIMIT", "OFFSET", "DISTINCT", "LEFT JOIN",
+    ])
+    allowed_aggregations: list[str] = field(default_factory=lambda: [
+        "COUNT(column_name)", "SUM", "AVG", "MIN", "MAX",
+    ])
+    allowed_expressions: list[str] = field(default_factory=lambda: [
+        "CASE/WHEN", "CAST", "COALESCE", "NULLIF",
+        "ROUND", "ABS", "LOWER", "UPPER", "TRIM",
+        "arithmetic (+, -, *, /)", "string concat (||)",
+    ])
+    allowed_predicates: list[str] = field(default_factory=lambda: [
+        "=", "<>", ">", "<", ">=", "<=",
+        "BETWEEN", "IN", "LIKE", "IS NULL", "IS NOT NULL",
+        "AND", "OR",
+    ])
+    disallowed: list[str] = field(default_factory=lambda: [
+        "SELECT *",
+        "COUNT(*) — use COUNT(column_name)",
+        "COUNT(DISTINCT ...) — no DISTINCT in aggregates",
+        "subqueries in any clause",
+        "UNION / INTERSECT / EXCEPT",
+        "CTE (WITH clause)",
+        "window functions (ROW_NUMBER, RANK, PARTITION BY)",
+        "RIGHT JOIN / FULL OUTER JOIN / CROSS JOIN",
+        "self-joins",
+        "more than 4 tables in JOIN chain",
+        "INSERT / UPDATE / DELETE / DDL",
+        "ORDER BY columns not in SELECT",
+        "HAVING without GROUP BY",
+        "OFFSET without LIMIT",
+    ])
+    critical_rules: list[str] = field(default_factory=lambda: [
+        "ALWAYS use explicit column names — never SELECT *",
+        "Use COUNT(column_name) — never COUNT(*) or COUNT(1)",
+        "LIMIT is REQUIRED on every query without a WHERE clause",
+        "All non-aggregated columns in SELECT must appear in GROUP BY",
+        "Only LEFT JOIN is supported",
+        "Maximum 4 tables in a JOIN chain",
+    ])
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "allowed_clauses": self.allowed_clauses,
+            "allowed_aggregations": self.allowed_aggregations,
+            "allowed_expressions": self.allowed_expressions,
+            "allowed_predicates": self.allowed_predicates,
+            "disallowed": self.disallowed,
+            "critical_rules": self.critical_rules,
+        }
+
+
 @dataclass
 class EntityContextPack:
     """Complete context for a single entity."""
@@ -89,9 +146,9 @@ class EntityContextPack:
     description: str | None = None
     columns: list[ColumnContext] = field(default_factory=list)
     row_count: int | None = None
+    query_capabilities: QueryCapabilities = field(default_factory=QueryCapabilities)
 
     def to_dict(self) -> dict[str, Any]:
-        """Serialize to JSON-compatible dict."""
         d: dict[str, Any] = {
             "entity_name": self.entity_name,
             "display_name": self.display_name,
@@ -101,6 +158,7 @@ def to_dict(self) -> dict[str, Any]:
         if self.row_count is not None:
             d["row_count"] = self.row_count
         d["columns"] = [c.to_dict() for c in self.columns]
+        d["query_capabilities"] = self.query_capabilities.to_dict()
         return d
 
 
@@ -110,11 +168,7 @@ def to_dict(self) -> dict[str, Any]:
 def classify_field_type(sql_type_name: str) -> tuple[bool, bool, bool]:
     """Classify a SQL type into (is_numeric, is_temporal, is_categorical)."""
     t = sql_type_name.lower().strip()
-    return (
-        t in _NUMERIC_TYPES,
-        t in _TEMPORAL_TYPES,
-        t in _CATEGORICAL_TYPES,
-    )
+    return (t in _NUMERIC_TYPES, t in _TEMPORAL_TYPES, t in _CATEGORICAL_TYPES)
 
 
 def extract_synonyms(field_name: str, description: str | None) -> list[str]:
@@ -129,18 +183,17 @@ def extract_synonyms(field_name: str, description: str | None) -> list[str]:
     name_lower = field_name.lower()
     synonyms: set[str] = set()
 
-    # Extract parenthetical content: "Total enrollment (K-12 students)"
     parens = re.findall(r"\(([^)]+)\)", description)
     for p in parens:
         p_stripped = p.strip()
         if p_stripped and p_stripped.lower() != name_lower:
             synonyms.add(p_stripped)
 
-    # Split on delimiters
-    parts = re.split(r"[,;]|\bor\b|\baka\b|\balso known as\b", description, flags=re.IGNORECASE)
+    parts = re.split(
+        r"[,;]|\bor\b|\baka\b|\balso known as\b", description, flags=re.IGNORECASE
+    )
     for part in parts:
         token = part.strip().strip(".")
-        # Only keep short phrases (likely synonyms, not full sentences)
         if (
             token
             and len(token.split()) <= 4
@@ -152,7 +205,7 @@ def extract_synonyms(field_name: str, description: str | None) -> list[str]:
     return sorted(synonyms)
 
 
-async def fetch_sample_rows(
+async def _fetch_sample_rows(
     entity_key: str, limit: int = 5
 ) -> list[dict[str, Any]]:
     """Fetch sample rows from Data Fabric using list_records API."""
@@ -162,8 +215,8 @@ async def fetch_sample_rows(
     try:
         records = await sdk.entities.list_records_async(entity_key, limit=limit)
         return [record.model_dump(exclude={"id"}) for record in records]
-    except Exception as e:
-        logger.warning(f"Failed to fetch sample rows for '{entity_key}': {e}")
+    except Exception:
+        logger.warning("Failed to fetch sample rows for '%s'", entity_key, exc_info=True)
         return []
 
 
@@ -186,10 +239,12 @@ def _extract_column_examples(
     return examples
 
 
+# --- Builders ---
+
+
 async def build_entity_context_pack(entity: Entity) -> EntityContextPack:
     """Build a full ECP from an Entity, including sample data from DF API."""
-    # Fetch sample rows concurrently with building column metadata
-    sample_rows = await fetch_sample_rows(entity.id)
+    sample_rows = await _fetch_sample_rows(entity.id)
 
     columns: list[ColumnContext] = []
     for f in entity.fields or []:
@@ -236,9 +291,7 @@ async def build_entity_context_packs(
     packs: list[EntityContextPack] = []
     for i, result in enumerate(results):
         if isinstance(result, Exception):
-            logger.warning(
-                f"Failed to build ECP for '{entities[i].name}': {result}"
-            )
+            logger.warning("Failed to build ECP for '%s': %s", entities[i].name, result)
         else:
             packs.append(result)
     return packs
@@ -247,46 +300,31 @@ async def build_entity_context_packs(
 # --- Formatting ---
 
 
-@lru_cache(maxsize=1)
-def _load_sql_constraints() -> str:
-    """Load SQL constraints from sql_constraints.txt."""
-    constraints_path = _PROMPTS_DIR / "sql_constraints.txt"
-    try:
-        return constraints_path.read_text(encoding="utf-8")
-    except FileNotFoundError:
-        logger.warning(f"SQL constraints file not found: {constraints_path}")
-        return ""
-
-
 def format_ecp_for_context(context_packs: list[EntityContextPack]) -> str:
     """Format ECPs as JSON for injection into agent system prompt.
 
-    Produces: SQL constraints + ECP JSON block.
-    The system_prompt.txt (SQL expert guidelines) is NOT included here —
-    it goes into the Studio Web system message at design time.
+    Produces: SQL generation guidelines + SQL constraints + ECP JSON block.
     """
     if not context_packs:
         return ""
 
+    from .datafabric_tool import _load_sql_constraints, _load_system_prompt
+
     lines: list[str] = []
 
+    system_prompt = _load_system_prompt()
+    if system_prompt:
+        lines.extend(["## SQL Query Generation Guidelines", "", system_prompt, ""])
+
     sql_constraints = _load_sql_constraints()
     if sql_constraints:
-        lines.append("## SQL Constraints")
-        lines.append("")
-        lines.append(sql_constraints)
-        lines.append("")
+        lines.extend(["## SQL Constraints", "", sql_constraints, ""])
 
     ecp_json = json.dumps(
         [pack.to_dict() for pack in context_packs],
         indent=2,
         default=str,
     )
-
-    lines.append("## Entity Context Packs")
-    lines.append("")
-    lines.append("```json")
-    lines.append(ecp_json)
-    lines.append("```")
+    lines.extend(["## Entity Context Packs", "", "```json", ecp_json, "```"])
 
     return "\n".join(lines)