Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 14

Commit

2e1969a

verified ·

1 Parent(s): d861fbf

Update tools/sql_tool.py

Browse files

Files changed (1) hide show

tools/sql_tool.py +132 -429

tools/sql_tool.py CHANGED Viewed

@@ -1,440 +1,143 @@
-# space/tools/sql_tool.py
 import os
 import re
-import json
-import logging
-import pandas as pd
-from typing import Optional
-from utils.config import AppConfig
-from utils.tracing import Tracer
-logger = logging.getLogger(__name__)
-RESERVED_MD_WORKSPACE_NAMES = {"", "workspace", "default"}
-MAX_QUERY_LENGTH = 50000
-MAX_RESULT_ROWS = 100000
-class SQLToolError(Exception):
-    """Custom exception for SQL tool errors."""
-    pass
 class SQLTool:
     """
-    SQL execution tool supporting BigQuery and MotherDuck backends.
-    Includes input validation, error handling, and secure query execution.
     """
-    def __init__(self, cfg: AppConfig, tracer: Tracer):
-        self.cfg = cfg
-        self.tracer = tracer
-        self.backend = cfg.sql_backend
-        self.client = None
-        logger.info(f"Initializing SQLTool with backend: {self.backend}")
-        try:
-            if self.backend == "bigquery":
-                self._init_bigquery()
-            elif self.backend == "motherduck":
-                self._init_motherduck()
-            else:
-                raise SQLToolError(f"Unknown SQL backend: {self.backend}")
-            logger.info(f"SQLTool initialized successfully with {self.backend}")
-        except Exception as e:
-            logger.error(f"Failed to initialize SQLTool: {e}")
-            raise SQLToolError(f"SQL backend initialization failed: {e}") from e
-    def _init_bigquery(self):
-        """Initialize BigQuery client with service account credentials."""
-        try:
-            from google.cloud import bigquery
-            from google.oauth2 import service_account
-            key_json = os.getenv("GCP_SERVICE_ACCOUNT_JSON")
-            if not key_json:
-                raise SQLToolError(
-                    "Missing GCP_SERVICE_ACCOUNT_JSON environment variable. "
-                    "Please configure BigQuery credentials."
-                )
-            # Parse credentials
-            try:
-                if key_json.strip().startswith("{"):
-                    info = json.loads(key_json)
-                else:
-                    # Assume it's a file path
-                    with open(key_json, 'r') as f:
-                        info = json.load(f)
-            except json.JSONDecodeError as e:
-                raise SQLToolError(f"Invalid JSON in GCP_SERVICE_ACCOUNT_JSON: {e}")
-            except FileNotFoundError:
-                raise SQLToolError(f"GCP service account file not found: {key_json}")
-            # Validate required fields
-            required_fields = ["type", "project_id", "private_key", "client_email"]
-            missing = [f for f in required_fields if f not in info]
-            if missing:
-                raise SQLToolError(
-                    f"GCP service account JSON missing required fields: {missing}"
-                )
-            creds = service_account.Credentials.from_service_account_info(info)
-            project = self.cfg.gcp_project or info.get("project_id")
-            if not project:
-                raise SQLToolError("GCP project ID not specified in config or credentials")
-            self.client = bigquery.Client(credentials=creds, project=project)
-            logger.info(f"BigQuery client initialized for project: {project}")
-        except ImportError as e:
-            raise SQLToolError(
-                "BigQuery dependencies not installed. "
-                "Install with: pip install google-cloud-bigquery"
-            ) from e
-    def _init_motherduck(self):
-        """Initialize MotherDuck/DuckDB client with version validation."""
-        try:
-            import duckdb
-            # Version compatibility check - be more flexible
-            version = duckdb.__version__
-            logger.info(f"DuckDB version: {version}")
-            # Warn if not on recommended version, but don't fail
-            if not version.startswith("1.3"):
-                logger.warning(
-                    f"DuckDB {version} detected. Recommended: 1.3.x for MotherDuck compatibility. "
-                    "Some features may not work as expected."
-                )
-            # Get configuration
-            token = (self.cfg.motherduck_token or os.getenv("MOTHERDUCK_TOKEN") or "").strip()
-            if not token:
-                raise SQLToolError(
-                    "Missing MOTHERDUCK_TOKEN. "
-                    "Get your token from: https://motherduck.com/docs/key-tasks/authenticating-to-motherduck"
-                )
-            db_name = (self.cfg.motherduck_db or "workspace").strip()
-            allow_create = os.getenv("ALLOW_CREATE_DB", "true").lower() == "true"
-            # Connect based on database name
-            if db_name in RESERVED_MD_WORKSPACE_NAMES:
-                # Workspace mode - no specific database context
-                connection_string = f"md:?motherduck_token={token}"
-                logger.info("Connecting to MotherDuck workspace")
-                self.client = duckdb.connect(connection_string)
-            else:
-                # Try connecting to specific database
-                try:
-                    connection_string = f"md:{db_name}?motherduck_token={token}"
-                    logger.info(f"Connecting to MotherDuck database: {db_name}")
-                    self.client = duckdb.connect(connection_string)
-                except Exception as db_err:
-                    logger.warning(f"Direct connection to '{db_name}' failed: {db_err}")
-                    # Fallback: connect to workspace and setup database
-                    connection_string = f"md:?motherduck_token={token}"
-                    self.client = duckdb.connect(connection_string)
-                    self._ensure_db_context(db_name, allow_create)
-            # Test connection
-            try:
-                self.client.execute("SELECT 1").fetchone()
-                logger.info("MotherDuck connection test successful")
-            except Exception as e:
-                raise SQLToolError(f"MotherDuck connection test failed: {e}")
-        except ImportError as e:
-            raise SQLToolError(
-                "DuckDB not installed. Install with: pip install duckdb"
-            ) from e
-    def _ensure_db_context(self, db_name: str, allow_create: bool):
-        """
-        Ensure database context is set for MotherDuck.
-        Creates database if it doesn't exist and allow_create is True.
-        """
-        if db_name in RESERVED_MD_WORKSPACE_NAMES:
-            return
-        safe_name = self._quote_ident(db_name)
-        # Try to USE the database first
-        try:
-            self.client.execute(f"USE {safe_name};")
-            logger.info(f"Using existing database: {db_name}")
-            return
-        except Exception as use_err:
-            logger.info(f"Database '{db_name}' not found: {use_err}")
-            if not allow_create:
-                raise SQLToolError(
-                    f"Database '{db_name}' does not exist and ALLOW_CREATE_DB is disabled. "
-                    f"Either create the database manually or set ALLOW_CREATE_DB=true"
-                )
-        # Attempt to create and use the database
-        try:
-            logger.info(f"Creating database: {db_name}")
-            self.client.execute(f"CREATE DATABASE IF NOT EXISTS {safe_name};")
-            self.client.execute(f"USE {safe_name};")
-            logger.info(f"Database '{db_name}' created and selected")
-        except Exception as create_err:
-            raise SQLToolError(
-                f"Failed to create database '{db_name}': {create_err}"
-            ) from create_err
-    @staticmethod
-    def _quote_ident(name: str) -> str:
-        """
-        Safely quote SQL identifiers.
-        Replaces non-alphanumeric characters with underscores.
-        """
-        if not name:
-            return "unnamed"
-        # Remove dangerous characters
-        safe = re.sub(r"[^a-zA-Z0-9_]", "_", name)
-        # Ensure it doesn't start with a number
-        if safe[0].isdigit():
-            safe = "_" + safe
-        return safe
-    def _validate_sql(self, sql: str) -> tuple[bool, str]:
-        """
-        Validate SQL query for basic safety.
-        Returns (is_valid, error_message).
-        """
-        if not sql or not sql.strip():
-            return False, "Empty SQL query"
-        if len(sql) > MAX_QUERY_LENGTH:
-            return False, f"Query too long (max {MAX_QUERY_LENGTH} characters)"
-        # Dangerous patterns check
-        sql_lower = sql.lower()
-        # Block multiple statements (simple check)
-        if sql.count(';') > 1:
-            return False, "Multiple SQL statements not allowed"
-        # Block dangerous keywords in non-SELECT queries
-        dangerous_patterns = [
-            (r'\bdrop\s+table\b', "DROP TABLE"),
-            (r'\bdrop\s+database\b', "DROP DATABASE"),
-            (r'\bdelete\s+from\b', "DELETE FROM"),
-            (r'\btruncate\b', "TRUNCATE"),
-            (r'\bexec\s*\(', "EXEC"),
-            (r'\bexecute\s*\(', "EXECUTE"),
-        ]
-        for pattern, name in dangerous_patterns:
-            if re.search(pattern, sql_lower):
-                logger.warning(f"Blocked query with {name} pattern")
-                return False, f"Query contains blocked operation: {name}"
-        return True, ""
-    def _nl_to_sql(self, message: str) -> str:
-        """
-        Convert natural language to SQL query.
-        IMPORTANT: This is a simple heuristic template system.
-        For production, either:
-        1. Replace table/column names with your actual schema, OR
-        2. Integrate a proper NL2SQL model (e.g., T5, CodeGen, GPT), OR
-        3. Have users write SQL directly
-        To customize: Set these environment variables or edit the code:
-        - SQL_DEFAULT_SCHEMA (default: "analytics")
-        - SQL_DEFAULT_TABLE (default: "fact_table")
-        """
-        m = message.lower()
-        # Get configurable defaults
-        default_schema = os.getenv("SQL_DEFAULT_SCHEMA", "analytics")
-        default_table = os.getenv("SQL_DEFAULT_TABLE", "fact_table")
-        full_table = f"{default_schema}.{default_table}"
-        # If it's already SQL, return as-is (after validation)
-        if re.match(r'^\s*select\s', m, re.IGNORECASE):
-            return message.strip()
-        # Special keyword: show tables/schemas
-        if any(keyword in m for keyword in ["show tables", "list tables", "available tables", "what tables"]):
-            return """
-SELECT table_schema, table_name, table_type
-FROM information_schema.tables
-WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
-ORDER BY table_schema, table_name
-LIMIT 100;
-"""
-        if any(keyword in m for keyword in ["show schemas", "list schemas", "available schemas"]):
-            return """
-SELECT DISTINCT table_schema
-FROM information_schema.tables
-WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
-ORDER BY table_schema;
-"""
-        if "show columns" in m or "describe table" in m or "table structure" in m:
-            # Try to extract table name from message
-            return f"""
-SELECT column_name, data_type, is_nullable
-FROM information_schema.columns
-WHERE table_schema = '{default_schema}'
-ORDER BY ordinal_position
-LIMIT 100;
-"""
-        # Template-based generation (customize for your schema)
-        if "avg" in m or "average" in m:
-            if "by month" in m or "monthly" in m:
-                return f"""
-SELECT
-    DATE_TRUNC('month', date_col) AS month,
-    AVG(metric_col) AS avg_metric
-FROM {full_table}
-GROUP BY 1
-ORDER BY 1 DESC
-LIMIT 100;
-"""
-        if "top" in m:
-            # Extract number if present
-            match = re.search(r'top\s+(\d+)', m)
-            limit = match.group(1) if match else "10"
-            return f"""
-SELECT *
-FROM {full_table}
-ORDER BY metric_col DESC
-LIMIT {limit};
-"""
-        if "count" in m:
-            return f"""
-SELECT
-    category_col,
-    COUNT(*) AS count
-FROM {full_table}
-GROUP BY 1
-ORDER BY 2 DESC
-LIMIT 100;
-"""
-        # Default fallback - show available tables instead of failing
-        logger.warning(
-            f"Could not generate specific SQL for query: '{message}'. "
-            f"Returning list of available tables. "
-            f"Configure SQL_DEFAULT_SCHEMA and SQL_DEFAULT_TABLE or write SQL directly."
-        )
-        return """
-SELECT
-    table_schema,
-    table_name,
-    table_type,
-    'Run: SELECT * FROM ' || table_schema || '.' || table_name || ' LIMIT 5' as example_query
-FROM information_schema.tables
-WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
-ORDER BY table_schema, table_name
-LIMIT 50;
-"""
-    def run(self, message: str) -> pd.DataFrame:
         """
-        Execute SQL query from natural language or SQL statement.
-        Args:
-            message: Natural language query or SQL statement
-        Returns:
-            DataFrame with query results
-        Raises:
-            SQLToolError: If query execution fails
         """
-        try:
-            # Convert to SQL
-            sql = self._nl_to_sql(message)
-            logger.info(f"Generated SQL query (first 200 chars): {sql[:200]}")
-            # Validate SQL
-            is_valid, error_msg = self._validate_sql(sql)
-            if not is_valid:
-                raise SQLToolError(f"Invalid SQL query: {error_msg}")
-            # Log query attempt
-            self.tracer.trace_event("sql_query", {
-                "sql": sql[:1000],  # Limit logged SQL length
-                "backend": self.backend,
-                "message": message[:500]
-            })
-            # Execute based on backend
-            if self.backend == "bigquery":
-                result = self._execute_bigquery(sql)
-            else:  # motherduck
-                result = self._execute_duckdb(sql)
-            # Validate result
-            if not isinstance(result, pd.DataFrame):
-                raise SQLToolError("Query did not return a DataFrame")
-            # Check result size
-            if len(result) > MAX_RESULT_ROWS:
-                logger.warning(f"Result truncated from {len(result)} to {MAX_RESULT_ROWS} rows")
-                result = result.head(MAX_RESULT_ROWS)
-            logger.info(f"Query successful: {len(result)} rows, {len(result.columns)} columns")
-            self.tracer.trace_event("sql_success", {
-                "rows": len(result),
-                "columns": len(result.columns)
-            })
-            return result
-        except SQLToolError:
-            raise
-        except Exception as e:
-            error_msg = f"SQL execution failed: {str(e)}"
-            logger.error(error_msg)
-            self.tracer.trace_event("sql_error", {"error": error_msg})
-            raise SQLToolError(error_msg) from e
-    def _execute_bigquery(self, sql: str) -> pd.DataFrame:
-        """Execute query on BigQuery."""
-        try:
-            query_job = self.client.query(sql)
-            df = query_job.to_dataframe()
-            return df
-        except Exception as e:
-            raise SQLToolError(f"BigQuery execution error: {str(e)}") from e
-    def _execute_duckdb(self, sql: str) -> pd.DataFrame:
-        """Execute query on DuckDB/MotherDuck."""
-        try:
-            result = self.client.execute(sql)
-            df = result.fetch_df()
-            return df
-        except Exception as e:
-            raise SQLToolError(f"DuckDB execution error: {str(e)}") from e
-    def test_connection(self) -> bool:
-        """Test database connection."""
-        try:
-            test_query = "SELECT 1 AS test"
-            result = self.run(test_query)
-            return len(result) == 1 and result.iloc[0, 0] == 1
-        except Exception as e:
-            logger.error(f"Connection test failed: {e}")
-            return False

+# tools/sql_tool.py
 import os
 import re
+import duckdb
+from typing import Optional, Tuple
+DUCKDB_PATH = os.getenv("DUCKDB_PATH", "alm.duckdb")
+# Defaults point to your real table; can be overridden via Space secrets
+DEFAULT_SCHEMA = os.getenv("SQL_DEFAULT_SCHEMA", "main")
+DEFAULT_TABLE  = os.getenv("SQL_DEFAULT_TABLE",  "masterdataset_v")
+def _full_table(schema: Optional[str] = None, table: Optional[str] = None) -> str:
+    schema = schema or DEFAULT_SCHEMA
+    table  = table  or DEFAULT_TABLE
+    return f"{schema}.{table}"
 class SQLTool:
     """
+    Minimal NL→SQL helper wired to main.masterdataset_v with a DuckDB runner.
     """
+    def __init__(self, db_path: Optional[str] = None):
+        self.db_path = db_path or DUCKDB_PATH
+        self.con = duckdb.connect(self.db_path)
+    def run_sql(self, sql: str):
+        return self.con.execute(sql).df()
+    # -------------------------
+    # NL → SQL
+    # -------------------------
+    def _nl_to_sql(self, message: str, schema: Optional[str] = None, table: Optional[str] = None) -> Tuple[str, str]:
         """
+        Returns (sql, rationale). Very small template library covering your common queries.
+        Falls back to SHOW TABLES if no match.
         """
+        full_table = _full_table(schema, table)
+        m = message.strip().lower()
+        # Common synonyms
+        def has_any(txt, words):
+            return any(w in txt for w in words)
+        # Extract a "top N"
+        limit = None
+        m_top = re.search(r"\btop\s+(\d+)", m)
+        if m_top:
+            limit = int(m_top.group(1))
+        # 1) Top N FDs by Portfolio_value
+        if has_any(m, ["fd", "fixed deposit", "deposits"]) and has_any(m, ["top", "largest", "biggest"]) and has_any(m, ["portfolio value", "portfolio_value"]):
+            n = limit or 10
+            sql = f"""
+            SELECT contract_number, Portfolio_value, Interest_rate, currency, segments
+            FROM {full_table}
+            WHERE lower(product) = 'fd'
+            ORDER BY Portfolio_value DESC
+            LIMIT {n};
+            """
+            why = f"Top {n} fixed deposits by Portfolio_value from {full_table}"
+            return sql, why
+        # 2) Top N Assets by Portfolio_value
+        if has_any(m, ["asset", "loan", "advances"]) and has_any(m, ["top", "largest", "biggest"]) and has_any(m, ["portfolio value", "portfolio_value"]):
+            n = limit or 10
+            sql = f"""
+            SELECT contract_number, Portfolio_value, Interest_rate, currency, segments
+            FROM {full_table}
+            WHERE lower(product) = 'assets'
+            ORDER BY Portfolio_value DESC
+            LIMIT {n};
+            """
+            why = f"Top {n} assets by Portfolio_value from {full_table}"
+            return sql, why
+        # 3) Aggregate (SUM/AVG) by segment or currency
+        if has_any(m, ["sum", "total", "avg", "average"]) and has_any(m, ["segment", "currency"]):
+            agg = "SUM" if has_any(m, ["sum", "total"]) else "AVG"
+            dim = "segments" if "segment" in m else "currency"
+            sql = f"""
+            SELECT {dim}, {agg}(Portfolio_value) AS {agg.lower()}_Portfolio_value
+            FROM {full_table}
+            GROUP BY 1
+            ORDER BY 2 DESC;
+            """
+            why = f"{agg} Portfolio_value grouped by {dim} from {full_table}"
+            return sql, why
+        # 4) Filter by product, currency, or segment
+        product = None
+        if "fd" in m or "deposit" in m:
+            product = "fd"
+        elif "asset" in m or "loan" in m or "advance" in m:
+            product = "assets"
+        parts = [f"SELECT * FROM {full_table} WHERE 1=1"]
+        why_parts = [f"Filtered rows from {full_table}"]
+        if product:
+            parts.append(f"AND lower(product) = '{product}'")
+            why_parts.append(f"product = {product}")
+        # currency filter like: "in lkr", "currency usd"
+        cur = None
+        cur_match = re.search(r"\b(currency|in)\s+([a-z]{3})\b", m)
+        if cur_match:
+            cur = cur_match.group(2).upper()
+        if cur:
+            parts.append(f"AND upper(currency) = '{cur}'")
+            why_parts.append(f"currency = {cur}")
+        # segment filter like: "segment retail" or "for corporate"
+        seg_match = re.search(r"(segment|for)\s+([a-z0-9_\- ]+)", m)
+        if seg_match:
+            seg = seg_match.group(2).strip()
+            if seg:
+                parts.append(f"AND lower(segments) LIKE '%{seg.lower()}%'")
+                why_parts.append(f"segments like '{seg}'")
+        # maybe a limit
+        if limit:
+            parts.append(f"LIMIT {limit}")
+        fallback_sql = " ".join(parts) + ";"
+        fallback_why = "; ".join(why_parts)
+        if fallback_sql:
+            return fallback_sql, fallback_why
+        # 5) Super fallback: show sample rows
+        return f"SELECT * FROM {full_table} LIMIT 20;", f"Default sample from {full_table}"
+    # Public helpers
+    def query_from_nl(self, message: str):
+        sql, why = self._nl_to_sql(message)
+        df = self.run_sql(sql)
+        return df, sql, why
+    def table_exists(self, schema: Optional[str] = None, table: Optional[str] = None) -> bool:
+        schema = schema or DEFAULT_SCHEMA
+        table  = table or DEFAULT_TABLE
+        q = f"SELECT COUNT(*) AS n FROM information_schema.tables WHERE table_schema = '{schema}' AND table_name = '{table}';"
+        n = self.con.execute(q).fetchone()[0]
+        return n > 0