Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 13

Commit

6c6d38f

verified ·

1 Parent(s): da25b2a

Update tools/sql_tool.py

Browse files

Files changed (1) hide show

tools/sql_tool.py +338 -90

tools/sql_tool.py CHANGED Viewed

@@ -2,138 +2,386 @@
 import os
 import re
 import json
 import pandas as pd
 from utils.config import AppConfig
 from utils.tracing import Tracer
-RESERVED_MD_WORKSPACE_NAMES = {"", "workspace", "default"}  # treat these as workspace/no-DB context
 class SQLTool:
     def __init__(self, cfg: AppConfig, tracer: Tracer):
         self.cfg = cfg
         self.tracer = tracer
-        self.backend = cfg.sql_backend  # "bigquery" or "motherduck"
-        # ---------------- BIGQUERY BACKEND ----------------
-        if self.backend == "bigquery":
             from google.cloud import bigquery
             from google.oauth2 import service_account
             key_json = os.getenv("GCP_SERVICE_ACCOUNT_JSON")
             if not key_json:
-                raise RuntimeError("Missing GCP_SERVICE_ACCOUNT_JSON secret")
-            info = json.loads(key_json) if key_json.strip().startswith("{") else {}
             creds = service_account.Credentials.from_service_account_info(info)
-            self.client = bigquery.Client(credentials=creds, project=cfg.gcp_project)
-        # ---------------- MOTHERDUCK BACKEND ----------------
-        elif self.backend == "motherduck":
             import duckdb
-            # MotherDuck extension compatibility: widely supported ABI is DuckDB 1.3.2
-            if not duckdb.__version__.startswith("1.3.2"):
-                raise RuntimeError(
-                    f"Incompatible DuckDB version {duckdb.__version__}. "
-                    "Pin duckdb==1.3.2 in requirements.txt and redeploy."
                 )
             token = (self.cfg.motherduck_token or os.getenv("MOTHERDUCK_TOKEN") or "").strip()
-            db_name = (self.cfg.motherduck_db or "workspace").strip()
-            allow_create = (os.getenv("ALLOW_CREATE_DB", "true").lower() == "true")
             if not token:
-                raise RuntimeError("Missing MOTHERDUCK_TOKEN")
-            # Workspace vs concrete DB handling
             if db_name in RESERVED_MD_WORKSPACE_NAMES:
-                # Connect to workspace; caller should fully-qualify tables if needed
-                self.client = duckdb.connect(f"md:?motherduck_token={token}")
-                # No USE/CREATE in workspace mode
             else:
-                # Try direct connection to the DB (preferred)
                 try:
-                    self.client = duckdb.connect(f"md:{db_name}?motherduck_token={token}")
-                except Exception:
-                    # Fallback: connect to workspace, then USE/CREATE the DB if permitted
-                    self.client = duckdb.connect(f"md:?motherduck_token={token}")
                     self._ensure_db_context(db_name, allow_create)
-        else:
-            raise RuntimeError(f"Unknown SQL backend: {self.backend}")
-    # ----- MotherDuck helpers -----
     def _ensure_db_context(self, db_name: str, allow_create: bool):
         """
-        Try to USE the target DB; if it doesn't exist and allow_create=True, create it and USE it.
-        Skips reserved workspace names.
         """
         if db_name in RESERVED_MD_WORKSPACE_NAMES:
-            # No-op for workspace/default
             return
-        # Attempt USE first
         try:
-            self.client.execute(f"USE {self._quote_ident(db_name)};")
             return
         except Exception as use_err:
             if not allow_create:
-                raise RuntimeError(
-                    f"Database '{db_name}' not found and ALLOW_CREATE_DB is false. "
-                    f"Original error: {use_err}"
                 )
-        # Attempt CREATE then USE
         try:
-            # CREATE DATABASE <name>; is supported on MotherDuck for valid names (not 'default')
-            self.client.execute(f"CREATE DATABASE {self._quote_ident(db_name)};")
-            self.client.execute(f"USE {self._quote_ident(db_name)};")
         except Exception as create_err:
-            raise RuntimeError(
-                f"Could not create or use database '{db_name}'. "
-                f"Original errors: CREATE: {create_err}"
-            )
     @staticmethod
     def _quote_ident(name: str) -> str:
         """
-        Very light identifier quoting. Replace non [a-zA-Z0-9_] with underscore.
         """
-        safe = re.sub(r"[^a-zA-Z0-9_]", "_", (name or ""))
         return safe
-    # ----- NL → SQL heuristic (toy example; edit to your schema) -----
     def _nl_to_sql(self, message: str) -> str:
         m = message.lower()
-        # Example DuckDB/MotherDuck flavor of DATE_TRUNC
-        if "avg" in m and " by " in m:
-            return (
-                "-- Example template; edit to your schema/columns\n"
-                "SELECT DATE_TRUNC('month', date_col) AS month,\n"
-                "       AVG(metric) AS avg_metric\n"
-                "FROM analytics.table\n"
-                "GROUP BY 1\n"
-                "ORDER BY 1;"
-            )
-        # If user typed SQL already, run it as-is
-        if re.match(r"^\\s*select ", m):
-            return message
-        # Fallback
-        return "SELECT * FROM analytics.table LIMIT 100;"
-    # ----- Execute -----
     def run(self, message: str) -> pd.DataFrame:
-        sql = self._nl_to_sql(message)
         try:
-            self.tracer.trace_event("sql_query", {"sql": sql, "backend": self.backend})
-        except Exception:
-            pass
-        if self.backend == "bigquery":
-            return self.client.query(sql).to_dataframe()
-        else:
-            return self.client.execute(sql).fetch_df()

 import os
 import re
 import json
+import logging
 import pandas as pd
+from typing import Optional
 from utils.config import AppConfig
 from utils.tracing import Tracer
+logger = logging.getLogger(__name__)
+RESERVED_MD_WORKSPACE_NAMES = {"", "workspace", "default"}
+MAX_QUERY_LENGTH = 50000
+MAX_RESULT_ROWS = 100000
+class SQLToolError(Exception):
+    """Custom exception for SQL tool errors."""
+    pass
 class SQLTool:
+    """
+    SQL execution tool supporting BigQuery and MotherDuck backends.
+    Includes input validation, error handling, and secure query execution.
+    """
     def __init__(self, cfg: AppConfig, tracer: Tracer):
         self.cfg = cfg
         self.tracer = tracer
+        self.backend = cfg.sql_backend
+        self.client = None
+        logger.info(f"Initializing SQLTool with backend: {self.backend}")
+        try:
+            if self.backend == "bigquery":
+                self._init_bigquery()
+            elif self.backend == "motherduck":
+                self._init_motherduck()
+            else:
+                raise SQLToolError(f"Unknown SQL backend: {self.backend}")
+            logger.info(f"SQLTool initialized successfully with {self.backend}")
+        except Exception as e:
+            logger.error(f"Failed to initialize SQLTool: {e}")
+            raise SQLToolError(f"SQL backend initialization failed: {e}") from e
+    def _init_bigquery(self):
+        """Initialize BigQuery client with service account credentials."""
+        try:
             from google.cloud import bigquery
             from google.oauth2 import service_account
             key_json = os.getenv("GCP_SERVICE_ACCOUNT_JSON")
             if not key_json:
+                raise SQLToolError(
+                    "Missing GCP_SERVICE_ACCOUNT_JSON environment variable. "
+                    "Please configure BigQuery credentials."
+                )
+            # Parse credentials
+            try:
+                if key_json.strip().startswith("{"):
+                    info = json.loads(key_json)
+                else:
+                    # Assume it's a file path
+                    with open(key_json, 'r') as f:
+                        info = json.load(f)
+            except json.JSONDecodeError as e:
+                raise SQLToolError(f"Invalid JSON in GCP_SERVICE_ACCOUNT_JSON: {e}")
+            except FileNotFoundError:
+                raise SQLToolError(f"GCP service account file not found: {key_json}")
+            # Validate required fields
+            required_fields = ["type", "project_id", "private_key", "client_email"]
+            missing = [f for f in required_fields if f not in info]
+            if missing:
+                raise SQLToolError(
+                    f"GCP service account JSON missing required fields: {missing}"
+                )
             creds = service_account.Credentials.from_service_account_info(info)
+            project = self.cfg.gcp_project or info.get("project_id")
+            if not project:
+                raise SQLToolError("GCP project ID not specified in config or credentials")
+            self.client = bigquery.Client(credentials=creds, project=project)
+            logger.info(f"BigQuery client initialized for project: {project}")
+        except ImportError as e:
+            raise SQLToolError(
+                "BigQuery dependencies not installed. "
+                "Install with: pip install google-cloud-bigquery"
+            ) from e
+    def _init_motherduck(self):
+        """Initialize MotherDuck/DuckDB client with version validation."""
+        try:
             import duckdb
+            # Version compatibility check - be more flexible
+            version = duckdb.__version__
+            logger.info(f"DuckDB version: {version}")
+            # Warn if not on recommended version, but don't fail
+            if not version.startswith("1.3"):
+                logger.warning(
+                    f"DuckDB {version} detected. Recommended: 1.3.x for MotherDuck compatibility. "
+                    "Some features may not work as expected."
                 )
+            # Get configuration
             token = (self.cfg.motherduck_token or os.getenv("MOTHERDUCK_TOKEN") or "").strip()
             if not token:
+                raise SQLToolError(
+                    "Missing MOTHERDUCK_TOKEN. "
+                    "Get your token from: https://motherduck.com/docs/key-tasks/authenticating-to-motherduck"
+                )
+            db_name = (self.cfg.motherduck_db or "workspace").strip()
+            allow_create = os.getenv("ALLOW_CREATE_DB", "true").lower() == "true"
+            # Connect based on database name
             if db_name in RESERVED_MD_WORKSPACE_NAMES:
+                # Workspace mode - no specific database context
+                connection_string = f"md:?motherduck_token={token}"
+                logger.info("Connecting to MotherDuck workspace")
+                self.client = duckdb.connect(connection_string)
             else:
+                # Try connecting to specific database
                 try:
+                    connection_string = f"md:{db_name}?motherduck_token={token}"
+                    logger.info(f"Connecting to MotherDuck database: {db_name}")
+                    self.client = duckdb.connect(connection_string)
+                except Exception as db_err:
+                    logger.warning(f"Direct connection to '{db_name}' failed: {db_err}")
+                    # Fallback: connect to workspace and setup database
+                    connection_string = f"md:?motherduck_token={token}"
+                    self.client = duckdb.connect(connection_string)
                     self._ensure_db_context(db_name, allow_create)
+            # Test connection
+            try:
+                self.client.execute("SELECT 1").fetchone()
+                logger.info("MotherDuck connection test successful")
+            except Exception as e:
+                raise SQLToolError(f"MotherDuck connection test failed: {e}")
+        except ImportError as e:
+            raise SQLToolError(
+                "DuckDB not installed. Install with: pip install duckdb"
+            ) from e
     def _ensure_db_context(self, db_name: str, allow_create: bool):
         """
+        Ensure database context is set for MotherDuck.
+        Creates database if it doesn't exist and allow_create is True.
         """
         if db_name in RESERVED_MD_WORKSPACE_NAMES:
             return
+        safe_name = self._quote_ident(db_name)
+        # Try to USE the database first
         try:
+            self.client.execute(f"USE {safe_name};")
+            logger.info(f"Using existing database: {db_name}")
             return
         except Exception as use_err:
+            logger.info(f"Database '{db_name}' not found: {use_err}")
             if not allow_create:
+                raise SQLToolError(
+                    f"Database '{db_name}' does not exist and ALLOW_CREATE_DB is disabled. "
+                    f"Either create the database manually or set ALLOW_CREATE_DB=true"
                 )
+        # Attempt to create and use the database
         try:
+            logger.info(f"Creating database: {db_name}")
+            self.client.execute(f"CREATE DATABASE IF NOT EXISTS {safe_name};")
+            self.client.execute(f"USE {safe_name};")
+            logger.info(f"Database '{db_name}' created and selected")
         except Exception as create_err:
+            raise SQLToolError(
+                f"Failed to create database '{db_name}': {create_err}"
+            ) from create_err
     @staticmethod
     def _quote_ident(name: str) -> str:
         """
+        Safely quote SQL identifiers.
+        Replaces non-alphanumeric characters with underscores.
         """
+        if not name:
+            return "unnamed"
+        # Remove dangerous characters
+        safe = re.sub(r"[^a-zA-Z0-9_]", "_", name)
+        # Ensure it doesn't start with a number
+        if safe[0].isdigit():
+            safe = "_" + safe
         return safe
+    def _validate_sql(self, sql: str) -> tuple[bool, str]:
+        """
+        Validate SQL query for basic safety.
+        Returns (is_valid, error_message).
+        """
+        if not sql or not sql.strip():
+            return False, "Empty SQL query"
+        if len(sql) > MAX_QUERY_LENGTH:
+            return False, f"Query too long (max {MAX_QUERY_LENGTH} characters)"
+        # Dangerous patterns check
+        sql_lower = sql.lower()
+        # Block multiple statements (simple check)
+        if sql.count(';') > 1:
+            return False, "Multiple SQL statements not allowed"
+        # Block dangerous keywords in non-SELECT queries
+        dangerous_patterns = [
+            (r'\bdrop\s+table\b', "DROP TABLE"),
+            (r'\bdrop\s+database\b', "DROP DATABASE"),
+            (r'\bdelete\s+from\b', "DELETE FROM"),
+            (r'\btruncate\b', "TRUNCATE"),
+            (r'\bexec\s*\(', "EXEC"),
+            (r'\bexecute\s*\(', "EXECUTE"),
+        ]
+        for pattern, name in dangerous_patterns:
+            if re.search(pattern, sql_lower):
+                logger.warning(f"Blocked query with {name} pattern")
+                return False, f"Query contains blocked operation: {name}"
+        return True, ""
     def _nl_to_sql(self, message: str) -> str:
+        """
+        Convert natural language to SQL query.
+        This is a simple heuristic - replace with proper NL2SQL model for production.
+        """
         m = message.lower()
+        # If it's already SQL, return as-is (after validation)
+        if re.match(r'^\s*select\s', m, re.IGNORECASE):
+            return message.strip()
+        # Template-based generation (customize for your schema)
+        if "avg" in m or "average" in m:
+            if "by month" in m or "monthly" in m:
+                return """
+SELECT
+    DATE_TRUNC('month', date_col) AS month,
+    AVG(metric_col) AS avg_metric
+FROM analytics.fact_table
+GROUP BY 1
+ORDER BY 1 DESC
+LIMIT 100;
+"""
+        if "top" in m:
+            # Extract number if present
+            match = re.search(r'top\s+(\d+)', m)
+            limit = match.group(1) if match else "10"
+            return f"""
+SELECT *
+FROM analytics.fact_table
+ORDER BY metric_col DESC
+LIMIT {limit};
+"""
+        if "count" in m:
+            return """
+SELECT
+    category_col,
+    COUNT(*) AS count
+FROM analytics.fact_table
+GROUP BY 1
+ORDER BY 2 DESC
+LIMIT 100;
+"""
+        # Default fallback
+        return """
+SELECT *
+FROM analytics.fact_table
+LIMIT 100;
+"""
     def run(self, message: str) -> pd.DataFrame:
+        """
+        Execute SQL query from natural language or SQL statement.
+        Args:
+            message: Natural language query or SQL statement
+        Returns:
+            DataFrame with query results
+        Raises:
+            SQLToolError: If query execution fails
+        """
         try:
+            # Convert to SQL
+            sql = self._nl_to_sql(message)
+            logger.info(f"Generated SQL query (first 200 chars): {sql[:200]}")
+            # Validate SQL
+            is_valid, error_msg = self._validate_sql(sql)
+            if not is_valid:
+                raise SQLToolError(f"Invalid SQL query: {error_msg}")
+            # Log query attempt
+            self.tracer.trace_event("sql_query", {
+                "sql": sql[:1000],  # Limit logged SQL length
+                "backend": self.backend,
+                "message": message[:500]
+            })
+            # Execute based on backend
+            if self.backend == "bigquery":
+                result = self._execute_bigquery(sql)
+            else:  # motherduck
+                result = self._execute_duckdb(sql)
+            # Validate result
+            if not isinstance(result, pd.DataFrame):
+                raise SQLToolError("Query did not return a DataFrame")
+            # Check result size
+            if len(result) > MAX_RESULT_ROWS:
+                logger.warning(f"Result truncated from {len(result)} to {MAX_RESULT_ROWS} rows")
+                result = result.head(MAX_RESULT_ROWS)
+            logger.info(f"Query successful: {len(result)} rows, {len(result.columns)} columns")
+            self.tracer.trace_event("sql_success", {
+                "rows": len(result),
+                "columns": len(result.columns)
+            })
+            return result
+        except SQLToolError:
+            raise
+        except Exception as e:
+            error_msg = f"SQL execution failed: {str(e)}"
+            logger.error(error_msg)
+            self.tracer.trace_event("sql_error", {"error": error_msg})
+            raise SQLToolError(error_msg) from e
+    def _execute_bigquery(self, sql: str) -> pd.DataFrame:
+        """Execute query on BigQuery."""
+        try:
+            query_job = self.client.query(sql)
+            df = query_job.to_dataframe()
+            return df
+        except Exception as e:
+            raise SQLToolError(f"BigQuery execution error: {str(e)}") from e
+    def _execute_duckdb(self, sql: str) -> pd.DataFrame:
+        """Execute query on DuckDB/MotherDuck."""
+        try:
+            result = self.client.execute(sql)
+            df = result.fetch_df()
+            return df
+        except Exception as e:
+            raise SQLToolError(f"DuckDB execution error: {str(e)}") from e
+    def test_connection(self) -> bool:
+        """Test database connection."""
+        try:
+            test_query = "SELECT 1 AS test"
+            result = self.run(test_query)
+            return len(result) == 1 and result.iloc[0, 0] == 1
+        except Exception as e:
+            logger.error(f"Connection test failed: {e}")
+            return False