Spaces:

AshenH
/

ALM_LLM

Sleeping

File size: 5,467 Bytes

b07564d
f4dc602
 
b07564d
f4dc602
 
 
 
e002acf
85b8a4e
 
 
f4dc602
 
 
 
 
e002acf
85b8a4e
f4dc602
 
 
2336094
f4dc602
 
 
e002acf
b07564d
e002acf
f4dc602
e002acf
85b8a4e
e002acf
b07564d
2336094
 
9d6bac9
 
b07564d
 
9d6bac9
e002acf
2336094
 
 
e002acf
 
 
85b8a4e
 
 
2336094
85b8a4e
 
 
 
 
 
 
 
2336094
 
f4dc602
2336094
f4dc602
2336094
 
 
 
85b8a4e
2336094
85b8a4e
 
 
 
 
2336094
 
85b8a4e
2336094
 
 
 
 
 
85b8a4e
 
 
 
 
 
 
 
 
 
 
2336094
 
 
e002acf
85b8a4e
e002acf
85b8a4e
2336094
 
 
 
f4dc602
e002acf
2336094
f4dc602
e002acf
2336094
 
 
 
 
b07564d
e002acf
 
2336094
85b8a4e
f4dc602
e002acf
2336094
e002acf
f4dc602
2336094
f4dc602
 
b07564d
 
 
 
e002acf
f4dc602
54614e9
f4dc602
54614e9

# space/tools/sql_tool.py
import os
import re
import json
import pandas as pd
from utils.config import AppConfig
from utils.tracing import Tracer


RESERVED_MD_WORKSPACE_NAMES = {"", "workspace", "default"}  # treat these as workspace/no-DB context


class SQLTool:
    def __init__(self, cfg: AppConfig, tracer: Tracer):
        self.cfg = cfg
        self.tracer = tracer
        self.backend = cfg.sql_backend  # "bigquery" or "motherduck"

        # ---------------- BIGQUERY BACKEND ----------------
        if self.backend == "bigquery":
            from google.cloud import bigquery
            from google.oauth2 import service_account

            key_json = os.getenv("GCP_SERVICE_ACCOUNT_JSON")
            if not key_json:
                raise RuntimeError("Missing GCP_SERVICE_ACCOUNT_JSON secret")

            info = json.loads(key_json) if key_json.strip().startswith("{") else {}
            creds = service_account.Credentials.from_service_account_info(info)
            self.client = bigquery.Client(credentials=creds, project=cfg.gcp_project)

        # ---------------- MOTHERDUCK BACKEND ----------------
        elif self.backend == "motherduck":
            import duckdb

            # MotherDuck extension compatibility: widely supported ABI is DuckDB 1.3.2
            if not duckdb.__version__.startswith("1.3.2"):
                raise RuntimeError(
                    f"Incompatible DuckDB version {duckdb.__version__}. "
                    "Pin duckdb==1.3.2 in requirements.txt and redeploy."
                )

            token = (self.cfg.motherduck_token or os.getenv("MOTHERDUCK_TOKEN") or "").strip()
            db_name = (self.cfg.motherduck_db or "workspace").strip()
            allow_create = (os.getenv("ALLOW_CREATE_DB", "true").lower() == "true")
            if not token:
                raise RuntimeError("Missing MOTHERDUCK_TOKEN")

            # Workspace vs concrete DB handling
            if db_name in RESERVED_MD_WORKSPACE_NAMES:
                # Connect to workspace; caller should fully-qualify tables if needed
                self.client = duckdb.connect(f"md:?motherduck_token={token}")
                # No USE/CREATE in workspace mode
            else:
                # Try direct connection to the DB (preferred)
                try:
                    self.client = duckdb.connect(f"md:{db_name}?motherduck_token={token}")
                except Exception:
                    # Fallback: connect to workspace, then USE/CREATE the DB if permitted
                    self.client = duckdb.connect(f"md:?motherduck_token={token}")
                    self._ensure_db_context(db_name, allow_create)

        else:
            raise RuntimeError(f"Unknown SQL backend: {self.backend}")

    # ----- MotherDuck helpers -----
    def _ensure_db_context(self, db_name: str, allow_create: bool):
        """
        Try to USE the target DB; if it doesn't exist and allow_create=True, create it and USE it.
        Skips reserved workspace names.
        """
        if db_name in RESERVED_MD_WORKSPACE_NAMES:
            # No-op for workspace/default
            return

        # Attempt USE first
        try:
            self.client.execute(f"USE {self._quote_ident(db_name)};")
            return
        except Exception as use_err:
            if not allow_create:
                raise RuntimeError(
                    f"Database '{db_name}' not found and ALLOW_CREATE_DB is false. "
                    f"Original error: {use_err}"
                )

        # Attempt CREATE then USE
        try:
            # CREATE DATABASE <name>; is supported on MotherDuck for valid names (not 'default')
            self.client.execute(f"CREATE DATABASE {self._quote_ident(db_name)};")
            self.client.execute(f"USE {self._quote_ident(db_name)};")
        except Exception as create_err:
            raise RuntimeError(
                f"Could not create or use database '{db_name}'. "
                f"Original errors: CREATE: {create_err}"
            )

    @staticmethod
    def _quote_ident(name: str) -> str:
        """
        Very light identifier quoting. Replace non [a-zA-Z0-9_] with underscore.
        """
        safe = re.sub(r"[^a-zA-Z0-9_]", "_", (name or ""))
        return safe

    # ----- NL → SQL heuristic (toy example; edit to your schema) -----
    def _nl_to_sql(self, message: str) -> str:
        m = message.lower()

        # Example DuckDB/MotherDuck flavor of DATE_TRUNC
        if "avg" in m and " by " in m:
            return (
                "-- Example template; edit to your schema/columns\n"
                "SELECT DATE_TRUNC('month', date_col) AS month,\n"
                "       AVG(metric) AS avg_metric\n"
                "FROM analytics.table\n"
                "GROUP BY 1\n"
                "ORDER BY 1;"
            )

        # If user typed SQL already, run it as-is
        if re.match(r"^\\s*select ", m):
            return message

        # Fallback
        return "SELECT * FROM analytics.table LIMIT 100;"

    # ----- Execute -----
    def run(self, message: str) -> pd.DataFrame:
        sql = self._nl_to_sql(message)
        try:
            self.tracer.trace_event("sql_query", {"sql": sql, "backend": self.backend})
        except Exception:
            pass

        if self.backend == "bigquery":
            return self.client.query(sql).to_dataframe()
        else:
            return self.client.execute(sql).fetch_df()