Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 8

Commit

2336094

verified ·

1 Parent(s): 52a979b

Update tools/sql_tool.py

Browse files

Files changed (1) hide show

tools/sql_tool.py +69 -28

tools/sql_tool.py CHANGED Viewed

@@ -16,68 +16,110 @@ class SQLTool:
         if self.backend == "bigquery":
             from google.cloud import bigquery
             from google.oauth2 import service_account
             key_json = os.getenv("GCP_SERVICE_ACCOUNT_JSON")
             if not key_json:
                 raise RuntimeError("Missing GCP_SERVICE_ACCOUNT_JSON secret")
-            # Accept full JSON string from Space Secret
             info = json.loads(key_json) if key_json.strip().startswith("{") else {}
             creds = service_account.Credentials.from_service_account_info(info)
             self.client = bigquery.Client(credentials=creds, project=cfg.gcp_project)
         elif self.backend == "motherduck":
             import duckdb
-            # MotherDuck currently supports DuckDB 1.3.2 broadly across hosts
             if not duckdb.__version__.startswith("1.3.2"):
                 raise RuntimeError(
                     f"Incompatible DuckDB version {duckdb.__version__}. "
                     "Pin duckdb==1.3.2 in requirements.txt and redeploy."
                 )
-            token = self.cfg.motherduck_token or os.getenv("MOTHERDUCK_TOKEN")
-            db_name = my_db
             if not token:
                 raise RuntimeError("Missing MOTHERDUCK_TOKEN")
-            # Easiest, correct way: connect directly to MotherDuck database.
-            # This will auto-download/load the extension; no manual INSTALL/LOAD/ATTACH needed.
-            # Valid URIs include:
-            #   "md:"                  -> connects to workspace (all DBs)
-            #   f"md:{db_name}"        -> connects to a specific DB
-            #   f"md:{db_name}?motherduck_token=..." -> with token in URI
-            uri = f"md:{db_name}?motherduck_token={token}"
-            self.client = duckdb.connect(uri)
-            # Optional: set a default database context (USE) if you connected to 'md:' (workspace)
-            # if db_name in ("", "workspace"):
-            #     self.client.execute("USE your_database;")
         else:
-            raise RuntimeError("Unknown SQL backend")
-    def _nl_to_sql(self, message: str) -> str:
         """
-        Minimal NL2SQL heuristic; replace with your own mapping or LLM prompt.
-        Edit table/column names to your schema.
         """
         m = message.lower()
-        # Simple example (DuckDB/MotherDuck DATE_TRUNC flavor)
         if "avg" in m and " by " in m:
             return (
-                "-- Example template; edit me\n"
-                "SELECT DATE_TRUNC('month', date_col) AS month, "
-                "AVG(metric) AS avg_metric "
-                "FROM analytics.table "
-                "GROUP BY 1 "
                 "ORDER BY 1;"
             )
-        # Pass-through if the user typed SQL explicitly
         if re.match(r"^\s*select ", m):
             return message
         return "SELECT * FROM analytics.table LIMIT 100;"
     def run(self, message: str) -> pd.DataFrame:
         sql = self._nl_to_sql(message)
         try:
@@ -88,5 +130,4 @@ class SQLTool:
         if self.backend == "bigquery":
             return self.client.query(sql).to_dataframe()
         else:
-            # DuckDB (MotherDuck)
             return self.client.execute(sql).fetch_df()

         if self.backend == "bigquery":
             from google.cloud import bigquery
             from google.oauth2 import service_account
             key_json = os.getenv("GCP_SERVICE_ACCOUNT_JSON")
             if not key_json:
                 raise RuntimeError("Missing GCP_SERVICE_ACCOUNT_JSON secret")
             info = json.loads(key_json) if key_json.strip().startswith("{") else {}
             creds = service_account.Credentials.from_service_account_info(info)
             self.client = bigquery.Client(credentials=creds, project=cfg.gcp_project)
         elif self.backend == "motherduck":
             import duckdb
+            # MotherDuck extension compatibility: widely supported ABI is DuckDB 1.3.2
             if not duckdb.__version__.startswith("1.3.2"):
                 raise RuntimeError(
                     f"Incompatible DuckDB version {duckdb.__version__}. "
                     "Pin duckdb==1.3.2 in requirements.txt and redeploy."
                 )
+            token = (self.cfg.motherduck_token or os.getenv("MOTHERDUCK_TOKEN") or "").strip()
+            db_name = (self.cfg.motherduck_db or "workspace").strip()
+            allow_create = (os.getenv("ALLOW_CREATE_DB", "true").lower() == "true")
             if not token:
                 raise RuntimeError("Missing MOTHERDUCK_TOKEN")
+            # Primary path: connect directly to the database
+            # Correct formats: "md:" (workspace) or "md:<dbname>" (specific DB)
+            try:
+                uri = f"md:{db_name}?motherduck_token={token}" if db_name and db_name != "workspace" else f"md:?motherduck_token={token}"
+                self.client = duckdb.connect(uri)
+                # If we connected to workspace explicitly, set DB context if provided
+                if db_name and db_name != "workspace":
+                    # Ensure we are actually in the right DB context
+                    self._ensure_db_context(db_name, allow_create)
+            except Exception as e:
+                # Fallback: connect to workspace, then create/use DB if needed
+                self.client = duckdb.connect(f"md:?motherduck_token={token}")
+                if not db_name or db_name == "workspace":
+                    # Using workspace only, caller must fully-qualify schema.table in queries
+                    pass
+                else:
+                    self._ensure_db_context(db_name, allow_create)
         else:
+            raise RuntimeError(f"Unknown SQL backend: {self.backend}")
+    # ----- MotherDuck helpers -----
+    def _ensure_db_context(self, db_name: str, allow_create: bool):
+        """
+        Try to USE the target DB; if it doesn't exist and allow_create=True, create it and USE it.
+        """
+        # DuckDB/MotherDuck: USE <db_name>;
+        try:
+            self.client.execute(f"USE {self._quote_ident(db_name)};")
+        except Exception as use_err:
+            if not allow_create:
+                raise RuntimeError(
+                    f"Database '{db_name}' not found and ALLOW_CREATE_DB is false. "
+                    f"Original error: {use_err}"
+                )
+            # Attempt to create then USE
+            try:
+                self.client.execute(f"CREATE DATABASE {self._quote_ident(db_name)};")
+                self.client.execute(f"USE {self._quote_ident(db_name)};")
+            except Exception as create_err:
+                raise RuntimeError(
+                    f"Could not create or use database '{db_name}'. "
+                    f"Original errors: USE: {use_err} | CREATE: {create_err}"
+                )
+    @staticmethod
+    def _quote_ident(name: str) -> str:
         """
+        Very light identifier quoting. Adjust if you allow special chars.
         """
+        if not name:
+            return name
+        # basic guard; you can tighten rules for your org naming conventions
+        safe = re.sub(r"[^a-zA-Z0-9_]", "_", name)
+        return safe
+    # ----- NL → SQL heuristic (toy example; edit to your schema) -----
+    def _nl_to_sql(self, message: str) -> str:
         m = message.lower()
+        # Example DuckDB/MotherDuck flavor of DATE_TRUNC
         if "avg" in m and " by " in m:
             return (
+                "-- Example template; edit to your schema/columns\n"
+                "SELECT DATE_TRUNC('month', date_col) AS month,\n"
+                "       AVG(metric) AS avg_metric\n"
+                "FROM analytics.table\n"
+                "GROUP BY 1\n"
                 "ORDER BY 1;"
             )
+        # If user typed SQL already, run it as-is
         if re.match(r"^\s*select ", m):
             return message
+        # Fallback
         return "SELECT * FROM analytics.table LIMIT 100;"
+    # ----- Execute -----
     def run(self, message: str) -> pd.DataFrame:
         sql = self._nl_to_sql(message)
         try:
         if self.backend == "bigquery":
             return self.client.query(sql).to_dataframe()
         else:
             return self.client.execute(sql).fetch_df()