Spaces:

Roxanne-WANG
/

LangSQL

Paused

App Files Files Community

Roxanne-WANG commited on Apr 20

Commit

abb320a

1 Parent(s): 1c20d2c

update model weight

Browse files

Files changed (23) hide show

build_whoosh_index.py +92 -0
db_contents_index/singer/{write.lock → MAIN_WRITELOCK} +0 -0
db_contents_index/singer/MAIN_qq60yoh2am2v4iv7.seg +0 -0
db_contents_index/singer/_0.fdm +0 -0
db_contents_index/singer/_0.fdt +0 -0
db_contents_index/singer/_0.fdx +0 -0
db_contents_index/singer/_0.fnm +0 -0
db_contents_index/singer/_0.nvd +0 -0
db_contents_index/singer/_0.nvm +0 -0
db_contents_index/singer/_0.si +0 -0
db_contents_index/singer/_0.tvd +0 -0
db_contents_index/singer/_0.tvm +0 -0
db_contents_index/singer/_0.tvx +0 -0
db_contents_index/singer/_0_Lucene90_0.doc +0 -0
db_contents_index/singer/_0_Lucene90_0.dvd +0 -0
db_contents_index/singer/_0_Lucene90_0.dvm +0 -0
db_contents_index/singer/_0_Lucene90_0.pos +0 -0
db_contents_index/singer/_0_Lucene90_0.tim +0 -0
db_contents_index/singer/_0_Lucene90_0.tip +0 -0
db_contents_index/singer/_0_Lucene90_0.tmd +0 -0
db_contents_index/singer/_MAIN_1.toc +0 -0
db_contents_index/singer/segments_1 +0 -0
text2sql.py +9 -11

build_whoosh_index.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import sqlite3
+from whoosh import index
+from whoosh.fields import Schema, ID, TEXT
+def extract_contents_from_db(db_path, max_len=25):
+    """
+    Extract all non-null, unique text values of length <= max_len
+    from every table and column in the SQLite database.
+    Returns:
+        List of tuples [(doc_id, text), ...]
+    """
+    conn = sqlite3.connect(db_path)
+    cur = conn.cursor()
+    docs = []
+    # Iterate over all user tables in the database
+    for (table_name,) in cur.execute(
+        "SELECT name FROM sqlite_master WHERE type='table'"
+    ):
+        if table_name == "sqlite_sequence":
+            continue
+        # PRAGMA table_info returns rows like (cid, name, type, ...)
+        # We want the column **name**, which is at index 1
+        cols = [r[1] for r in cur.execute(f"PRAGMA table_info('{table_name}')")]
+        # Pull distinct non-null values from each column
+        for col in cols:
+            for (val,) in cur.execute(
+                f"SELECT DISTINCT `{col}` FROM `{table_name}` WHERE `{col}` IS NOT NULL"
+            ):
+                text = str(val).strip()
+                if 0 < len(text) <= max_len:
+                    # Generate a unique document ID
+                    doc_id = f"{table_name}-{col}-{hash(text)}"
+                    docs.append((doc_id, text))
+    conn.close()
+    return docs
+def build_index_for_db(db_id, db_path, index_root="db_contents_index"):
+    """
+    Build (or open) a Whoosh index for a single database.
+    - If the index already exists in index_root/db_id, it will be opened.
+    - Otherwise, a new index is created and populated from the SQLite file.
+    """
+    index_dir = os.path.join(index_root, db_id)
+    os.makedirs(index_dir, exist_ok=True)
+    # Define the schema: unique ID + stored text field
+    schema = Schema(
+        id=ID(stored=True, unique=True),
+        content=TEXT(stored=True)
+    )
+    # Open existing index if present
+    if index.exists_in(index_dir):
+        return index.open_dir(index_dir)
+    # Otherwise create a new index and add documents
+    ix = index.create_in(index_dir, schema)
+    writer = ix.writer()
+    docs = extract_contents_from_db(db_path)
+    for doc_id, text in docs:
+        writer.add_document(id=doc_id, content=text)
+    writer.commit()
+    return ix
+if __name__ == "__main__":
+    DATABASE_ROOT = "databases"
+    INDEX_ROOT = "db_contents_index"
+    # Optionally remove any existing index directory to start fresh
+    if os.path.isdir(INDEX_ROOT):
+        import shutil
+        shutil.rmtree(INDEX_ROOT)
+    os.makedirs(INDEX_ROOT, exist_ok=True)
+    # Loop over each database folder in databases/
+    for db_id in os.listdir(DATABASE_ROOT):
+        db_file = os.path.join(DATABASE_ROOT, db_id, f"{db_id}.sqlite")
+        if os.path.isfile(db_file):
+            print(f"Building Whoosh index for {db_id}...")
+            build_index_for_db(db_id, db_file, INDEX_ROOT)
+    print("All indexes built successfully.")

db_contents_index/singer/{write.lock → MAIN_WRITELOCK} RENAMED Viewed

File without changes

db_contents_index/singer/MAIN_qq60yoh2am2v4iv7.seg ADDED Viewed

Binary file (19.9 kB). View file

db_contents_index/singer/_0.fdm DELETED Viewed

Binary file (157 Bytes)

db_contents_index/singer/_0.fdt DELETED Viewed

Binary file (1.96 kB)

db_contents_index/singer/_0.fdx DELETED Viewed

Binary file (64 Bytes)

db_contents_index/singer/_0.fnm DELETED Viewed

Binary file (343 Bytes)

db_contents_index/singer/_0.nvd DELETED Viewed

Binary file (126 Bytes)

db_contents_index/singer/_0.nvm DELETED Viewed

Binary file (103 Bytes)

db_contents_index/singer/_0.si DELETED Viewed

Binary file (520 Bytes)

db_contents_index/singer/_0.tvd DELETED Viewed

Binary file (518 Bytes)

db_contents_index/singer/_0.tvm DELETED Viewed

Binary file (162 Bytes)

db_contents_index/singer/_0.tvx DELETED Viewed

Binary file (69 Bytes)

db_contents_index/singer/_0_Lucene90_0.doc DELETED Viewed

Binary file (101 Bytes)

db_contents_index/singer/_0_Lucene90_0.dvd DELETED Viewed

Binary file (1.68 kB)

db_contents_index/singer/_0_Lucene90_0.dvm DELETED Viewed

Binary file (171 Bytes)

db_contents_index/singer/_0_Lucene90_0.pos DELETED Viewed

Binary file (160 Bytes)

db_contents_index/singer/_0_Lucene90_0.tim DELETED Viewed

Binary file (1.22 kB)

db_contents_index/singer/_0_Lucene90_0.tip DELETED Viewed

Binary file (107 Bytes)

db_contents_index/singer/_0_Lucene90_0.tmd DELETED Viewed

Binary file (269 Bytes)

db_contents_index/singer/_MAIN_1.toc ADDED Viewed

Binary file (1.63 kB). View file

db_contents_index/singer/segments_1 DELETED Viewed

Binary file (154 Bytes)

text2sql.py CHANGED Viewed

@@ -9,6 +9,7 @@ import sqlite3
 from tqdm import tqdm
 from utils.db_utils import get_db_schema
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from whoosh.index import create_in
 from whoosh.fields import Schema, TEXT
 from whoosh.qparser import QueryParser
@@ -115,18 +116,15 @@ class ChatBot():
         self.sic = SchemaItemClassifierInference("Roxanne-WANG/LangSQL")
         self.db_id2content_searcher = dict()
         for db_id in os.listdir("db_contents_index"):
-            schema = Schema(content=TEXT(stored=True))
             index_dir = os.path.join("db_contents_index", db_id)
-            if not os.path.exists(index_dir):
-                os.makedirs(index_dir)
-            ix = create_in(index_dir, schema)
-            writer = ix.writer()
-            with open(os.path.join(index_dir, f"{db_id}.json"), "r") as file:
-                data = json.load(file)
-                for item in data:
-                    writer.add_document(content=item['content'])
-            writer.commit()
-            self.db_id2content_searcher[db_id] = ix
         self.db_ids = sorted(os.listdir("databases"))
         self.db_id2schema = get_db_id2schema("databases", "data/tables.json")

 from tqdm import tqdm
 from utils.db_utils import get_db_schema
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from whoosh import index
 from whoosh.index import create_in
 from whoosh.fields import Schema, TEXT
 from whoosh.qparser import QueryParser
         self.sic = SchemaItemClassifierInference("Roxanne-WANG/LangSQL")
         self.db_id2content_searcher = dict()
         for db_id in os.listdir("db_contents_index"):
             index_dir = os.path.join("db_contents_index", db_id)
+            # Open existing Whoosh index directory
+            if index.exists_in(index_dir):
+                ix = index.open_dir(index_dir)
+                # keep a searcher around for querying
+                self.db_id2content_searcher[db_id] = ix.searcher()
+            else:
+                raise ValueError(f"No Whoosh index found for '{db_id}' at '{index_dir}'")
         self.db_ids = sorted(os.listdir("databases"))
         self.db_id2schema = get_db_id2schema("databases", "data/tables.json")