Spaces:

Roxanne-WANG
/

LangSQL

Paused

App Files Files Community

Roxanne-WANG commited on Apr 20

Commit

749f953

1 Parent(s): 4640e16

update utils

Browse files

Files changed (1) hide show

utils/db_utils.py +47 -5

utils/db_utils.py CHANGED Viewed

@@ -7,6 +7,8 @@ from utils.bridge_content_encoder import get_matched_entries
 from nltk.tokenize import word_tokenize
 from nltk import ngrams
 def add_a_record(question, db_id):
     conn = sqlite3.connect('data/history/history.sqlite')
     cursor = conn.cursor()
@@ -97,15 +99,19 @@ def get_column_contents(column_name, table_name, cursor):
     return column_contents
 def get_matched_contents(question, searcher):
-    # coarse-grained matching between the input text and all contents in database
     grams = obtain_n_grams(question, 4)
     hits = []
     for query in grams:
-        hits.extend(searcher.search(query, limit = 10))
     coarse_matched_contents = dict()
-    for i in range(len(hits)):
-        matched_result = json.loads(hits[i].raw)
         # `tc_name` refers to column names like `table_name.column_name`, e.g., document_drafts.document_id
         tc_name = ".".join(matched_result["id"].split("-**-")[:2])
         if tc_name in coarse_matched_contents.keys():
@@ -116,7 +122,7 @@ def get_matched_contents(question, searcher):
     fine_matched_contents = dict()
     for tc_name, contents in coarse_matched_contents.items():
-        # fine-grained matching between the question and coarse matched contents
         fm_contents = get_matched_entries(question, contents)
         if fm_contents is None:
@@ -132,6 +138,42 @@ def get_matched_contents(question, searcher):
     return fine_matched_contents
 def get_db_schema_sequence(schema):
     schema_sequence = "database schema :\n"
     for table in schema["schema_items"]:

 from nltk.tokenize import word_tokenize
 from nltk import ngrams
+from whoosh.qparser import QueryParser
 def add_a_record(question, db_id):
     conn = sqlite3.connect('data/history/history.sqlite')
     cursor = conn.cursor()
     return column_contents
 def get_matched_contents(question, searcher):
+    # Coarse-grained matching between the input text and all contents in database
     grams = obtain_n_grams(question, 4)
     hits = []
+    # Parse each n-gram query into a valid Whoosh query object
     for query in grams:
+        query_parser = QueryParser("content", schema=searcher.schema)  # 'content' should match the field you are searching
+        parsed_query = query_parser.parse(query)  # Convert the query string into a Whoosh Query object
+        hits.extend(searcher.search(parsed_query, limit=10))  # Perform the search with the parsed query
     coarse_matched_contents = dict()
+    for hit in hits:
+        matched_result = json.loads(hit.raw)
         # `tc_name` refers to column names like `table_name.column_name`, e.g., document_drafts.document_id
         tc_name = ".".join(matched_result["id"].split("-**-")[:2])
         if tc_name in coarse_matched_contents.keys():
     fine_matched_contents = dict()
     for tc_name, contents in coarse_matched_contents.items():
+        # Fine-grained matching between the question and coarse matched contents
         fm_contents = get_matched_entries(question, contents)
         if fm_contents is None:
     return fine_matched_contents
+# def get_matched_contents(question, searcher):
+#     # coarse-grained matching between the input text and all contents in database
+#     grams = obtain_n_grams(question, 4)
+#     hits = []
+#     for query in grams:
+#         hits.extend(searcher.search(query, limit = 10))
+#     coarse_matched_contents = dict()
+#     for i in range(len(hits)):
+#         matched_result = json.loads(hits[i].raw)
+#         # `tc_name` refers to column names like `table_name.column_name`, e.g., document_drafts.document_id
+#         tc_name = ".".join(matched_result["id"].split("-**-")[:2])
+#         if tc_name in coarse_matched_contents.keys():
+#             if matched_result["contents"] not in coarse_matched_contents[tc_name]:
+#                 coarse_matched_contents[tc_name].append(matched_result["contents"])
+#         else:
+#             coarse_matched_contents[tc_name] = [matched_result["contents"]]
+#     fine_matched_contents = dict()
+#     for tc_name, contents in coarse_matched_contents.items():
+#         # fine-grained matching between the question and coarse matched contents
+#         fm_contents = get_matched_entries(question, contents)
+#         if fm_contents is None:
+#             continue
+#         for _match_str, (field_value, _s_match_str, match_score, s_match_score, _match_size,) in fm_contents:
+#             if match_score < 0.9:
+#                 continue
+#             if tc_name in fine_matched_contents.keys():
+#                 if len(fine_matched_contents[tc_name]) < 25:
+#                     fine_matched_contents[tc_name].append(field_value.strip())
+#             else:
+#                 fine_matched_contents[tc_name] = [field_value.strip()]
+#     return fine_matched_contents
 def get_db_schema_sequence(schema):
     schema_sequence = "database schema :\n"
     for table in schema["schema_items"]: