Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,6 +22,7 @@ from duckduckgo_search import DDGS
|
|
| 22 |
import requests
|
| 23 |
import tempfile
|
| 24 |
|
|
|
|
| 25 |
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
| 26 |
|
| 27 |
# create the length function
|
|
@@ -149,14 +150,15 @@ def add_files_to_zip(session_id):
|
|
| 149 |
arcname = os.path.relpath(file_path, session_id)
|
| 150 |
zipObj.write(file_path, arcname)
|
| 151 |
|
| 152 |
-
|
| 153 |
## Search files functions ##
|
| 154 |
|
| 155 |
def search_docs(topic, max_references):
|
|
|
|
| 156 |
doc_list = []
|
| 157 |
with DDGS() as ddgs:
|
| 158 |
i=0
|
| 159 |
for r in ddgs.text('{} filetype:pdf'.format(topic), region='wt-wt', safesearch='On', timelimit='n'):
|
|
|
|
| 160 |
if i>=max_references:
|
| 161 |
break
|
| 162 |
doc_list.append("TITLE : " + r['title'] + " -- BODY : " + r['body'] + " -- URL : " + r['href'])
|
|
@@ -164,7 +166,7 @@ def search_docs(topic, max_references):
|
|
| 164 |
return doc_list
|
| 165 |
|
| 166 |
|
| 167 |
-
def store_files(references):
|
| 168 |
url_list=[]
|
| 169 |
temp_files = []
|
| 170 |
for ref in references:
|
|
@@ -182,11 +184,13 @@ def store_files(references):
|
|
| 182 |
temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
|
| 183 |
temp_file.write(response.content)
|
| 184 |
temp_file.close()
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
return temp_files
|
| 188 |
-
|
| 189 |
-
|
| 190 |
## Summary functions ##
|
| 191 |
|
| 192 |
## Load each doc from the vector store
|
|
@@ -289,7 +293,7 @@ def embed_files(files,ui_session_id,progress=gr.Progress(),progress_step=0.05):
|
|
| 289 |
|
| 290 |
print("EMBEDDED, before embeddeding: ",session_id,len(db.index_to_docstore_id))
|
| 291 |
for file_id,file in enumerate(files):
|
| 292 |
-
print("ID : ", file_id,"FILE : ", file)
|
| 293 |
file_type = file.name.split('.')[-1].lower()
|
| 294 |
source = file.name.split('/')[-1]
|
| 295 |
print(f"current file: {source}")
|
|
@@ -330,19 +334,25 @@ def embed_files(files,ui_session_id,progress=gr.Progress(),progress_step=0.05):
|
|
| 330 |
progress(progress_step, desc = 'db zipped')
|
| 331 |
return f"{session_id}.zip",ui_session_id
|
| 332 |
|
| 333 |
-
def display_docs(docs):
|
| 334 |
-
output_str = ''
|
| 335 |
-
for i, doc in enumerate(docs):
|
| 336 |
-
source = doc.metadata['source'].split('/')[-1]
|
| 337 |
-
output_str += f"Ref: {i+1}\n{repr(doc.page_content)}\nSource: {source}\n\n"
|
| 338 |
-
return output_str
|
| 339 |
|
| 340 |
|
| 341 |
def add_to_db(references,ui_session_id):
|
| 342 |
files = store_files(references)
|
| 343 |
return embed_files(files,ui_session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
def ask_gpt(query, apikey,history,ui_session_id):
|
| 347 |
session_id = f"PDFAISS-{ui_session_id}"
|
| 348 |
try:
|
|
@@ -368,7 +378,10 @@ with gr.Blocks() as demo:
|
|
| 368 |
gr.Markdown("Upload your documents and question them.")
|
| 369 |
with gr.Accordion("Open to enter your API key", open=False):
|
| 370 |
apikey_input = gr.Textbox(placeholder="Type here your OpenAI API key to use Summarization and Q&A", label="OpenAI API Key",type='password')
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
| 372 |
with gr.Accordion("Get files from the web", open=False):
|
| 373 |
with gr.Column():
|
| 374 |
topic_input = gr.Textbox(placeholder="Type your research", label="Research")
|
|
@@ -379,6 +392,8 @@ with gr.Blocks() as demo:
|
|
| 379 |
dd_documents.style(container=True)
|
| 380 |
with gr.Row():
|
| 381 |
btn_dl = gr.Button("Add these files to the Database")
|
|
|
|
|
|
|
| 382 |
tb_session_id = gr.Textbox(label='session id')
|
| 383 |
docs_input = gr.File(file_count="multiple", file_types=[".txt", ".pdf",".zip",".docx"])
|
| 384 |
db_output = gr.outputs.File(label="Download zipped database")
|
|
@@ -406,10 +421,12 @@ with gr.Blocks() as demo:
|
|
| 406 |
|
| 407 |
btn_search.click(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
|
| 408 |
btn_dl.click(add_to_db, inputs=[dd_documents,tb_session_id], outputs=[db_output,tb_session_id])
|
|
|
|
|
|
|
| 409 |
btn_generate_db.click(embed_files, inputs=[docs_input,tb_session_id], outputs=[db_output,tb_session_id])
|
| 410 |
btn_reset_db.click(reset_database,inputs=[tb_session_id],outputs=[db_output])
|
| 411 |
btn_summary.click(summarize_docs, inputs=[apikey_input,tb_session_id], outputs=summary_output)
|
| 412 |
btn_askGPT.click(ask_gpt, inputs=[query_input,apikey_input,history,tb_session_id], outputs=[answer_output,sources,history])
|
| 413 |
-
|
| 414 |
demo.queue(concurrency_count=10)
|
| 415 |
demo.launch(debug=False,share=False)
|
|
|
|
| 22 |
import requests
|
| 23 |
import tempfile
|
| 24 |
|
| 25 |
+
|
| 26 |
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
| 27 |
|
| 28 |
# create the length function
|
|
|
|
| 150 |
arcname = os.path.relpath(file_path, session_id)
|
| 151 |
zipObj.write(file_path, arcname)
|
| 152 |
|
|
|
|
| 153 |
## Search files functions ##
|
| 154 |
|
| 155 |
def search_docs(topic, max_references):
|
| 156 |
+
print(f"SEARCH PDF : {topic}")
|
| 157 |
doc_list = []
|
| 158 |
with DDGS() as ddgs:
|
| 159 |
i=0
|
| 160 |
for r in ddgs.text('{} filetype:pdf'.format(topic), region='wt-wt', safesearch='On', timelimit='n'):
|
| 161 |
+
#doc_list.append(str(r))
|
| 162 |
if i>=max_references:
|
| 163 |
break
|
| 164 |
doc_list.append("TITLE : " + r['title'] + " -- BODY : " + r['body'] + " -- URL : " + r['href'])
|
|
|
|
| 166 |
return doc_list
|
| 167 |
|
| 168 |
|
| 169 |
+
def store_files(references, ret_names=False):
|
| 170 |
url_list=[]
|
| 171 |
temp_files = []
|
| 172 |
for ref in references:
|
|
|
|
| 184 |
temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
|
| 185 |
temp_file.write(response.content)
|
| 186 |
temp_file.close()
|
| 187 |
+
if ret_names:
|
| 188 |
+
temp_files.append(temp_file.name)
|
| 189 |
+
else:
|
| 190 |
+
temp_files.append(temp_file)
|
| 191 |
|
| 192 |
return temp_files
|
| 193 |
+
|
|
|
|
| 194 |
## Summary functions ##
|
| 195 |
|
| 196 |
## Load each doc from the vector store
|
|
|
|
| 293 |
|
| 294 |
print("EMBEDDED, before embeddeding: ",session_id,len(db.index_to_docstore_id))
|
| 295 |
for file_id,file in enumerate(files):
|
| 296 |
+
print("ID : ", file_id, "FILE : ", file)
|
| 297 |
file_type = file.name.split('.')[-1].lower()
|
| 298 |
source = file.name.split('/')[-1]
|
| 299 |
print(f"current file: {source}")
|
|
|
|
| 334 |
progress(progress_step, desc = 'db zipped')
|
| 335 |
return f"{session_id}.zip",ui_session_id
|
| 336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
def add_to_db(references,ui_session_id):
|
| 340 |
files = store_files(references)
|
| 341 |
return embed_files(files,ui_session_id)
|
| 342 |
+
|
| 343 |
+
def export_files(references):
|
| 344 |
+
files = store_files(references, ret_names=True)
|
| 345 |
+
#paths = [file.name for file in files]
|
| 346 |
+
return files
|
| 347 |
|
| 348 |
|
| 349 |
+
def display_docs(docs):
|
| 350 |
+
output_str = ''
|
| 351 |
+
for i, doc in enumerate(docs):
|
| 352 |
+
source = doc.metadata['source'].split('/')[-1]
|
| 353 |
+
output_str += f"Ref: {i+1}\n{repr(doc.page_content)}\nSource: {source}\n\n"
|
| 354 |
+
return output_str
|
| 355 |
+
|
| 356 |
def ask_gpt(query, apikey,history,ui_session_id):
|
| 357 |
session_id = f"PDFAISS-{ui_session_id}"
|
| 358 |
try:
|
|
|
|
| 378 |
gr.Markdown("Upload your documents and question them.")
|
| 379 |
with gr.Accordion("Open to enter your API key", open=False):
|
| 380 |
apikey_input = gr.Textbox(placeholder="Type here your OpenAI API key to use Summarization and Q&A", label="OpenAI API Key",type='password')
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
with gr.Tab("Upload PDF & TXT"):
|
| 385 |
with gr.Accordion("Get files from the web", open=False):
|
| 386 |
with gr.Column():
|
| 387 |
topic_input = gr.Textbox(placeholder="Type your research", label="Research")
|
|
|
|
| 392 |
dd_documents.style(container=True)
|
| 393 |
with gr.Row():
|
| 394 |
btn_dl = gr.Button("Add these files to the Database")
|
| 395 |
+
btn_export = gr.Button("Export selected files ⬇⬇")
|
| 396 |
+
|
| 397 |
tb_session_id = gr.Textbox(label='session id')
|
| 398 |
docs_input = gr.File(file_count="multiple", file_types=[".txt", ".pdf",".zip",".docx"])
|
| 399 |
db_output = gr.outputs.File(label="Download zipped database")
|
|
|
|
| 421 |
|
| 422 |
btn_search.click(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
|
| 423 |
btn_dl.click(add_to_db, inputs=[dd_documents,tb_session_id], outputs=[db_output,tb_session_id])
|
| 424 |
+
topic_input.submit(export_files, inputs=dd_documents, outputs=docs_input)
|
| 425 |
+
btn_export.click(export_files, inputs=dd_documents, outputs=docs_input)
|
| 426 |
btn_generate_db.click(embed_files, inputs=[docs_input,tb_session_id], outputs=[db_output,tb_session_id])
|
| 427 |
btn_reset_db.click(reset_database,inputs=[tb_session_id],outputs=[db_output])
|
| 428 |
btn_summary.click(summarize_docs, inputs=[apikey_input,tb_session_id], outputs=summary_output)
|
| 429 |
btn_askGPT.click(ask_gpt, inputs=[query_input,apikey_input,history,tb_session_id], outputs=[answer_output,sources,history])
|
| 430 |
+
#
|
| 431 |
demo.queue(concurrency_count=10)
|
| 432 |
demo.launch(debug=False,share=False)
|