Spaces:

Didier
/

Docs_QA_ColBERT_DSPy

Runtime error

App Files Files Community

Docs_QA_ColBERT_DSPy / app.py

Didier

Update app.py

91ff6bf verified 11 months ago

raw

history blame

5.35 kB

	""" app.py

	Question / answer over a collection of PDF documents using late interaction
	ColBERT model for retrieval and DSPy+Mistral for answer generation.

	:author: Didier Guillevic
	:date: 2024-12-22
	"""

	import gradio as gr

	import logging
	logger = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO)

	import os
	import pdf_utils # utilities for pdf processing
	import colbert_utils # utilities for to build a ColBERT retrieval model
	import dspy_utils # utilities for building a DSPy based retrieval generation model

	from tqdm.notebook import tqdm
	import warnings
	warnings.filterwarnings('ignore')


	dspy_rag_model = None

	def build_rag_model(files: list[str]) -> str:
	"""Build a retrieval augmented model using given files to index.

	"""
	global dspy_rag_model

	# Get the text from the pdf files
	documents = []
	metadatas = []
	for pdf_file in files:
	logger.info(f"Processing {pdf_file}")
	metadata = pdf_utils.get_metadata_info(pdf_file)
	text = pdf_utils.get_text_from_pdf(pdf_file)
	if text:
	documents.append(text)
	metadatas.append(metadata)

	# Build the ColBERT retrieval model
	colbert_base_model = 'antoinelouis/colbert-xm' # multilingual model
	colbert_index_name = 'OECD_HNW' # for web app, generate unique name with uuid.uuid4()
	retrieval_model = colbert_utils.build_colbert_model(
	documents,
	metadatas,
	pretrained_model=colbert_base_model,
	index_name=colbert_index_name
	)

	# Instanatiate the DSPy based RAG model
	dspy_rag_model = dspy_utils.DSPyRagModel(retrieval_model)

	return "Done building RAG model."


	def generate_response(question: str) -> list[str, str, str]:
	"""Generate a response to a given question using the RAG model.

	"""
	global dspy_rag_model

	if dspy_rag_model is None:
	return "RAG model not built. Please build the model first.", [], []

	# Generate response
	responses, references, snippets = dspy_rag_model.generate_response(
	question=question, k=5, method='chain_of_thought')

	return responses, references, snippets


	with gr.Blocks() as demo:
	gr.Markdown("""
	# Retrieval (ColBERT) + Generation (DSPy & Mistral)
	- Note:
	- building the retrieval model will be slow on free CPU (expect 5+ minutes).
	- first question & answer will be slow (2 minutes for model loading). Subsequent question approx. 10 seconds.
	- Usage: upload a few PDF files to index. Build the model. Ask questions.
	""")

	# Input files and build status
	with gr.Row():
	upload_files = gr.File(
	label="Upload PDF files to index", file_count="multiple",
	value=["OECD_Dividend_tax_fraud_2023-en.pdf",],
	scale=5)
	build_status = gr.Textbox(label="Build status", placeholder="", scale=2)

	# button
	build_button = gr.Button("Build retrieval generation model", variant='primary')

	# Question to answer
	question = gr.Textbox(
	label="Question",
	placeholder="What is dividend stripping?"
	)
	response = gr.Textbox(
	label="Response",
	placeholder=""
	)
	with gr.Accordion("References & snippets", open=False):
	references = gr.HTML(label="References")
	snippets = gr.HTML(label="Snippets")

	# button
	response_button = gr.Button("Submit", variant='primary')

	# Example questions given default provided PDF file
	with gr.Accordion("Sample questions", open=False):
	gr.Examples(
	[
	["What is dividend stripping?",],
	["What are the most common types of dividend stripping schemes?",],
	["How do authorities detect dividend stripping?",],
	["What are some indicators of potential dividend stripping?",],
	["What are the consequences of dividend stripping?",],
	["How can countries combat dividend stripping?",],
	["What is the role of professional enablers in dividend stripping?",],
	["How can countries address the role of professional enablers in dividend stripping?",],
	],
	inputs=[question,],
	outputs=[response, references, snippets],
	fn=generate_response,
	cache_examples=False,
	label="Sample questions"
	)

	# Documentation
	with gr.Accordion("Documentation", open=False):
	gr.Markdown("""
	- What
	- Retrieval augmented generation (RAG) model based on ColBERT and DSPy.
	- Retrieval base model: 'antoinelouis/colbert-xm' (multilingual model)
	- Generation framework: DSPy and Mistral.
	- How
	- Upload PDF files to index.
	- Build the retrieval generation model (might take a few minutes)
	- Ask a question about the content of those uploaded documents.
	""")

	# Click actions
	build_button.click(
	fn=build_rag_model,
	inputs=[upload_files],
	outputs=[build_status]
	)
	response_button.click(
	fn=generate_response,
	inputs=[question],
	outputs=[response, references, snippets]
	)


	demo.launch(show_api=False)