Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import fitz # PyMuPDF for PDF extraction | |
| import spacy | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.llms import OpenAI | |
| from langchain_community.vectorstores import FAISS # Updated import | |
| import torch | |
| from transformers import AutoTokenizer, AutoModel | |
| import gradio as gr | |
| # Load and preprocess PDF text | |
| def extract_text_from_pdf(pdf_path): | |
| text = "" | |
| with fitz.open(pdf_path) as pdf_document: | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| text += page.get_text() | |
| return text | |
| # Extract text from the PDF | |
| pdf_text = extract_text_from_pdf('Getting Started with Ubuntu 16.04.pdf') # Ensure this path is correct | |
| # Convert the text to a DataFrame | |
| df = pd.DataFrame({'text': [pdf_text]}) | |
| # Load the custom embedding model | |
| class CustomEmbeddingModel: | |
| def __init__(self, model_name): | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| self.model = AutoModel.from_pretrained(model_name) | |
| def embed_text(self, text): | |
| inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| embeddings = self.model(**inputs).last_hidden_state.mean(dim=1) | |
| return embeddings[0].numpy() | |
| embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name | |
| # Load Spacy model for preprocessing | |
| nlp = spacy.load("en_core_web_sm") # Ensure the model is installed | |
| def preprocess_text(text): | |
| doc = nlp(text) | |
| tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha] | |
| return ' '.join(tokens) | |
| # Apply preprocessing and embedding | |
| df['text'] = df['text'].apply(preprocess_text) | |
| df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x)) | |
| # Create FAISS vector store | |
| documents = df['text'].tolist() | |
| embeddings = df['text_embeddings'].tolist() | |
| vector_store = FAISS.from_documents(documents, embeddings) | |
| # Create LangChain model and chain | |
| llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired | |
| retriever = vector_store.as_retriever() | |
| chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever) | |
| # Function to generate a response | |
| def generate_response(prompt): | |
| result = chain({"query": prompt}) | |
| response = result["result"] | |
| return response | |
| # Gradio interface | |
| iface = gr.Interface( | |
| fn=generate_response, | |
| inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."), | |
| outputs=gr.Textbox(label="Response"), | |
| title="Ubuntu Manual Chatbot", | |
| description="Ask questions about the Ubuntu manual." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |