Spaces:
Paused
Paused
| from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, UnstructuredURLLoader | |
| from langchain_community.vectorstores import Qdrant | |
| import os | |
| import requests | |
| def process_file(file): | |
| # save the file temporarily | |
| temp_file = "./"+file.path | |
| with open(temp_file, "wb") as file: | |
| file.write(file.content) | |
| documents = [] | |
| if file.path.endswith(".pdf"): | |
| loader = PyMuPDF(temp_file) | |
| docs = loader.load() | |
| documents.extend(docs) | |
| else: | |
| loader = TextLoader(temp_file) | |
| docs = loader.load() | |
| documents.extend(docs) | |
| return documents | |
| def load_documents_from_url(url): | |
| try: | |
| # Check if it's a PDF | |
| if url.endswith(".pdf"): | |
| try: | |
| loader = PyMuPDFLoader(url) | |
| return loader.load() | |
| except Exception as e: | |
| print(f"Error loading PDF from {url}: {e}") | |
| return None | |
| # Fetch the content and check for video pages | |
| try: | |
| response = requests.head(url, timeout=10) # Timeout for fetching headers | |
| content_type = response.headers.get('Content-Type', '') | |
| except Exception as e: | |
| print(f"Error fetching headers from {url}: {e}") | |
| return None | |
| # Ignore video content (flagged for now) | |
| if 'video' in content_type: | |
| return None | |
| if 'youtube' in url: | |
| return None | |
| # Otherwise, treat it as an HTML page | |
| try: | |
| loader = UnstructuredURLLoader([url]) | |
| return loader.load() | |
| except Exception as e: | |
| print(f"Error loading HTML from {url}: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"General error loading from {url}: {e}") | |
| return None | |
| def add_to_qdrant(documents, embeddings, qdrant_client, collection_name): | |
| Qdrant.from_documents( | |
| documents, | |
| embeddings, | |
| url=qdrant_client.url, | |
| prefer_grpc=True, | |
| collection_name=collection_name, | |
| ) |