Spaces:
Paused
Paused
| from langchain.document_loaders import ReadTheDocsLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import Qdrant | |
| # from qdrant_client import QdrantClient | |
| from nvda_ug_loader import NVDAUserGuideLoader | |
| from config import DB_CONFIG, DB_E5_CONFIG | |
| CHUNK_SIZE = 500 | |
| def _remove_prefix_path(p: str): | |
| prefix = "data/rtdocs/nvdajp-book.readthedocs.io/" | |
| return p.removeprefix(prefix) | |
| def get_documents(path: str): | |
| loader = ReadTheDocsLoader(path, encoding="utf-8") | |
| docs = loader.load() | |
| base_url = "https://nvdajp-book.readthedocs.io/" | |
| category = "ja-book" | |
| for doc in docs: | |
| org_metadata = doc.metadata | |
| source = _remove_prefix_path(org_metadata["source"]) | |
| add_meta = { | |
| "category": category, | |
| "source": source, | |
| "url": f"{base_url}{source}", | |
| } | |
| doc.metadata = org_metadata | add_meta | |
| yield doc | |
| def get_text_chunk(docs): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, chunk_overlap=0 | |
| ) | |
| texts = text_splitter.split_documents(docs) | |
| return texts | |
| def store(texts, mname): | |
| if mname == "openai": | |
| embeddings = OpenAIEmbeddings() | |
| db_url, db_api_key, db_collection_name = DB_CONFIG | |
| elif mname == "e5": | |
| model_name = "intfloat/multilingual-e5-large" | |
| model_kwargs = {"device": "cuda"} | |
| encode_kwargs = {"normalize_embeddings": False} | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| model_kwargs=model_kwargs, | |
| encode_kwargs=encode_kwargs, | |
| ) | |
| db_url, db_api_key, db_collection_name = DB_E5_CONFIG | |
| else: | |
| raise ValueError("Invalid mname") | |
| _ = Qdrant.from_documents( | |
| texts, | |
| embeddings, | |
| url=db_url, | |
| api_key=db_api_key, | |
| collection_name=db_collection_name, | |
| ) | |
| def rtd_main(path: str, mname: str): | |
| docs = get_documents(path) | |
| texts = get_text_chunk(docs) | |
| store(texts, mname) | |
| def nul_main(url: str, mname: str): | |
| if "www.nvda.jp" in url: | |
| category = "ja-nvda-user-guide" | |
| else: | |
| category = "en-nvda-user-guide" | |
| loader = NVDAUserGuideLoader(url, category) | |
| docs = loader.load() | |
| texts = get_text_chunk(docs) | |
| store(texts, mname) | |
| if __name__ == "__main__": | |
| """ | |
| $ python store.py rtd "data/rtdocs/nvdajp-book.readthedocs.io/ja/latest" openai | |
| $ python store.py nul "https://www.nvaccess.org/files/nvda/documentation/userGuide.html" e5 | |
| $ python store.py nul "https://www.nvda.jp/nvda2023.1jp/ja/userGuide.html" e5 | |
| """ | |
| import sys | |
| args = sys.argv | |
| if len(args) != 4: | |
| print("No args, you need two args for type, html_path") | |
| else: | |
| type_ = args[1] | |
| path = args[2] | |
| mname = args[3] | |
| if type_ == "rtd": | |
| rtd_main(path, mname) | |
| elif type_ == "nul": | |
| nul_main(path, mname) | |
| else: | |
| print("No type for store") | |