Spaces:
Paused
Paused
| from dataclasses import dataclass | |
| import re | |
| from typing import Iterator, List | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders.base import BaseLoader | |
| from bs4 import BeautifulSoup, Tag, ResultSet | |
| import requests | |
| RE_HEADERS = re.compile(r"h[23]") | |
| class Content: | |
| name: str | |
| title: str | |
| text: str | |
| body: list[Tag] | |
| def _get_anchor_name(header: Tag) -> str: | |
| for tag in header.previous_elements: | |
| if tag.name == "a": | |
| return tag.attrs.get("name", "") | |
| return "" | |
| def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]: | |
| has_anchor = False | |
| for tag in reversed(body): | |
| if not has_anchor: | |
| if tag.name == "a": | |
| has_anchor = True | |
| continue | |
| else: | |
| yield tag | |
| def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]: | |
| return reversed(list(_reversed_remove_last_anchor(body))) | |
| def _get_bodys_text(body: list[Tag]) -> str: | |
| text = "" | |
| for tag in body: | |
| text += tag.get_text() | |
| return text | |
| def _get_child_content(header: Tag) -> Content: | |
| title = header.get_text() | |
| name = _get_anchor_name(header) | |
| body = [header] | |
| for i, child in enumerate(header.next_elements): | |
| if i == 0: | |
| continue | |
| if child.name == "h2" or child.name == "h3": | |
| break | |
| body.append(child) | |
| removed_next_anchor_body = list(_remove_last_anchor(body)) | |
| text = _get_bodys_text(removed_next_anchor_body) | |
| return Content(name, | |
| title, | |
| text, | |
| removed_next_anchor_body | |
| ) | |
| def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]: | |
| for header in headers: | |
| yield _get_child_content(header) | |
| class NVDAUserGuideLoader(BaseLoader): | |
| """ | |
| """ | |
| def __init__(self, url: str, category: str) -> None: | |
| self.url = url | |
| self.category = category | |
| def fetch(self) -> BeautifulSoup: | |
| res = requests.get(self.url) | |
| soup = BeautifulSoup(res.content, 'lxml') | |
| return soup | |
| def lazy_load(self) -> Iterator[Document]: | |
| soup = self.fetch() | |
| # body = soup.body | |
| headers = soup.find_all(RE_HEADERS) | |
| for content in get_contents(headers): | |
| name = content.name | |
| title = content.title | |
| text = content.text | |
| metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title} | |
| yield Document(page_content=text, metadata=metadata) | |
| def load(self) -> List[Document]: | |
| return list(self.lazy_load()) | |
| if __name__ == "__main__": | |
| url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html" | |
| loader = NVDAUserGuideLoader(url, "en-nvda-user-guide") | |
| data = loader.load() | |
| print(data) | |
| # breakpoint() | |