Spaces:
Sleeping
Sleeping
| from .asg_loader import DocumentLoading | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| class TextSplitting: | |
| def mineru_recursive_splitter(self, file_path, survey_id, mode): | |
| docs = DocumentLoading().load_pdf(file_path, survey_id, mode) | |
| if docs is None: | |
| # 若加载失败,则返回空列表,调用方需处理 | |
| return [] | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=400, | |
| chunk_overlap=30, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| texts = text_splitter.create_documents([docs]) | |
| return texts | |
| def pypdf_recursive_splitter(self, file_path, survey_id): | |
| docs = DocumentLoading().pypdf_loader(file_path, survey_id) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=300, | |
| chunk_overlap=20, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| texts = text_splitter.create_documents([docs]) | |
| return texts |