Spaces:
Runtime error
Runtime error
gamingflexer
commited on
Commit
·
59a1246
1
Parent(s):
424c175
Refactor
Browse files- src/scrapper/extractor.py +2 -1
- src/scrapper/main.py +4 -1
src/scrapper/extractor.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from langchain_community.chat_models import ChatOpenAI
|
| 2 |
-
from langchain import PromptTemplate
|
|
|
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
import tiktoken
|
| 5 |
from typing import Union
|
|
|
|
| 1 |
from langchain_community.chat_models import ChatOpenAI
|
| 2 |
+
from langchain.prompts import PromptTemplate
|
| 3 |
+
from langchain.chains import LLMChain
|
| 4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 5 |
import tiktoken
|
| 6 |
from typing import Union
|
src/scrapper/main.py
CHANGED
|
@@ -3,6 +3,7 @@ from scrapper.arxiv import get_paper_id,Arxiv
|
|
| 3 |
from scrapper.extractor import get_google_scrape,init_extractor
|
| 4 |
from tqdm import tqdm
|
| 5 |
import os
|
|
|
|
| 6 |
|
| 7 |
class ArxivPaper:
|
| 8 |
|
|
@@ -33,6 +34,7 @@ class ArxivPaper:
|
|
| 33 |
|
| 34 |
def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
|
| 35 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
|
|
|
| 36 |
for i in tqdm(paper_ids):
|
| 37 |
paper = Arxiv(i)
|
| 38 |
paper.load()
|
|
@@ -41,4 +43,5 @@ class ArxivPaper:
|
|
| 41 |
extractor=self.extractor,
|
| 42 |
text_splitter=self.text_splitter,)
|
| 43 |
paper.chunker()
|
| 44 |
-
paper.save_chunks(include_metadata=True, path=path_author)
|
|
|
|
|
|
| 3 |
from scrapper.extractor import get_google_scrape,init_extractor
|
| 4 |
from tqdm import tqdm
|
| 5 |
import os
|
| 6 |
+
from config import OPENAI_API_KEY
|
| 7 |
|
| 8 |
class ArxivPaper:
|
| 9 |
|
|
|
|
| 34 |
|
| 35 |
def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
|
| 36 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
| 37 |
+
data = {}
|
| 38 |
for i in tqdm(paper_ids):
|
| 39 |
paper = Arxiv(i)
|
| 40 |
paper.load()
|
|
|
|
| 43 |
extractor=self.extractor,
|
| 44 |
text_splitter=self.text_splitter,)
|
| 45 |
paper.chunker()
|
| 46 |
+
paper.save_chunks(include_metadata=True, path=path_author)
|
| 47 |
+
|