Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,42 +3,47 @@ import requests
|
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
from transformers import pipeline
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
llm = pipeline("text2text-generation", model="
|
| 8 |
|
| 9 |
-
def
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
raw_text = soup.get_text(separator="\n")
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
1. Extract clear headings and structure the content.
|
| 18 |
-
2. Generate question-answer pairs based on the content.
|
| 19 |
-
3. Format everything in JSONL style for GPT2 training.
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
|
| 23 |
Content:
|
| 24 |
-
{
|
| 25 |
"""
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
except Exception as e:
|
| 30 |
-
|
| 31 |
|
| 32 |
demo = gr.Interface(
|
| 33 |
-
fn=
|
| 34 |
inputs=[
|
| 35 |
gr.Textbox(label="๐ Enter Webpage URL"),
|
| 36 |
gr.Textbox(label="๐ง Instruction", placeholder="e.g. Clean and format this for GPT2 training")
|
| 37 |
],
|
| 38 |
-
outputs=gr.Textbox(label="๐
|
| 39 |
-
title="๐ง
|
| 40 |
-
description="
|
| 41 |
)
|
| 42 |
|
| 43 |
if __name__ == "__main__":
|
| 44 |
-
demo.launch()
|
|
|
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
from transformers import pipeline
|
| 5 |
|
| 6 |
+
# Load fast, open-access model
|
| 7 |
+
llm = pipeline("text2text-generation", model="google/flan-t5-base")
|
| 8 |
|
| 9 |
+
def extract_text(url):
|
| 10 |
+
response = requests.get(url, timeout=10)
|
| 11 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 12 |
+
return soup.get_text(separator="\n")
|
|
|
|
| 13 |
|
| 14 |
+
def chunk_text(text, chunk_size=3000):
|
| 15 |
+
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def refine_chunk(chunk, instruction):
|
| 18 |
+
prompt = f"""
|
| 19 |
+
{instruction}
|
| 20 |
|
| 21 |
Content:
|
| 22 |
+
{chunk}
|
| 23 |
"""
|
| 24 |
+
result = llm(prompt, max_new_tokens=512)[0]["generated_text"]
|
| 25 |
+
return result
|
| 26 |
|
| 27 |
+
def streamed_pipeline(url, instruction):
|
| 28 |
+
try:
|
| 29 |
+
raw_text = extract_text(url)
|
| 30 |
+
chunks = chunk_text(raw_text)
|
| 31 |
+
for i, chunk in enumerate(chunks):
|
| 32 |
+
result = refine_chunk(chunk, instruction)
|
| 33 |
+
yield f"### Section {i+1}\n{result}\n\n"
|
| 34 |
except Exception as e:
|
| 35 |
+
yield f"Error: {str(e)}"
|
| 36 |
|
| 37 |
demo = gr.Interface(
|
| 38 |
+
fn=streamed_pipeline,
|
| 39 |
inputs=[
|
| 40 |
gr.Textbox(label="๐ Enter Webpage URL"),
|
| 41 |
gr.Textbox(label="๐ง Instruction", placeholder="e.g. Clean and format this for GPT2 training")
|
| 42 |
],
|
| 43 |
+
outputs=gr.Textbox(label="๐ Streaming Output", lines=40, max_lines=80, interactive=False),
|
| 44 |
+
title="๐ง Real-Time Chunked Refiner",
|
| 45 |
+
description="Crawls full webpage, breaks into chunks, and streams refined output section-by-section using Flan-T5."
|
| 46 |
)
|
| 47 |
|
| 48 |
if __name__ == "__main__":
|
| 49 |
+
demo.launch()
|