datarefine2

Sleeping

App Files Files Community

ghosthets commited on Sep 13

Commit

bbb8747

verified ·

1 Parent(s): a9aa739

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -22

app.py CHANGED Viewed

@@ -3,42 +3,47 @@ import requests
 from bs4 import BeautifulSoup
 from transformers import pipeline
-# Use faster, open-access model
-llm = pipeline("text2text-generation", model="EleutherAI/gpt-neo-2.7B")
-def refine_from_url(url, instruction):
-    try:
-        response = requests.get(url, timeout=10)
-        soup = BeautifulSoup(response.text, "html.parser")
-        raw_text = soup.get_text(separator="\n")
-        prompt = f"""
-You are a data refinement agent. Given the following webpage content, do the following:
-1. Extract clear headings and structure the content.
-2. Generate question-answer pairs based on the content.
-3. Format everything in JSONL style for GPT2 training.
-Instruction: {instruction}
 Content:
-{raw_text[:3000]}
 """
-        output = llm(prompt, max_new_tokens=512)[0]["generated_text"]
-        return output
     except Exception as e:
-        return f"Error: {str(e)}"
 demo = gr.Interface(
-    fn=refine_from_url,
     inputs=[
         gr.Textbox(label="🔗 Enter Webpage URL"),
         gr.Textbox(label="🧠 Instruction", placeholder="e.g. Clean and format this for GPT2 training")
     ],
-    outputs=gr.Textbox(label="📄 Refined JSONL Output", lines=30, max_lines=60),
-    title="🧠 Link-Based Data Refiner + Q&A Generator",
-    description="Paste any webpage link. This app will crawl, refine, and generate question-answer pairs using Flan-T5."
 )
 if __name__ == "__main__":
-    demo.launch()

 from bs4 import BeautifulSoup
 from transformers import pipeline
+# Load fast, open-access model
+llm = pipeline("text2text-generation", model="google/flan-t5-base")
+def extract_text(url):
+    response = requests.get(url, timeout=10)
+    soup = BeautifulSoup(response.text, "html.parser")
+    return soup.get_text(separator="\n")
+def chunk_text(text, chunk_size=3000):
+    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+def refine_chunk(chunk, instruction):
+    prompt = f"""
+{instruction}
 Content:
+{chunk}
 """
+    result = llm(prompt, max_new_tokens=512)[0]["generated_text"]
+    return result
+def streamed_pipeline(url, instruction):
+    try:
+        raw_text = extract_text(url)
+        chunks = chunk_text(raw_text)
+        for i, chunk in enumerate(chunks):
+            result = refine_chunk(chunk, instruction)
+            yield f"### Section {i+1}\n{result}\n\n"
     except Exception as e:
+        yield f"Error: {str(e)}"
 demo = gr.Interface(
+    fn=streamed_pipeline,
     inputs=[
         gr.Textbox(label="🔗 Enter Webpage URL"),
         gr.Textbox(label="🧠 Instruction", placeholder="e.g. Clean and format this for GPT2 training")
     ],
+    outputs=gr.Textbox(label="📄 Streaming Output", lines=40, max_lines=80, interactive=False),
+    title="🧠 Real-Time Chunked Refiner",
+    description="Crawls full webpage, breaks into chunks, and streams refined output section-by-section using Flan-T5."
 )
 if __name__ == "__main__":
+    demo.launch()