Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from transformers import pipeline | |
| # Use faster, open-access model | |
| llm = pipeline("text2text-generation", model="EleutherAI/gpt-neo-2.7B") | |
| def refine_from_url(url, instruction): | |
| try: | |
| response = requests.get(url, timeout=10) | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| raw_text = soup.get_text(separator="\n") | |
| prompt = f""" | |
| You are a data refinement agent. Given the following webpage content, do the following: | |
| 1. Extract clear headings and structure the content. | |
| 2. Generate 5 question-answer pairs based on the content. | |
| 3. Format everything in JSONL style for GPT2 training. | |
| Instruction: {instruction} | |
| Content: | |
| {raw_text[:3000]} | |
| """ | |
| output = llm(prompt, max_new_tokens=512)[0]["generated_text"] | |
| return output | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| demo = gr.Interface( | |
| fn=refine_from_url, | |
| inputs=[ | |
| gr.Textbox(label="π Enter Webpage URL"), | |
| gr.Textbox(label="π§ Instruction", placeholder="e.g. Clean and format this for GPT2 training") | |
| ], | |
| outputs=gr.Textbox(label="π Refined JSONL Output", lines=30, max_lines=60), | |
| title="π§ Link-Based Data Refiner + Q&A Generator", | |
| description="Paste any webpage link. This app will crawl, refine, and generate question-answer pairs using Flan-T5." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |