import gradio as gr import requests from bs4 import BeautifulSoup from transformers import pipeline # Use faster, open-access model llm = pipeline("text2text-generation", model="EleutherAI/gpt-neo-2.7B") def refine_from_url(url, instruction): try: response = requests.get(url, timeout=10) soup = BeautifulSoup(response.text, "html.parser") raw_text = soup.get_text(separator="\n") prompt = f""" You are a data refinement agent. Given the following webpage content, do the following: 1. Extract clear headings and structure the content. 2. Generate 5 question-answer pairs based on the content. 3. Format everything in JSONL style for GPT2 training. Instruction: {instruction} Content: {raw_text[:3000]} """ output = llm(prompt, max_new_tokens=512)[0]["generated_text"] return output except Exception as e: return f"Error: {str(e)}" demo = gr.Interface( fn=refine_from_url, inputs=[ gr.Textbox(label="🔗 Enter Webpage URL"), gr.Textbox(label="🧠 Instruction", placeholder="e.g. Clean and format this for GPT2 training") ], outputs=gr.Textbox(label="📄 Refined JSONL Output", lines=30, max_lines=60), title="🧠 Link-Based Data Refiner + Q&A Generator", description="Paste any webpage link. This app will crawl, refine, and generate question-answer pairs using Flan-T5." ) if __name__ == "__main__": demo.launch()