datarefine2

Sleeping

datarefine2 / app.py

Update app.py

ea8b8be verified 3 months ago

1.44 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline

	# Use faster, open-access model
	llm = pipeline("text2text-generation", model="EleutherAI/gpt-neo-2.7B")

	def refine_from_url(url, instruction):
	try:
	response = requests.get(url, timeout=10)
	soup = BeautifulSoup(response.text, "html.parser")
	raw_text = soup.get_text(separator="\n")

	prompt = f"""
	You are a data refinement agent. Given the following webpage content, do the following:
	1. Extract clear headings and structure the content.
	2. Generate 5 question-answer pairs based on the content.
	3. Format everything in JSONL style for GPT2 training.

	Instruction: {instruction}

	Content:
	{raw_text[:3000]}
	"""

	output = llm(prompt, max_new_tokens=512)[0]["generated_text"]
	return output
	except Exception as e:
	return f"Error: {str(e)}"

	demo = gr.Interface(
	fn=refine_from_url,
	inputs=[
	gr.Textbox(label="🔗 Enter Webpage URL"),
	gr.Textbox(label="🧠 Instruction", placeholder="e.g. Clean and format this for GPT2 training")
	],
	outputs=gr.Textbox(label="📄 Refined JSONL Output", lines=30, max_lines=60),
	title="🧠 Link-Based Data Refiner + Q&A Generator",
	description="Paste any webpage link. This app will crawl, refine, and generate question-answer pairs using Flan-T5."
	)

	if __name__ == "__main__":
	demo.launch()