datarefine2

Sleeping

File size: 1,435 Bytes

6cd6d4c
76ac794
 
 
3f37aa1
502e656
ea8b8be
3536a95
76ac794
3536a95
7d512b6
76ac794
 
7d512b6
 
 
 
 
 
 
 
 
 
502e656
7d512b6
 
502e656
76ac794
3f37aa1
76ac794
6cd6d4c
 
76ac794
 
7d512b6
 
76ac794
502e656
7d512b6
502e656
6cd6d4c
3f37aa1
 
76ac794

import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline

# Use faster, open-access model
llm = pipeline("text2text-generation", model="EleutherAI/gpt-neo-2.7B")

def refine_from_url(url, instruction):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        raw_text = soup.get_text(separator="\n")

        prompt = f"""
You are a data refinement agent. Given the following webpage content, do the following:
1. Extract clear headings and structure the content.
2. Generate 5 question-answer pairs based on the content.
3. Format everything in JSONL style for GPT2 training.

Instruction: {instruction}

Content:
{raw_text[:3000]}
"""

        output = llm(prompt, max_new_tokens=512)[0]["generated_text"]
        return output
    except Exception as e:
        return f"Error: {str(e)}"

demo = gr.Interface(
    fn=refine_from_url,
    inputs=[
        gr.Textbox(label="🔗 Enter Webpage URL"),
        gr.Textbox(label="🧠 Instruction", placeholder="e.g. Clean and format this for GPT2 training")
    ],
    outputs=gr.Textbox(label="📄 Refined JSONL Output", lines=30, max_lines=60),
    title="🧠 Link-Based Data Refiner + Q&A Generator",
    description="Paste any webpage link. This app will crawl, refine, and generate question-answer pairs using Flan-T5."
)

if __name__ == "__main__":
    demo.launch()