datarefine2

Sleeping

File size: 1,527 Bytes

6cd6d4c
76ac794
 
 
3f37aa1
bbb8747
 
3536a95
bbb8747
 
 
 
7d512b6
bbb8747
 
7d512b6
bbb8747
 
 
7d512b6
 
bbb8747
7d512b6
bbb8747
 
7d512b6
bbb8747
 
 
 
 
 
 
3f37aa1
bbb8747
6cd6d4c
 
bbb8747
76ac794
7d512b6
 
76ac794
bbb8747
 
 
6cd6d4c
3f37aa1
 
bbb8747

import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline

# Load fast, open-access model
llm = pipeline("text2text-generation", model="google/flan-t5-base")

def extract_text(url):
    response = requests.get(url, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup.get_text(separator="\n")

def chunk_text(text, chunk_size=3000):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def refine_chunk(chunk, instruction):
    prompt = f"""
{instruction}

Content:
{chunk}
"""
    result = llm(prompt, max_new_tokens=512)[0]["generated_text"]
    return result

def streamed_pipeline(url, instruction):
    try:
        raw_text = extract_text(url)
        chunks = chunk_text(raw_text)
        for i, chunk in enumerate(chunks):
            result = refine_chunk(chunk, instruction)
            yield f"### Section {i+1}\n{result}\n\n"
    except Exception as e:
        yield f"Error: {str(e)}"

demo = gr.Interface(
    fn=streamed_pipeline,
    inputs=[
        gr.Textbox(label="🔗 Enter Webpage URL"),
        gr.Textbox(label="🧠 Instruction", placeholder="e.g. Clean and format this for GPT2 training")
    ],
    outputs=gr.Textbox(label="📄 Streaming Output", lines=40, max_lines=80, interactive=False),
    title="🧠 Real-Time Chunked Refiner",
    description="Crawls full webpage, breaks into chunks, and streams refined output section-by-section using Flan-T5."
)

if __name__ == "__main__":
    demo.launch()