ghosthets commited on
Commit
bbb8747
ยท
verified ยท
1 Parent(s): a9aa739

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -22
app.py CHANGED
@@ -3,42 +3,47 @@ import requests
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
- # Use faster, open-access model
7
- llm = pipeline("text2text-generation", model="EleutherAI/gpt-neo-2.7B")
8
 
9
- def refine_from_url(url, instruction):
10
- try:
11
- response = requests.get(url, timeout=10)
12
- soup = BeautifulSoup(response.text, "html.parser")
13
- raw_text = soup.get_text(separator="\n")
14
 
15
- prompt = f"""
16
- You are a data refinement agent. Given the following webpage content, do the following:
17
- 1. Extract clear headings and structure the content.
18
- 2. Generate question-answer pairs based on the content.
19
- 3. Format everything in JSONL style for GPT2 training.
20
 
21
- Instruction: {instruction}
 
 
22
 
23
  Content:
24
- {raw_text[:3000]}
25
  """
 
 
26
 
27
- output = llm(prompt, max_new_tokens=512)[0]["generated_text"]
28
- return output
 
 
 
 
 
29
  except Exception as e:
30
- return f"Error: {str(e)}"
31
 
32
  demo = gr.Interface(
33
- fn=refine_from_url,
34
  inputs=[
35
  gr.Textbox(label="๐Ÿ”— Enter Webpage URL"),
36
  gr.Textbox(label="๐Ÿง  Instruction", placeholder="e.g. Clean and format this for GPT2 training")
37
  ],
38
- outputs=gr.Textbox(label="๐Ÿ“„ Refined JSONL Output", lines=30, max_lines=60),
39
- title="๐Ÿง  Link-Based Data Refiner + Q&A Generator",
40
- description="Paste any webpage link. This app will crawl, refine, and generate question-answer pairs using Flan-T5."
41
  )
42
 
43
  if __name__ == "__main__":
44
- demo.launch()
 
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
+ # Load fast, open-access model
7
+ llm = pipeline("text2text-generation", model="google/flan-t5-base")
8
 
9
+ def extract_text(url):
10
+ response = requests.get(url, timeout=10)
11
+ soup = BeautifulSoup(response.text, "html.parser")
12
+ return soup.get_text(separator="\n")
 
13
 
14
+ def chunk_text(text, chunk_size=3000):
15
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
 
 
 
16
 
17
+ def refine_chunk(chunk, instruction):
18
+ prompt = f"""
19
+ {instruction}
20
 
21
  Content:
22
+ {chunk}
23
  """
24
+ result = llm(prompt, max_new_tokens=512)[0]["generated_text"]
25
+ return result
26
 
27
+ def streamed_pipeline(url, instruction):
28
+ try:
29
+ raw_text = extract_text(url)
30
+ chunks = chunk_text(raw_text)
31
+ for i, chunk in enumerate(chunks):
32
+ result = refine_chunk(chunk, instruction)
33
+ yield f"### Section {i+1}\n{result}\n\n"
34
  except Exception as e:
35
+ yield f"Error: {str(e)}"
36
 
37
  demo = gr.Interface(
38
+ fn=streamed_pipeline,
39
  inputs=[
40
  gr.Textbox(label="๐Ÿ”— Enter Webpage URL"),
41
  gr.Textbox(label="๐Ÿง  Instruction", placeholder="e.g. Clean and format this for GPT2 training")
42
  ],
43
+ outputs=gr.Textbox(label="๐Ÿ“„ Streaming Output", lines=40, max_lines=80, interactive=False),
44
+ title="๐Ÿง  Real-Time Chunked Refiner",
45
+ description="Crawls full webpage, breaks into chunks, and streams refined output section-by-section using Flan-T5."
46
  )
47
 
48
  if __name__ == "__main__":
49
+ demo.launch()