ghosthets commited on
Commit
3536a95
Β·
verified Β·
1 Parent(s): d6b29df

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gradio as gr
4
+ import nltk
5
+ from huggingface_hub import InferenceApi
6
+
7
+ # Download sentence tokenizer
8
+ nltk.download("punkt")
9
+ from nltk.tokenize import sent_tokenize
10
+
11
+ # β€” Hugging Face Inference API β€”
12
+ hf_token = os.getenv("HF_TOKEN")
13
+ model_id = "google/flan-t5-small" # Fast, free, hosted model
14
+
15
+ inference = InferenceApi(
16
+ repo_id=model_id,
17
+ task="text2text-generation",
18
+ token=hf_token
19
+ )
20
+
21
+ # β€” Subject Extraction Function β€”
22
+ def extract_subject(sentence):
23
+ prompt = f"Extract the main subject or category in 1–3 words. Only return the subject.\nSentence: {sentence}"
24
+ try:
25
+ response = inference(inputs=prompt, params={"max_new_tokens": 10})
26
+ if isinstance(response, list) and "generated_text" in response[0]:
27
+ return response[0]["generated_text"].strip().rstrip(".")
28
+ elif isinstance(response, dict) and "generated_text" in response:
29
+ return response["generated_text"].strip().rstrip(".")
30
+ else:
31
+ return "Miscellaneous"
32
+ except Exception:
33
+ return "Error"
34
+
35
+ # β€” Grouping Logic β€”
36
+ def refine_and_group(raw_text):
37
+ sentences = sent_tokenize(raw_text)
38
+ groups = {}
39
+
40
+ for sentence in sentences:
41
+ subject = extract_subject(sentence)
42
+ groups.setdefault(subject, []).append(sentence)
43
+
44
+ return [{"heading": subject, "content": items} for subject, items in groups.items()]
45
+
46
+ # β€” Gradio Interface β€”
47
+ def run_pipeline(raw_text):
48
+ structured = refine_and_group(raw_text)
49
+ return json.dumps(structured, indent=2)
50
+
51
+ gr.Interface(
52
+ fn=run_pipeline,
53
+ inputs=gr.Textbox(
54
+ label="Paste Mixed Raw Data",
55
+ placeholder=(
56
+ "Paste a full paragraph or list of facts.\n"
57
+ "Each sentence will be auto-tagged and grouped.\n"
58
+ "Example:\n"
59
+ "Bitcoin is a decentralized currency.\n"
60
+ "Python is a programming language.\n"
61
+ "The Taj Mahal was built by Shah Jahan."
62
+ ),
63
+ lines=25
64
+ ),
65
+ outputs=gr.Textbox(
66
+ label="Refined JSON Output",
67
+ lines=25
68
+ ),
69
+ title="πŸ“š Document Categorizer (Text Output Only)",
70
+ description=(
71
+ "This app uses Flan-T5-Small to extract subjects from each sentence, "
72
+ "groups them under headings, and shows the result as JSON text."
73
+ )
74
+ ).launch()