Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import nltk
|
| 5 |
+
from huggingface_hub import InferenceApi
|
| 6 |
+
|
| 7 |
+
# Download sentence tokenizer
|
| 8 |
+
nltk.download("punkt")
|
| 9 |
+
from nltk.tokenize import sent_tokenize
|
| 10 |
+
|
| 11 |
+
# β Hugging Face Inference API β
|
| 12 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 13 |
+
model_id = "google/flan-t5-small" # Fast, free, hosted model
|
| 14 |
+
|
| 15 |
+
inference = InferenceApi(
|
| 16 |
+
repo_id=model_id,
|
| 17 |
+
task="text2text-generation",
|
| 18 |
+
token=hf_token
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# β Subject Extraction Function β
|
| 22 |
+
def extract_subject(sentence):
|
| 23 |
+
prompt = f"Extract the main subject or category in 1β3 words. Only return the subject.\nSentence: {sentence}"
|
| 24 |
+
try:
|
| 25 |
+
response = inference(inputs=prompt, params={"max_new_tokens": 10})
|
| 26 |
+
if isinstance(response, list) and "generated_text" in response[0]:
|
| 27 |
+
return response[0]["generated_text"].strip().rstrip(".")
|
| 28 |
+
elif isinstance(response, dict) and "generated_text" in response:
|
| 29 |
+
return response["generated_text"].strip().rstrip(".")
|
| 30 |
+
else:
|
| 31 |
+
return "Miscellaneous"
|
| 32 |
+
except Exception:
|
| 33 |
+
return "Error"
|
| 34 |
+
|
| 35 |
+
# β Grouping Logic β
|
| 36 |
+
def refine_and_group(raw_text):
|
| 37 |
+
sentences = sent_tokenize(raw_text)
|
| 38 |
+
groups = {}
|
| 39 |
+
|
| 40 |
+
for sentence in sentences:
|
| 41 |
+
subject = extract_subject(sentence)
|
| 42 |
+
groups.setdefault(subject, []).append(sentence)
|
| 43 |
+
|
| 44 |
+
return [{"heading": subject, "content": items} for subject, items in groups.items()]
|
| 45 |
+
|
| 46 |
+
# β Gradio Interface β
|
| 47 |
+
def run_pipeline(raw_text):
|
| 48 |
+
structured = refine_and_group(raw_text)
|
| 49 |
+
return json.dumps(structured, indent=2)
|
| 50 |
+
|
| 51 |
+
gr.Interface(
|
| 52 |
+
fn=run_pipeline,
|
| 53 |
+
inputs=gr.Textbox(
|
| 54 |
+
label="Paste Mixed Raw Data",
|
| 55 |
+
placeholder=(
|
| 56 |
+
"Paste a full paragraph or list of facts.\n"
|
| 57 |
+
"Each sentence will be auto-tagged and grouped.\n"
|
| 58 |
+
"Example:\n"
|
| 59 |
+
"Bitcoin is a decentralized currency.\n"
|
| 60 |
+
"Python is a programming language.\n"
|
| 61 |
+
"The Taj Mahal was built by Shah Jahan."
|
| 62 |
+
),
|
| 63 |
+
lines=25
|
| 64 |
+
),
|
| 65 |
+
outputs=gr.Textbox(
|
| 66 |
+
label="Refined JSON Output",
|
| 67 |
+
lines=25
|
| 68 |
+
),
|
| 69 |
+
title="π Document Categorizer (Text Output Only)",
|
| 70 |
+
description=(
|
| 71 |
+
"This app uses Flan-T5-Small to extract subjects from each sentence, "
|
| 72 |
+
"groups them under headings, and shows the result as JSON text."
|
| 73 |
+
)
|
| 74 |
+
).launch()
|