datarefine2

Sleeping

App Files Files Community

ghosthets commited on Sep 13

Commit

3536a95

verified ·

1 Parent(s): d6b29df

Create app.py

Browse files

Files changed (1) hide show

app.py +74 -0

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import json
+import gradio as gr
+import nltk
+from huggingface_hub import InferenceApi
+# Download sentence tokenizer
+nltk.download("punkt")
+from nltk.tokenize import sent_tokenize
+# — Hugging Face Inference API —
+hf_token = os.getenv("HF_TOKEN")
+model_id = "google/flan-t5-small"  # Fast, free, hosted model
+inference = InferenceApi(
+    repo_id=model_id,
+    task="text2text-generation",
+    token=hf_token
+)
+# — Subject Extraction Function —
+def extract_subject(sentence):
+    prompt = f"Extract the main subject or category in 1–3 words. Only return the subject.\nSentence: {sentence}"
+    try:
+        response = inference(inputs=prompt, params={"max_new_tokens": 10})
+        if isinstance(response, list) and "generated_text" in response[0]:
+            return response[0]["generated_text"].strip().rstrip(".")
+        elif isinstance(response, dict) and "generated_text" in response:
+            return response["generated_text"].strip().rstrip(".")
+        else:
+            return "Miscellaneous"
+    except Exception:
+        return "Error"
+# — Grouping Logic —
+def refine_and_group(raw_text):
+    sentences = sent_tokenize(raw_text)
+    groups = {}
+    for sentence in sentences:
+        subject = extract_subject(sentence)
+        groups.setdefault(subject, []).append(sentence)
+    return [{"heading": subject, "content": items} for subject, items in groups.items()]
+# — Gradio Interface —
+def run_pipeline(raw_text):
+    structured = refine_and_group(raw_text)
+    return json.dumps(structured, indent=2)
+gr.Interface(
+    fn=run_pipeline,
+    inputs=gr.Textbox(
+        label="Paste Mixed Raw Data",
+        placeholder=(
+            "Paste a full paragraph or list of facts.\n"
+            "Each sentence will be auto-tagged and grouped.\n"
+            "Example:\n"
+            "Bitcoin is a decentralized currency.\n"
+            "Python is a programming language.\n"
+            "The Taj Mahal was built by Shah Jahan."
+        ),
+        lines=25
+    ),
+    outputs=gr.Textbox(
+        label="Refined JSON Output",
+        lines=25
+    ),
+    title="📚 Document Categorizer (Text Output Only)",
+    description=(
+        "This app uses Flan-T5-Small to extract subjects from each sentence, "
+        "groups them under headings, and shows the result as JSON text."
+    )
+).launch()