Spaces:

bhardwaj08sarthak
/

STEM-Question-Generator

Running

File size: 15,443 Bytes

import os
import json
import gradio as gr
import spaces
from huggingface_hub import InferenceClient
from smolagents import CodeAgent, InferenceClientModel, tool
from huggingface_hub import login
from smolagents import TransformersModel
import os
token = os.getenv("HF_Token")
login(token=token)
from level_classifier_tool import (
    classify_levels_phrases,
    HFEmbeddingBackend,
    build_phrase_index
)

# ------------------------ Taxonomy phrases ------------------------
BLOOMS_PHRASES = {
    "Remember": [
        "define", "list", "recall", "identify", "state", "label", "name", "recognize", "find",
        "select", "match", "choose", "give", "write", "tell", "show"
    ],
    "Understand": [
        "classify", "interpret", "summarize", "explain", "estimate", "describe", "discuss",
        "predict", "paraphrase", "restate", "illustrate", "compare", "contrast", "report"
    ],
    "Apply": [
        "apply", "solve", "use", "demonstrate", "calculate", "implement", "perform",
        "execute", "carry out", "practice", "employ", "sketch"
    ],
    "Analyze": [
        "analyze", "differentiate", "organize", "structure", "break down", "distinguish",
        "dissect", "examine", "compare", "contrast", "attribute", "investigate"
    ],
    "Evaluate": [
        "evaluate", "judge", "critique", "assess", "defend", "argue", "select", "support",
        "appraise", "recommend", "conclude", "review"
    ],
    "Create": [
        "create", "design", "compose", "plan", "construct", "produce", "devise", "generate",
        "develop", "formulate", "invent", "build"
    ]
}

DOK_PHRASES = {
    "DOK1": [
        "define", "list", "recall", "compute", "identify", "state", "label", "how many",
        "name", "recognize", "find", "determine", "select", "match", "choose", "give",
        "write", "tell", "show", "point out"
    ],
    "DOK2": [
        "classify", "interpret", "estimate", "organise", "summarise", "explain", "solve",
        "categorize", "group", "compare", "contrast", "distinguish", "make observations",
        "collect data", "display data", "arrange", "sort", "paraphrase", "restate", "predict",
        "approximate", "demonstrate", "illustrate", "describe", "analyze data"
    ],
    "DOK3": [
        "justify", "analyze", "generalise", "compare", "construct", "investigate",
        "support", "defend", "argue", "examine", "differentiate", "criticize", "debate",
        "test", "experiment", "hypothesize", "draw conclusions", "break down", "dissect",
        "probe", "explore", "develop", "formulate"
    ],
    "DOK4": [
        "design", "synthesize", "model", "prove", "evaluate system", "critique", "create",
        "compose", "plan", "invent", "devise", "generate", "build", "construct", "produce",
        "formulate", "improve", "revise", "assess", "appraise", "judge", "recommend",
        "predict outcome", "simulate"
    ]
}

# ------------------------ Prebuild embeddings once ------------------------
_backend = HFEmbeddingBackend(model_name="sentence-transformers/all-MiniLM-L6-v2")
_BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
_DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)

# ------------------------ Tool: classify and score ------------------------
@tool
def classify_and_score(
    question: str,
    target_bloom: str,
    target_dok: str,
    agg: str = "max"
) -> dict:
    """Classify a question against Bloom’s and DOK targets and return guidance.

    Args:
        question: The question text to evaluate for cognitive demand.
        target_bloom: Target Bloom’s level or range. Accepts exact (e.g., "Analyze")
            or plus form (e.g., "Apply+") meaning that level or higher.
        target_dok: Target DOK level or range. Accepts exact (e.g., "DOK3")
            or span (e.g., "DOK2-DOK3").
        agg: Aggregation method over phrase similarities within a level
            (choices: "mean", "max", "topk_mean").

    Returns:
        A dictionary with:
            ok: True if both Bloom’s and DOK match the targets.
            measured: Dict with best levels and per-level scores for Bloom’s and DOK.
            feedback: Brief guidance describing how to adjust the question to hit targets.
    """
    res = classify_levels_phrases(
        question,
        BLOOMS_PHRASES,
        DOK_PHRASES,
        backend=_backend,
        prebuilt_bloom_index=_BLOOM_INDEX,
        prebuilt_dok_index=_DOK_INDEX,
        agg=agg,
        return_phrase_matches=True
    )

    def _parse_target_bloom(t: str):
        order = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
        if t.endswith("+"):
            base = t[:-1]
            if base not in order:
                raise ValueError(f"Invalid Bloom target '{t}'")
            return set(order[order.index(base):])
        if t not in order:
            raise ValueError(f"Invalid Bloom target '{t}'")
        return {t}

    def _parse_target_dok(t: str):
        order = ["DOK1", "DOK2", "DOK3", "DOK4"]
        if "-" in t:
            lo, hi = t.split("-")
            if lo not in order or hi not in order or order.index(lo) > order.index(hi):
                raise ValueError(f"Invalid DOK range '{t}'")
            return set(order[order.index(lo):order.index(hi) + 1])
        if t not in order:
            raise ValueError(f"Invalid DOK target '{t}'")
        return {t}

    try:
        bloom_target_set = _parse_target_bloom(target_bloom)
        dok_target_set = _parse_target_dok(target_dok)
    except Exception as e:
        return {
            "ok": False,
            "measured": {},
            "feedback": (
                f"Invalid targets: {e}. Use Bloom in "
                "{Remember, Understand, Apply, Analyze, Evaluate, Create} "
                "and DOK in {DOK1..DOK4} or ranges like 'DOK2-DOK3'."
            ),
        }

    bloom_best = res["blooms"]["best_level"]
    dok_best = res["dok"]["best_level"]

    bloom_ok = bloom_best in bloom_target_set
    dok_ok = dok_best in dok_target_set

    top_bloom_phrases = res["blooms"].get("top_phrases", {})
    top_dok_phrases = res["dok"].get("top_phrases", {})

    feedback_parts = []
    if not bloom_ok:
        feedback_parts.append(
            f"Shift Bloom’s from {bloom_best} toward {sorted(list(bloom_target_set))}. "
            f"Top cues: {top_bloom_phrases.get(bloom_best, [])[:3]}"
        )
    if not dok_ok:
        feedback_parts.append(
            f"Shift DOK from {dok_best} toward {sorted(list(dok_target_set))}. "
            f"Top cues: {top_dok_phrases.get(dok_best, [])[:3]}"
        )

    return {
        "ok": bool(bloom_ok and dok_ok),
        "measured": {
            "bloom_best": bloom_best,
            "bloom_scores": res["blooms"]["scores"],
            "dok_best": dok_best,
            "dok_scores": res["dok"]["scores"],
        },
        "feedback": " ".join(feedback_parts) if feedback_parts else "On target.",
    }

# ------------------------ Backend selection + GPU-wrapped local loader ------------------------
_LOCAL_MODEL_CACHE = {"model": None, "model_id": None}

@spaces.GPU(duration=30)  # request GPU only when loading/using local model
def get_local_model_gpu(model_id: str):
    """
    Load and cache a local Transformers model for smolagents on GPU.
    Decorated so Spaces knows this task needs a GPU.
    """
    # Import here to keep Hosted mode lightweight.
    try:
        from smolagents import TransformersModel  # provided by smolagents
    except Exception as e:
        raise RuntimeError(
            "Local backend requires 'TransformersModel' from smolagents. "
            "Please ensure your smolagents version provides it."
        ) from e

    if (
        _LOCAL_MODEL_CACHE["model"] is not None
        and _LOCAL_MODEL_CACHE["model_id"] == model_id
    ):
        return _LOCAL_MODEL_CACHE["model"]

    local_model = TransformersModel(
        model_id=model_id,
        device_map="auto"  # lets accelerate pick the best device(s)
    )
    _LOCAL_MODEL_CACHE["model"] = local_model
    _LOCAL_MODEL_CACHE["model_id"] = model_id
    return local_model

def make_agent(
    backend_choice: str,          # "Hosted API" | "Local GPU"
    hf_token: str,
    model_id: str,
    timeout: int,
    temperature: float,
    max_tokens: int
):
    if backend_choice == "Local GPU":
        # This call is GPU-annotated; Spaces will allocate a GPU for it.
        model = get_local_model_gpu(model_id)
    else:
        client = InferenceClient(
            model=model_id,
            timeout=timeout,
            token=(hf_token or None),
        )
        model = InferenceClientModel(client=client)

    agent = CodeAgent(model=model, tools=[classify_and_score])
    agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}
    return agent

# ------------------------ Agent task template -----------------------------
TASK_TMPL = '''You generate {subject} question candidates for {grade} on "{topic}".

After you propose a candidate, you MUST immediately call:
classify_and_score(
    question=<just the question text>,
    target_bloom="{target_bloom}",
    target_dok="{target_dok}",
    agg="max"
)

Use the returned dict:
- If ok == True: print ONLY compact JSON {{"question": "...", "answer": "...", "reasoning": "..."}} and finish.
- If ok == False: briefly explain the needed shift, revise the question, and call classify_and_score again.
Repeat up to {attempts} attempts.
Keep answers concise.
Additionally, when you call classify_and_score, pass the exact question text you propose.
If you output JSON, ensure it is valid JSON (no trailing commas, use double quotes).
'''

# ------------------------ Utility: robust JSON extractor ------------------
def extract_top_level_json(s: str) -> str:
    start = s.find("{")
    if start == -1:
        return ""
    depth = 0
    for i in range(start, len(s)):
        ch = s[i]
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                candidate = s[start:i + 1]
                try:
                    json.loads(candidate)  # validate
                    return candidate
                except Exception:
                    return ""
    return ""

# ------------------------ Pipeline ---------------------------------------
def run_pipeline(
    backend_choice,
    hf_token,
    topic,
    grade,
    subject,
    target_bloom,
    target_dok,
    attempts,
    model_id,
    timeout,
    temperature,
    max_tokens
):
    try:
        agent = make_agent(
            backend_choice=backend_choice,
            hf_token=(hf_token or "").strip(),
            model_id=model_id,
            timeout=int(timeout),
            temperature=float(temperature),
            max_tokens=int(max_tokens),
        )
    except Exception as e:
        err = f"ERROR initializing backend '{backend_choice}': {e}"
        return "", err

    task = TASK_TMPL.format(
        grade=grade,
        topic=topic,
        subject=subject,
        target_bloom=target_bloom,
        target_dok=target_dok,
        attempts=int(attempts)
    )

    try:
        result_text = agent.run(task, max_steps=int(attempts) * 4)
    except Exception as e:
        result_text = f"ERROR while running the agent: {e}"

    final_json = ""
    candidate = extract_top_level_json(result_text or "")
    if candidate:
        try:
            final_json = json.dumps(json.loads(candidate), indent=2)
        except Exception:
            final_json = ""

    return final_json, result_text

# ------------------------ Gradio UI --------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
    gr.Markdown(
        "Use a **CodeAgent** that calls the scoring tool (`classify_and_score`) after each proposal, "
        "and revises until it hits your Bloom/DOK target."
    )

    with gr.Accordion("API / Backend Settings", open=True):
        backend_choice = gr.Radio(
            choices=["Hosted API", "Local GPU"],
            value="Hosted API",
            label="Inference Backend"
        )
        with gr.Row():
            hf_token = gr.Textbox(
                label="Hugging Face Token (required for private/hosted endpoints)",
                type="password",
                visible=True
            )
            model_id = gr.Textbox(
                value="swiss-ai/Apertus-70B-Instruct-2509",
                label="Model ID (repo id for Hosted, or local repo for GPU)"
            )
        timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s, Hosted API only)")

    with gr.Row():
        topic = gr.Textbox(value="Fractions", label="Topic")
        grade = gr.Dropdown(
            choices=[
                "Grade 1", "Grade 2", "Grade 3", "Grade 4", "Grade 5", "Grade 6",
                "Grade 7", "Grade 8", "Grade 9",
                "Grade 10", "Grade 11", "Grade 12",
                "Under Graduate", "Post Graduate"
            ],
            value="Grade 7",
            label="Grade"
        )
        subject = gr.Textbox(value="Math", label="Subject")

    with gr.Row():
        target_bloom = gr.Dropdown(
            choices=["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"],
            value="Analyze",
            label="Target Bloom’s"
        )
        target_dok = gr.Dropdown(
            choices=["DOK1", "DOK2", "DOK3", "DOK4", "DOK1-DOK2", "DOK2-DOK3", "DOK3-DOK4"],
            value="DOK2-DOK3",
            label="Target Depth of Knowledge"
        )
        attempts = gr.Slider(1, 8, value=5, step=1, label="Max Attempts")

    with gr.Accordion("⚙️ Generation Controls", open=False):
        temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
        max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")

    backend_tips = gr.Markdown(
        "*Hosted API:* uses Hugging Face Inference endpoints. Provide a token if needed.\n\n"
        "*Local GPU:* loads the model into the Space with `TransformersModel (device_map='auto')`. "
        "Ensure your Space has a GPU and enough VRAM for the selected model."
    )

    run_btn = gr.Button("Run Agent 🚀")

    final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
    transcript = gr.Textbox(label="Agent Transcript", lines=18)

    def _toggle_backend_fields(choice):
        return (
            gr.update(visible=(choice == "Hosted API")),  # hf_token
            gr.update(visible=True),                      # model_id always visible
            gr.update(visible=(choice == "Hosted API"))   # timeout slider
        )

    backend_choice.change(
        _toggle_backend_fields,
        inputs=[backend_choice],
        outputs=[hf_token, model_id, timeout]
    )

    run_btn.click(
        fn=run_pipeline,
        inputs=[
            backend_choice, hf_token, topic, grade, subject,
            target_bloom, target_dok, attempts, model_id,
            timeout, temperature, max_tokens
        ],
        outputs=[final_json, transcript]
    )

if __name__ == "__main__" or os.getenv("SYSTEM") == "spaces":
    try:
        get_local_model_gpu(model_id)  # triggers GPU allocation during startup
    except Exception as e:
        # don't crash the app if warmup fails; logs will show details
        print("Warmup failed:", e)

    demo.launch()