import os import json import gradio as gr import spaces from huggingface_hub import InferenceClient from smolagents import CodeAgent, InferenceClientModel, tool from level_classifier_tool import ( classify_levels_phrases, HFEmbeddingBackend, build_phrase_index ) # ------------------------ Taxonomy phrases ------------------------ BLOOMS_PHRASES = { "Remember": [ "define", "list", "recall", "identify", "state", "label", "name", "recognize", "find", "select", "match", "choose", "give", "write", "tell", "show" ], "Understand": [ "classify", "interpret", "summarize", "explain", "estimate", "describe", "discuss", "predict", "paraphrase", "restate", "illustrate", "compare", "contrast", "report" ], "Apply": [ "apply", "solve", "use", "demonstrate", "calculate", "implement", "perform", "execute", "carry out", "practice", "employ", "sketch" ], "Analyze": [ "analyze", "differentiate", "organize", "structure", "break down", "distinguish", "dissect", "examine", "compare", "contrast", "attribute", "investigate" ], "Evaluate": [ "evaluate", "judge", "critique", "assess", "defend", "argue", "select", "support", "appraise", "recommend", "conclude", "review" ], "Create": [ "create", "design", "compose", "plan", "construct", "produce", "devise", "generate", "develop", "formulate", "invent", "build" ] } DOK_PHRASES = { "DOK1": [ "define", "list", "recall", "compute", "identify", "state", "label", "how many", "name", "recognize", "find", "determine", "select", "match", "choose", "give", "write", "tell", "show", "point out" ], "DOK2": [ "classify", "interpret", "estimate", "organise", "summarise", "explain", "solve", "categorize", "group", "compare", "contrast", "distinguish", "make observations", "collect data", "display data", "arrange", "sort", "paraphrase", "restate", "predict", "approximate", "demonstrate", "illustrate", "describe", "analyze data" ], "DOK3": [ "justify", "analyze", "generalise", "compare", "construct", "investigate", "support", "defend", "argue", "examine", "differentiate", "criticize", "debate", "test", "experiment", "hypothesize", "draw conclusions", "break down", "dissect", "probe", "explore", "develop", "formulate" ], "DOK4": [ "design", "synthesize", "model", "prove", "evaluate system", "critique", "create", "compose", "plan", "invent", "devise", "generate", "build", "construct", "produce", "formulate", "improve", "revise", "assess", "appraise", "judge", "recommend", "predict outcome", "simulate" ] } # ------------------------ Prebuild embeddings once ------------------------ _backend = HFEmbeddingBackend(model_name="sentence-transformers/all-MiniLM-L6-v2") _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES) _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES) # ------------------------ Tool: classify and score ------------------------ @tool def classify_and_score( question: str, target_bloom: str, target_dok: str, agg: str = "max" ) -> dict: """Classify a question against Bloom’s and DOK targets and return guidance.""" res = classify_levels_phrases( question, BLOOMS_PHRASES, DOK_PHRASES, backend=_backend, prebuilt_bloom_index=_BLOOM_INDEX, prebuilt_dok_index=_DOK_INDEX, agg=agg, return_phrase_matches=True ) def _parse_target_bloom(t: str): order = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"] if t.endswith("+"): base = t[:-1] if base not in order: raise ValueError(f"Invalid Bloom target '{t}'") return set(order[order.index(base):]) if t not in order: raise ValueError(f"Invalid Bloom target '{t}'") return {t} def _parse_target_dok(t: str): order = ["DOK1", "DOK2", "DOK3", "DOK4"] if "-" in t: lo, hi = t.split("-") if lo not in order or hi not in order or order.index(lo) > order.index(hi): raise ValueError(f"Invalid DOK range '{t}'") return set(order[order.index(lo):order.index(hi) + 1]) if t not in order: raise ValueError(f"Invalid DOK target '{t}'") return {t} try: bloom_target_set = _parse_target_bloom(target_bloom) dok_target_set = _parse_target_dok(target_dok) except Exception as e: return { "ok": False, "measured": {}, "feedback": ( f"Invalid targets: {e}. Use Bloom in " "{Remember, Understand, Apply, Analyze, Evaluate, Create} " "and DOK in {DOK1..DOK4} or ranges like 'DOK2-DOK3'." ), } bloom_best = res["blooms"]["best_level"] dok_best = res["dok"]["best_level"] bloom_ok = bloom_best in bloom_target_set dok_ok = dok_best in dok_target_set top_bloom_phrases = res["blooms"].get("top_phrases", {}) top_dok_phrases = res["dok"].get("top_phrases", {}) feedback_parts = [] if not bloom_ok: feedback_parts.append( f"Shift Bloom’s from {bloom_best} toward {sorted(list(bloom_target_set))}. " f"Top cues: {top_bloom_phrases.get(bloom_best, [])[:3]}" ) if not dok_ok: feedback_parts.append( f"Shift DOK from {dok_best} toward {sorted(list(dok_target_set))}. " f"Top cues: {top_dok_phrases.get(dok_best, [])[:3]}" ) return { "ok": bool(bloom_ok and dok_ok), "measured": { "bloom_best": bloom_best, "bloom_scores": res["blooms"]["scores"], "dok_best": dok_best, "dok_scores": res["dok"]["scores"], }, "feedback": " ".join(feedback_parts) if feedback_parts else "On target.", } # ------------------------ Backend selection + GPU-wrapped local loader ------------------------ _LOCAL_MODEL_CACHE = {"model": None, "model_id": None} @spaces.GPU(duration=30) # request GPU only when loading/using local model def get_local_model_gpu(model_id: str): """ Load and cache a local Transformers model for smolagents on GPU. Decorated so Spaces knows this task needs a GPU. """ # Import here to keep Hosted mode lightweight. try: from smolagents import TransformersModel # provided by smolagents except Exception as e: raise RuntimeError( "Local backend requires 'TransformersModel' from smolagents. " "Please ensure your smolagents version provides it." ) from e if ( _LOCAL_MODEL_CACHE["model"] is not None and _LOCAL_MODEL_CACHE["model_id"] == model_id ): return _LOCAL_MODEL_CACHE["model"] local_model = TransformersModel( model_id=model_id, device_map="auto" # lets accelerate pick the best device(s) ) _LOCAL_MODEL_CACHE["model"] = local_model _LOCAL_MODEL_CACHE["model_id"] = model_id return local_model def make_agent( backend_choice: str, # "Hosted API" | "Local GPU" hf_token: str, model_id: str, timeout: int, temperature: float, max_tokens: int ): if backend_choice == "Local GPU": # This call is GPU-annotated; Spaces will allocate a GPU for it. model = get_local_model_gpu(model_id) else: client = InferenceClient( model=model_id, timeout=timeout, token=(hf_token or None), ) model = InferenceClientModel(client=client) agent = CodeAgent(model=model, tools=[classify_and_score]) agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens} return agent # ------------------------ Agent task template ----------------------------- TASK_TMPL = '''You generate {subject} question candidates for {grade} on "{topic}". After you propose a candidate, you MUST immediately call: classify_and_score( question=, target_bloom="{target_bloom}", target_dok="{target_dok}", agg="max" ) Use the returned dict: - If ok == True: print ONLY compact JSON {{"question": "...", "answer": "...", "reasoning": "..."}} and finish. - If ok == False: briefly explain the needed shift, revise the question, and call classify_and_score again. Repeat up to {attempts} attempts. Keep answers concise. Additionally, when you call classify_and_score, pass the exact question text you propose. If you output JSON, ensure it is valid JSON (no trailing commas, use double quotes). ''' # ------------------------ Utility: robust JSON extractor ------------------ def extract_top_level_json(s: str) -> str: start = s.find("{") if start == -1: return "" depth = 0 for i in range(start, len(s)): ch = s[i] if ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: candidate = s[start:i + 1] try: json.loads(candidate) # validate return candidate except Exception: return "" return "" # ------------------------ Pipeline --------------------------------------- def run_pipeline( backend_choice, hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, timeout, temperature, max_tokens ): try: agent = make_agent( backend_choice=backend_choice, hf_token=(hf_token or "").strip(), model_id=model_id, timeout=int(timeout), temperature=float(temperature), max_tokens=int(max_tokens), ) except Exception as e: err = f"ERROR initializing backend '{backend_choice}': {e}" return "", err task = TASK_TMPL.format( grade=grade, topic=topic, subject=subject, target_bloom=target_bloom, target_dok=target_dok, attempts=int(attempts) ) try: result_text = agent.run(task, max_steps=int(attempts) * 4) except Exception as e: result_text = f"ERROR while running the agent: {e}" final_json = "" candidate = extract_top_level_json(result_text or "") if candidate: try: final_json = json.dumps(json.loads(candidate), indent=2) except Exception: final_json = "" return final_json, result_text # ------------------------ Gradio UI -------------------------------------- with gr.Blocks() as demo: gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty") gr.Markdown( "Use a **CodeAgent** that calls the scoring tool (`classify_and_score`) after each proposal, " "and revises until it hits your Bloom/DOK target." ) with gr.Accordion("API / Backend Settings", open=True): backend_choice = gr.Radio( choices=["Hosted API", "Local GPU"], value="Hosted API", label="Inference Backend" ) with gr.Row(): hf_token = gr.Textbox( label="Hugging Face Token (required for private/hosted endpoints)", type="password", visible=True ) model_id = gr.Textbox( value="swiss-ai/Apertus-70B-Instruct-2509", label="Model ID (repo id for Hosted, or local repo for GPU)" ) timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s, Hosted API only)") with gr.Row(): topic = gr.Textbox(value="Fractions", label="Topic") grade = gr.Dropdown( choices=[ "Grade 1", "Grade 2", "Grade 3", "Grade 4", "Grade 5", "Grade 6", "Grade 7", "Grade 8", "Grade 9", "Grade 10", "Grade 11", "Grade 12", "Under Graduate", "Post Graduate" ], value="Grade 7", label="Grade" ) subject = gr.Textbox(value="Math", label="Subject") with gr.Row(): target_bloom = gr.Dropdown( choices=["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"], value="Analyze", label="Target Bloom’s" ) target_dok = gr.Dropdown( choices=["DOK1", "DOK2", "DOK3", "DOK4", "DOK1-DOK2", "DOK2-DOK3", "DOK3-DOK4"], value="DOK2-DOK3", label="Target Depth of Knowledge" ) attempts = gr.Slider(1, 8, value=5, step=1, label="Max Attempts") with gr.Accordion("⚙️ Generation Controls", open=False): temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature") max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens") backend_tips = gr.Markdown( "*Hosted API:* uses Hugging Face Inference endpoints. Provide a token if needed.\n\n" "*Local GPU:* loads the model into the Space with `TransformersModel (device_map='auto')`. " "Ensure your Space has a GPU and enough VRAM for the selected model." ) run_btn = gr.Button("Run Agent 🚀") final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json") transcript = gr.Textbox(label="Agent Transcript", lines=18) def _toggle_backend_fields(choice): return ( gr.update(visible=(choice == "Hosted API")), # hf_token gr.update(visible=True), # model_id always visible gr.update(visible=(choice == "Hosted API")) # timeout slider ) backend_choice.change( _toggle_backend_fields, inputs=[backend_choice], outputs=[hf_token, model_id, timeout] ) run_btn.click( fn=run_pipeline, inputs=[ backend_choice, hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, timeout, temperature, max_tokens ], outputs=[final_json, transcript] ) if __name__ == "__main__" or os.getenv("SYSTEM") == "spaces": demo.launch()