import os, sys, importlib # Disable Spaces ZeroGPU by env and by calling its disable/unpatch if preloaded os.environ["SPACES_ZERO_DISABLED"] = "1" def _hard_disable_spaces_zero(): # Hit common modules and try disable/unpatch/deactivate if present candidates = [ "spaces.zero", "spaces.zero.torch.patching", "spaces.zero.torch", "spaces.zero.patch", "spaces.zero.patching" ] for modname in candidates: try: m = sys.modules.get(modname) or importlib.import_module(modname) except Exception: continue for attr in ("disable", "unpatch", "deactivate"): fn = getattr(m, attr, None) if callable(fn): try: fn() except Exception: pass _hard_disable_spaces_zero() # Force Transformers to use eager attention globally (affects all future loads) os.environ["TRANSFORMERS_ATTENTION_IMPLEMENTATION"] = "eager" # Prefer simple math SDP kernels (avoid vmap-heavy paths) try: import torch torch.backends.cuda.sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False) except Exception: pass # ------------------------------------------------------------------------------- import sys from huggingface_hub import hf_hub_download import pickle from huggingface_hub import login login(os.getenv("HF_Token")) import json import gradio as gr from huggingface_hub import InferenceClient from smolagents import CodeAgent, InferenceClientModel, tool from langchain_community.embeddings import HuggingFaceEmbeddings # from llama_index.embeddings.huggingface import HuggingFaceEmbeddings from llama_index.core import StorageContext, load_index_from_storage from huggingface_hub import login, snapshot_download from smolagents import tool # from all_datasets import * from level_classifier_tool_2 import ( classify_levels_phrases, HFEmbeddingBackend, build_phrase_index ) from task_temp import rag_temp, rag_cls_temp, cls_temp, gen_temp from all_tools import classify_and_score, QuestionRetrieverTool, set_classifier_state, set_retrieval_index from phrases import BLOOMS_PHRASES, DOK_PHRASES from pathlib import Path # ------------------------ Prebuild embeddings once ------------------------ _backend = HFEmbeddingBackend(model_name="google/embeddinggemma-300m") # Belt-and-suspenders: ensure eager attention even if class wasn't patched try: _backend.MODEL.config.attn_implementation = "eager" except Exception: pass _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES) _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES) DATASET_REPO = "bhardwaj08sarthak/rag-index" # your dataset repo id PERSIST_SUBDIR = "index_store" # folder inside the dataset # Writable cache base (home or /tmp) def _pick_writable_base() -> Path: for base in (Path.home(), Path("/tmp")): try: base.mkdir(parents=True, exist_ok=True) test = base / ".write_test" test.write_text("ok") test.unlink(missing_ok=True) return base except Exception: continue return Path.cwd() WRITABLE_BASE = _pick_writable_base() LOCAL_BASE = WRITABLE_BASE / "my_app_cache" / "index" LOCAL_BASE.mkdir(parents=True, exist_ok=True) # Download only the persisted index folder snapshot_download( repo_id=DATASET_REPO, repo_type="dataset", local_dir=str(LOCAL_BASE), local_dir_use_symlinks=False, ) # Resolve the actual persist dir by finding docstore.json def _resolve_persist_dir(base: Path, subdir: str) -> Path: # Common candidates candidates = [ base / subdir, # /index_store base, # sometimes files land directly under local base ] for c in candidates: if (c / "docstore.json").exists(): return c # Search anywhere under base for docstore.json matches = list(base.rglob("docstore.json")) if matches: return matches[0].parent # Nothing found: print what we actually downloaded tree = "\n".join(str(p.relative_to(base)) for p in base.rglob("*") if p.is_file()) raise FileNotFoundError( f"Could not find 'docstore.json' under {base}. " f"Expected '{subdir}/docstore.json'. Downloaded files:\n{tree}" ) persist_dir = _resolve_persist_dir(Path(LOCAL_BASE), PERSIST_SUBDIR) # Sanity-check typical LlamaIndex files (names may vary by version/vector store) expected = ["docstore.json", "index_store.json", "vector_store.json"] missing = [name for name in expected if not (persist_dir / name).exists()] if missing: # Not fatal for every setup, but warn loudly so you know if upload was incomplete print(f"[warn] Missing in {persist_dir}: {missing}. If loading fails, re-upload the full '{PERSIST_SUBDIR}' folder.") # Pick a device that exists for embeddings try: import torch _emb_device = "cuda" if torch.cuda.is_available() else "cpu" except Exception: _emb_device = "cpu" emb = HuggingFaceEmbeddings( model_name="google/embeddinggemma-300m", model_kwargs={"device": _emb_device}, #"attn_implementation": "eager"}, encode_kwargs={"normalize_embeddings": True}, ) # Finally load the index storage_context = StorageContext.from_defaults(persist_dir=str(persist_dir)) index = load_index_from_storage(storage_context, embed_model=emb) set_classifier_state(_backend, _BLOOM_INDEX, _DOK_INDEX) set_retrieval_index(index) # Datasets & GPU build code remains commented out... # @spaces.GPU(15) # def build_indexes_on_gpu(model="google/embeddinggemma-300m"): # device = 'cuda' # emb = HuggingFaceEmbeddings( # model_name="model", # model_kwargs={"device": device, "attn_implementation": "eager"}, # encode_kwargs={"normalize_embeddings": True}) # idx = VectorStoreIndex.from_documents([Document(text=t) for t in texts], embed_model=emb) # return idx TASK_TEMPLATES = { "rag_temp": rag_temp, "rag_cls_temp": rag_cls_temp, "cls_temp": cls_temp, "gen_temp": gen_temp, } # ------------------------ Agent setup with timeout ------------------------ def make_agent(hf_token: str, model_id: str, provider: str, timeout: int, temperature: float, max_tokens: int): client = InferenceClient( model=model_id, provider=provider, timeout=timeout, token=hf_token if hf_token else None, ) # Bind generation params by partially applying via model kwargs. # smolagents InferenceClientModel currently accepts client only; we pass runtime params in task text. model = InferenceClientModel(model_id=model_id, client=client) agent = CodeAgent(model=model, tools=[classify_and_score, QuestionRetrieverTool]) agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens} # attach for reference return agent # ------------------------ Gradio glue ------------------------------------ def run_pipeline( hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, provider, timeout, temperature, max_tokens, task_type ): # Build agent per run (or cache if you prefer) agent = make_agent( hf_token=hf_token.strip(), model_id=model_id, provider=provider, timeout=int(timeout), temperature=float(temperature), max_tokens=int(max_tokens), ) template = TASK_TEMPLATES[task_type] task = template.format( grade=grade, topic=topic, subject=subject, target_bloom=target_bloom, target_dok=target_dok, attempts=int(attempts) ) # The agent will internally call the tool try: result_text = agent.run(task, max_steps=int(attempts) * 4) except Exception as e: result_text = f"ERROR: {e}" # Try to extract final JSON final_json = "" try: # find JSON object in result_text (simple heuristic) start = result_text.find("{") end = result_text.rfind("}") if start != -1 and end != -1 and end > start: candidate = result_text[start:end+1] final_json = json.dumps(json.loads(candidate), indent=2) except Exception: final_json = "" return final_json, result_text with gr.Blocks() as demo: gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty") gr.Markdown( "This app uses a **CodeAgent** that *calls the scoring tool* " "(`classify_and_score`) after each proposal, and revises until it hits the target." ) with gr.Accordion("API Settings", open=False): hf_token = gr.Textbox(label="Hugging Face Token (required)", type="password") model_id = gr.Textbox(value="meta-llama/Llama-4-Scout-17B-16E-Instruct", label="Model ID") provider = gr.Textbox(value="novita", label="Provider") timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s)") with gr.Row(): topic = gr.Textbox(value="Fractions", label="Topic") grade = gr.Dropdown( choices=["Grade 1","Grade 2","Grade 3","Grade4","Grade 5","Grade 6","Grade 7","Grade 8","Grade 9", "Grade 10","Grade 11","Grade 12","Under Graduate","Post Graduate"], value="Grade 7", label="Grade" ) subject = gr.Textbox(value="Math", label="Subject") task_type = gr.Dropdown( choices=[("RAG Template", "rag_temp"), ("RAG+CLS Template", "rag_cls_temp"), ("Classification Template", "cls_temp"), ("Generation Template", "gen_temp")], label="task type" ) with gr.Row(): target_bloom = gr.Dropdown( choices=["Remember","Understand","Apply","Analyze","Evaluate","Create","Apply+","Analyze+","Evaluate+"], value="Analyze", label="Target Bloom’s" ) target_dok = gr.Dropdown( choices=["DOK1","DOK2","DOK3","DOK4","DOK1-DOK2","DOK2-DOK3","DOK3-DOK4"], value="DOK2-DOK3", label="Target DOK" ) attempts = gr.Slider(1, 8, value=5, step=1, label="Max Attempts") with gr.Accordion("Generation Controls", open=False): temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature") max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens") run_btn = gr.Button("Run Agent") final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json") transcript = gr.Textbox(label="Agent Transcript", lines=18) run_btn.click( fn=run_pipeline, inputs=[hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, provider, timeout, temperature, max_tokens, task_type], outputs=[final_json, transcript] ) if __name__ == "__main__": demo.launch(share=True)