ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 9

Commit

3c22497

verified ·

1 Parent(s): 8eefe94

feat(models): Added three new models

Browse files

ServiceNow-AI/Apriel-1.5-15b-Thinker (15B, multimodal reasoning)
ServiceNow-AI/Apriel-5B-Instruct (5B, instruction-tuned)
ai21labs/AI21-Jamba-Reasoning-3B (3B, hybrid Transformer–Mamba with 256K context)
Reordered entire MODELS dictionary from largest (~15B) to smallest (~135M)
Grouped models into logical size-based sections with updated descriptions using official Hugging Face metadata
Enhanced model descriptions with key capabilities, benchmarks, and usage context from source model cards

Files changed (1) hide show

app.py +160 -88

app.py CHANGED Viewed

@@ -26,117 +26,189 @@ cancel_event = threading.Event()
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
-    # Models with ~135M parameters
-    "SmolLM2-135M-multilingual-base": {"repo_id": "agentlans/SmolLM2-135M-multilingual-base", "description": "SmolLM2-135M-multilingual-base"},
-    "SmolLM-135M-Taiwan-Instruct-v1.0": {
-        "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
-        "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
     },
-    "SmolLM2_135M_Grpo_Gsm8k":{"repo_id":"prithivMLmods/SmolLM2_135M_Grpo_Gsm8k", "description":"SmolLM2_135M_Grpo_Gsm8k"},
-    "SmolLM2-135M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct", "description": "Original SmolLM2‑135M Instruct"},
-    "SmolLM2-135M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat", "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"},
-    # Models with ~270M parameters
-    "parser_model_ner_gemma_v0.1": {
-        "repo_id": "myfi/parser_model_ner_gemma_v0.1",
-        "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
     },
-    "Gemma-3-Taiwan-270M-it":{
-        "repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
-        "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
     },
-    "gemma-3-270m-it":{
-        "repo_id":"google/gemma-3-270m-it",
-        "description":"Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
     },
-    "Taiwan-ELM-270M-Instruct": {"repo_id": "liswei/Taiwan-ELM-270M-Instruct", "description": "Taiwan-ELM-270M-Instruct"},
-    # Models with 350M-700M parameters
-    "LFM2-350M": {
-        "repo_id": "LiquidAI/LFM2-350M",
-        "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
     },
-    "SmolLM2-360M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat", "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"},
-    "SmolLM2-360M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct", "description": "Original SmolLM2‑360M Instruct"},
-    "Qwen2.5-0.5B-Taiwan-Instruct": {
-        "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
-        "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
     },
-    "Qwen3-0.6B-Taiwan": {
-        "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
-        "description": "Qwen3-Taiwan model with 0.6 B parameters"
     },
-    "Qwen3-0.6B":    {"repo_id":"Qwen/Qwen3-0.6B","description":"Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."},
-    "LFM2-700M": {
-        "repo_id": "LiquidAI/LFM2-700M",
-        "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
     },
-    # Models with 1B-2B parameters
-    "Llama-3.2-Taiwan-1B": {
-        "repo_id":   "lianghsun/Llama-3.2-Taiwan-1B",
-        "description":"Llama-3.2-Taiwan base model with 1 B parameters"
     },
-    "Taiwan-ELM-1_1B-Instruct": {"repo_id": "liswei/Taiwan-ELM-1_1B-Instruct", "description": "Taiwan-ELM-1_1B-Instruct"},
     "LFM2-1.2B": {
         "repo_id": "LiquidAI/LFM2-1.2B",
         "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
     },
-    "Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
-    "Falcon-H1-1.5B-Instruct": {
-        "repo_id":   "tiiuae/Falcon-H1-1.5B-Instruct",
-        "description":"Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
     },
-    "Nemotron-Research-Reasoning-Qwen-1.5B": {"repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B", "description": "Nemotron-Research-Reasoning-Qwen-1.5B"},
-    "Qwen3-1.7B":    {"repo_id":"Qwen/Qwen3-1.7B","description":"Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."},
-    "Gemma-3n-E2B": {
-        "repo_id":   "google/gemma-3n-E2B",
-        "description":"Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
     },
-    # Models with 2.6B-4B parameters
-    "LFM2-2.6B": {
-        "repo_id": "LiquidAI/LFM2-2.6B",
-        "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
     },
-    "Qwen2.5-Taiwan-3B-Reason-GRPO": {
-        "repo_id":   "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
-        "description":"Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned"
-    },
-    "Llama-3.2-Taiwan-3B-Instruct": {"repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct", "description": "Llama-3.2-Taiwan-3B-Instruct"},
-    "Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct"},
-    "Qwen2.5-Omni-3B":   {"repo_id": "Qwen/Qwen2.5-Omni-3B",   "description": "Qwen2.5-Omni-3B"},
-    "Phi-4-mini-Reasoning": {"repo_id": "microsoft/Phi-4-mini-reasoning", "description": "Phi-4-mini-Reasoning (4.3B parameters)"},
-    "Phi-4-mini-Instruct": {"repo_id": "microsoft/Phi-4-mini-instruct", "description": "Phi-4-mini-Instruct (4.3B parameters)"},
-    "Gemma-3n-E4B": {
-        "repo_id":   "google/gemma-3n-E4B",
-        "description":"Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
     },
-    "SmallThinker-4BA0.6B-Instruct": {
-        "repo_id":   "PowerInfer/SmallThinker-4BA0.6B-Instruct",
-        "description":"SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
-    },
-    "Qwen3-4B":      {"repo_id":"Qwen/Qwen3-4B","description":"Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."},
-    "Gemma-3-4B-IT": {"repo_id": "unsloth/gemma-3-4b-it", "description": "Gemma-3-4B-IT"},
-    "MiniCPM3-4B": {"repo_id": "openbmb/MiniCPM3-4B", "description": "MiniCPM3-4B"},
-    # Models with 7B-8.3B parameters
-    "Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct"},
-    "Qwen2.5-Coder-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct", "description": "Qwen2.5-Coder-7B-Instruct"},
-    "MiMo-7B-RL":        {"repo_id": "XiaomiMiMo/MiMo-7B-RL",   "description": "MiMo-7B-RL"},
-    "Mistral-7B-Instruct-v0.3": {"repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3", "description": "Mistral-7B-Instruct-v0.3"},
-    "DeepSeek-R1-0528-Qwen3-8B": {"repo_id": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", "description": "DeepSeek-R1-0528-Qwen3-8B"},
-    "Meta-Llama-3.1-8B-Instruct": {"repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct", "description": "Meta-Llama-3.1-8B-Instruct"},
-    "DeepSeek-R1-Distill-Llama-8B": {"repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B", "description": "DeepSeek-R1-Distill-Llama-8B"},
-    "Qwen3-8B":      {"repo_id":"Qwen/Qwen3-8B","description":"Dense causal language model with 8.2 B total parameters (6.95 B non-embedding), 36 layers, 32 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), excels at multilingual instruction following & zero-shot tasks."},
-    "LFM2-8B-A1B": {
-        "repo_id": "LiquidAI/LFM2-8B-A1B",
-        "description": "A Mixture-of-Experts (MoE) model with 8.3B total parameters (1.5B active) designed for on-device use, providing the quality of larger models with the speed of a 1.5B-class model."
     },
-    # Models with 14B+ parameters
-    "Qwen/Qwen3-14B-FP8": {"repo_id": "Qwen/Qwen3-14B-FP8", "description": "Qwen/Qwen3-14B-FP8"},
-    "Qwen3-14B":     {"repo_id":"Qwen/Qwen3-14B","description":"Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."},
 }
 # Global cache for pipelines to avoid re-loading.

 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
+    # Models with 14B+ parameters
+    "Apriel-1.5-15b-Thinker": {
+        "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
+        "description": "A 15B multimodal reasoning model from ServiceNow’s Apriel series. Achieves SOTA performance on text and image reasoning (52 on Artificial Analysis index, 68 on Tau2 Bench Telecom, 62 on IFBench) despite undergoing only text SFT—no image fine-tuning. Fits on a single GPU and competes with models 10× its size like Deepseek R1 and Gemini-Flash."
+    },
+    "Qwen3-14B": {
+        "repo_id": "Qwen/Qwen3-14B",
+        "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."
+    },
+    "Qwen/Qwen3-14B-FP8": {
+        "repo_id": "Qwen/Qwen3-14B-FP8",
+        "description": "FP8-quantized version of Qwen3-14B for efficient inference."
     },
+    # Models with ~5B parameters
+    "Apriel-5B-Instruct": {
+        "repo_id": "ServiceNow-AI/Apriel-5B-Instruct",
+        "description": "A 5B-parameter instruction-tuned model from ServiceNow’s Apriel series, optimized for enterprise tasks and general-purpose instruction following."
     },
+    # Models with 4B–4.3B parameters
+    "Qwen3-4B": {
+        "repo_id": "Qwen/Qwen3-4B",
+        "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."
+    },
+    "Gemma-3-4B-IT": {
+        "repo_id": "unsloth/gemma-3-4b-it",
+        "description": "Gemma-3-4B-IT"
+    },
+    "MiniCPM3-4B": {
+        "repo_id": "openbmb/MiniCPM3-4B",
+        "description": "MiniCPM3-4B"
+    },
+    "Gemma-3n-E4B": {
+        "repo_id": "google/gemma-3n-E4B",
+        "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
+    },
+    "Phi-4-mini-Reasoning": {
+        "repo_id": "microsoft/Phi-4-mini-reasoning",
+        "description": "Phi-4-mini-Reasoning (4.3B parameters)"
     },
+    "Phi-4-mini-Instruct": {
+        "repo_id": "microsoft/Phi-4-mini-instruct",
+        "description": "Phi-4-mini-Instruct (4.3B parameters)"
+    },
+    "SmallThinker-4BA0.6B-Instruct": {
+        "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
+        "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
     },
+    # Models with ~3B parameters
+    "AI21-Jamba-Reasoning-3B": {
+        "repo_id": "ai21labs/AI21-Jamba-Reasoning-3B",
+        "description": "A compact 3B hybrid Transformer–Mamba reasoning model with 256K context length, strong intelligence benchmark scores (61% MMLU-Pro, 52% IFBench), and efficient inference suitable for edge and datacenter use. Outperforms Gemma-3 4B and Llama-3.2 3B despite smaller size."
     },
+    "Qwen2.5-Taiwan-3B-Reason-GRPO": {
+        "repo_id": "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
+        "description": "Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned"
     },
+    "Llama-3.2-Taiwan-3B-Instruct": {
+        "repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct",
+        "description": "Llama-3.2-Taiwan-3B-Instruct"
     },
+    "Qwen2.5-3B-Instruct": {
+        "repo_id": "Qwen/Qwen2.5-3B-Instruct",
+        "description": "Qwen2.5-3B-Instruct"
+    },
+    "Qwen2.5-Omni-3B": {
+        "repo_id": "Qwen/Qwen2.5-Omni-3B",
+        "description": "Qwen2.5-Omni-3B"
     },
+    # Models with 2.6B parameters
+    "LFM2-2.6B": {
+        "repo_id": "LiquidAI/LFM2-2.6B",
+        "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
+    },
+    # Models with 1.7B–2B parameters
+    "Qwen3-1.7B": {
+        "repo_id": "Qwen/Qwen3-1.7B",
+        "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."
+    },
+    "Gemma-3n-E2B": {
+        "repo_id": "google/gemma-3n-E2B",
+        "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
+    },
+    # Models with 1B–1.5B parameters
+    "Nemotron-Research-Reasoning-Qwen-1.5B": {
+        "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
+        "description": "Nemotron-Research-Reasoning-Qwen-1.5B"
+    },
+    "Falcon-H1-1.5B-Instruct": {
+        "repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
+        "description": "Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
+    },
+    "Qwen2.5-Taiwan-1.5B-Instruct": {
+        "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
+        "description": "Qwen2.5-Taiwan-1.5B-Instruct"
     },
     "LFM2-1.2B": {
         "repo_id": "LiquidAI/LFM2-1.2B",
         "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
     },
+    "Taiwan-ELM-1_1B-Instruct": {
+        "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
+        "description": "Taiwan-ELM-1_1B-Instruct"
     },
+    "Llama-3.2-Taiwan-1B": {
+        "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
+        "description": "Llama-3.2-Taiwan base model with 1 B parameters"
     },
+    # Models with 700M–360M parameters
+    "LFM2-700M": {
+        "repo_id": "LiquidAI/LFM2-700M",
+        "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
     },
+    "Qwen3-0.6B": {
+        "repo_id": "Qwen/Qwen3-0.6B",
+        "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."
     },
+    "Qwen3-0.6B-Taiwan": {
+        "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
+        "description": "Qwen3-Taiwan model with 0.6 B parameters"
+    },
+    "Qwen2.5-0.5B-Taiwan-Instruct": {
+        "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
+        "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
+    },
+    "SmolLM2-360M-Instruct": {
+        "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
+        "description": "Original SmolLM2‑360M Instruct"
+    },
+    "SmolLM2-360M-Instruct-TaiwanChat": {
+        "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
+        "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"
+    },
+    "LFM2-350M": {
+        "repo_id": "LiquidAI/LFM2-350M",
+        "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
     },
+    # Models with ~270M parameters
+    "parser_model_ner_gemma_v0.1": {
+        "repo_id": "myfi/parser_model_ner_gemma_v0.1",
+        "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
+    },
+    "Gemma-3-Taiwan-270M-it": {
+        "repo_id": "lianghsun/Gemma-3-Taiwan-270M-it",
+        "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
+    },
+    "gemma-3-270m-it": {
+        "repo_id": "google/gemma-3-270m-it",
+        "description": "Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
+    },
+    "Taiwan-ELM-270M-Instruct": {
+        "repo_id": "liswei/Taiwan-ELM-270M-Instruct",
+        "description": "Taiwan-ELM-270M-Instruct"
+    },
+    # Models with ~135M parameters
+    "SmolLM2-135M-multilingual-base": {
+        "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
+        "description": "SmolLM2-135M-multilingual-base"
+    },
+    "SmolLM-135M-Taiwan-Instruct-v1.0": {
+        "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
+        "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
+    },
+    "SmolLM2_135M_Grpo_Gsm8k": {
+        "repo_id": "prithivMLmods/SmolLM2_135M_Grpo_Gsm8k",
+        "description": "SmolLM2_135M_Grpo_Gsm8k"
+    },
+    "SmolLM2-135M-Instruct": {
+        "repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct",
+        "description": "Original SmolLM2‑135M Instruct"
+    },
+    "SmolLM2-135M-Instruct-TaiwanChat": {
+        "repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat",
+        "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"
+    },
 }
 # Global cache for pipelines to avoid re-loading.