ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 8

Commit

8eefe94

1 Parent(s): ac20174

add 5 models from liquid ai

Browse files

Files changed (1) hide show

app.py +67 -54

app.py CHANGED Viewed

@@ -26,9 +26,17 @@ cancel_event = threading.Event()
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
-    # … your existing entries …
-    "Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
     "SmolLM2-135M-multilingual-base": {"repo_id": "agentlans/SmolLM2-135M-multilingual-base", "description": "SmolLM2-135M-multilingual-base"},
     "parser_model_ner_gemma_v0.1": {
         "repo_id": "myfi/parser_model_ner_gemma_v0.1",
         "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
@@ -38,17 +46,18 @@ MODELS = {
         "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
     },
     "gemma-3-270m-it":{
-    "repo_id":"google/gemma-3-270m-it",
-    "description":"Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
-    },
-    "SmolLM-135M-Taiwan-Instruct-v1.0": {
-        "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
-        "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
     },
-    "Llama-3.2-Taiwan-1B": {
-        "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
-        "description": "Llama-3.2-Taiwan base model with 1 B parameters"
     },
     "Qwen2.5-0.5B-Taiwan-Instruct": {
         "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
         "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
@@ -57,73 +66,77 @@ MODELS = {
         "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
         "description": "Qwen3-Taiwan model with 0.6 B parameters"
     },
-    "Qwen2.5-Taiwan-3B-Reason-GRPO": {
-        "repo_id":   "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
-        "description":"Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned"
     },
     "Llama-3.2-Taiwan-1B": {
         "repo_id":   "lianghsun/Llama-3.2-Taiwan-1B",
         "description":"Llama-3.2-Taiwan base model with 1 B parameters"
     },
-    # Gemma 3n “effective” variants (official Google repos)
     "Gemma-3n-E2B": {
         "repo_id":   "google/gemma-3n-E2B",
         "description":"Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
     },
     "Gemma-3n-E4B": {
         "repo_id":   "google/gemma-3n-E4B",
         "description":"Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
     },
-    # PowerInfer SmallThinker (instruction‑tuned)
     "SmallThinker-4BA0.6B-Instruct": {
         "repo_id":   "PowerInfer/SmallThinker-4BA0.6B-Instruct",
         "description":"SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
     },
-    # TIIUAE Falcon‑H1 (instruction‑tuned)
-    "Falcon-H1-1.5B-Instruct": {
-        "repo_id":   "tiiuae/Falcon-H1-1.5B-Instruct",
-        "description":"Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
-    },
-    "Qwen/Qwen3-14B-FP8": {"repo_id": "Qwen/Qwen3-14B-FP8", "description": "Qwen/Qwen3-14B-FP8"},
-    #"Qwen/Qwen3-32B-FP8": {"repo_id": "Qwen/Qwen3-32B-FP8", "description": "Qwen/Qwen3-32B-FP8"},
-    "DeepSeek-R1-0528-Qwen3-8B": {"repo_id": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", "description": "DeepSeek-R1-0528-Qwen3-8B"},
-    "Nemotron-Research-Reasoning-Qwen-1.5B": {"repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B", "description": "Nemotron-Research-Reasoning-Qwen-1.5B"},
-    "Taiwan-ELM-1_1B-Instruct": {"repo_id": "liswei/Taiwan-ELM-1_1B-Instruct", "description": "Taiwan-ELM-1_1B-Instruct"},
-    "Taiwan-ELM-270M-Instruct": {"repo_id": "liswei/Taiwan-ELM-270M-Instruct", "description": "Taiwan-ELM-270M-Instruct"},
-    # "Granite-4.0-Tiny-Preview": {"repo_id": "ibm-granite/granite-4.0-tiny-preview", "description": "Granite-4.0-Tiny-Preview"},
-    "Qwen3-0.6B":    {"repo_id":"Qwen/Qwen3-0.6B","description":"Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."},
-    "Qwen3-1.7B":    {"repo_id":"Qwen/Qwen3-1.7B","description":"Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."},
     "Qwen3-4B":      {"repo_id":"Qwen/Qwen3-4B","description":"Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."},
-    "Qwen3-8B":      {"repo_id":"Qwen/Qwen3-8B","description":"Dense causal language model with 8.2 B total parameters (6.95 B non-embedding), 36 layers, 32 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), excels at multilingual instruction following & zero-shot tasks."},
-    "Qwen3-14B":     {"repo_id":"Qwen/Qwen3-14B","description":"Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."},
-    # "Qwen3-32B":     {"repo_id":"Qwen/Qwen3-32B","description":"Dense causal language model with 32.8 B total parameters (31.2 B non-embedding), 64 layers, 64 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), flagship variant delivering state-of-the-art reasoning & instruction following."},
-    # "Qwen3-30B-A3B": {"repo_id":"Qwen/Qwen3-30B-A3B","description":"Mixture-of-Experts model with 30.5 B total parameters (29.9 B non-embedding, 3.3 B activated per token), 48 layers, 128 experts (8 activated per token), 32 query heads & 4 KV heads, 32 768-token context (131 072 via YaRN), MoE routing for scalable specialized reasoning."},
-    # "Qwen3-235B-A22B":{"repo_id":"Qwen/Qwen3-235B-A22B","description":"Mixture-of-Experts model with 235 B total parameters (234 B non-embedding, 22 B activated per token), 94 layers, 128 experts (8 activated per token), 64 query heads & 4 KV heads, 32 768-token context (131 072 via YaRN), ultra-scale reasoning & agentic workflows."},
     "Gemma-3-4B-IT": {"repo_id": "unsloth/gemma-3-4b-it", "description": "Gemma-3-4B-IT"},
-    "SmolLM2_135M_Grpo_Gsm8k":{"repo_id":"prithivMLmods/SmolLM2_135M_Grpo_Gsm8k", "desscription":"SmolLM2_135M_Grpo_Gsm8k"},
-    "SmolLM2-135M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat", "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"},
-    "SmolLM2-135M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct", "description": "Original SmolLM2‑135M Instruct"},
-    "SmolLM2-360M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat", "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"},
-    "SmolLM2-360M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct", "description": "Original SmolLM2‑360M Instruct"},
-    "Llama-3.2-Taiwan-3B-Instruct": {"repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct", "description": "Llama-3.2-Taiwan-3B-Instruct"},
     "MiniCPM3-4B": {"repo_id": "openbmb/MiniCPM3-4B", "description": "MiniCPM3-4B"},
-    "Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct"},
     "Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct"},
-    "Phi-4-mini-Reasoning": {"repo_id": "microsoft/Phi-4-mini-reasoning", "description": "Phi-4-mini-Reasoning"},
-    # "Phi-4-Reasoning":      {"repo_id": "microsoft/Phi-4-reasoning",      "description": "Phi-4-Reasoning"},
-    "Phi-4-mini-Instruct": {"repo_id": "microsoft/Phi-4-mini-instruct", "description": "Phi-4-mini-Instruct"},
-    "Meta-Llama-3.1-8B-Instruct": {"repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct", "description": "Meta-Llama-3.1-8B-Instruct"},
-    "DeepSeek-R1-Distill-Llama-8B": {"repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B", "description": "DeepSeek-R1-Distill-Llama-8B"},
-    "Mistral-7B-Instruct-v0.3": {"repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3", "description": "Mistral-7B-Instruct-v0.3"},
     "Qwen2.5-Coder-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct", "description": "Qwen2.5-Coder-7B-Instruct"},
-    "Qwen2.5-Omni-3B":   {"repo_id": "Qwen/Qwen2.5-Omni-3B",   "description": "Qwen2.5-Omni-3B"},
     "MiMo-7B-RL":        {"repo_id": "XiaomiMiMo/MiMo-7B-RL",   "description": "MiMo-7B-RL"},
 }
 # Global cache for pipelines to avoid re-loading.

 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
+    # Models with ~135M parameters
     "SmolLM2-135M-multilingual-base": {"repo_id": "agentlans/SmolLM2-135M-multilingual-base", "description": "SmolLM2-135M-multilingual-base"},
+    "SmolLM-135M-Taiwan-Instruct-v1.0": {
+        "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
+        "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
+    },
+    "SmolLM2_135M_Grpo_Gsm8k":{"repo_id":"prithivMLmods/SmolLM2_135M_Grpo_Gsm8k", "description":"SmolLM2_135M_Grpo_Gsm8k"},
+    "SmolLM2-135M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct", "description": "Original SmolLM2‑135M Instruct"},
+    "SmolLM2-135M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat", "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"},
+    # Models with ~270M parameters
     "parser_model_ner_gemma_v0.1": {
         "repo_id": "myfi/parser_model_ner_gemma_v0.1",
         "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
         "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
     },
     "gemma-3-270m-it":{
+        "repo_id":"google/gemma-3-270m-it",
+        "description":"Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
     },
+    "Taiwan-ELM-270M-Instruct": {"repo_id": "liswei/Taiwan-ELM-270M-Instruct", "description": "Taiwan-ELM-270M-Instruct"},
+    # Models with 350M-700M parameters
+    "LFM2-350M": {
+        "repo_id": "LiquidAI/LFM2-350M",
+        "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
     },
+    "SmolLM2-360M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat", "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"},
+    "SmolLM2-360M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct", "description": "Original SmolLM2‑360M Instruct"},
     "Qwen2.5-0.5B-Taiwan-Instruct": {
         "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
         "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
         "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
         "description": "Qwen3-Taiwan model with 0.6 B parameters"
     },
+    "Qwen3-0.6B":    {"repo_id":"Qwen/Qwen3-0.6B","description":"Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."},
+    "LFM2-700M": {
+        "repo_id": "LiquidAI/LFM2-700M",
+        "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
     },
+    # Models with 1B-2B parameters
     "Llama-3.2-Taiwan-1B": {
         "repo_id":   "lianghsun/Llama-3.2-Taiwan-1B",
         "description":"Llama-3.2-Taiwan base model with 1 B parameters"
     },
+    "Taiwan-ELM-1_1B-Instruct": {"repo_id": "liswei/Taiwan-ELM-1_1B-Instruct", "description": "Taiwan-ELM-1_1B-Instruct"},
+    "LFM2-1.2B": {
+        "repo_id": "LiquidAI/LFM2-1.2B",
+        "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
+    },
+    "Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
+    "Falcon-H1-1.5B-Instruct": {
+        "repo_id":   "tiiuae/Falcon-H1-1.5B-Instruct",
+        "description":"Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
+    },
+    "Nemotron-Research-Reasoning-Qwen-1.5B": {"repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B", "description": "Nemotron-Research-Reasoning-Qwen-1.5B"},
+    "Qwen3-1.7B":    {"repo_id":"Qwen/Qwen3-1.7B","description":"Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."},
     "Gemma-3n-E2B": {
         "repo_id":   "google/gemma-3n-E2B",
         "description":"Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
     },
+    # Models with 2.6B-4B parameters
+    "LFM2-2.6B": {
+        "repo_id": "LiquidAI/LFM2-2.6B",
+        "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
+    },
+    "Qwen2.5-Taiwan-3B-Reason-GRPO": {
+        "repo_id":   "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
+        "description":"Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned"
+    },
+    "Llama-3.2-Taiwan-3B-Instruct": {"repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct", "description": "Llama-3.2-Taiwan-3B-Instruct"},
+    "Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct"},
+    "Qwen2.5-Omni-3B":   {"repo_id": "Qwen/Qwen2.5-Omni-3B",   "description": "Qwen2.5-Omni-3B"},
+    "Phi-4-mini-Reasoning": {"repo_id": "microsoft/Phi-4-mini-reasoning", "description": "Phi-4-mini-Reasoning (4.3B parameters)"},
+    "Phi-4-mini-Instruct": {"repo_id": "microsoft/Phi-4-mini-instruct", "description": "Phi-4-mini-Instruct (4.3B parameters)"},
     "Gemma-3n-E4B": {
         "repo_id":   "google/gemma-3n-E4B",
         "description":"Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
     },
     "SmallThinker-4BA0.6B-Instruct": {
         "repo_id":   "PowerInfer/SmallThinker-4BA0.6B-Instruct",
         "description":"SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
     },
     "Qwen3-4B":      {"repo_id":"Qwen/Qwen3-4B","description":"Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."},
     "Gemma-3-4B-IT": {"repo_id": "unsloth/gemma-3-4b-it", "description": "Gemma-3-4B-IT"},
     "MiniCPM3-4B": {"repo_id": "openbmb/MiniCPM3-4B", "description": "MiniCPM3-4B"},
+    # Models with 7B-8.3B parameters
     "Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct"},
     "Qwen2.5-Coder-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct", "description": "Qwen2.5-Coder-7B-Instruct"},
     "MiMo-7B-RL":        {"repo_id": "XiaomiMiMo/MiMo-7B-RL",   "description": "MiMo-7B-RL"},
+    "Mistral-7B-Instruct-v0.3": {"repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3", "description": "Mistral-7B-Instruct-v0.3"},
+    "DeepSeek-R1-0528-Qwen3-8B": {"repo_id": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", "description": "DeepSeek-R1-0528-Qwen3-8B"},
+    "Meta-Llama-3.1-8B-Instruct": {"repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct", "description": "Meta-Llama-3.1-8B-Instruct"},
+    "DeepSeek-R1-Distill-Llama-8B": {"repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B", "description": "DeepSeek-R1-Distill-Llama-8B"},
+    "Qwen3-8B":      {"repo_id":"Qwen/Qwen3-8B","description":"Dense causal language model with 8.2 B total parameters (6.95 B non-embedding), 36 layers, 32 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), excels at multilingual instruction following & zero-shot tasks."},
+    "LFM2-8B-A1B": {
+        "repo_id": "LiquidAI/LFM2-8B-A1B",
+        "description": "A Mixture-of-Experts (MoE) model with 8.3B total parameters (1.5B active) designed for on-device use, providing the quality of larger models with the speed of a 1.5B-class model."
+    },
+    # Models with 14B+ parameters
+    "Qwen/Qwen3-14B-FP8": {"repo_id": "Qwen/Qwen3-14B-FP8", "description": "Qwen/Qwen3-14B-FP8"},
+    "Qwen3-14B":     {"repo_id":"Qwen/Qwen3-14B","description":"Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."},
 }
 # Global cache for pipelines to avoid re-loading.