add 5 models from liquid ai
Browse files
app.py
CHANGED
|
@@ -26,9 +26,17 @@ cancel_event = threading.Event()
|
|
| 26 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
| 27 |
# ------------------------------
|
| 28 |
MODELS = {
|
| 29 |
-
#
|
| 30 |
-
"Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
|
| 31 |
"SmolLM2-135M-multilingual-base": {"repo_id": "agentlans/SmolLM2-135M-multilingual-base", "description": "SmolLM2-135M-multilingual-base"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"parser_model_ner_gemma_v0.1": {
|
| 33 |
"repo_id": "myfi/parser_model_ner_gemma_v0.1",
|
| 34 |
"description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
|
|
@@ -38,17 +46,18 @@ MODELS = {
|
|
| 38 |
"description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
|
| 39 |
},
|
| 40 |
"gemma-3-270m-it":{
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
},
|
| 44 |
-
"SmolLM-135M-Taiwan-Instruct-v1.0": {
|
| 45 |
-
"repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
|
| 46 |
-
"description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
|
| 47 |
},
|
| 48 |
-
"
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
| 51 |
},
|
|
|
|
|
|
|
| 52 |
"Qwen2.5-0.5B-Taiwan-Instruct": {
|
| 53 |
"repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
|
| 54 |
"description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
|
|
@@ -57,73 +66,77 @@ MODELS = {
|
|
| 57 |
"repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
|
| 58 |
"description": "Qwen3-Taiwan model with 0.6 B parameters"
|
| 59 |
},
|
| 60 |
-
|
| 61 |
-
"
|
| 62 |
-
"repo_id":
|
| 63 |
-
"description":"
|
| 64 |
},
|
|
|
|
|
|
|
| 65 |
"Llama-3.2-Taiwan-1B": {
|
| 66 |
"repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
|
| 67 |
"description":"Llama-3.2-Taiwan base model with 1 B parameters"
|
| 68 |
},
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
"Gemma-3n-E2B": {
|
| 74 |
"repo_id": "google/gemma-3n-E2B",
|
| 75 |
"description":"Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
|
| 76 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
"Gemma-3n-E4B": {
|
| 78 |
"repo_id": "google/gemma-3n-E4B",
|
| 79 |
"description":"Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
|
| 80 |
},
|
| 81 |
-
|
| 82 |
-
# PowerInfer SmallThinker (instruction‑tuned)
|
| 83 |
"SmallThinker-4BA0.6B-Instruct": {
|
| 84 |
"repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
|
| 85 |
"description":"SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
|
| 86 |
},
|
| 87 |
-
# TIIUAE Falcon‑H1 (instruction‑tuned)
|
| 88 |
-
"Falcon-H1-1.5B-Instruct": {
|
| 89 |
-
"repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
|
| 90 |
-
"description":"Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
|
| 91 |
-
},
|
| 92 |
-
"Qwen/Qwen3-14B-FP8": {"repo_id": "Qwen/Qwen3-14B-FP8", "description": "Qwen/Qwen3-14B-FP8"},
|
| 93 |
-
#"Qwen/Qwen3-32B-FP8": {"repo_id": "Qwen/Qwen3-32B-FP8", "description": "Qwen/Qwen3-32B-FP8"},
|
| 94 |
-
"DeepSeek-R1-0528-Qwen3-8B": {"repo_id": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", "description": "DeepSeek-R1-0528-Qwen3-8B"},
|
| 95 |
-
"Nemotron-Research-Reasoning-Qwen-1.5B": {"repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B", "description": "Nemotron-Research-Reasoning-Qwen-1.5B"},
|
| 96 |
-
"Taiwan-ELM-1_1B-Instruct": {"repo_id": "liswei/Taiwan-ELM-1_1B-Instruct", "description": "Taiwan-ELM-1_1B-Instruct"},
|
| 97 |
-
"Taiwan-ELM-270M-Instruct": {"repo_id": "liswei/Taiwan-ELM-270M-Instruct", "description": "Taiwan-ELM-270M-Instruct"},
|
| 98 |
-
# "Granite-4.0-Tiny-Preview": {"repo_id": "ibm-granite/granite-4.0-tiny-preview", "description": "Granite-4.0-Tiny-Preview"},
|
| 99 |
-
"Qwen3-0.6B": {"repo_id":"Qwen/Qwen3-0.6B","description":"Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."},
|
| 100 |
-
"Qwen3-1.7B": {"repo_id":"Qwen/Qwen3-1.7B","description":"Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."},
|
| 101 |
"Qwen3-4B": {"repo_id":"Qwen/Qwen3-4B","description":"Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."},
|
| 102 |
-
"Qwen3-8B": {"repo_id":"Qwen/Qwen3-8B","description":"Dense causal language model with 8.2 B total parameters (6.95 B non-embedding), 36 layers, 32 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), excels at multilingual instruction following & zero-shot tasks."},
|
| 103 |
-
"Qwen3-14B": {"repo_id":"Qwen/Qwen3-14B","description":"Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."},
|
| 104 |
-
# "Qwen3-32B": {"repo_id":"Qwen/Qwen3-32B","description":"Dense causal language model with 32.8 B total parameters (31.2 B non-embedding), 64 layers, 64 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), flagship variant delivering state-of-the-art reasoning & instruction following."},
|
| 105 |
-
# "Qwen3-30B-A3B": {"repo_id":"Qwen/Qwen3-30B-A3B","description":"Mixture-of-Experts model with 30.5 B total parameters (29.9 B non-embedding, 3.3 B activated per token), 48 layers, 128 experts (8 activated per token), 32 query heads & 4 KV heads, 32 768-token context (131 072 via YaRN), MoE routing for scalable specialized reasoning."},
|
| 106 |
-
# "Qwen3-235B-A22B":{"repo_id":"Qwen/Qwen3-235B-A22B","description":"Mixture-of-Experts model with 235 B total parameters (234 B non-embedding, 22 B activated per token), 94 layers, 128 experts (8 activated per token), 64 query heads & 4 KV heads, 32 768-token context (131 072 via YaRN), ultra-scale reasoning & agentic workflows."},
|
| 107 |
"Gemma-3-4B-IT": {"repo_id": "unsloth/gemma-3-4b-it", "description": "Gemma-3-4B-IT"},
|
| 108 |
-
"SmolLM2_135M_Grpo_Gsm8k":{"repo_id":"prithivMLmods/SmolLM2_135M_Grpo_Gsm8k", "desscription":"SmolLM2_135M_Grpo_Gsm8k"},
|
| 109 |
-
"SmolLM2-135M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat", "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"},
|
| 110 |
-
"SmolLM2-135M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct", "description": "Original SmolLM2‑135M Instruct"},
|
| 111 |
-
"SmolLM2-360M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat", "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"},
|
| 112 |
-
"SmolLM2-360M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct", "description": "Original SmolLM2‑360M Instruct"},
|
| 113 |
-
"Llama-3.2-Taiwan-3B-Instruct": {"repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct", "description": "Llama-3.2-Taiwan-3B-Instruct"},
|
| 114 |
"MiniCPM3-4B": {"repo_id": "openbmb/MiniCPM3-4B", "description": "MiniCPM3-4B"},
|
| 115 |
-
|
|
|
|
| 116 |
"Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct"},
|
| 117 |
-
"Phi-4-mini-Reasoning": {"repo_id": "microsoft/Phi-4-mini-reasoning", "description": "Phi-4-mini-Reasoning"},
|
| 118 |
-
# "Phi-4-Reasoning": {"repo_id": "microsoft/Phi-4-reasoning", "description": "Phi-4-Reasoning"},
|
| 119 |
-
"Phi-4-mini-Instruct": {"repo_id": "microsoft/Phi-4-mini-instruct", "description": "Phi-4-mini-Instruct"},
|
| 120 |
-
"Meta-Llama-3.1-8B-Instruct": {"repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct", "description": "Meta-Llama-3.1-8B-Instruct"},
|
| 121 |
-
"DeepSeek-R1-Distill-Llama-8B": {"repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B", "description": "DeepSeek-R1-Distill-Llama-8B"},
|
| 122 |
-
"Mistral-7B-Instruct-v0.3": {"repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3", "description": "Mistral-7B-Instruct-v0.3"},
|
| 123 |
"Qwen2.5-Coder-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct", "description": "Qwen2.5-Coder-7B-Instruct"},
|
| 124 |
-
"Qwen2.5-Omni-3B": {"repo_id": "Qwen/Qwen2.5-Omni-3B", "description": "Qwen2.5-Omni-3B"},
|
| 125 |
"MiMo-7B-RL": {"repo_id": "XiaomiMiMo/MiMo-7B-RL", "description": "MiMo-7B-RL"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
|
|
|
|
|
|
|
|
|
| 127 |
}
|
| 128 |
|
| 129 |
# Global cache for pipelines to avoid re-loading.
|
|
|
|
| 26 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
| 27 |
# ------------------------------
|
| 28 |
MODELS = {
|
| 29 |
+
# Models with ~135M parameters
|
|
|
|
| 30 |
"SmolLM2-135M-multilingual-base": {"repo_id": "agentlans/SmolLM2-135M-multilingual-base", "description": "SmolLM2-135M-multilingual-base"},
|
| 31 |
+
"SmolLM-135M-Taiwan-Instruct-v1.0": {
|
| 32 |
+
"repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
|
| 33 |
+
"description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
|
| 34 |
+
},
|
| 35 |
+
"SmolLM2_135M_Grpo_Gsm8k":{"repo_id":"prithivMLmods/SmolLM2_135M_Grpo_Gsm8k", "description":"SmolLM2_135M_Grpo_Gsm8k"},
|
| 36 |
+
"SmolLM2-135M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct", "description": "Original SmolLM2‑135M Instruct"},
|
| 37 |
+
"SmolLM2-135M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat", "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"},
|
| 38 |
+
|
| 39 |
+
# Models with ~270M parameters
|
| 40 |
"parser_model_ner_gemma_v0.1": {
|
| 41 |
"repo_id": "myfi/parser_model_ner_gemma_v0.1",
|
| 42 |
"description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
|
|
|
|
| 46 |
"description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
|
| 47 |
},
|
| 48 |
"gemma-3-270m-it":{
|
| 49 |
+
"repo_id":"google/gemma-3-270m-it",
|
| 50 |
+
"description":"Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
},
|
| 52 |
+
"Taiwan-ELM-270M-Instruct": {"repo_id": "liswei/Taiwan-ELM-270M-Instruct", "description": "Taiwan-ELM-270M-Instruct"},
|
| 53 |
+
|
| 54 |
+
# Models with 350M-700M parameters
|
| 55 |
+
"LFM2-350M": {
|
| 56 |
+
"repo_id": "LiquidAI/LFM2-350M",
|
| 57 |
+
"description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
|
| 58 |
},
|
| 59 |
+
"SmolLM2-360M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat", "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"},
|
| 60 |
+
"SmolLM2-360M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct", "description": "Original SmolLM2‑360M Instruct"},
|
| 61 |
"Qwen2.5-0.5B-Taiwan-Instruct": {
|
| 62 |
"repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
|
| 63 |
"description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
|
|
|
|
| 66 |
"repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
|
| 67 |
"description": "Qwen3-Taiwan model with 0.6 B parameters"
|
| 68 |
},
|
| 69 |
+
"Qwen3-0.6B": {"repo_id":"Qwen/Qwen3-0.6B","description":"Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."},
|
| 70 |
+
"LFM2-700M": {
|
| 71 |
+
"repo_id": "LiquidAI/LFM2-700M",
|
| 72 |
+
"description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
|
| 73 |
},
|
| 74 |
+
|
| 75 |
+
# Models with 1B-2B parameters
|
| 76 |
"Llama-3.2-Taiwan-1B": {
|
| 77 |
"repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
|
| 78 |
"description":"Llama-3.2-Taiwan base model with 1 B parameters"
|
| 79 |
},
|
| 80 |
+
"Taiwan-ELM-1_1B-Instruct": {"repo_id": "liswei/Taiwan-ELM-1_1B-Instruct", "description": "Taiwan-ELM-1_1B-Instruct"},
|
| 81 |
+
"LFM2-1.2B": {
|
| 82 |
+
"repo_id": "LiquidAI/LFM2-1.2B",
|
| 83 |
+
"description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
|
| 84 |
+
},
|
| 85 |
+
"Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
|
| 86 |
+
"Falcon-H1-1.5B-Instruct": {
|
| 87 |
+
"repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
|
| 88 |
+
"description":"Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
|
| 89 |
+
},
|
| 90 |
+
"Nemotron-Research-Reasoning-Qwen-1.5B": {"repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B", "description": "Nemotron-Research-Reasoning-Qwen-1.5B"},
|
| 91 |
+
"Qwen3-1.7B": {"repo_id":"Qwen/Qwen3-1.7B","description":"Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."},
|
| 92 |
"Gemma-3n-E2B": {
|
| 93 |
"repo_id": "google/gemma-3n-E2B",
|
| 94 |
"description":"Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
|
| 95 |
},
|
| 96 |
+
|
| 97 |
+
# Models with 2.6B-4B parameters
|
| 98 |
+
"LFM2-2.6B": {
|
| 99 |
+
"repo_id": "LiquidAI/LFM2-2.6B",
|
| 100 |
+
"description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
|
| 101 |
+
},
|
| 102 |
+
"Qwen2.5-Taiwan-3B-Reason-GRPO": {
|
| 103 |
+
"repo_id": "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
|
| 104 |
+
"description":"Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned"
|
| 105 |
+
},
|
| 106 |
+
"Llama-3.2-Taiwan-3B-Instruct": {"repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct", "description": "Llama-3.2-Taiwan-3B-Instruct"},
|
| 107 |
+
"Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct"},
|
| 108 |
+
"Qwen2.5-Omni-3B": {"repo_id": "Qwen/Qwen2.5-Omni-3B", "description": "Qwen2.5-Omni-3B"},
|
| 109 |
+
"Phi-4-mini-Reasoning": {"repo_id": "microsoft/Phi-4-mini-reasoning", "description": "Phi-4-mini-Reasoning (4.3B parameters)"},
|
| 110 |
+
"Phi-4-mini-Instruct": {"repo_id": "microsoft/Phi-4-mini-instruct", "description": "Phi-4-mini-Instruct (4.3B parameters)"},
|
| 111 |
"Gemma-3n-E4B": {
|
| 112 |
"repo_id": "google/gemma-3n-E4B",
|
| 113 |
"description":"Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
|
| 114 |
},
|
|
|
|
|
|
|
| 115 |
"SmallThinker-4BA0.6B-Instruct": {
|
| 116 |
"repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
|
| 117 |
"description":"SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
|
| 118 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
"Qwen3-4B": {"repo_id":"Qwen/Qwen3-4B","description":"Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
"Gemma-3-4B-IT": {"repo_id": "unsloth/gemma-3-4b-it", "description": "Gemma-3-4B-IT"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
"MiniCPM3-4B": {"repo_id": "openbmb/MiniCPM3-4B", "description": "MiniCPM3-4B"},
|
| 122 |
+
|
| 123 |
+
# Models with 7B-8.3B parameters
|
| 124 |
"Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
"Qwen2.5-Coder-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct", "description": "Qwen2.5-Coder-7B-Instruct"},
|
|
|
|
| 126 |
"MiMo-7B-RL": {"repo_id": "XiaomiMiMo/MiMo-7B-RL", "description": "MiMo-7B-RL"},
|
| 127 |
+
"Mistral-7B-Instruct-v0.3": {"repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3", "description": "Mistral-7B-Instruct-v0.3"},
|
| 128 |
+
"DeepSeek-R1-0528-Qwen3-8B": {"repo_id": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", "description": "DeepSeek-R1-0528-Qwen3-8B"},
|
| 129 |
+
"Meta-Llama-3.1-8B-Instruct": {"repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct", "description": "Meta-Llama-3.1-8B-Instruct"},
|
| 130 |
+
"DeepSeek-R1-Distill-Llama-8B": {"repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B", "description": "DeepSeek-R1-Distill-Llama-8B"},
|
| 131 |
+
"Qwen3-8B": {"repo_id":"Qwen/Qwen3-8B","description":"Dense causal language model with 8.2 B total parameters (6.95 B non-embedding), 36 layers, 32 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), excels at multilingual instruction following & zero-shot tasks."},
|
| 132 |
+
"LFM2-8B-A1B": {
|
| 133 |
+
"repo_id": "LiquidAI/LFM2-8B-A1B",
|
| 134 |
+
"description": "A Mixture-of-Experts (MoE) model with 8.3B total parameters (1.5B active) designed for on-device use, providing the quality of larger models with the speed of a 1.5B-class model."
|
| 135 |
+
},
|
| 136 |
|
| 137 |
+
# Models with 14B+ parameters
|
| 138 |
+
"Qwen/Qwen3-14B-FP8": {"repo_id": "Qwen/Qwen3-14B-FP8", "description": "Qwen/Qwen3-14B-FP8"},
|
| 139 |
+
"Qwen3-14B": {"repo_id":"Qwen/Qwen3-14B","description":"Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."},
|
| 140 |
}
|
| 141 |
|
| 142 |
# Global cache for pipelines to avoid re-loading.
|