Luigi commited on
Commit
fea2910
·
verified ·
1 Parent(s): 6073cc2

add 4 20b+ models after enabling dynamic gpu duration

Browse files
Files changed (1) hide show
  1. app.py +16 -32
app.py CHANGED
@@ -27,38 +27,22 @@ cancel_event = threading.Event()
27
  # ------------------------------
28
  MODELS = {
29
  # ~30.5B total parameters (MoE: 3.3B activated)
30
- # "Qwen3-30B-A3B-Instruct-2507-AWQ-4bit": {
31
- # "repo_id": "cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit",
32
- # "description": "4-bit AWQ quantized instruct-tuned MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision."
33
- # },
34
- # "Qwen3-30B-A3B-Thinking-2507-AWQ-4bit": {
35
- # "repo_id": "cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
36
- # "description": "4-bit AWQ quantized thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
37
- # },
38
-
39
- # # ~80B total parameters (MoE: 3B activated)
40
- # "Qwen3-Next-80B-A3B-Instruct-AWQ-4bit": {
41
- # "repo_id": "cpatonn/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit",
42
- # "description": "4-bit AWQ quantized instruct model from the Qwen3-Next series. Features 80B total parameters (3B activated), hybrid Gated DeltaNet + Gated Attention architecture, 512 experts (10 activated + 1 shared), and native 262,144-token context (extendable to 1M tokens with YaRN). Delivers performance comparable to Qwen3-235B on many benchmarks while offering superior ultra-long-context efficiency. Supports only non-thinking mode. Note: May require re-quantization for stable inference (as of Sept 2025)."
43
- # },
44
- # "Qwen3-Next-80B-A3B-Thinking-AWQ-4bit": {
45
- # "repo_id": "cpatonn/Qwen3-Next-80B-A3B-Thinking-AWQ-4bit",
46
- # "description": "4-bit AWQ quantized thinking-mode variant of Qwen3-Next-80B-A3B. Combines 80B total parameters (3B activated), hybrid attention (Gated DeltaNet + Gated Attention), and 512-expert MoE (10 activated + 1 shared) for advanced reasoning over ultra-long contexts (natively 262K, extendable to 1M tokens). Designed for complex problem-solving with automatic reasoning trace generation. Quantized using AWQ; intended for high-end agentic and analytical workloads."
47
- # },
48
-
49
- # ~235B total parameters (MoE: 22B activated) — included for reference if added later
50
- # "Qwen3-235B-A22B-Thinking": { ... },
51
-
52
- # 32.8B total parameters
53
- # "Qwen3-32B-AWQ": {
54
- # "repo_id": "Qwen/Qwen3-32B-AWQ",
55
- # "description": "4-bit AWQ quantized dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities."
56
- # },
57
-
58
- # "gpt-oss-20b-BF16": {
59
- # "repo_id": "unsloth/gpt-oss-20b-BF16",
60
- # "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities."
61
- # },
62
  "Qwen3-4B-Instruct-2507": {
63
  "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
64
  "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."
 
27
  # ------------------------------
28
  MODELS = {
29
  # ~30.5B total parameters (MoE: 3.3B activated)
30
+ "Qwen3-30B-A3B-Instruct-2507": {
31
+ "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
32
+ "description": "non-thinking-mode MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision."
33
+ },
34
+ "Qwen3-30B-A3B-Thinking-2507": {
35
+ "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
36
+ "description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
37
+ },
38
+ "Qwen3-32B-FP8": {
39
+ "repo_id": "Qwen/Qwen3-32B-FP8",
40
+ "description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities."
41
+ },
42
+ "gpt-oss-20b-BF16": {
43
+ "repo_id": "unsloth/gpt-oss-20b-BF16",
44
+ "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities."
45
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  "Qwen3-4B-Instruct-2507": {
47
  "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
48
  "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."