Luigi commited on
Commit
4af617b
·
1 Parent(s): f97cbfc

remove qwen3 30b-a3b and qwen3 next 80b-a3b

Browse files
Files changed (1) hide show
  1. app.py +20 -20
app.py CHANGED
@@ -27,32 +27,32 @@ cancel_event = threading.Event()
27
  # Torch-Compatible Model Definitions with Adjusted Descriptions
28
  # ------------------------------
29
  MODELS = {
30
- "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": {
31
- "repo_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
32
- "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization for efficiency. Optimized for fast, stable instruction-following dialogue without 'thinking' traces, making it ideal for general chat and low-latency applications [[2]][[3]][[5]][[8]].",
33
- "params_b": 80.0
34
- },
35
- "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": {
36
- "repo_id": "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
37
- "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization. Specialized for complex reasoning, math, and coding tasks, this model outputs structured 'thinking' traces by default and is designed to be used with a reasoning parser [[10]][[11]][[14]][[18]].",
38
- "params_b": 80.0
39
- },
40
  "Qwen3-32B-FP8": {
41
  "repo_id": "Qwen/Qwen3-32B-FP8",
42
  "description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities.",
43
  "params_b": 32.8
44
  },
45
  # ~30.5B total parameters (MoE: 3.3B activated)
46
- "Qwen3-30B-A3B-Instruct-2507": {
47
- "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
48
- "description": "non-thinking-mode MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision.",
49
- "params_b": 30.5
50
- },
51
- "Qwen3-30B-A3B-Thinking-2507": {
52
- "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
53
- "description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers.",
54
- "params_b": 30.5
55
- },
56
  "gpt-oss-20b-BF16": {
57
  "repo_id": "unsloth/gpt-oss-20b-BF16",
58
  "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities.",
 
27
  # Torch-Compatible Model Definitions with Adjusted Descriptions
28
  # ------------------------------
29
  MODELS = {
30
+ # "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": {
31
+ # "repo_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
32
+ # "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization for efficiency. Optimized for fast, stable instruction-following dialogue without 'thinking' traces, making it ideal for general chat and low-latency applications [[2]][[3]][[5]][[8]].",
33
+ # "params_b": 80.0
34
+ # },
35
+ # "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": {
36
+ # "repo_id": "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
37
+ # "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization. Specialized for complex reasoning, math, and coding tasks, this model outputs structured 'thinking' traces by default and is designed to be used with a reasoning parser [[10]][[11]][[14]][[18]].",
38
+ # "params_b": 80.0
39
+ # },
40
  "Qwen3-32B-FP8": {
41
  "repo_id": "Qwen/Qwen3-32B-FP8",
42
  "description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities.",
43
  "params_b": 32.8
44
  },
45
  # ~30.5B total parameters (MoE: 3.3B activated)
46
+ # "Qwen3-30B-A3B-Instruct-2507": {
47
+ # "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
48
+ # "description": "non-thinking-mode MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision.",
49
+ # "params_b": 30.5
50
+ # },
51
+ # "Qwen3-30B-A3B-Thinking-2507": {
52
+ # "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
53
+ # "description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers.",
54
+ # "params_b": 30.5
55
+ # },
56
  "gpt-oss-20b-BF16": {
57
  "repo_id": "unsloth/gpt-oss-20b-BF16",
58
  "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities.",