tclf90 commited on
Commit
15fbe96
·
verified ·
1 Parent(s): c8d6023

update awq_marlin.py for vllm==0.9.2

Browse files
Files changed (1) hide show
  1. awq_marlin.py +11 -6
awq_marlin.py CHANGED
@@ -1,4 +1,5 @@
1
  # SPDX-License-Identifier: Apache-2.0
 
2
 
3
  from typing import Any, Callable, Optional
4
 
@@ -9,7 +10,7 @@ import vllm.model_executor.layers.fused_moe # noqa
9
  from vllm import _custom_ops as ops
10
  from vllm.logger import init_logger
11
  from vllm.model_executor.layers.fused_moe.layer import (
12
- FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
13
  UnquantizedFusedMoEMethod)
14
  from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
15
  UnquantizedLinearMethod,
@@ -485,13 +486,16 @@ class AWQMoEMethod(FusedMoEMethodBase):
485
  e_score_correction_bias: Optional[torch.Tensor] = None,
486
  apply_router_weight_on_input: bool = False,
487
  activation: str = "silu",
 
 
 
 
488
  ) -> torch.Tensor:
489
- assert activation == "silu", "Only SiLU activation is supported."
490
-
491
- if apply_router_weight_on_input:
492
  raise NotImplementedError(
493
- "Apply router weight on input is not supported for"
494
- "fused Marlin MoE method.")
 
495
 
496
  topk_weights, topk_ids = FusedMoE.select_experts(
497
  hidden_states=x,
@@ -515,6 +519,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
515
  topk_weights,
516
  topk_ids,
517
  quant_type_id=self.quant_type.id,
 
518
  global_num_experts=global_num_experts,
519
  expert_map=expert_map,
520
  w1_zeros=layer.w13_qzeros,
 
1
  # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
 
4
  from typing import Any, Callable, Optional
5
 
 
10
  from vllm import _custom_ops as ops
11
  from vllm.logger import init_logger
12
  from vllm.model_executor.layers.fused_moe.layer import (
13
+ FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
14
  UnquantizedFusedMoEMethod)
15
  from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
16
  UnquantizedLinearMethod,
 
486
  e_score_correction_bias: Optional[torch.Tensor] = None,
487
  apply_router_weight_on_input: bool = False,
488
  activation: str = "silu",
489
+ enable_eplb: bool = False,
490
+ expert_load_view: Optional[torch.Tensor] = None,
491
+ logical_to_physical_map: Optional[torch.Tensor] = None,
492
+ logical_replica_count: Optional[torch.Tensor] = None,
493
  ) -> torch.Tensor:
494
+ if enable_eplb:
 
 
495
  raise NotImplementedError(
496
+ "EPLB not supported for `AWQMoEMethod` yet.")
497
+
498
+ assert activation == "silu", "Only SiLU activation is supported."
499
 
500
  topk_weights, topk_ids = FusedMoE.select_experts(
501
  hidden_states=x,
 
519
  topk_weights,
520
  topk_ids,
521
  quant_type_id=self.quant_type.id,
522
+ apply_router_weight_on_input=apply_router_weight_on_input,
523
  global_num_experts=global_num_experts,
524
  expert_map=expert_map,
525
  w1_zeros=layer.w13_qzeros,