update awq_marlin.py for vllm==0.9.2
Browse files- awq_marlin.py +11 -6
awq_marlin.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
| 2 |
|
| 3 |
from typing import Any, Callable, Optional
|
| 4 |
|
|
@@ -9,7 +10,7 @@ import vllm.model_executor.layers.fused_moe # noqa
|
|
| 9 |
from vllm import _custom_ops as ops
|
| 10 |
from vllm.logger import init_logger
|
| 11 |
from vllm.model_executor.layers.fused_moe.layer import (
|
| 12 |
-
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
|
| 13 |
UnquantizedFusedMoEMethod)
|
| 14 |
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
| 15 |
UnquantizedLinearMethod,
|
|
@@ -485,13 +486,16 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
|
| 485 |
e_score_correction_bias: Optional[torch.Tensor] = None,
|
| 486 |
apply_router_weight_on_input: bool = False,
|
| 487 |
activation: str = "silu",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
) -> torch.Tensor:
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
if apply_router_weight_on_input:
|
| 492 |
raise NotImplementedError(
|
| 493 |
-
"
|
| 494 |
-
|
|
|
|
| 495 |
|
| 496 |
topk_weights, topk_ids = FusedMoE.select_experts(
|
| 497 |
hidden_states=x,
|
|
@@ -515,6 +519,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
|
| 515 |
topk_weights,
|
| 516 |
topk_ids,
|
| 517 |
quant_type_id=self.quant_type.id,
|
|
|
|
| 518 |
global_num_experts=global_num_experts,
|
| 519 |
expert_map=expert_map,
|
| 520 |
w1_zeros=layer.w13_qzeros,
|
|
|
|
| 1 |
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
| 3 |
|
| 4 |
from typing import Any, Callable, Optional
|
| 5 |
|
|
|
|
| 10 |
from vllm import _custom_ops as ops
|
| 11 |
from vllm.logger import init_logger
|
| 12 |
from vllm.model_executor.layers.fused_moe.layer import (
|
| 13 |
+
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
|
| 14 |
UnquantizedFusedMoEMethod)
|
| 15 |
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
| 16 |
UnquantizedLinearMethod,
|
|
|
|
| 486 |
e_score_correction_bias: Optional[torch.Tensor] = None,
|
| 487 |
apply_router_weight_on_input: bool = False,
|
| 488 |
activation: str = "silu",
|
| 489 |
+
enable_eplb: bool = False,
|
| 490 |
+
expert_load_view: Optional[torch.Tensor] = None,
|
| 491 |
+
logical_to_physical_map: Optional[torch.Tensor] = None,
|
| 492 |
+
logical_replica_count: Optional[torch.Tensor] = None,
|
| 493 |
) -> torch.Tensor:
|
| 494 |
+
if enable_eplb:
|
|
|
|
|
|
|
| 495 |
raise NotImplementedError(
|
| 496 |
+
"EPLB not supported for `AWQMoEMethod` yet.")
|
| 497 |
+
|
| 498 |
+
assert activation == "silu", "Only SiLU activation is supported."
|
| 499 |
|
| 500 |
topk_weights, topk_ids = FusedMoE.select_experts(
|
| 501 |
hidden_states=x,
|
|
|
|
| 519 |
topk_weights,
|
| 520 |
topk_ids,
|
| 521 |
quant_type_id=self.quant_type.id,
|
| 522 |
+
apply_router_weight_on_input=apply_router_weight_on_input,
|
| 523 |
global_num_experts=global_num_experts,
|
| 524 |
expert_map=expert_map,
|
| 525 |
w1_zeros=layer.w13_qzeros,
|