Skip to content

Commit 1afe830

Browse files
committed
BF16 + TP1 working
1 parent f01c04a commit 1afe830

File tree

5 files changed

+84
-41
lines changed

5 files changed

+84
-41
lines changed

examples/offline_inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@
1919
for output in outputs:
2020
prompt = output.prompt
2121
generated_text = output.outputs[0].text
22-
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
22+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

vllm/config.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ class ModelConfig:
160160
override default pooling config for the pooling model.
161161
logits_processor_pattern: Optional regex pattern specifying valid
162162
logits processor qualified names that can be passed with the
163-
`logits_processors` extra completion argument. Defaults to None,
163+
`logits_processors` extra completion argument. Defaults to None,
164164
which allows no processors.
165165
generation_config: Configuration parameter file for generation.
166166
"""
@@ -363,7 +363,7 @@ def __init__(self,
363363
def maybe_pull_model_tokenizer_for_s3(self, model: str,
364364
tokenizer: str) -> None:
365365
"""
366-
Pull the model config or tokenizer to a temporary
366+
Pull the model config or tokenizer to a temporary
367367
directory in case of S3.
368368
369369
Args:
@@ -721,7 +721,7 @@ def get_hidden_size(self) -> int:
721721
def get_head_size(self) -> int:
722722
# TODO remove hard code
723723
if hasattr(self.hf_text_config, "model_type"
724-
) and self.hf_text_config.model_type == 'deepseek_v2':
724+
) and (self.hf_text_config.model_type in ('deepseek_v2', 'deepseek_v3')):
725725
# FlashAttention supports only head_size 32, 64, 128, 256,
726726
# we need to pad head_size 192 to 256
727727
return 256
@@ -874,14 +874,14 @@ def try_get_generation_config(self) -> Dict[str, Any]:
874874

875875
def get_diff_sampling_param(self) -> Dict[str, Any]:
876876
"""
877-
This method returns a dictionary containing the parameters
878-
that differ from the default sampling parameters, but only
879-
if `generation_config` is set. If `generation_config` is not
877+
This method returns a dictionary containing the parameters
878+
that differ from the default sampling parameters, but only
879+
if `generation_config` is set. If `generation_config` is not
880880
set, an empty dictionary is returned.
881881
882882
Returns:
883-
Dict[str, Any]: A dictionary with the differing sampling
884-
parameters if `generation_config` is set, otherwise an
883+
Dict[str, Any]: A dictionary with the differing sampling
884+
parameters if `generation_config` is set, otherwise an
885885
empty dictionary.
886886
"""
887887
if self.generation_config is None:

vllm/model_executor/layers/fused_moe/fused_moe.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,14 +421,15 @@ def fused_topk(
421421
return topk_weights, topk_ids
422422

423423

424-
# This is used by the Deepseek-V2 model
424+
# This is used by the Deepseek-V2 and Deepseek-V3 model
425425
def grouped_topk(hidden_states: torch.Tensor,
426426
gating_output: torch.Tensor,
427427
topk: int,
428428
renormalize: bool,
429429
num_expert_group: int = 0,
430430
topk_group: int = 0,
431-
scoring_func: str = "softmax"):
431+
scoring_func: str = "softmax",
432+
e_score_correction_bias: Optional[torch.Tensor] = None):
432433

433434
assert hidden_states.shape[0] == gating_output.shape[0], (
434435
"Number of tokens mismatch")
@@ -440,6 +441,9 @@ def grouped_topk(hidden_states: torch.Tensor,
440441
else:
441442
raise ValueError(f"Unsupported scoring function: {scoring_func}")
442443

444+
if e_score_correction_bias is not None:
445+
scores.add_(e_score_correction_bias.unsqueeze(0))
446+
443447
num_token = scores.shape[0]
444448
group_scores = scores.view(num_token, num_expert_group,
445449
-1).max(dim=-1).values # [n, n_group]

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ def apply(
8181
use_grouped_topk: bool,
8282
topk_group: Optional[int] = None,
8383
num_expert_group: Optional[int] = None,
84-
custom_routing_function: Optional[Callable] = None
84+
custom_routing_function: Optional[Callable] = None,
85+
scoring_func: str = "softmax",
86+
e_score_correction_bias: Optional[torch.Tensor] = None
8587
) -> torch.Tensor:
8688
return self.forward(x=x,
8789
layer=layer,
@@ -91,7 +93,9 @@ def apply(
9193
use_grouped_topk=use_grouped_topk,
9294
topk_group=topk_group,
9395
num_expert_group=num_expert_group,
94-
custom_routing_function=custom_routing_function)
96+
custom_routing_function=custom_routing_function,
97+
scoring_func=scoring_func,
98+
e_score_correction_bias=e_score_correction_bias)
9599

96100
def forward_cuda(
97101
self,
@@ -103,7 +107,9 @@ def forward_cuda(
103107
renormalize: bool,
104108
topk_group: Optional[int] = None,
105109
num_expert_group: Optional[int] = None,
106-
custom_routing_function: Optional[Callable] = None
110+
custom_routing_function: Optional[Callable] = None,
111+
scoring_func: str = "softmax",
112+
e_score_correction_bias: Optional[torch.Tensor] = None
107113
) -> torch.Tensor:
108114
topk_weights, topk_ids = FusedMoE.select_experts(
109115
hidden_states=x,
@@ -113,7 +119,9 @@ def forward_cuda(
113119
renormalize=renormalize,
114120
topk_group=topk_group,
115121
num_expert_group=num_expert_group,
116-
custom_routing_function=custom_routing_function)
122+
custom_routing_function=custom_routing_function,
123+
scoring_func=scoring_func,
124+
e_score_correction_bias=e_score_correction_bias)
117125

118126
return fused_experts(hidden_states=x,
119127
w1=layer.w13_weight,
@@ -136,7 +144,8 @@ def forward_tpu(
136144
renormalize: bool,
137145
topk_group: Optional[int] = None,
138146
num_expert_group: Optional[int] = None,
139-
custom_routing_function: Optional[Callable] = None
147+
custom_routing_function: Optional[Callable] = None,
148+
**kwargs,
140149
) -> torch.Tensor:
141150
assert not use_grouped_topk
142151
assert num_expert_group is None
@@ -190,6 +199,7 @@ def __init__(
190199
prefix: str = "",
191200
custom_routing_function: Optional[Callable] = None,
192201
scoring_func: str = "softmax",
202+
e_score_correction_bias: Optional[torch.Tensor] = None,
193203
):
194204
super().__init__()
195205

@@ -210,9 +220,12 @@ def __init__(
210220
self.topk_group = topk_group
211221
self.custom_routing_function = custom_routing_function
212222
self.scoring_func = scoring_func
223+
self.e_score_correction_bias = e_score_correction_bias
213224

214225
if self.scoring_func != "softmax" and not self.use_grouped_topk:
215-
raise ValueError("Only softmax scoring function is supported for non-grouped topk.")
226+
raise ValueError(
227+
"Only softmax scoring function is supported for non-grouped topk."
228+
)
216229

217230
if quant_config is None:
218231
self.quant_method: Optional[QuantizeMethodBase] = (
@@ -447,7 +460,8 @@ def select_experts(hidden_states: torch.Tensor,
447460
topk_group: Optional[int] = None,
448461
num_expert_group: Optional[int] = None,
449462
custom_routing_function: Optional[Callable] = None,
450-
scoring_func: str = "softmax"):
463+
scoring_func: str = "softmax",
464+
e_score_correction_bias: Optional[torch.Tensor] = None):
451465
from vllm.model_executor.layers.fused_moe.fused_moe import (
452466
fused_topk, grouped_topk)
453467

@@ -462,7 +476,8 @@ def select_experts(hidden_states: torch.Tensor,
462476
renormalize=renormalize,
463477
num_expert_group=num_expert_group,
464478
topk_group=topk_group,
465-
scoring_func=scoring_func)
479+
scoring_func=scoring_func,
480+
e_score_correction_bias=e_score_correction_bias)
466481
elif custom_routing_function is None:
467482
topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
468483
gating_output=router_logits,
@@ -491,7 +506,9 @@ def forward(self, hidden_states: torch.Tensor,
491506
use_grouped_topk=self.use_grouped_topk,
492507
topk_group=self.topk_group,
493508
num_expert_group=self.num_expert_group,
494-
custom_routing_function=self.custom_routing_function)
509+
custom_routing_function=self.custom_routing_function,
510+
scoring_func=self.scoring_func,
511+
e_score_correction_bias=self.e_score_correction_bias)
495512

496513
if self.reduce_results and self.tp_size > 1:
497514
final_hidden_states = tensor_model_parallel_all_reduce(

vllm/model_executor/models/deepseek_v3.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,28 @@ def forward(self, x):
9090
return x
9191

9292

93+
class MoEGate(nn.Module):
94+
95+
def __init__(
96+
self,
97+
config: PretrainedConfig,
98+
):
99+
super().__init__()
100+
# TODO(simon): make this replicated linear
101+
self.weight = nn.Parameter(
102+
torch.empty(config.n_routed_experts, config.hidden_size))
103+
if config.topk_method == "noaux_tc":
104+
self.e_score_correction_bias = nn.Parameter(
105+
torch.empty((config.n_routed_experts)))
106+
else:
107+
self.e_score_correction_bias = None
108+
109+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
110+
return torch.nn.functional.linear(hidden_states,
111+
self.weight,
112+
bias=None)
113+
114+
93115
class DeepseekV3MoE(nn.Module):
94116

95117
def __init__(
@@ -112,24 +134,22 @@ def __init__(
112134
raise ValueError(f"Unsupported activation: {config.hidden_act}. "
113135
"Only silu is supported for now.")
114136

115-
self.experts = FusedMoE(num_experts=config.n_routed_experts,
116-
top_k=config.num_experts_per_tok,
117-
hidden_size=config.hidden_size,
118-
intermediate_size=config.moe_intermediate_size,
119-
reduce_results=False,
120-
renormalize=config.norm_topk_prob,
121-
quant_config=quant_config,
122-
use_grouped_topk=True,
123-
num_expert_group=config.n_group,
124-
topk_group=config.topk_group,
125-
prefix=f"{prefix}.experts",
126-
scoring_func=config.scoring_func)
127-
128-
self.gate = ReplicatedLinear(config.hidden_size,
129-
config.n_routed_experts,
130-
bias=False,
131-
quant_config=None,
132-
prefix=f"{prefix}.gate")
137+
self.gate = MoEGate(config)
138+
self.experts = FusedMoE(
139+
num_experts=config.n_routed_experts,
140+
top_k=config.num_experts_per_tok,
141+
hidden_size=config.hidden_size,
142+
intermediate_size=config.moe_intermediate_size,
143+
reduce_results=False,
144+
renormalize=config.norm_topk_prob,
145+
quant_config=quant_config,
146+
use_grouped_topk=True,
147+
num_expert_group=config.n_group,
148+
topk_group=config.topk_group,
149+
prefix=f"{prefix}.experts",
150+
scoring_func=config.scoring_func,
151+
e_score_correction_bias=self.gate.e_score_correction_bias)
152+
133153
if config.n_shared_experts is not None:
134154
intermediate_size = (config.moe_intermediate_size *
135155
config.n_shared_experts)
@@ -147,7 +167,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
147167
if self.n_shared_experts is not None:
148168
shared_output = self.shared_experts(hidden_states)
149169
# router_logits: (num_tokens, n_experts)
150-
router_logits, _ = self.gate(hidden_states)
170+
router_logits = self.gate(hidden_states)
151171
final_hidden_states = self.experts(
152172
hidden_states=hidden_states,
153173
router_logits=router_logits) * self.routed_scaling_factor
@@ -244,8 +264,7 @@ def __init__(
244264
bias=False,
245265
quant_config=quant_config,
246266
prefix=f"{prefix}.o_proj")
247-
if rope_scaling:
248-
rope_scaling["rope_type"] = 'deepseek_yarn'
267+
rope_scaling["rope_type"] = 'deepseek_yarn'
249268
self.rotary_emb = get_rope(qk_rope_head_dim,
250269
rotary_dim=qk_rope_head_dim,
251270
max_position=max_position_embeddings,
@@ -624,6 +643,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
624643
if is_pp_missing_parameter(name, self):
625644
continue
626645

646+
if name not in params_dict:
647+
for key in params_dict:
648+
print(key)
627649
param = params_dict[name]
628650
weight_loader = getattr(param, "weight_loader",
629651
default_weight_loader)

0 commit comments

Comments
 (0)