@@ -111,7 +111,6 @@ class BertEncoder(nn.Module):
111
111
def __init__ (self ,
112
112
vllm_config : VllmConfig ,
113
113
bias : bool = True ,
114
- gate_up_proj_bias : bool = True ,
115
114
rotary_kwargs : Optional [dict ] = None ,
116
115
prefix : str = "" ):
117
116
super ().__init__ ()
@@ -123,7 +122,6 @@ def __init__(self,
123
122
cache_config = cache_config ,
124
123
quant_config = quant_config ,
125
124
bias = bias ,
126
- gate_up_proj_bias = gate_up_proj_bias ,
127
125
rotary_kwargs = rotary_kwargs ,
128
126
prefix = f"{ prefix } .layer.{ layer_idx } " )
129
127
for layer_idx in range (config .num_hidden_layers )
@@ -146,7 +144,6 @@ def __init__(self,
146
144
cache_config : Optional [CacheConfig ] = None ,
147
145
quant_config : Optional [QuantizationConfig ] = None ,
148
146
bias : bool = True ,
149
- gate_up_proj_bias : bool = True ,
150
147
rotary_kwargs : Optional [dict ] = None ,
151
148
prefix : str = "" ):
152
149
super ().__init__ ()
@@ -166,7 +163,7 @@ def __init__(self,
166
163
hidden_size = config .hidden_size ,
167
164
intermediate_size = config .intermediate_size ,
168
165
hidden_act = config .hidden_act ,
169
- gate_up_proj_bias = gate_up_proj_bias ,
166
+ bias = bias ,
170
167
quant_config = quant_config ,
171
168
prefix = f"{ prefix } .intermediate" )
172
169
else :
@@ -350,15 +347,15 @@ def __init__(self,
350
347
hidden_size : int ,
351
348
intermediate_size : int ,
352
349
hidden_act : str ,
353
- gate_up_proj_bias : bool = True ,
350
+ bias : bool = True ,
354
351
quant_config : Optional [QuantizationConfig ] = None ,
355
352
prefix : str = "" ):
356
353
super ().__init__ ()
357
354
self .act_fn = get_act_and_mul_fn (hidden_act )
358
355
self .gate_up_proj = MergedColumnParallelLinear (
359
356
hidden_size ,
360
357
[intermediate_size ] * 2 ,
361
- bias = gate_up_proj_bias ,
358
+ bias = bias ,
362
359
quant_config = quant_config ,
363
360
prefix = f"{ prefix } .gate_up_proj" ,
364
361
)
@@ -410,24 +407,18 @@ def __init__(self,
410
407
prefix : str = "" ,
411
408
embedding_class : type = BertEmbedding ,
412
409
bias : bool = True ,
413
- gate_up_proj_bias : bool = True ,
414
410
rotary_kwargs : Optional [dict ] = None ,
415
411
add_pooling_layer : bool = False ):
416
412
super ().__init__ ()
417
413
"""
418
414
For BertModel, all linear layers have bias.
419
- For NomicBertModel, all linear layers do not have bias,
420
- the bias parameter intended to control all linear layers.
421
- For GteModel, only up_gate_proj layer does not have bias,
422
- so the gate_up_proj_bias parameter must be added.
423
- see #16649
415
+ For NomicBertModel, all linear layers do not have bias.
424
416
"""
425
417
426
418
config = vllm_config .model_config .hf_config
427
419
self .embeddings = embedding_class (config )
428
420
self .encoder = BertEncoder (vllm_config = vllm_config ,
429
421
bias = bias ,
430
- gate_up_proj_bias = gate_up_proj_bias ,
431
422
rotary_kwargs = rotary_kwargs ,
432
423
prefix = f"{ prefix } .encoder" )
433
424
self .pooler = BertPooler (config ) if add_pooling_layer else None
@@ -672,7 +663,6 @@ def _build_model(self,
672
663
return BertModel (vllm_config = vllm_config ,
673
664
prefix = prefix ,
674
665
bias = False ,
675
- gate_up_proj_bias = False ,
676
666
rotary_kwargs = rotary_kwargs ,
677
667
embedding_class = BertEmbedding )
678
668
@@ -694,6 +684,7 @@ def _build_model(self,
694
684
695
685
assert config .__class__ .__name__ == "GteConfig"
696
686
assert config .position_embedding_type == "rope"
687
+ assert config .hidden_act == "gelu"
697
688
698
689
config .position_embedding_type = "rotary"
699
690
config .hidden_act = "gelu_and_mul"
@@ -706,11 +697,21 @@ def _build_model(self,
706
697
"base" : config .rope_theta ,
707
698
}
708
699
709
- return BertModel (vllm_config = vllm_config ,
710
- prefix = prefix ,
711
- gate_up_proj_bias = False ,
712
- rotary_kwargs = rotary_kwargs ,
713
- embedding_class = BertEmbedding )
700
+ model = BertModel (vllm_config = vllm_config ,
701
+ prefix = prefix ,
702
+ rotary_kwargs = rotary_kwargs ,
703
+ embedding_class = BertEmbedding )
704
+
705
+ # GteModel only gate_up_proj does not have bias.
706
+ for layer in model .encoder .layer :
707
+ layer .intermediate .gate_up_proj = MergedColumnParallelLinear (
708
+ config .hidden_size ,
709
+ [config .intermediate_size ] * 2 ,
710
+ bias = False ,
711
+ quant_config = vllm_config .quant_config ,
712
+ prefix = f"{ prefix } .gate_up_proj" ,
713
+ )
714
+ return model
714
715
715
716
def split_up_gate_proj (self , weights : Iterable [Tuple [str , torch .Tensor ]]):
716
717
n = "mlp.up_gate_proj"
0 commit comments