@@ -618,12 +618,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
618
618
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" :
619
619
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
620
620
res = "chatglm-bpe"
621
+ if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" :
622
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
623
+ res = "chatglm-bpe"
621
624
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
622
625
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
623
626
res = "glm4"
624
627
if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902" :
625
628
# ref: https://huggingface.co/zai-org/GLM-4.5-Air, https://huggingface.co/zai-org/GLM-4.5
626
- res = "gpt-2 "
629
+ res = "glm4 "
627
630
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee" :
628
631
# ref: https://huggingface.co/LumiOpen/Viking-7B
629
632
res = "viking"
@@ -3961,33 +3964,32 @@ class Glm4MoeModel(Model):
3961
3964
def __init__ (self , * args , ** kwargs ):
3962
3965
super ().__init__ (* args , ** kwargs )
3963
3966
# GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
3964
- self .block_count = self .hparams ["num_hidden_layers" ] + 1
3967
+ self .block_count = self .hparams ["num_hidden_layers" ] + self . hparams . get ( "num_nextn_predict_layers" , 0 )
3965
3968
self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
3966
-
3969
+
3967
3970
def set_vocab (self ):
3968
3971
from transformers import AutoTokenizer
3969
3972
3970
- tokenizer = AutoTokenizer .from_pretrained (
3971
- self .dir_model , trust_remote_code = True
3972
- )
3973
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
3973
3974
special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
3974
3975
tokens , toktypes , tokpre = self .get_vocab_base ()
3975
3976
self .gguf_writer .add_tokenizer_model ("gpt2" )
3976
3977
self .gguf_writer .add_tokenizer_pre (tokpre )
3977
3978
self .gguf_writer .add_token_list (tokens )
3978
3979
self .gguf_writer .add_token_types (toktypes )
3979
3980
3980
- # Set special tokens
3981
- special_vocab ._set_special_token (
3982
- "eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
3983
- )
3984
- special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
3985
- special_vocab ._set_special_token (
3986
- "unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
3987
- )
3988
- special_vocab ._set_special_token (
3989
- "bos" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
3990
- )
3981
+ # Special tokens
3982
+ # Note: Using <|endoftext|> (151329) for eot causes endless generation
3983
+ special_vocab ._set_special_token ("bos" , tokenizer .get_added_vocab ()["[gMASK]" ]) # 151331
3984
+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ]) # 151336
3985
+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ]) # 151329
3986
+ special_vocab ._set_special_token ("eom" , tokenizer .get_added_vocab ()["<|observation|>" ]) # 151338
3987
+
3988
+ # Patch broken chat template
3989
+ if isinstance (special_vocab .chat_template , str ) and "visible_text(m.content).endswith" in special_vocab .chat_template :
3990
+ special_vocab .chat_template = special_vocab .chat_template .replace (
3991
+ """{{ visible_text(m.content) }}\n {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""" ,
3992
+ """{% set content = visible_text(m.content) %}{{ content }}\n {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""" )
3991
3993
3992
3994
special_vocab .add_to_gguf (self .gguf_writer )
3993
3995
@@ -4001,10 +4003,9 @@ def set_gguf_parameters(self):
4001
4003
int (rope_dim * self .hparams .get ("partial_rotary_factor" , 0.5 ))
4002
4004
)
4003
4005
4004
- # MoE parameters
4005
- if (n_experts := self .hparams .get ("n_routed_experts" )) is not None :
4006
- self .gguf_writer .add_expert_count (n_experts )
4007
- # Note: expert_used_count is already set by parent class using num_experts_per_tok
4006
+ # MoE parameters - Use only routed expert count (shared experts handled separately)
4007
+ if (n_routed_experts := self .hparams .get ("n_routed_experts" )) is not None :
4008
+ self .gguf_writer .add_expert_count (n_routed_experts )
4008
4009
if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
4009
4010
self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
4010
4011
if (n_shared_experts := self .hparams .get ("n_shared_experts" )) is not None :
@@ -4023,8 +4024,11 @@ def set_gguf_parameters(self):
4023
4024
if (norm_topk_prob := self .hparams .get ("norm_topk_prob" )) is not None :
4024
4025
self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
4025
4026
4027
+ # NextN/MTP prediction layers
4028
+ if (num_nextn_predict_layers := self .hparams .get ("num_nextn_predict_layers" )) is not None :
4029
+ self .gguf_writer .add_nextn_predict_layers (num_nextn_predict_layers )
4030
+
4026
4031
_experts : list [dict [str , Tensor ]] | None = None
4027
- _shared_experts : list [dict [str , Tensor ]] | None = None
4028
4032
4029
4033
def modify_tensors (
4030
4034
self , data_torch : Tensor , name : str , bid : int | None
@@ -4035,21 +4039,17 @@ def modify_tensors(
4035
4039
name = name .replace ("language_model." , "" ) # for multimodal variants
4036
4040
4037
4041
# Handle main token embedding (but not layer-specific NextN embeddings)
4038
- if name == "model.embed_tokens.weight" :
4042
+ if name == "model.embed_tokens.weight" and ".layers." not in name :
4039
4043
return [(self .map_tensor_name ("token_embd.weight" ), data_torch )]
4040
4044
4041
4045
# Handle routed experts
4042
- if name .find ("mlp.experts" ) != - 1 and "shared_experts" not in name :
4046
+ if name .find ("mlp.experts" ) != - 1 :
4043
4047
n_experts = self .hparams ["n_routed_experts" ]
4044
4048
assert bid is not None
4045
4049
4046
4050
if self ._experts is None :
4047
4051
self ._experts = [{} for _ in range (self .block_count )]
4048
4052
4049
- # Extend experts array if needed (for models where actual layers > num_hidden_layers)
4050
- while len (self ._experts ) <= bid :
4051
- self ._experts .append ({})
4052
-
4053
4053
self ._experts [bid ][name ] = data_torch
4054
4054
4055
4055
if len (self ._experts [bid ]) >= n_experts * 3 :
@@ -4065,95 +4065,21 @@ def modify_tensors(
4065
4065
del self ._experts [bid ][ename ]
4066
4066
4067
4067
data_torch = torch .stack (datas , dim = 0 )
4068
- # Generate GGUF tensor names for merged experts
4069
- if w_name == "down_proj" :
4070
- new_name = f"blk.{ bid } .ffn_down_exps.weight"
4071
- elif w_name == "gate_proj" :
4072
- new_name = f"blk.{ bid } .ffn_gate_exps.weight"
4073
- elif w_name == "up_proj" :
4074
- new_name = f"blk.{ bid } .ffn_up_exps.weight"
4075
- else :
4076
- merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
4077
- new_name = self .map_tensor_name (merged_name )
4068
+
4069
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
4070
+
4071
+ new_name = self .map_tensor_name (merged_name )
4078
4072
tensors .append ((new_name , data_torch ))
4079
4073
return tensors
4080
4074
else :
4081
4075
return []
4082
4076
4083
- # Handle expert gating input (routing gate)
4084
- if ".mlp.gate.e_score_correction_bias" in name :
4085
- new_name = name .replace ("model.layers." , "blk." ).replace (
4086
- ".mlp.gate.e_score_correction_bias" , ".ffn_gate_inp.bias" # *NOTE* this is ".exp_probs_b" in mainline PR
4087
- )
4088
- return [(new_name , data_torch )]
4089
- elif ".mlp.gate.weight" in name :
4090
- new_name = name .replace ("model.layers." , "blk." ).replace (
4091
- ".mlp.gate.weight" , ".ffn_gate_inp.weight"
4092
- )
4093
- return [(new_name , data_torch )]
4094
-
4095
- # Handle shared expert tensors
4096
- if ".mlp.shared_experts." in name :
4097
- new_name = name .replace ("model.layers." , "blk." ).replace (".mlp.shared_experts." , ".ffn_" )
4098
- if "gate_proj" in new_name :
4099
- new_name = new_name .replace ("gate_proj" , "gate_shexp" )
4100
- elif "down_proj" in new_name :
4101
- new_name = new_name .replace ("down_proj" , "down_shexp" )
4102
- elif "up_proj" in new_name :
4103
- new_name = new_name .replace ("up_proj" , "up_shexp" )
4104
- return [(new_name , data_torch )]
4105
-
4106
- # Handle regular dense FFN layers (for hybrid dense/MoE architecture)
4107
- if ".mlp." in name and "experts" not in name and "_shexp" not in name :
4108
- if "gate_proj" in name :
4109
- new_name = name .replace ("model.layers." , "blk." ).replace (
4110
- ".mlp.gate_proj.weight" , ".ffn_gate.weight"
4111
- )
4112
- elif "up_proj" in name :
4113
- new_name = name .replace ("model.layers." , "blk." ).replace (
4114
- ".mlp.up_proj.weight" , ".ffn_up.weight"
4115
- )
4116
- elif "down_proj" in name :
4117
- new_name = name .replace ("model.layers." , "blk." ).replace (
4118
- ".mlp.down_proj.weight" , ".ffn_down.weight"
4119
- )
4120
- else :
4121
- new_name = name
4122
- return [(self .map_tensor_name (new_name ), data_torch )]
4123
-
4124
- # Handle special NextN tensors - preserve for future MTP support - See https://github.com/ggml-org/llama.cpp/pull/13236
4125
- if (
4126
- ".embed_tokens." in name
4127
- or ".shared_head." in name
4128
- or ".eh_proj." in name
4129
- or ".enorm." in name
4130
- or ".hnorm." in name
4131
- ):
4132
- new_name = name .replace ("model.layers." , "blk." ).replace ("model." , "" ).replace (".weight" , "" )
4133
- # logger.debug(f"Skipping MTP tensor: {new_name}")
4134
- return [(new_name , data_torch )]
4135
-
4136
- # GLM tensor mapping - handle directly without map_tensor_name
4137
- if ".input_layernorm." in name :
4138
- new_name = name .replace ("model.layers." , "blk." ).replace (".input_layernorm." , ".attn_norm." )
4139
- return [(new_name , data_torch )]
4140
- elif ".post_attention_layernorm." in name :
4141
- new_name = name .replace ("model.layers." , "blk." ).replace (".post_attention_layernorm." , ".ffn_norm." )
4142
- return [(new_name , data_torch )]
4143
- elif ".self_attn." in name :
4144
- # Map GLM self_attn to standard attention naming
4145
- new_name = name .replace ("model.layers." , "blk." ).replace (".self_attn." , ".attn_" )
4146
- if "q_proj" in new_name :
4147
- new_name = new_name .replace ("q_proj" , "q" )
4148
- elif "k_proj" in new_name :
4149
- new_name = new_name .replace ("k_proj" , "k" )
4150
- elif "v_proj" in new_name :
4151
- new_name = new_name .replace ("v_proj" , "v" )
4152
- elif "o_proj" in new_name :
4153
- new_name = new_name .replace ("o_proj" , "output" )
4154
- return [(new_name , data_torch )]
4077
+ if name .endswith ("e_score_correction_bias" ):
4078
+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
4155
4079
4156
- return super ().modify_tensors (data_torch , name , bid )
4080
+ new_name = self .map_tensor_name (name )
4081
+
4082
+ return [(new_name , data_torch )]
4157
4083
4158
4084
def prepare_tensors (self ):
4159
4085
super ().prepare_tensors ()
0 commit comments