@@ -888,6 +888,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
888
888
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756" :
889
889
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
890
890
res = "mellum"
891
+ if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206" :
892
+ # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
893
+ res = "llada-moe"
891
894
892
895
if res is None :
893
896
logger .warning ("\n " )
@@ -8239,6 +8242,76 @@ def prepare_tensors(self):
8239
8242
raise ValueError (f"Unprocessed experts: { experts } " )
8240
8243
8241
8244
8245
+ @ModelBase .register ("LLaDAMoEModel" , "LLaDAMoEModelLM" )
8246
+ class LLaDAMoEModel (TextModel ):
8247
+ model_arch = gguf .MODEL_ARCH .LLADA_MOE
8248
+
8249
+ def set_gguf_parameters (self ):
8250
+ super ().set_gguf_parameters ()
8251
+ if (n_experts := self .hparams .get ("num_experts" )) is not None :
8252
+ self .gguf_writer .add_expert_count (n_experts )
8253
+
8254
+ if (expert_intermediate_size := self .hparams .get ("expert_intermediate_size" )) is not None :
8255
+ self .gguf_writer .add_expert_feed_forward_length (expert_intermediate_size )
8256
+
8257
+ # number of experts used per token (top-k)
8258
+ if (n_experts_used := self .hparams .get ("num_experts_per_tok" )) is not None :
8259
+ self .gguf_writer .add_expert_used_count (n_experts_used )
8260
+
8261
+ self .gguf_writer .add_mask_token_id (156895 )
8262
+ self .gguf_writer .add_causal_attention (False )
8263
+ self .gguf_writer .add_diffusion_shift_logits (False )
8264
+
8265
+ _experts : list [dict [str , Tensor ]] | None = None
8266
+
8267
+ # Copied from: Qwen2MoeModel
8268
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
8269
+ # process the experts separately
8270
+ if name .find ("experts" ) != - 1 :
8271
+ n_experts = self .hparams ["num_experts" ]
8272
+ assert bid is not None
8273
+
8274
+ if self ._experts is None :
8275
+ self ._experts = [{} for _ in range (self .block_count )]
8276
+
8277
+ self ._experts [bid ][name ] = data_torch
8278
+
8279
+ if len (self ._experts [bid ]) >= n_experts * 3 :
8280
+ tensors : list [tuple [str , Tensor ]] = []
8281
+
8282
+ # merge the experts into a single 3d tensor
8283
+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
8284
+ datas : list [Tensor ] = []
8285
+
8286
+ for xid in range (n_experts ):
8287
+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
8288
+ datas .append (self ._experts [bid ][ename ])
8289
+ del self ._experts [bid ][ename ]
8290
+
8291
+ data_torch = torch .stack (datas , dim = 0 )
8292
+
8293
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
8294
+
8295
+ new_name = self .map_tensor_name (merged_name )
8296
+
8297
+ tensors .append ((new_name , data_torch ))
8298
+ return tensors
8299
+ else :
8300
+ return []
8301
+
8302
+ return [(self .map_tensor_name (name ), data_torch )]
8303
+
8304
+ # Copied from: Qwen2MoeModel
8305
+ def prepare_tensors (self ):
8306
+ super ().prepare_tensors ()
8307
+
8308
+ if self ._experts is not None :
8309
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8310
+ experts = [k for d in self ._experts for k in d .keys ()]
8311
+ if len (experts ) > 0 :
8312
+ raise ValueError (f"Unprocessed experts: { experts } " )
8313
+
8314
+
8242
8315
@ModelBase .register ("HunYuanDenseV1ForCausalLM" )
8243
8316
class HunYuanModel (TextModel ):
8244
8317
model_arch = gguf .MODEL_ARCH .HUNYUAN_DENSE
0 commit comments