@@ -1847,6 +1847,124 @@ class StarCoder2Model(Model):
1847
1847
model_arch = gguf .MODEL_ARCH .STARCODER2
1848
1848
1849
1849
1850
+ @Model .register ("MambaForCausalLM" , "MambaLMHeadModel" )
1851
+ class MambaModel (Model ):
1852
+ model_arch = gguf .MODEL_ARCH .MAMBA
1853
+
1854
+ def set_vocab (self ):
1855
+ vocab_size = self .hparams ["vocab_size" ]
1856
+ # Round vocab size to next multiple of 8
1857
+ pad_vocab = self .hparams .get ("pad_vocab_size_multiple" , 8 )
1858
+ # pad using ceiling division
1859
+ # ref: https://stackoverflow.com/a/17511341/22827863
1860
+ vocab_size = - (vocab_size // - pad_vocab ) * pad_vocab
1861
+ self .hparams ["vocab_size" ] = vocab_size
1862
+
1863
+ if (self .dir_model / "tokenizer.json" ).is_file ():
1864
+ self ._set_vocab_gpt2 ()
1865
+ else :
1866
+ # Use the GPT-NeoX tokenizer when no tokenizer files are present
1867
+ tokenizer_path = Path (sys .path [0 ]) / "models" / "ggml-vocab-gpt-neox.gguf"
1868
+ print (f"Using tokenizer from '{ os .path .relpath (tokenizer_path , os .getcwd ())} '" )
1869
+ neox_reader = gguf .GGUFReader (tokenizer_path , "r" )
1870
+
1871
+ field = neox_reader .get_field (gguf .Keys .Tokenizer .MODEL )
1872
+ self .gguf_writer .add_tokenizer_model (bytes (field .parts [- 1 ]))
1873
+ field = neox_reader .get_field (gguf .Keys .Tokenizer .LIST )
1874
+ self .gguf_writer .add_token_list ([bytes (field .parts [i ]) for i in field .data ][:vocab_size ])
1875
+ field = neox_reader .get_field (gguf .Keys .Tokenizer .TOKEN_TYPE )
1876
+ self .gguf_writer .add_token_types ([field .parts [i ].tolist ()[0 ] for i in field .data ][:vocab_size ])
1877
+ field = neox_reader .get_field (gguf .Keys .Tokenizer .MERGES )
1878
+ self .gguf_writer .add_token_merges ([bytes (field .parts [i ]) for i in field .data ])
1879
+ field = neox_reader .get_field (gguf .Keys .Tokenizer .BOS_ID )
1880
+ self .gguf_writer .add_bos_token_id (field .parts [- 1 ].tolist ()[0 ])
1881
+ field = neox_reader .get_field (gguf .Keys .Tokenizer .EOS_ID )
1882
+ self .gguf_writer .add_eos_token_id (field .parts [- 1 ].tolist ()[0 ])
1883
+ field = neox_reader .get_field (gguf .Keys .Tokenizer .UNK_ID )
1884
+ self .gguf_writer .add_unk_token_id (field .parts [- 1 ].tolist ()[0 ])
1885
+
1886
+ def set_gguf_parameters (self ):
1887
+ d_model = self .find_hparam (["hidden_size" , "d_model" ])
1888
+ d_conv = self .find_hparam (["conv_kernel" , "d_conv" ], optional = True ) or 4
1889
+ d_inner = self .find_hparam (["intermediate_size" , "d_inner" ], optional = True ) or 2 * d_model
1890
+ d_state = self .find_hparam (["state_size" , "d_state" ], optional = True ) or 16
1891
+ # ceiling division
1892
+ # ref: https://stackoverflow.com/a/17511341/22827863
1893
+ # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
1894
+ dt_rank = self .find_hparam (["time_step_rank" , "dt_rank" ], optional = True ) or - (d_model // - 16 )
1895
+ rms_norm_eps = self .find_hparam (["layer_norm_epsilon" , "rms_norm_eps" ], optional = True ) or 1e-5
1896
+
1897
+ # Fail early for models which don't have a block expansion factor of 2
1898
+ assert d_inner == 2 * d_model
1899
+
1900
+ self .gguf_writer .add_name (self .dir_model .name )
1901
+ self .gguf_writer .add_context_length (2 ** 20 ) # arbitrary value; for those who use the default
1902
+ self .gguf_writer .add_embedding_length (d_model )
1903
+ self .gguf_writer .add_feed_forward_length (0 ) # unused, but seemingly required when loading
1904
+ self .gguf_writer .add_head_count (0 ) # unused, but seemingly required when loading
1905
+ self .gguf_writer .add_block_count (self .hparams ["n_layer" ])
1906
+ self .gguf_writer .add_ssm_conv_kernel (d_conv )
1907
+ self .gguf_writer .add_ssm_inner_size (d_inner )
1908
+ self .gguf_writer .add_ssm_state_size (d_state )
1909
+ self .gguf_writer .add_ssm_time_step_rank (dt_rank )
1910
+ self .gguf_writer .add_layer_norm_rms_eps (rms_norm_eps )
1911
+ self .gguf_writer .add_file_type (self .ftype )
1912
+
1913
+ def write_tensors (self ):
1914
+ block_count = self .hparams ["n_layer" ]
1915
+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
1916
+
1917
+ tok_embd = None
1918
+ tok_embd_name = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .TOKEN_EMBD ] + ".weight"
1919
+ output_name = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .OUTPUT ] + ".weight"
1920
+
1921
+ for name , data_torch in self .get_tensors ():
1922
+ old_dtype = data_torch .dtype
1923
+
1924
+ # convert any unsupported data types to float32
1925
+ if data_torch .dtype not in (torch .float16 , torch .float32 ):
1926
+ data_torch = data_torch .to (torch .float32 )
1927
+
1928
+ # map tensor names
1929
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
1930
+ if new_name is None :
1931
+ print (f"Can not map tensor { name !r} " )
1932
+ sys .exit ()
1933
+
1934
+ if name .endswith (".A_log" ):
1935
+ print ("A_log --> A ==> " + new_name )
1936
+ data_torch = - torch .exp (data_torch )
1937
+
1938
+ # assuming token_embd.weight is seen before output.weight
1939
+ if tok_embd is not None and new_name == output_name :
1940
+ if torch .equal (tok_embd , data_torch ):
1941
+ print (f"{ output_name } is equivalent to { tok_embd_name } , omitting" )
1942
+ continue
1943
+ if new_name == tok_embd_name :
1944
+ tok_embd = data_torch
1945
+
1946
+ data = data_torch .squeeze ().numpy ()
1947
+
1948
+ n_dims = len (data .shape )
1949
+ data_dtype = data .dtype
1950
+
1951
+ # if f32 desired, convert any float16 to float32
1952
+ if self .ftype == 0 and data_dtype == np .float16 :
1953
+ data = data .astype (np .float32 )
1954
+
1955
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1956
+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
1957
+ data = data .astype (np .float32 )
1958
+
1959
+ # if f16 desired, convert big float32 2-dim weight tensors to float16
1960
+ if self .ftype == 1 and data_dtype == np .float32 and new_name .removesuffix (".weight" ).endswith ((".ssm_in" , ".ssm_out" , "token_embd" , "output" )) and n_dims == 2 :
1961
+ data = data .astype (np .float16 )
1962
+
1963
+ print (f"{ new_name } , n_dims = { n_dims } , { old_dtype } --> { data .dtype } " )
1964
+
1965
+ self .gguf_writer .add_tensor (new_name , data )
1966
+
1967
+
1850
1968
###### CONVERSION LOGIC ######
1851
1969
1852
1970
0 commit comments