@@ -7950,6 +7950,119 @@ def set_vocab(self):
7950
7950
self .gguf_writer .add_chat_template (chat_template )
7951
7951
7952
7952
7953
+ @ModelBase .register ("GptOssForCausalLM" )
7954
+ class GptOssModel (TextModel ):
7955
+ model_arch = gguf .MODEL_ARCH .GPT_OSS
7956
+
7957
+ def transform_nibble_layout (self , tensor ):
7958
+ assert tensor .dtype == torch .uint8
7959
+ assert tensor .shape [- 1 ] == 16
7960
+ # swap nibbles
7961
+ t_lo = tensor & 0x0F
7962
+ t_hi = tensor & 0xF0
7963
+ t_swapped = (t_lo << 4 ) | (t_hi >> 4 )
7964
+ tensor = t_swapped
7965
+ # transform aaaa...bbbb... to abababab...
7966
+ blk_a , blk_b = tensor .chunk (2 , dim = - 1 )
7967
+ # get a_
7968
+ blk_a0 = (blk_a & 0xF0 ).view (- 1 , 1 )
7969
+ blk_a1 = (blk_a << 4 ).view (- 1 , 1 )
7970
+ blk_a = torch .stack ((blk_a0 , blk_a1 ), dim = 2 ).view (tensor .shape )
7971
+ # get _b
7972
+ blk_b0 = (blk_b >> 4 ).view (- 1 , 1 )
7973
+ blk_b1 = (blk_b & 0x0F ).view (- 1 , 1 )
7974
+ blk_b = torch .stack ((blk_b0 , blk_b1 ), dim = 2 ).view (tensor .shape )
7975
+ # swap once more
7976
+ out = blk_a | blk_b
7977
+ out_h = out & 0xF0
7978
+ out_l = out & 0x0F
7979
+ out = (out_h >> 4 ) | (out_l << 4 )
7980
+ return out
7981
+
7982
+ def repack_mxfp4 (self , new_name : str , blocks : Tensor , scales : Tensor ):
7983
+ assert blocks .dtype == torch .uint8
7984
+ assert scales .dtype == torch .uint8
7985
+ scales = scales .unsqueeze (- 1 )
7986
+ assert len (blocks .shape ) == 4
7987
+ assert len (scales .shape ) == 4
7988
+ blocks = self .transform_nibble_layout (blocks )
7989
+ new_data = torch .concat ((scales , blocks ), dim = - 1 )
7990
+ new_shape = [new_data .shape [0 ], new_data .shape [1 ], new_data .shape [2 ] * 32 ]
7991
+ logger .info (f"Repacked { new_name } with shape { new_shape } and quantization MXFP4" )
7992
+ # flatten last dim
7993
+ new_data = new_data .view (new_data .shape [0 ], new_data .shape [1 ], new_data .shape [2 ] * new_data .shape [3 ])
7994
+ new_data = new_data .numpy ()
7995
+ self .gguf_writer .add_tensor (new_name , new_data , raw_dtype = gguf .GGMLQuantizationType .MXFP4 )
7996
+
7997
+ def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
7998
+ blocks0 : Tensor = torch .zeros (1 )
7999
+ blocks1 : Tensor = torch .zeros (1 )
8000
+ found_mxfp4_tensors = False
8001
+ # we assume that tensors are loaded in the correct order
8002
+ for name , data_torch in self .get_tensors ():
8003
+ if "mlp.experts.down_proj_blocks" in name :
8004
+ blocks0 = data_torch
8005
+ elif "mlp.experts.down_proj_scales" in name :
8006
+ new_name = self .map_tensor_name (name .replace ("_scales" , ".weight" ))
8007
+ self .repack_mxfp4 (new_name , blocks0 , data_torch )
8008
+ found_mxfp4_tensors = True
8009
+ elif "mlp.experts.gate_up_proj_blocks" in name :
8010
+ blocks0 , blocks1 = data_torch [:, ::2 , :, :], data_torch [:, 1 ::2 , :, :]
8011
+ elif "mlp.experts.gate_up_proj_scales" in name :
8012
+ scales0 , scales1 = data_torch [:, ::2 , :], data_torch [:, 1 ::2 , :]
8013
+ new_name_gate = self .map_tensor_name (name .replace ("gate_up_proj_scales" , "gate_proj.weight" ))
8014
+ new_name_up = self .map_tensor_name (name .replace ("gate_up_proj_scales" , "up_proj.weight" ))
8015
+ self .repack_mxfp4 (new_name_gate , blocks0 , scales0 )
8016
+ self .repack_mxfp4 (new_name_up , blocks1 , scales1 )
8017
+ found_mxfp4_tensors = True
8018
+ if not found_mxfp4_tensors :
8019
+ raise ValueError ("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model." )
8020
+ return []
8021
+
8022
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
8023
+ del bid # unused
8024
+
8025
+ if "sinks" in name :
8026
+ name += ".weight"
8027
+
8028
+ # correct naming for down_proj
8029
+ if "down_proj" in name :
8030
+ if name .endswith ("_bias" ):
8031
+ name = name .replace ("down_proj_bias" , "down_proj.bias" )
8032
+ else :
8033
+ return []
8034
+
8035
+ # split the gate_up into gate and up
8036
+ if "gate_up_proj" in name :
8037
+ if name .endswith ("_bias" ):
8038
+ name_up = name .replace ("gate_up_proj_bias" , "up_proj.bias" )
8039
+ name_gate = name .replace ("gate_up_proj_bias" , "gate_proj.bias" )
8040
+ gate_proj_bias , up_proj_bias = data_torch [..., ::2 ], data_torch [..., 1 ::2 ]
8041
+ return [
8042
+ (self .map_tensor_name (name_gate ), gate_proj_bias ),
8043
+ (self .map_tensor_name (name_up ), up_proj_bias )
8044
+ ]
8045
+ else :
8046
+ return []
8047
+
8048
+ return [(self .map_tensor_name (name ), data_torch )]
8049
+
8050
+ def set_vocab (self ):
8051
+ self ._set_vocab_gpt2 ()
8052
+
8053
+ def set_gguf_parameters (self ):
8054
+ super ().set_gguf_parameters ()
8055
+ self .gguf_writer .add_sliding_window (self .hparams ["sliding_window" ])
8056
+ self .gguf_writer .add_expert_feed_forward_length (self .hparams ["intermediate_size" ])
8057
+
8058
+ rope_scaling = self .hparams .get ("rope_scaling" ) or {}
8059
+ rope_type = rope_scaling .get ("rope_type" , rope_scaling .get ("type" ))
8060
+ assert rope_type == "yarn" , f"GPT-OSS only supports yarn rope scaling, got { rope_type } "
8061
+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
8062
+ self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
8063
+ self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_scaling .get ("original_max_position_embeddings" , 4096 ))
8064
+
8065
+
7953
8066
@ModelBase .register ("Lfm2ForCausalLM" )
7954
8067
@ModelBase .register ("LFM2ForCausalLM" )
7955
8068
class LFM2Model (TextModel ):
@@ -8089,6 +8202,7 @@ class LazyTorchTensor(gguf.LazyBase):
8089
8202
_dtype_map : dict [torch .dtype , type ] = {
8090
8203
torch .float16 : np .float16 ,
8091
8204
torch .float32 : np .float32 ,
8205
+ torch .uint8 : np .uint8 ,
8092
8206
}
8093
8207
8094
8208
# used for safetensors slices
0 commit comments