6
6
7
7
freqs_cis : -1 # torch.complex64 (2048, 64)
8
8
tok_embeddings.weight : 1 # torch.float32 (32000, 4096)
9
+ tok_embeddings.weight_scaler : 0 # torch.bfloat16 (4096,)
9
10
layers.*.attention.wo.weight : 1 # torch.int8 (4096, 4096)
10
11
layers.*.attention.wo.weight_scaler : 0 # torch.bfloat16 (4096,)
11
12
layers.*.attention.wq.weight : 0 # torch.int8 (4096, 4096)
@@ -15,9 +16,13 @@ layers.*.attention.wk.weight_scaler : 0 # torch.bfloat16 (4096,)
15
16
layers.*.attention.wv.weight : 0 # torch.int8 (4096, 4096)
16
17
layers.*.attention.wv.weight_scaler : 0 # torch.bfloat16 (4096,)
17
18
layers.*.feed_forward.w1.weight : 0 # torch.float32 (11008, 4096)
19
+ layers.*.feed_forward.w1.weight_scaler : 0 # torch.bfloat16 (4096,)
18
20
layers.*.feed_forward.w2.weight : 1 # torch.float32 (4096, 11008)
21
+ layers.*.feed_forward.w2.weight_scaler : 0 # torch.bfloat16 (11008,)
19
22
layers.*.feed_forward.w3.weight : 0 # torch.float32 (11008, 4096)
23
+ layers.*.feed_forward.w3.weight_scaler : 0 # torch.bfloat16 (4096,)
20
24
layers.*.attention_norm.weight : -1 # torch.float32 (4096,)
21
25
layers.*.ffn_norm.weight : -1 # torch.float32 (4096,)
22
26
norm.weight : -1 # torch.float32 (4096,)
23
27
output.weight : 0 # torch.float32 (32000, 4096)
28
+ output.weight_scaler : 0 # torch.float32 (4096,)
0 commit comments