Skip to content

Commit 4608090

Browse files
authored
【Hackathon 9th No.54、57】 add unit tests for per_token_quant and per_token_quant_padding (#3746)
1 parent 7baf1b5 commit 4608090

File tree

1 file changed

+171
-0
lines changed

1 file changed

+171
-0
lines changed
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import unittest
2+
3+
import numpy as np
4+
import paddle
5+
import paddle.nn.functional as F
6+
7+
from fastdeploy.model_executor.ops.gpu import per_token_quant, per_token_quant_padding
8+
9+
paddle.seed(2024)
10+
11+
12+
def per_token_quant_paddle(input_tensor, block_size):
13+
MAX_VALUE = 448.0
14+
epsilon = 1e-10
15+
16+
input_shape = input_tensor.shape
17+
token_num = input_shape[0]
18+
hidden_size = input_shape[1]
19+
20+
# According to https://github.com/PaddlePaddle/FastDeploy/pull/3659
21+
padding_size = (block_size - hidden_size % block_size) % block_size
22+
23+
padded_input = input_tensor
24+
if padding_size > 0:
25+
padded_input = F.pad(input_tensor, pad=[0, padding_size], mode="constant", value=0.0)
26+
27+
padded_hidden_size = hidden_size + padding_size
28+
hidden_size_scale = padded_hidden_size // block_size
29+
30+
reshaped_input = paddle.reshape(padded_input, [token_num, hidden_size_scale, block_size]).astype("float32")
31+
32+
max_abs_val = paddle.max(paddle.abs(reshaped_input), axis=-1, keepdim=True)
33+
max_abs_val = paddle.clip(max_abs_val, min=epsilon)
34+
scale = max_abs_val / MAX_VALUE
35+
36+
quanted_value = reshaped_input / scale
37+
38+
quanted_x_padded_reshaped = quanted_value.to(paddle.float8_e4m3fn)
39+
quanted_x_padded = paddle.reshape(quanted_x_padded_reshaped, [token_num, padded_hidden_size])
40+
41+
quanted_x = quanted_x_padded[:, :hidden_size]
42+
43+
quanted_scale = paddle.squeeze(scale, axis=-1)
44+
45+
return quanted_x, quanted_scale
46+
47+
48+
def per_token_quant_padding_paddle(input_tensor, block_size, dtype):
49+
quanted_x, intermediate_scale = per_token_quant_paddle(input_tensor, block_size)
50+
token_num = input_tensor.shape[0]
51+
52+
tma_alignment_elements = 4
53+
padded_token_num = ((token_num + tma_alignment_elements - 1) // tma_alignment_elements) * tma_alignment_elements
54+
55+
hidden_size_scale = intermediate_scale.shape[1]
56+
padded_scale = paddle.zeros([padded_token_num, hidden_size_scale], dtype="float32")
57+
58+
padded_scale[:token_num, :] = intermediate_scale
59+
60+
return quanted_x, padded_scale
61+
62+
63+
class TestPerTokenQuant(unittest.TestCase):
64+
def get_input(self, shape, dtype):
65+
return paddle.randn(shape=shape, dtype=dtype)
66+
67+
def setUp(self) -> None:
68+
self.dtype = paddle.float16
69+
self.token_num = 4
70+
self.hidden_size = 500
71+
self.block_size = 128
72+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
73+
74+
def test_per_token_quant(self):
75+
paddle_output, paddle_output_scale = per_token_quant_paddle(self.input_tensor, self.block_size)
76+
output, output_scale = per_token_quant(self.input_tensor, self.block_size)
77+
78+
np.testing.assert_allclose(paddle_output_scale.numpy(), output_scale.numpy(), rtol=1e-6)
79+
80+
output_rel_diff = paddle.mean(
81+
paddle.abs(output.to(paddle.float32) - paddle_output.to(paddle.float32))
82+
) / paddle.mean(paddle.abs(paddle_output.to(paddle.float32)))
83+
84+
assert output_rel_diff < 0.001
85+
86+
87+
class TestPerTokenQuantCase1(TestPerTokenQuant):
88+
def setUp(self) -> None:
89+
self.dtype = paddle.float16
90+
self.token_num = 4
91+
self.hidden_size = 128 * 6
92+
self.block_size = 128
93+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
94+
95+
96+
class TestPerTokenQuantCase2(TestPerTokenQuant):
97+
def setUp(self) -> None:
98+
self.dtype = paddle.bfloat16
99+
self.token_num = 4
100+
self.hidden_size = 500
101+
self.block_size = 128
102+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
103+
104+
105+
class TestPerTokenQuantCase3(TestPerTokenQuant):
106+
def setUp(self) -> None:
107+
self.dtype = paddle.bfloat16
108+
self.token_num = 4
109+
self.hidden_size = 128 * 6
110+
self.block_size = 128
111+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
112+
113+
114+
class TestPerTokenQuantPadding(TestPerTokenQuant):
115+
def setUp(self) -> None:
116+
self.dtype = paddle.float16
117+
self.token_num = 6
118+
self.hidden_size = 128 * 4
119+
self.block_size = 128
120+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
121+
122+
def test_per_token_quant_padding(self):
123+
paddle_output, paddle_output_scale = per_token_quant_padding_paddle(
124+
self.input_tensor, self.block_size, self.dtype
125+
)
126+
output, output_scale = per_token_quant_padding(self.input_tensor, self.block_size)
127+
128+
self.assertEqual(paddle_output_scale.shape, output_scale.shape)
129+
np.testing.assert_allclose(
130+
paddle_output_scale[0 : self.token_num].numpy(),
131+
output_scale[0 : self.token_num].numpy(),
132+
rtol=1e-5,
133+
atol=1e-5,
134+
)
135+
136+
output_rel_diff = paddle.mean(
137+
paddle.abs(output.to(paddle.float32) - paddle_output.to(paddle.float32))
138+
) / paddle.mean(paddle.abs(paddle_output.to(paddle.float32)) + 1e-9)
139+
140+
assert output_rel_diff < 0.001
141+
142+
143+
class TestPerTokenQuantPaddingCase1(TestPerTokenQuantPadding):
144+
def setUp(self) -> None:
145+
self.dtype = paddle.float16
146+
self.token_num = 8
147+
self.hidden_size = 128 * 4
148+
self.block_size = 128
149+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
150+
151+
152+
class TestPerTokenQuantPaddingCase2(TestPerTokenQuantPadding):
153+
def setUp(self) -> None:
154+
self.dtype = paddle.bfloat16
155+
self.token_num = 6
156+
self.hidden_size = 128 * 4
157+
self.block_size = 128
158+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
159+
160+
161+
class TestPerTokenQuantPaddingCase3(TestPerTokenQuantPadding):
162+
def setUp(self) -> None:
163+
self.dtype = paddle.bfloat16
164+
self.token_num = 8
165+
self.hidden_size = 128 * 4
166+
self.block_size = 128
167+
self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype)
168+
169+
170+
if __name__ == "__main__":
171+
unittest.main()

0 commit comments

Comments
 (0)