1
1
// @lint-ignore-every LICENSELINT
2
2
//
3
- // SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates
3
+ // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates
4
4
5
5
//
6
6
// SPDX-License-Identifier: Apache-2.0
@@ -15,15 +15,15 @@ namespace kleidiai {
15
15
void NOINLINE gemmkernel_1x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
16
16
#ifdef __aarch64__
17
17
__asm__ __volatile__ (
18
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
18
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
19
19
" mov x25, #0x1\n "
20
20
" fmov v29.8h, #1.0\n "
21
21
" ldr x24, [%x[gp], %[offsetof_b_block_cols]]\n "
22
22
" ldr x23, [%x[gp], %[offsetof_B]]\n "
23
23
" ldr x22, [%x[gp], %[offsetof_C]]\n "
24
- " bic x20, x20, #0x80000000\n "
25
- " cmp x20, #0x0\n "
24
+ " fcmp s16, #0.0\n "
26
25
" csel x25, XZR, x25, EQ\n "
26
+ " csel x25, XZR, x25, VS\n "
27
27
" 1:" // Height 1: Column loop
28
28
" tbz x25, #0, 2f\n "
29
29
" ldr q30, [x22, #0x0]\n "
@@ -177,15 +177,15 @@ void NOINLINE gemmkernel_1x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
177
177
void NOINLINE gemmkernel_2x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
178
178
#ifdef __aarch64__
179
179
__asm__ __volatile__ (
180
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
180
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
181
181
" mov x26, #0x1\n "
182
182
" fmov v27.8h, #1.0\n "
183
183
" ldr x25, [%x[gp], %[offsetof_b_block_cols]]\n "
184
184
" ldr x24, [%x[gp], %[offsetof_B]]\n "
185
185
" ldr x23, [%x[gp], %[offsetof_C]]\n "
186
- " bic x20, x20, #0x80000000\n "
187
- " cmp x20, #0x0\n "
186
+ " fcmp s16, #0.0\n "
188
187
" csel x26, XZR, x26, EQ\n "
188
+ " csel x26, XZR, x26, VS\n "
189
189
" 1:" // Height 2: Column loop
190
190
" tbz x26, #0, 2f\n "
191
191
" ldr q28, [x23, #0x0]\n "
@@ -384,15 +384,15 @@ void NOINLINE gemmkernel_2x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
384
384
void NOINLINE gemmkernel_3x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
385
385
#ifdef __aarch64__
386
386
__asm__ __volatile__ (
387
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
387
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
388
388
" mov x27, #0x1\n "
389
389
" fmov v25.8h, #1.0\n "
390
390
" ldr x26, [%x[gp], %[offsetof_b_block_cols]]\n "
391
391
" ldr x25, [%x[gp], %[offsetof_B]]\n "
392
392
" ldr x24, [%x[gp], %[offsetof_C]]\n "
393
- " bic x20, x20, #0x80000000\n "
394
- " cmp x20, #0x0\n "
393
+ " fcmp s16, #0.0\n "
395
394
" csel x27, XZR, x27, EQ\n "
395
+ " csel x27, XZR, x27, VS\n "
396
396
" 1:" // Height 3: Column loop
397
397
" tbz x27, #0, 2f\n "
398
398
" ldr q26, [x24, #0x0]\n "
@@ -632,15 +632,15 @@ void NOINLINE gemmkernel_3x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
632
632
void NOINLINE gemmkernel_4x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
633
633
#ifdef __aarch64__
634
634
__asm__ __volatile__ (
635
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
635
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
636
636
" mov x28, #0x1\n "
637
637
" fmov v23.8h, #1.0\n "
638
638
" ldr x27, [%x[gp], %[offsetof_b_block_cols]]\n "
639
639
" ldr x26, [%x[gp], %[offsetof_B]]\n "
640
640
" ldr x25, [%x[gp], %[offsetof_C]]\n "
641
- " bic x20, x20, #0x80000000\n "
642
- " cmp x20, #0x0\n "
641
+ " fcmp s16, #0.0\n "
643
642
" csel x28, XZR, x28, EQ\n "
643
+ " csel x28, XZR, x28, VS\n "
644
644
" 1:" // Height 4: Column loop
645
645
" tbz x28, #0, 2f\n "
646
646
" ldr q24, [x25, #0x0]\n "
@@ -921,15 +921,15 @@ void NOINLINE gemmkernel_4x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
921
921
void NOINLINE gemmkernel_5x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
922
922
#ifdef __aarch64__
923
923
__asm__ __volatile__ (
924
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
924
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
925
925
" mov x9, #0x1\n "
926
926
" fmov v21.8h, #1.0\n "
927
927
" ldr x28, [%x[gp], %[offsetof_b_block_cols]]\n "
928
928
" ldr x27, [%x[gp], %[offsetof_B]]\n "
929
929
" ldr x26, [%x[gp], %[offsetof_C]]\n "
930
- " bic x20, x20, #0x80000000\n "
931
- " cmp x20, #0x0\n "
930
+ " fcmp s16, #0.0\n "
932
931
" csel x9, XZR, x9, EQ\n "
932
+ " csel x9, XZR, x9, VS\n "
933
933
" 1:" // Height 5: Column loop
934
934
" tbz x9, #0, 2f\n "
935
935
" ldr q22, [x26, #0x0]\n "
@@ -1251,15 +1251,15 @@ void NOINLINE gemmkernel_5x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1251
1251
void NOINLINE gemmkernel_6x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
1252
1252
#ifdef __aarch64__
1253
1253
__asm__ __volatile__ (
1254
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
1254
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
1255
1255
" mov x10, #0x1\n "
1256
1256
" fmov v19.8h, #1.0\n "
1257
1257
" ldr x9, [%x[gp], %[offsetof_b_block_cols]]\n "
1258
1258
" ldr x28, [%x[gp], %[offsetof_B]]\n "
1259
1259
" ldr x27, [%x[gp], %[offsetof_C]]\n "
1260
- " bic x20, x20, #0x80000000\n "
1261
- " cmp x20, #0x0\n "
1260
+ " fcmp s16, #0.0\n "
1262
1261
" csel x10, XZR, x10, EQ\n "
1262
+ " csel x10, XZR, x10, VS\n "
1263
1263
" 1:" // Height 6: Column loop
1264
1264
" tbz x10, #0, 2f\n "
1265
1265
" ldr q20, [x27, #0x0]\n "
@@ -1620,15 +1620,15 @@ void NOINLINE gemmkernel_6x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1620
1620
void NOINLINE gemmkernel_7x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
1621
1621
#ifdef __aarch64__
1622
1622
__asm__ __volatile__ (
1623
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
1623
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
1624
1624
" mov x11, #0x1\n "
1625
1625
" fmov v17.8h, #1.0\n "
1626
1626
" ldr x10, [%x[gp], %[offsetof_b_block_cols]]\n "
1627
1627
" ldr x9, [%x[gp], %[offsetof_B]]\n "
1628
1628
" ldr x28, [%x[gp], %[offsetof_C]]\n "
1629
- " bic x20, x20, #0x80000000\n "
1630
- " cmp x20, #0x0\n "
1629
+ " fcmp s16, #0.0\n "
1631
1630
" csel x11, XZR, x11, EQ\n "
1631
+ " csel x11, XZR, x11, VS\n "
1632
1632
" 1:" // Height 7: Column loop
1633
1633
" tbz x11, #0, 2f\n "
1634
1634
" ldr q18, [x28, #0x0]\n "
@@ -2027,15 +2027,15 @@ void NOINLINE gemmkernel_7x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
2027
2027
void NOINLINE gemmkernel_8x1_Neon_fp16_fA0fB0fC0 (GemmParamsFP16* gp) {
2028
2028
#ifdef __aarch64__
2029
2029
__asm__ __volatile__ (
2030
- " ldr w20 , [%x[gp], %[offsetof_beta]]\n "
2030
+ " ldr s16 , [%x[gp], %[offsetof_beta]]\n "
2031
2031
" mov x12, #0x1\n "
2032
2032
" fmov v15.8h, #1.0\n "
2033
2033
" ldr x11, [%x[gp], %[offsetof_b_block_cols]]\n "
2034
2034
" ldr x10, [%x[gp], %[offsetof_B]]\n "
2035
2035
" ldr x9, [%x[gp], %[offsetof_C]]\n "
2036
- " bic x20, x20, #0x80000000\n "
2037
- " cmp x20, #0x0\n "
2036
+ " fcmp s16, #0.0\n "
2038
2037
" csel x12, XZR, x12, EQ\n "
2038
+ " csel x12, XZR, x12, VS\n "
2039
2039
" 1:" // Height 8: Column loop
2040
2040
" tbz x12, #0, 2f\n "
2041
2041
" ldr q16, [x9, #0x0]\n "
0 commit comments