Fix KleidiAI FP16 (#3769)

Nicoshev · facebook-github-bot · commit 27d6cbe02a1a · 2025-03-11T10:00:19.000-07:00
Summary: X-link: facebookresearch/FBGEMM#849 FP32Test was failing when KleidiAI was enabled. It turns out FPCommon.h and PackMatrixB.h were not acconditioned to handle using kleidAI for FP16 and not for FP32 PackedGemmMatrixFP16 constructors were moved to a .cc file, compiled with the rest of fbgemm. This ensures the KleidiAI flag is set when compiling such code. Previously, consumers of the library would include FbgemmPackMatrixB.h from their .cpp files. Because the KleidiAI flag was not set when compiling their code, the intended case for the float16 was never run. Additionally, we have ingested a change in KleidiAI's inline assembly, which handles nan beta values as 0. Reviewed By: psaab Differential Revision: D70606808
diff --git a/defs.bzl b/defs.bzl
@@ -26,6 +26,7 @@ def get_fbgemm_generic_srcs(with_base = False):
         "src/FbgemmI64.cc",
         "src/FbgemmSparseDense.cc",
         "src/FbgemmI8Spmdm.cc",
+        "src/FbgemmPackMatrixB.cc",
         # "src/fp32/FbgemmFP32.cc",
         "src/GenerateKernelDirectConvU8S8S32ACC32.cc",
         "src/GenerateKernel.cc",
diff --git a/include/fbgemm/FbgemmFPCommon.h b/include/fbgemm/FbgemmFPCommon.h
@@ -38,6 +38,18 @@ struct GemmParams {
   float* C;
   uint64_t ldc;
   uint64_t b_block_cols;
+  uint64_t b_block_size;
+};
+
+template <>
+struct GemmParams<float16> {
+  uint64_t k;
+  float* A;
+  const float16* B;
+  float beta;
+  float* C;
+  uint64_t ldc;
+  uint64_t b_block_cols;
 #ifdef FBGEMM_ENABLE_KLEIDIAI
   uint64_t lda;
 #else
@@ -163,10 +175,15 @@ void cblas_gemm_compute(
           assert(kernel_nrows * kb < static_cast<int64_t>(scratchpad->size()));
           if (m != 1) {
 #ifdef FBGEMM_ENABLE_KLEIDIAI
-            gp.A = const_cast<float*>(&A[m2 * k + k_ind]);
-#else
-            PackA(kernel_nrows, kb, &A[m2 * k + k_ind], k, scratchpad->data());
-            gp.A = scratchpad->data();
+            if constexpr (std::is_same<T, float16>::value) {
+              gp.A = const_cast<float*>(&A[m2 * k + k_ind]);
+            } else {
+#endif
+              PackA(
+                  kernel_nrows, kb, &A[m2 * k + k_ind], k, scratchpad->data());
+              gp.A = scratchpad->data();
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+            }
 #endif
           } else {
             // When m == 1, it is actually vector matrix multiplication. We
@@ -184,11 +201,14 @@ void cblas_gemm_compute(
           gp.ldc = ldc * sizeof(C[0]);
           gp.b_block_cols = nbcol;
 #ifdef FBGEMM_ENABLE_KLEIDIAI
-          gp.lda = k * sizeof(A[0]);
-#else
-          gp.b_block_size = gp.k * Bp.blockColSize() * sizeof(gp.B[0]);
+          if constexpr (std::is_same<T, float16>::value) {
+            gp.lda = k * sizeof(A[0]);
+          } else {
+#endif
+            gp.b_block_size = gp.k * Bp.blockColSize() * sizeof(gp.B[0]);
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+          }
 #endif
-
           if ((n % Bp.blockColSize()) == 0) {
             int64_t jb_begin, jb_end;
             fbgemmPartition1D(
diff --git a/include/fbgemm/FbgemmPackMatrixB.h b/include/fbgemm/FbgemmPackMatrixB.h
@@ -62,15 +62,12 @@ class PackedGemmMatrixB {
       const float alpha,
       const float* smat,
       const int brow = 512)
-      : nrow_(nrow),
-        ncol_(ncol),
-        brow_(brow),
-#ifdef FBGEMM_ENABLE_KLEIDIAI
-        kernel_ncol_blocks_(1)
-#else
-        kernel_ncol_blocks_(2)
+      : nrow_(nrow), ncol_(ncol), brow_(brow), kernel_ncol_blocks_(2) {
+#if defined(FBGEMM_ENABLE_KLEIDIAI)
+    if (std::is_same<T, float16>::value) {
+      kernel_ncol_blocks_ = 1;
+    }
 #endif
-  {
     initializeParam();
     initializeMemory();
     // copy source matrix into packed matrix
@@ -95,6 +92,11 @@ class PackedGemmMatrixB {
         nbcol_(nbcol),
         size_(size),
         kernel_ncol_blocks_(2) {
+#if defined(FBGEMM_ENABLE_KLEIDIAI)
+    if (std::is_same<T, float16>::value) {
+      kernel_ncol_blocks_ = 1;
+    }
+#endif
     initializeMemory();
   }
 
@@ -297,4 +299,30 @@ class PackedGemmMatrixB {
   bool pmat_passed_in{false};
 };
 
+#ifndef FBGEMM_STATIC
+
+template <>
+FBGEMM_API
+PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB(
+    const matrix_op_t trans,
+    const int nrow,
+    const int ncol,
+    const float alpha,
+    const float* smat,
+    const int brow);
+
+template <>
+FBGEMM_API
+PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB(
+    const int nrow,
+    const int ncol,
+    const int brow,
+    const int last_brow,
+    const int bcol,
+    const int nbrow,
+    const int nbcol,
+    const uint64_t size);
+
+#endif
+
 } // namespace fbgemm
diff --git a/src/FbgemmPackMatrixB.cc b/src/FbgemmPackMatrixB.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm/FbgemmFP16.h"
+#define FBGEMM_EXPORTS
+
+namespace fbgemm {
+
+// takes smat input mamtrix in row-major format;
+// packs it into gemm-friendly blocked format;
+// allocate space and sets up all the internal variables;
+// also premultiplies by alpha during packing.
+// brow_ contains tile size along k dimension
+// and also is # of fmas updates into int16 container
+// before flushing into fp32.
+// the smaller the brow_, the higher overhead
+// of flushing is.
+// kernel_ncol_blocks is the number of column blocks (in the size of 8 fp16,
+// or 128 bit, or 1 xmm register size) in the kernel. Because the batch size
+// can be dynamic and we need to prepack the weight matrix B, the internal
+// packing layout of the weight matrix and kernel_ncol_blocks have to be
+// fixed. We can choose kernel_ncol_blocks = 1 (with kernels of 1x1~14x1
+// register layouts), 2 (with kernels of 1x2~6x2 register layout), or 3 (with
+// kernels of 1x3~4x3 register layout).
+
+#ifndef FBGEMM_STATIC
+
+template <>
+FBGEMM_API
+PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB(
+    const matrix_op_t trans,
+    const int nrow,
+    const int ncol,
+    const float alpha,
+    const float* smat,
+    const int brow)
+    : nrow_(nrow), ncol_(ncol), brow_(brow), kernel_ncol_blocks_(2) {
+#if defined(FBGEMM_ENABLE_KLEIDIAI)
+  kernel_ncol_blocks_ = 1;
+#endif
+  initializeParam();
+  initializeMemory();
+  // copy source matrix into packed matrix
+  this->PackedGemmMatrixB<float16, TypeConverter<float16>>::packFromSrc(
+      trans, alpha, smat);
+}
+
+template <>
+FBGEMM_API
+PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB(
+    const int nrow,
+    const int ncol,
+    const int brow,
+    const int last_brow,
+    const int bcol,
+    const int nbrow,
+    const int nbcol,
+    const uint64_t size)
+    : nrow_(nrow),
+      ncol_(ncol),
+      brow_(brow),
+      last_brow_(last_brow),
+      bcol_(bcol),
+      nbrow_(nbrow),
+      nbcol_(nbcol),
+      size_(size),
+      kernel_ncol_blocks_(2) {
+#if defined(FBGEMM_ENABLE_KLEIDIAI)
+  kernel_ncol_blocks_ = 1;
+#endif
+  initializeMemory();
+}
+
+#endif
+
+} // namespace fbgemm
diff --git a/src/KleidiAIFP16UKernelsNeon.cc b/src/KleidiAIFP16UKernelsNeon.cc
@@ -1,6 +1,6 @@
 // @lint-ignore-every LICENSELINT
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates
 // <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
@@ -15,15 +15,15 @@ namespace kleidiai {
 void NOINLINE gemmkernel_1x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x25, #0x1\n"
       "fmov v29.8h, #1.0\n"
       "ldr x24, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x23, [%x[gp], %[offsetof_B]]\n"
       "ldr x22, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x25, XZR, x25, EQ\n"
+      "csel x25, XZR, x25, VS\n"
       "1:" // Height 1: Column loop
       "tbz x25, #0, 2f\n"
       "ldr q30, [x22, #0x0]\n"
@@ -177,15 +177,15 @@ void NOINLINE gemmkernel_1x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 void NOINLINE gemmkernel_2x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x26, #0x1\n"
       "fmov v27.8h, #1.0\n"
       "ldr x25, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x24, [%x[gp], %[offsetof_B]]\n"
       "ldr x23, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x26, XZR, x26, EQ\n"
+      "csel x26, XZR, x26, VS\n"
       "1:" // Height 2: Column loop
       "tbz x26, #0, 2f\n"
       "ldr q28, [x23, #0x0]\n"
@@ -384,15 +384,15 @@ void NOINLINE gemmkernel_2x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 void NOINLINE gemmkernel_3x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x27, #0x1\n"
       "fmov v25.8h, #1.0\n"
       "ldr x26, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x25, [%x[gp], %[offsetof_B]]\n"
       "ldr x24, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x27, XZR, x27, EQ\n"
+      "csel x27, XZR, x27, VS\n"
       "1:" // Height 3: Column loop
       "tbz x27, #0, 2f\n"
       "ldr q26, [x24, #0x0]\n"
@@ -632,15 +632,15 @@ void NOINLINE gemmkernel_3x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 void NOINLINE gemmkernel_4x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x28, #0x1\n"
       "fmov v23.8h, #1.0\n"
       "ldr x27, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x26, [%x[gp], %[offsetof_B]]\n"
       "ldr x25, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x28, XZR, x28, EQ\n"
+      "csel x28, XZR, x28, VS\n"
       "1:" // Height 4: Column loop
       "tbz x28, #0, 2f\n"
       "ldr q24, [x25, #0x0]\n"
@@ -921,15 +921,15 @@ void NOINLINE gemmkernel_4x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 void NOINLINE gemmkernel_5x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x9, #0x1\n"
       "fmov v21.8h, #1.0\n"
       "ldr x28, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x27, [%x[gp], %[offsetof_B]]\n"
       "ldr x26, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x9, XZR, x9, EQ\n"
+      "csel x9, XZR, x9, VS\n"
       "1:" // Height 5: Column loop
       "tbz x9, #0, 2f\n"
       "ldr q22, [x26, #0x0]\n"
@@ -1251,15 +1251,15 @@ void NOINLINE gemmkernel_5x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 void NOINLINE gemmkernel_6x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x10, #0x1\n"
       "fmov v19.8h, #1.0\n"
       "ldr x9, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x28, [%x[gp], %[offsetof_B]]\n"
       "ldr x27, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x10, XZR, x10, EQ\n"
+      "csel x10, XZR, x10, VS\n"
       "1:" // Height 6: Column loop
       "tbz x10, #0, 2f\n"
       "ldr q20, [x27, #0x0]\n"
@@ -1620,15 +1620,15 @@ void NOINLINE gemmkernel_6x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 void NOINLINE gemmkernel_7x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x11, #0x1\n"
       "fmov v17.8h, #1.0\n"
       "ldr x10, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x9, [%x[gp], %[offsetof_B]]\n"
       "ldr x28, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x11, XZR, x11, EQ\n"
+      "csel x11, XZR, x11, VS\n"
       "1:" // Height 7: Column loop
       "tbz x11, #0, 2f\n"
       "ldr q18, [x28, #0x0]\n"
@@ -2027,15 +2027,15 @@ void NOINLINE gemmkernel_7x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 void NOINLINE gemmkernel_8x1_Neon_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
 #ifdef __aarch64__
   __asm__ __volatile__(
-      "ldr w20, [%x[gp], %[offsetof_beta]]\n"
+      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
       "mov x12, #0x1\n"
       "fmov v15.8h, #1.0\n"
       "ldr x11, [%x[gp], %[offsetof_b_block_cols]]\n"
       "ldr x10, [%x[gp], %[offsetof_B]]\n"
       "ldr x9, [%x[gp], %[offsetof_C]]\n"
-      "bic x20, x20, #0x80000000\n"
-      "cmp x20, #0x0\n"
+      "fcmp s16, #0.0\n"
       "csel x12, XZR, x12, EQ\n"
+      "csel x12, XZR, x12, VS\n"
       "1:" // Height 8: Column loop
       "tbz x12, #0, 2f\n"
       "ldr q16, [x9, #0x0]\n"