Skip to content

Commit e875f09

Browse files
committed
Support for SME1 based sgemm_direct kernel for cblas_sgemm level 3 API
* Added ARMV9SME target * Added SGEMM_DIRECT kernel based on SME1
1 parent 18014b0 commit e875f09

22 files changed

+596
-22
lines changed

Makefile.arm64

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve
3030
endif
3131
endif
3232

33+
ifeq ($(CORE), ARMV9SME)
34+
CCOMMON_OPT += -march=armv9-a+sve2+sme
35+
FCOMMON_OPT += -march=armv9-a+sve2
36+
endif
37+
3338
ifeq ($(CORE), CORTEXA53)
3439
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
3540
ifneq ($(F_COMPILER), NAG)

Makefile.system

+8
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64)
420420
export MACOSX_DEPLOYMENT_TARGET=11.0
421421
ifeq ($(C_COMPILER), GCC)
422422
export NO_SVE = 1
423+
export NO_SME = 1
423424
endif
424425
else
425426
export MACOSX_DEPLOYMENT_TARGET=10.8
@@ -709,6 +710,9 @@ DYNAMIC_CORE += NEOVERSEN2
709710
DYNAMIC_CORE += ARMV8SVE
710711
DYNAMIC_CORE += A64FX
711712
endif
713+
ifneq ($(NO_SME), 1)
714+
DYNAMIC_CORE += ARMV9SME
715+
endif
712716
DYNAMIC_CORE += THUNDERX
713717
DYNAMIC_CORE += THUNDERX2T99
714718
DYNAMIC_CORE += TSV110
@@ -1474,6 +1478,10 @@ ifeq ($(NO_SVE), 1)
14741478
CCOMMON_OPT += -DNO_SVE
14751479
endif
14761480

1481+
ifeq ($(NO_SME), 1)
1482+
CCOMMON_OPT += -DNO_SME
1483+
endif
1484+
14771485
ifdef SMP
14781486
CCOMMON_OPT += -DSMP_SERVER
14791487

TargetList.txt

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ THUNDERX3T110
111111
VORTEX
112112
A64FX
113113
ARMV8SVE
114+
ARMV9SME
114115
FT2000
115116

116117
9.System Z:

cmake/arch.cmake

+15-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,21 @@ endif ()
4444

4545
if (DYNAMIC_ARCH)
4646
if (ARM64)
47-
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
48-
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
49-
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
47+
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
48+
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
49+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10
50+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
51+
endif ()
52+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
53+
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
54+
endif()
55+
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
56+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
57+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
58+
endif ()
59+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
60+
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
61+
endif()
5062
endif ()
5163
if (DYNAMIC_LIST)
5264
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

cmake/cc.cmake

+6
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE)
238238
endif ()
239239
endif ()
240240

241+
if (${CORE} STREQUAL ARMV9SME)
242+
if (NOT DYNAMIC_ARCH)
243+
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme")
244+
endif ()
245+
endif ()
246+
241247
if (${CORE} STREQUAL CORTEXA510)
242248
if (NOT DYNAMIC_ARCH)
243249
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")

cmake/prebuild.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ endif ()
10141014
set(ZGEMM_UNROLL_M 4)
10151015
set(ZGEMM_UNROLL_N 4)
10161016
set(SYMV_P 16)
1017-
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
1017+
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME")
10181018
file(APPEND ${TARGET_CONF_TEMP}
10191019
"#define L1_CODE_SIZE\t65536\n"
10201020
"#define L1_CODE_LINESIZE\t64\n"

cmake/system.cmake

+15-10
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
310310
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
311311
endif()
312312
endif()
313+
if (${TARGET} STREQUAL ARMV9SME)
314+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3")
315+
endif()
313316
if (${TARGET} STREQUAL A64FX)
314317
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
315318
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
@@ -382,6 +385,8 @@ if (NEED_PIC)
382385
if (NOT NOFORTRAN)
383386
if (${F_COMPILER} STREQUAL "SUN")
384387
set(FCOMMON_OPT "${FCOMMON_OPT} -pic")
388+
elseif (${F_COMPILER} STREQUAL "NAGFOR")
389+
set(FCOMMON_OPT "${FCOMMON_OPT} -PIC")
385390
else ()
386391
set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC")
387392
endif ()
@@ -640,17 +645,17 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
640645
endif ()
641646

642647
if (CMAKE_Fortran_COMPILER)
643-
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
644-
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
645-
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
646-
message(STATUS "removing fortran flags")
647-
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
648+
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
649+
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
650+
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
651+
message(STATUS "removing fortran flags")
652+
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
653+
endif ()
654+
foreach (FILTER_FLAG ${FILTER_FLAGS})
655+
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
656+
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
657+
endforeach ()
648658
endif ()
649-
foreach (FILTER_FLAG ${FILTER_FLAGS})
650-
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
651-
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
652-
endforeach ()
653-
endif ()
654659
endif ()
655660

656661
if ("${F_COMPILER}" STREQUAL "GFORTRAN")

common.h

+1
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ void gotoblas_profile_init(void);
696696
void gotoblas_profile_quit(void);
697697

698698
int support_avx512(void);
699+
int support_sme1(void);
699700

700701
#ifdef USE_OPENMP
701702

common_arm64.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
175175
#define HUGE_PAGESIZE ( 4 << 20)
176176

177177
#ifndef BUFFERSIZE
178-
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
178+
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME)
179179
#define BUFFER_SIZE (32 << 22)
180180
#else
181181
#define BUFFER_SIZE (32 << 20)

common_param.h

+5
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
221221
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
222222
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
223223
#endif
224+
#ifdef ARCH_ARM64
225+
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
226+
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
227+
#endif
228+
224229

225230
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
226231
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);

common_s.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,9 @@
213213
#ifdef ARCH_X86_64
214214
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
215215
#define SGEMM_DIRECT gotoblas -> sgemm_direct
216-
#else
216+
#elif ARCH_ARM64
217217
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
218-
#define SGEMM_DIRECT sgemm_direct
218+
#define SGEMM_DIRECT gotoblas -> sgemm_direct
219219
#endif
220220

221221
#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy

driver/others/dynamic_arm64.c

+31
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE;
115115
#else
116116
#define gotoblas_ARMV8SVE gotoblas_ARMV8
117117
#endif
118+
#ifdef DYN_ARMV9SME
119+
extern gotoblas_t gotoblas_ARMV9SME;
120+
#else
121+
#define gotoblas_ARMV9SME gotoblas_ARMV8
122+
#endif
118123
#ifdef DYN_CORTEX_A55
119124
extern gotoblas_t gotoblas_CORTEXA55;
120125
#else
@@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX;
148153
#define gotoblas_ARMV8SVE gotoblas_ARMV8
149154
#define gotoblas_A64FX gotoblas_ARMV8
150155
#endif
156+
157+
#ifndef NO_SME
158+
extern gotoblas_t gotoblas_ARMV9SME;
159+
#else
160+
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
161+
#endif
162+
151163
extern gotoblas_t gotoblas_THUNDERX3T110;
152164
#endif
153165
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
@@ -393,6 +405,13 @@ static gotoblas_t *get_coretype(void) {
393405
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
394406
openblas_warning(1, coremsg);
395407
}
408+
409+
#if !defined(NO_SME) && defined(HWCAP2_SME)
410+
if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) {
411+
return &gotoblas_ARMV9SME;
412+
}
413+
#endif
414+
396415
#ifndef NO_SVE
397416
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
398417
return &gotoblas_ARMV8SVE;
@@ -443,3 +462,15 @@ void gotoblas_dynamic_init(void) {
443462
void gotoblas_dynamic_quit(void) {
444463
gotoblas = NULL;
445464
}
465+
466+
int support_sme1(void) {
467+
int ret = 0;
468+
469+
#if (defined OS_LINUX || defined OS_ANDROID)
470+
ret = getauxval(AT_HWCAP2) & HWCAP2_SME;
471+
if(getauxval(AT_HWCAP2) & HWCAP2_SME){
472+
ret = 1;
473+
}
474+
#endif
475+
return ret;
476+
}

getarch.c

+13
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12891289
#define CORENAME "ARMV8SVE"
12901290
#endif
12911291

1292+
#ifdef FORCE_ARMV9SME
1293+
#define FORCE
1294+
#define ARCHITECTURE "ARM64"
1295+
#define SUBARCHITECTURE "ARMV9SME"
1296+
#define SUBDIRNAME "arm64"
1297+
#define ARCHCONFIG "-DARMV9SME " \
1298+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
1299+
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
1300+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
1301+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9"
1302+
#define LIBNAME "armv9sme"
1303+
#define CORENAME "ARMV9SME"
1304+
#endif
12921305

12931306
#ifdef FORCE_ARMV8
12941307
#define FORCE

interface/gemm.c

+18-3
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@
4545
#include "functable.h"
4646
#endif
4747

48+
#if (defined OS_LINUX || defined OS_ANDROID)
49+
#include <sys/auxv.h>
50+
#include <asm/hwcap.h>
51+
#endif
52+
4853
#ifndef COMPLEX
4954
#define SMP_THRESHOLD_MIN 65536.0
5055
#ifdef XDOUBLE
@@ -85,6 +90,7 @@
8590
#define GEMM_MULTITHREAD_THRESHOLD 4
8691
#endif
8792

93+
8894
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = {
8995
#ifndef GEMM3M
9096
GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN,
@@ -347,17 +353,26 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
347353
int nodes;
348354
#endif
349355

356+
350357
PRINT_DEBUG_CNAME;
351358

352359
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
353-
#ifdef DYNAMIC_ARCH
360+
#if defined(DYNAMIC_ARCH) && defined(ARCH_x86)
354361
if (support_avx512() )
355-
#endif
362+
356363
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
357364
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
358365
return;
359366
}
360-
367+
#endif
368+
#if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64)
369+
if (support_sme1()){
370+
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
371+
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
372+
return;
373+
}
374+
}
375+
#endif
361376
#endif
362377

363378
#ifndef COMPLEX

kernel/Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ ifdef NO_AVX2
2424
AVX2OPT=
2525
endif
2626

27+
2728
ifdef TARGET_CORE
29+
ifeq ($(TARGET_CORE), ARMV9SME)
30+
override CFLAGS += -march=armv9-a+sve2+sme
31+
endif
2832
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
2933
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
3034
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))

kernel/Makefile.L3

+31-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ endif
2424

2525
ifeq ($(ARCH), arm64)
2626
USE_TRMM = 1
27+
USE_DIRECT_SGEMM = 1
2728
endif
2829

2930
ifeq ($(ARCH), riscv64)
@@ -95,9 +96,16 @@ endif
9596

9697
ifdef USE_DIRECT_SGEMM
9798
ifndef SGEMMDIRECTKERNEL
99+
ifeq ($(ARCH), x86_64)
98100
SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c
99101
SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
100102
endif
103+
ifeq ($(ARCH), arm64)
104+
ifdef HAVE_SME
105+
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c
106+
endif
107+
endif
108+
endif
101109
endif
102110

103111
ifeq ($(BUILD_BFLOAT16), 1)
@@ -128,9 +136,19 @@ SKERNELOBJS += \
128136
$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ)
129137

130138
ifdef USE_DIRECT_SGEMM
139+
ifeq ($(ARCH), x86_64)
131140
SKERNELOBJS += \
132141
sgemm_direct$(TSUFFIX).$(SUFFIX) \
133-
sgemm_direct_performant$(TSUFFIX).$(SUFFIX)
142+
sgemm_direct_performant$(TSUFFIX).$(SUFFIX)
143+
endif
144+
ifeq ($(ARCH), arm64)
145+
ifdef HAVE_SME
146+
SKERNELOBJS += \
147+
sgemm_direct.$(SUFFIX) \
148+
sgemm_direct_sme1.$(SUFFIX) \
149+
sgemm_direct_sme1_preprocess.$(SUFFIX)
150+
endif
151+
endif
134152
endif
135153
endif
136154

@@ -809,11 +827,23 @@ else
809827
endif
810828

811829
ifdef USE_DIRECT_SGEMM
830+
ifeq ($(ARCH), x86_64)
812831
$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
813832
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
814833
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
815834
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
816835
endif
836+
ifeq ($(ARCH), arm64)
837+
ifdef HAVE_SME
838+
$(KDIR)sgemm_direct_sme1.$(SUFFIX) :
839+
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@
840+
$(KDIR)sgemm_direct_sme1_preprocess.$(SUFFIX) :
841+
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@
842+
$(KDIR)sgemm_direct.$(SUFFIX) :
843+
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_arm64_sme1.c -UDOUBLE -UCOMPLEX -o $@
844+
endif
845+
endif
846+
endif
817847

818848
ifeq ($(BUILD_BFLOAT16), 1)
819849

kernel/arm64/KERNEL.ARMV9SME

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
include $(KERNELDIR)/KERNEL.ARMV8SVE
2+
3+

0 commit comments

Comments
 (0)