Skip to content

Add Moorethreads MUSA support #6697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_MUSA "llama: use MUSA" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
Expand Down Expand Up @@ -574,6 +575,49 @@ if (LLAMA_HIPBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
endif()

if (LLAMA_MUSA)
option(MUSA_ARCH "MUSA architecture" "21")

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

find_package(MUSA REQUIRED)

message(STATUS "MUSA found")

enable_language(MUSA)

set(GGML_HEADERS_MUSA ggml-cuda.h)

file(GLOB GGML_SOURCES_MUSA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_MUSA "ggml-cuda.cu")

add_compile_definitions(GGML_USE_MUSA GGML_USE_CUDA)

if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()

if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()

if (LLAMA_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()

add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE MUSA)

if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for MUSA")
endif()

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC MUSA::musa MUSA::mublas MUSA::musart)
endif()

if (LLAMA_SYCL)
if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
Expand Down Expand Up @@ -1160,6 +1204,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_MUSA} ${GGML_HEADERS_MUSA}
)

target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
Expand Down
32 changes: 32 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,38 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com

endif # LLAMA_HIPBLAS

ifdef LLAMA_MUSA
MUSA_PATH ?= /usr/local/musa
MUSA_ARCH ?= 21
MCC ?= $(CCACHE) $(MUSA_PATH)/bin/mcc
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
MK_LDFLAGS += -lmublas -lmusa -lmusart
MUSAFLAGS += --cuda-gpu-arch=mp_$(MUSA_ARCH)
MUSAFLAGS += -Wno-unknown-warning-option -Wno-gnu-anonymous-struct -Wno-nested-anon-types -Wno-invalid-noreturn
MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
ifdef LLAMA_CUDA_FORCE_DMMV
MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_NO_PEER_COPY
MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))

ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -c -o $@ $<

ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -c -o $@ $<

endif # LLAMA_MUSA

ifdef LLAMA_METAL
MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
Expand Down
11 changes: 11 additions & 0 deletions cmake/CMakeDetermineMUSACompiler.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

set(CMAKE_MUSA_ARCHITECTURES "mp_${MUSA_ARCH}")
set(CMAKE_MUSA_COMPILER "${MUSA_MCC}")
set(CMAKE_MUSA_COMPILER_ID "Clang")
set(CMAKE_MUSA_COMPILER_ARG1 "")
set(CMAKE_MUSA_COMPILER_ENV_VAR "MCC")

configure_file(
${CMAKE_CURRENT_LIST_DIR}/CMakeMUSACompiler.cmake.in
${CMAKE_PLATFORM_INFO_DIR}/CMakeMUSACompiler.cmake
)
6 changes: 6 additions & 0 deletions cmake/CMakeMUSACompiler.cmake.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
set(CMAKE_MUSA_COMPILER "@CMAKE_MUSA_COMPILER@")
set(CMAKE_MUSA_COMPILER_ARG1 "@CMAKE_MUSA_COMPILER_ARG1@")
set(CMAKE_MUSA_COMPILER_LOADED 1)
set(CMAKE_MUSA_SOURCE_FILE_EXTENSIONS mu;cu)
set(CMAKE_MUSA_OUTPUT_EXTENSION .o)
set(CMAKE_MUSA_COMPILER_ENV_VAR "MUSA")
26 changes: 26 additions & 0 deletions cmake/CMakeMUSAInformation.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

# reuse cxx things

include(CMakeLanguageInformation)
include(CMakeCommonLanguageInclude)

include(Compiler/Clang)

__compiler_clang(MUSA)
__compiler_clang_cxx_standards(MUSA)

set(CMAKE_INCLUDE_FLAG_MUSA "-I")

set(CMAKE_MUSA_RUNTIME_LIBRARY_DEFAULT "SHARED")
set(CMAKE_MUSA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "")
set(CMAKE_MUSA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "")

# Populated by CMakeHIPInformation.cmake
set(CMAKE_MUSA_RUNTIME_LIBRARIES_STATIC "")
set(CMAKE_MUSA_RUNTIME_LIBRARIES_SHARED "")

# compile a C++ file into an object file
if(NOT CMAKE_MUSA_COMPILE_OBJECT)
set(CMAKE_MUSA_COMPILE_OBJECT
"<CMAKE_MUSA_COMPILER> -x musa --cuda-gpu-arch=${CMAKE_MUSA_ARCHITECTURES} -fPIC <DEFINES> <INCLUDES> <FLAGS> -o <OBJECT> -c <SOURCE>")
endif()
1 change: 1 addition & 0 deletions cmake/CMakeTestMUSACompiler.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# do nothing, make cmake happy
101 changes: 101 additions & 0 deletions cmake/FindMUSA.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# find MUSA things

include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
include(${CMAKE_ROOT}/Modules/CMakeFindDependencyMacro.cmake)

if(DEFINED ENV{MUSA_HOME})
set(MUSA_HOME $ENV{MUSA_HOME})
else()
set(MUSA_HOME /usr/local/musa)
endif()

set(MUSA_MCC ${MUSA_HOME}/bin/mcc)

if (DEFINED ENV{MUSA_ARCH})
set(MUSA_ARCH $ENV{MUSA_ARCH})
elseif(NOT MUSA_ARCH)
set(MUSA_ARCH "21")
endif()

if(NOT MUSA_INCLUDE_DIR)
set(MUSA_INCLUDE_DIR ${MUSA_HOME}/include)
endif()

if(NOT MUSA_LIBRARY_DIR)
set(MUSA_LIBRARY_DIR ${MUSA_HOME}/lib)
endif()

if(NOT MUSA_LIBRARIES)
find_library(
MUSA_MUSA_LIBRARY
NAMES libmusa.so
PATHS ${MUSA_LIBRARY_DIR}
)

find_library(
MUSA_MUBLAS_LIBRARY
NAMES libmublas.so
PATHS ${MUSA_LIBRARY_DIR}
)

find_library(
MUSA_MUSART_LIBRARY
NAMES libmusart.so
PATHS ${MUSA_LIBRARY_DIR}
)

if(MUSA_MUSA_LIBRARY AND MUSA_MUBLAS_LIBRARY AND MUSA_MUSART_LIBRARY)
set(MUSA_LIBRARIES "${MUSA_MUSA_LIBRARY};${MUSA_MUBLAS_LIBRARY};${MUSA_MUSART_LIBRARY}")
set(MUSA_MUSA_LIBRARY CACHE STRING "${MUSA_MUSA_LIBRARY}")
set(MUSA_MUBLAS_LIBRARY CACHE STRING "${MUSA_MUBLAS_LIBRARY}")
set(MUSA_MUSART_LIBRARY CACHE STRING "${MUSA_MUSART_LIBRARY}")
endif()
endif()

if(MUSA_LIBRARIES)
if(NOT TARGET MUSA::musa)
add_library(MUSA::musa SHARED IMPORTED)
set_target_properties(MUSA::musa PROPERTIES
IMPORTED_LOCATION ${MUSA_MUSA_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${MUSA_INCLUDE_DIR}
)
endif()

if(NOT TARGET MUSA::mublas)
add_library(MUSA::mublas SHARED IMPORTED)
set_target_properties(MUSA::mublas PROPERTIES
IMPORTED_LOCATION ${MUSA_MUBLAS_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${MUSA_INCLUDE_DIR}
)
endif()

if(NOT TARGET MUSA::musart)
add_library(MUSA::musart SHARED IMPORTED)
set_target_properties(MUSA::musart PROPERTIES
IMPORTED_LOCATION ${MUSA_MUSART_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${MUSA_INCLUDE_DIR}
)
endif()

set(MUSA_INCLUDE_DIR ${MUSA_INCLUDE_DIR} CACHE STRING "")
set(MUSA_LIBRARY_DIR ${MUSA_LIBRARY_DIR} CACHE STRING "")
set(MUSA_LIBRARIES ${MUSA_LIBRARIES} CACHE STRING "")
endif()

find_package_handle_standard_args(
MUSA
REQUIRED_VARS
MUSA_ARCH
MUSA_MCC
MUSA_INCLUDE_DIR
MUSA_LIBRARIES
MUSA_LIBRARY_DIR
)
mark_as_advanced(
MUSA_INCLUDE_DIR
MUSA_LIBRARIES
MUSA_LIBRARY_DIR
CMAKE_MUSA_ARCHITECTURES
CMAKE_MUSA_COMPILER
)
13 changes: 11 additions & 2 deletions ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ typedef half2 ggml_half2;

#define GGML_COMMON_AGGR

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_MUSA)
#include <mublas.h>

typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
#include <cuda_fp16.h>
Expand Down Expand Up @@ -73,7 +82,7 @@ typedef sycl::half2 ggml_half2;
#define K_SCALE_SIZE 12
#endif // GGML_QKK_64

#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL) || defined(GGML_COMMON_DECL_MUSA)
// QR = QK / number of values before dequantization
// QI = number of 32 bit integers before dequantization

Expand Down Expand Up @@ -439,7 +448,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
#include <cstdint>

#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
Expand Down
20 changes: 10 additions & 10 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;

#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
Expand All @@ -124,7 +124,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
}
#endif // !defined(GGML_USE_HIPBLAS)
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
info.devices[id].vmm = !!device_vmm;

cudaDeviceProp prop;
Expand Down Expand Up @@ -257,7 +257,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
};

// pool with virtual memory
#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

Expand Down Expand Up @@ -351,10 +351,10 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
}
};
#endif // !defined(GGML_USE_HIPBLAS)
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)

std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
}
Expand Down Expand Up @@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat(
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = ctx.device;
Expand Down Expand Up @@ -1793,7 +1793,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;

#if 0
#if defined(GGML_USE_MUSA)
// use cublasGemmEx
{
for (int i13 = 0; i13 < ne13; ++i13) {
Expand All @@ -1802,10 +1802,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
int i02 = i12 / r2;

CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
alpha, (const char *) src0_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
(const char *) src1_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
Expand Down
Loading