Skip to content

Commit b83bc10

Browse files
committed
trying to fix cuda builds (+1 squashed commits)
Squashed commits: [80615722c] wip
1 parent 7527f1e commit b83bc10

File tree

5 files changed

+21
-3
lines changed

5 files changed

+21
-3
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,12 +135,15 @@ if (LLAMA_CUBLAS)
135135
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
136136
add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
137137
set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-virtual") # lowest CUDA 13 standard
138+
add_compile_definitions("KCPP_BACKUP_CUDA_ARCH_REF=75,80,86")
138139
elseif(CUDAToolkit_VERSION VERSION_GREATER 12)
139140
add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
140141
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
142+
add_compile_definitions("KCPP_BACKUP_CUDA_ARCH_REF=50,61,70,75,80")
141143
else()
142144
add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
143145
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
146+
add_compile_definitions("KCPP_BACKUP_CUDA_ARCH_REF=35,50,61,70,75")
144147
endif()
145148
endif()
146149
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

Makefile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,21 +228,24 @@ NVCCFLAGS += -Wno-deprecated-gpu-targets \
228228
-gencode arch=compute_61,code=compute_61 \
229229
-gencode arch=compute_70,code=compute_70 \
230230
-gencode arch=compute_75,code=compute_75 \
231-
-DKCPP_LIMIT_CUDA_MAX_ARCH=750
231+
-DKCPP_LIMIT_CUDA_MAX_ARCH=750 \
232+
-DKCPP_BACKUP_CUDA_ARCH_REF="35,50,61,70,75"
232233

233234
else ifdef LLAMA_ARCHES_CU12
234235
NVCCFLAGS += -Wno-deprecated-gpu-targets \
235236
-gencode arch=compute_50,code=compute_50 \
236237
-gencode arch=compute_61,code=compute_61 \
237238
-gencode arch=compute_70,code=compute_70 \
238239
-gencode arch=compute_75,code=compute_75 \
239-
-gencode arch=compute_80,code=compute_80
240+
-gencode arch=compute_80,code=compute_80 \
241+
-DKCPP_BACKUP_CUDA_ARCH_REF="50,61,70,75,80"
240242

241243
else ifdef LLAMA_ARCHES_CU13
242244
NVCCFLAGS += -Wno-deprecated-gpu-targets \
243245
-gencode arch=compute_75,code=compute_75 \
244246
-gencode arch=compute_80,code=compute_80 \
245-
-gencode arch=compute_86,code=compute_86
247+
-gencode arch=compute_86,code=compute_86 \
248+
-DKCPP_BACKUP_CUDA_ARCH_REF="75,80,86"
246249

247250
else
248251
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=all

ggml/src/ggml-cuda/common.cuh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,12 @@
9595
# define GGML_CUDA_USE_CUB
9696
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
9797

98+
#ifndef __CUDA_ARCH_LIST__
99+
#ifdef KCPP_BACKUP_CUDA_ARCH_REF
100+
#define __CUDA_ARCH_LIST__ KCPP_BACKUP_CUDA_ARCH_REF
101+
#endif
102+
#endif
103+
98104
#ifdef __CUDA_ARCH_LIST__
99105
constexpr bool ggml_cuda_has_arch_impl(int) {
100106
return false;

ggml/src/ggml-cuda/fattn.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
310310
{
311311
return BEST_FATTN_KERNEL_WMMA_F16;
312312
}
313+
else
314+
{
315+
return BEST_FATTN_KERNEL_NONE;
316+
}
313317
}
314318

315319
return BEST_FATTN_KERNEL_MMA_F16;

gpttype_adapter.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2258,6 +2258,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
22582258
std::vector<llama_model_kv_override> kvos; //ensure it keeps in scope until model is created
22592259
std::vector<llama_model_tensor_buft_override> tenos; //ensure it keeps in scope until model is created
22602260
std::vector<std::string> temp_tensor_names; //store temp tensor names to have mem references.
2261+
temp_tensor_names.reserve(32); //very important, prevents vector from reallocating
2262+
tenos.reserve(32);
22612263
if(inputs.moe_experts>0)
22622264
{
22632265
printf("\nOverriding number of experts to %d\n",inputs.moe_experts);

0 commit comments

Comments
 (0)