trying to fix cuda builds (+1 squashed commits)

LostRuins · LostRuins · commit b83bc105e827 · 2025-11-27T14:34:50.000+08:00
Squashed commits:

[80615722c] wip
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -135,12 +135,15 @@ if (LLAMA_CUBLAS)
         if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
                 add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
                 set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-virtual") # lowest CUDA 13 standard
+                add_compile_definitions("KCPP_BACKUP_CUDA_ARCH_REF=75,80,86")
         elseif(CUDAToolkit_VERSION VERSION_GREATER 12)
                 add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
                 set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
+                add_compile_definitions("KCPP_BACKUP_CUDA_ARCH_REF=50,61,70,75,80")
         else()
                 add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
                 set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
+                add_compile_definitions("KCPP_BACKUP_CUDA_ARCH_REF=35,50,61,70,75")
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
diff --git a/Makefile b/Makefile
@@ -228,21 +228,24 @@ NVCCFLAGS += -Wno-deprecated-gpu-targets \
              -gencode arch=compute_61,code=compute_61 \
              -gencode arch=compute_70,code=compute_70 \
              -gencode arch=compute_75,code=compute_75 \
-             -DKCPP_LIMIT_CUDA_MAX_ARCH=750
+             -DKCPP_LIMIT_CUDA_MAX_ARCH=750 \
+			 -DKCPP_BACKUP_CUDA_ARCH_REF="35,50,61,70,75"
 
 else ifdef LLAMA_ARCHES_CU12
 NVCCFLAGS += -Wno-deprecated-gpu-targets \
              -gencode arch=compute_50,code=compute_50 \
              -gencode arch=compute_61,code=compute_61 \
              -gencode arch=compute_70,code=compute_70 \
              -gencode arch=compute_75,code=compute_75 \
-             -gencode arch=compute_80,code=compute_80
+             -gencode arch=compute_80,code=compute_80 \
+			 -DKCPP_BACKUP_CUDA_ARCH_REF="50,61,70,75,80"
 
 else ifdef LLAMA_ARCHES_CU13
 NVCCFLAGS += -Wno-deprecated-gpu-targets \
              -gencode arch=compute_75,code=compute_75 \
              -gencode arch=compute_80,code=compute_80 \
-             -gencode arch=compute_86,code=compute_86
+             -gencode arch=compute_86,code=compute_86 \
+			 -DKCPP_BACKUP_CUDA_ARCH_REF="75,80,86"
 
 else
 NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=all
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -95,6 +95,12 @@
 #    define GGML_CUDA_USE_CUB
 #endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
 
+#ifndef __CUDA_ARCH_LIST__
+#ifdef KCPP_BACKUP_CUDA_ARCH_REF
+#define __CUDA_ARCH_LIST__ KCPP_BACKUP_CUDA_ARCH_REF
+#endif
+#endif
+
 #ifdef __CUDA_ARCH_LIST__
 constexpr bool ggml_cuda_has_arch_impl(int) {
     return false;
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -310,6 +310,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
             {
                 return BEST_FATTN_KERNEL_WMMA_F16;
             }
+            else
+            {
+                return BEST_FATTN_KERNEL_NONE;
+            }
         }
 
         return BEST_FATTN_KERNEL_MMA_F16;
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -2258,6 +2258,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         std::vector<llama_model_kv_override> kvos; //ensure it keeps in scope until model is created
         std::vector<llama_model_tensor_buft_override> tenos; //ensure it keeps in scope until model is created
         std::vector<std::string> temp_tensor_names; //store temp tensor names to have mem references.
+        temp_tensor_names.reserve(32); //very important, prevents vector from reallocating
+        tenos.reserve(32);
         if(inputs.moe_experts>0)
         {
             printf("\nOverriding number of experts to %d\n",inputs.moe_experts);

Original file line number	Diff line number	Diff line change
`@@ -310,6 +310,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const`
`310`	`310`	`{`
`311`	`311`	`return BEST_FATTN_KERNEL_WMMA_F16;`
`312`	`312`	`}`
	`313`	`+ else`
	`314`	`+ {`
	`315`	`+ return BEST_FATTN_KERNEL_NONE;`
	`316`	`+ }`
`313`	`317`	`}`
`314`	`318`
`315`	`319`	`return BEST_FATTN_KERNEL_MMA_F16;`
Original file line number	Diff line number	Diff line change
`@@ -2258,6 +2258,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2258`	`2258`	`std::vector<llama_model_kv_override> kvos; //ensure it keeps in scope until model is created`
`2259`	`2259`	`std::vector<llama_model_tensor_buft_override> tenos; //ensure it keeps in scope until model is created`
`2260`	`2260`	`std::vector<std::string> temp_tensor_names; //store temp tensor names to have mem references.`
	`2261`	`+ temp_tensor_names.reserve(32); //very important, prevents vector from reallocating`
	`2262`	`+ tenos.reserve(32);`
`2261`	`2263`	`if(inputs.moe_experts>0)`
`2262`	`2264`	`{`
`2263`	`2265`	`printf("\nOverriding number of experts to %d\n",inputs.moe_experts);`