@@ -5085,10 +5085,10 @@ static struct ggml_cgraph * llama_build_graph(
5085
5085
{ OFFLOAD_FUNC_NOP, " CPU" },
5086
5086
{ OFFLOAD_FUNC_OUT, " CPU" },
5087
5087
#ifdef GGML_USE_CUBLAS
5088
- { OFFLOAD_FUNC, " GPU (CUDA)" },
5089
- { OFFLOAD_FUNC_KQ, " GPU (CUDA) KQ" },
5090
- { OFFLOAD_FUNC_V, " GPU (CUDA) V" },
5091
- { OFFLOAD_FUNC_NR, " GPU (CUDA) NR" },
5088
+ { OFFLOAD_FUNC, " GPU (CUDA)" },
5089
+ { OFFLOAD_FUNC_KQ, " GPU (CUDA) KQ" },
5090
+ { OFFLOAD_FUNC_V, " GPU (CUDA) V" },
5091
+ { OFFLOAD_FUNC_NR, " GPU (CUDA) NR" },
5092
5092
{ OFFLOAD_FUNC_EMB, " GPU (CUDA) EMB" },
5093
5093
#else
5094
5094
{ OFFLOAD_FUNC, " CPU" },
@@ -5103,11 +5103,11 @@ static struct ggml_cgraph * llama_build_graph(
5103
5103
llm_offload_func_e func_e = k_offload_func_trie.find (name);
5104
5104
5105
5105
if (func_e == OFFLOAD_FUNC_NOP) {
5106
- // if a tensor hasn't been offloaded, we warn the user
5107
- if (worst_case) {
5108
- LLAMA_LOG_WARN (" %s: %32s: not offloaded (ref: %s)\n " , __func__,
5109
- cur->name , " https://github.com/ggerganov/llama.cpp/pull/3837" );
5110
- }
5106
+ // // if a tensor hasn't been offloaded, we warn the user
5107
+ // if (worst_case) {
5108
+ // LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
5109
+ // cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
5110
+ // }
5111
5111
5112
5112
return ;
5113
5113
}
@@ -5214,6 +5214,30 @@ static struct ggml_cgraph * llama_build_graph(
5214
5214
GGML_ASSERT (false );
5215
5215
}
5216
5216
5217
+ #ifdef GGML_USE_CUBLAS
5218
+ // TODO: tmp code to help find tensors that haven't been offloaded
5219
+ if (worst_case) {
5220
+ for (int i = 0 ; i < result->n_nodes ; ++i) {
5221
+ struct ggml_tensor * cur = result->nodes [i];
5222
+
5223
+ if (cur->view_src != nullptr ) {
5224
+ continue ;
5225
+ }
5226
+
5227
+ // check the global map for what offload function to use for this tensor
5228
+ llm_offload_func_e func_e = k_offload_func_trie.find (cur->name );
5229
+
5230
+ if (func_e == OFFLOAD_FUNC_NOP && cur->backend == GGML_BACKEND_CPU) {
5231
+ // if a tensor hasn't been offloaded, we warn the user
5232
+ if (worst_case) {
5233
+ LLAMA_LOG_WARN (" %s: %32s: not offloaded (ref: %s)\n " , __func__,
5234
+ cur->name , " https://github.com/ggerganov/llama.cpp/pull/3837" );
5235
+ }
5236
+ }
5237
+ }
5238
+ }
5239
+ #endif
5240
+
5217
5241
return result;
5218
5242
}
5219
5243
0 commit comments