@@ -1224,32 +1224,18 @@ static void llama_model_load_internal(
12241224
12251225#ifdef GGML_USE_CUBLAS
12261226 const int max_backend_supported_layers = hparams.n_layer + 3 ;
1227- #if defined(GGML_USE_HIPBLAS)
1228- const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3 ;
1229- #else
12301227 const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
1231- #endif
12321228 if (n_gpu_layers > (int ) hparams.n_layer + 1 ) {
12331229 if (low_vram) {
1234- #if defined(GGML_USE_HIPBLAS)
1235- fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1236- vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1237- #else
12381230 fprintf (stderr, " %s: cannot offload v cache to GPU due to low VRAM option\n " , __func__);
1239- #endif
12401231 } else {
12411232 fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
12421233 vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
12431234 }
12441235 }
12451236 if (n_gpu_layers > (int ) hparams.n_layer + 2 ) {
12461237 if (low_vram) {
1247- #if defined(GGML_USE_HIPBLAS)
1248- fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1249- vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1250- #else
12511238 fprintf (stderr, " %s: cannot offload k cache to GPU due to low VRAM option\n " , __func__);
1252- #endif
12531239 } else {
12541240 fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
12551241 vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
0 commit comments