@@ -2469,7 +2469,9 @@ struct llama_model_loader {
2469
2469
}
2470
2470
2471
2471
if (progress_callback) {
2472
- progress_callback(1.0f, progress_callback_user_data);
2472
+ // Even though the model is done loading, we still honor
2473
+ // cancellation since we need to free allocations.
2474
+ return progress_callback(1.0f, progress_callback_user_data);
2473
2475
}
2474
2476
return true;
2475
2477
}
@@ -3060,8 +3062,6 @@ static bool llm_load_tensors(
3060
3062
void * progress_callback_user_data) {
3061
3063
model.t_start_us = ggml_time_us();
3062
3064
3063
- bool ok = true; // if false, model load was cancelled
3064
-
3065
3065
auto & ctx = model.ctx;
3066
3066
auto & hparams = model.hparams;
3067
3067
@@ -3729,19 +3729,16 @@ static bool llm_load_tensors(
3729
3729
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3730
3730
}
3731
3731
3732
- ok = ok && ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
3733
- if (progress_callback) {
3734
- // Even though the model is done loading, we still honor
3735
- // cancellation since we need to free allocations.
3736
- ok = ok && progress_callback(1.0f, progress_callback_user_data);
3732
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
3733
+ return false;
3737
3734
}
3738
3735
3739
3736
model.mapping = std::move(ml.mapping);
3740
3737
3741
3738
// loading time will be recalculate after the first eval, so
3742
3739
// we take page faults deferred by mmap() into consideration
3743
3740
model.t_load_us = ggml_time_us() - model.t_start_us;
3744
- return ok ;
3741
+ return true ;
3745
3742
}
3746
3743
3747
3744
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
0 commit comments