@@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {
59
59
60
60
bool llama_supports_gpu_offload (void ) {
61
61
return ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
62
+ ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
62
63
llama_supports_rpc ();
63
64
}
64
65
@@ -184,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
184
185
model->devices .push_back (*dev);
185
186
}
186
187
} else {
188
+ // default device selection
189
+
190
+ // build list of available devices
191
+ std::vector<ggml_backend_dev_t > gpus;
192
+ std::vector<ggml_backend_dev_t > igpus;
187
193
std::vector<ggml_backend_dev_t > rpc_servers;
188
- // use all available devices
194
+
189
195
for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
190
196
ggml_backend_dev_t dev = ggml_backend_dev_get (i);
191
197
switch (ggml_backend_dev_type (dev)) {
@@ -194,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
194
200
// skip CPU backends since they are handled separately
195
201
break ;
196
202
197
- case GGML_BACKEND_DEVICE_TYPE_GPU:
203
+ case GGML_BACKEND_DEVICE_TYPE_GPU: {
198
204
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg (dev);
199
205
if (ggml_backend_reg_name (reg) == std::string (" RPC" )) {
200
206
rpc_servers.push_back (dev);
201
207
} else {
202
- model->devices .push_back (dev);
208
+ // check if there is already a GPU with the same device id
209
+ ggml_backend_dev_props props;
210
+ ggml_backend_dev_get_props (dev, &props);
211
+ auto it = std::find_if (gpus.begin (), gpus.end (), [&props](ggml_backend_dev_t d) {
212
+ ggml_backend_dev_props d_props;
213
+ ggml_backend_dev_get_props (d, &d_props);
214
+ if (props.device_id && d_props.device_id ) {
215
+ return strcmp (props.device_id , d_props.device_id ) == 0 ;
216
+ }
217
+ return false ;
218
+ });
219
+
220
+ if (it != gpus.end ()) {
221
+ LLAMA_LOG_INFO (" %s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n " ,
222
+ __func__,
223
+ ggml_backend_dev_name (dev), ggml_backend_dev_description (dev),
224
+ props.device_id ? props.device_id : " unknown id" ,
225
+ ggml_backend_dev_name (*it), ggml_backend_dev_description (*it));
226
+ } else {
227
+ gpus.push_back (dev);
228
+ }
203
229
}
204
230
break ;
231
+ }
232
+
233
+ case GGML_BACKEND_DEVICE_TYPE_IGPU:
234
+ igpus.push_back (dev);
235
+ break ;
205
236
}
206
237
}
207
- // add RPC servers at the front of the list
208
- if (!rpc_servers.empty ()) {
209
- model->devices .insert (model->devices .begin (), rpc_servers.begin (), rpc_servers.end ());
238
+
239
+ // add RPC servers at the front of the list to minimize network transfers
240
+ model->devices .insert (model->devices .begin (), rpc_servers.begin (), rpc_servers.end ());
241
+
242
+ // add GPUs
243
+ model->devices .insert (model->devices .end (), gpus.begin (), gpus.end ());
244
+
245
+ // add integrated GPUs only if no other devices were found
246
+ if (model->devices .empty ()) {
247
+ model->devices .insert (model->devices .end (), igpus.begin (), igpus.end ());
210
248
}
211
249
}
212
250
@@ -227,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
227
265
}
228
266
229
267
for (auto * dev : model->devices ) {
230
- size_t free, total; // NOLINT
231
- ggml_backend_dev_memory (dev, &free, &total);
232
- LLAMA_LOG_INFO (" %s: using device %s (%s) - %zu MiB free\n " , __func__, ggml_backend_dev_name (dev), ggml_backend_dev_description (dev), free/1024 /1024 );
268
+ ggml_backend_dev_props props;
269
+ ggml_backend_dev_get_props (dev, &props);
270
+ LLAMA_LOG_INFO (" %s: using device %s (%s) (%s) - %zu MiB free\n " , __func__,
271
+ ggml_backend_dev_name (dev), ggml_backend_dev_description (dev),
272
+ props.device_id ? props.device_id : " unknown id" ,
273
+ props.memory_free /1024 /1024 );
233
274
}
234
275
235
276
const int status = llama_model_load (path_model, splits, *model, params);
0 commit comments