Skip to content

Commit 360d653

Browse files
authored
ggml-backend : add GGML_BACKEND_DEVICE_TYPE_IGPU device type (#15797)
* ggml-backend : add GGML_BACKEND_DEVICE_TYPE_IGPU device type ggml-backend : add device id to device props llama : only use iGPU devices if there are no GPU devices llama : do not use multiple devices from different backends with the same device id
1 parent 0e6ff00 commit 360d653

File tree

7 files changed

+81
-14
lines changed

7 files changed

+81
-14
lines changed

ggml/include/ggml-backend.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ extern "C" {
132132
GGML_BACKEND_DEVICE_TYPE_CPU,
133133
// GPU device using dedicated memory
134134
GGML_BACKEND_DEVICE_TYPE_GPU,
135+
// integrated GPU device using host memory
136+
GGML_BACKEND_DEVICE_TYPE_IGPU,
135137
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136138
GGML_BACKEND_DEVICE_TYPE_ACCEL
137139
};
@@ -150,11 +152,21 @@ extern "C" {
150152

151153
// all the device properties
152154
struct ggml_backend_dev_props {
155+
// device name
153156
const char * name;
157+
// device description
154158
const char * description;
159+
// device free memory in bytes
155160
size_t memory_free;
161+
// device total memory in bytes
156162
size_t memory_total;
163+
// device type
157164
enum ggml_backend_dev_type type;
165+
// device id
166+
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
167+
// if the id is unknown, this should be NULL
168+
const char * device_id;
169+
// device capabilities
158170
struct ggml_backend_dev_caps caps;
159171
};
160172

ggml/src/ggml-backend-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
extern "C" {
99
#endif
1010

11-
#define GGML_BACKEND_API_VERSION 1
11+
#define GGML_BACKEND_API_VERSION 2
1212

1313
//
1414
// Backend buffer type

ggml/src/ggml-backend-reg.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
400400

401401
ggml_backend_t ggml_backend_init_best(void) {
402402
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
403-
if (!dev) {
404-
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
405-
}
403+
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
404+
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
406405
if (!dev) {
407406
return nullptr;
408407
}

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3210,6 +3210,7 @@ struct ggml_backend_cuda_device_context {
32103210
int device;
32113211
std::string name;
32123212
std::string description;
3213+
std::string pci_bus_id;
32133214
};
32143215

32153216
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3234,9 +3235,12 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
32343235
}
32353236

32363237
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
3238+
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3239+
32373240
props->name = ggml_backend_cuda_device_get_name(dev);
32383241
props->description = ggml_backend_cuda_device_get_description(dev);
32393242
props->type = ggml_backend_cuda_device_get_type(dev);
3243+
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
32403244
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
32413245

32423246
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
@@ -3804,6 +3808,10 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
38043808
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
38053809
dev_ctx->description = prop.name;
38063810

3811+
char pci_bus_id[16] = {};
3812+
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
3813+
dev_ctx->pci_bus_id = pci_bus_id;
3814+
38073815
ggml_backend_dev_t dev = new ggml_backend_device {
38083816
/* .iface = */ ggml_backend_cuda_device_interface,
38093817
/* .reg = */ &reg,

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12113,13 +12113,15 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
1211312113

1211412114
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
1211512115
UNUSED(dev);
12116+
// TODO: return GGML_BACKEND_DEVICE_TYPE_IGPU for integrated GPUs
1211612117
return GGML_BACKEND_DEVICE_TYPE_GPU;
1211712118
}
1211812119

1211912120
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
1212012121
props->name = ggml_backend_vk_device_get_name(dev);
1212112122
props->description = ggml_backend_vk_device_get_description(dev);
1212212123
props->type = ggml_backend_vk_device_get_type(dev);
12124+
// TODO: set props->device_id to PCI bus id
1212312125
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
1212412126
props->caps = {
1212512127
/* .async = */ false,

src/llama.cpp

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {
5959

6060
bool llama_supports_gpu_offload(void) {
6161
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
62+
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
6263
llama_supports_rpc();
6364
}
6465

@@ -184,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
184185
model->devices.push_back(*dev);
185186
}
186187
} else {
188+
// default device selection
189+
190+
// build list of available devices
191+
std::vector<ggml_backend_dev_t> gpus;
192+
std::vector<ggml_backend_dev_t> igpus;
187193
std::vector<ggml_backend_dev_t> rpc_servers;
188-
// use all available devices
194+
189195
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
190196
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
191197
switch (ggml_backend_dev_type(dev)) {
@@ -194,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
194200
// skip CPU backends since they are handled separately
195201
break;
196202

197-
case GGML_BACKEND_DEVICE_TYPE_GPU:
203+
case GGML_BACKEND_DEVICE_TYPE_GPU: {
198204
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
199205
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
200206
rpc_servers.push_back(dev);
201207
} else {
202-
model->devices.push_back(dev);
208+
// check if there is already a GPU with the same device id
209+
ggml_backend_dev_props props;
210+
ggml_backend_dev_get_props(dev, &props);
211+
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
212+
ggml_backend_dev_props d_props;
213+
ggml_backend_dev_get_props(d, &d_props);
214+
if (props.device_id && d_props.device_id) {
215+
return strcmp(props.device_id, d_props.device_id) == 0;
216+
}
217+
return false;
218+
});
219+
220+
if (it != gpus.end()) {
221+
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
222+
__func__,
223+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
224+
props.device_id ? props.device_id : "unknown id",
225+
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
226+
} else {
227+
gpus.push_back(dev);
228+
}
203229
}
204230
break;
231+
}
232+
233+
case GGML_BACKEND_DEVICE_TYPE_IGPU:
234+
igpus.push_back(dev);
235+
break;
205236
}
206237
}
207-
// add RPC servers at the front of the list
208-
if (!rpc_servers.empty()) {
209-
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
238+
239+
// add RPC servers at the front of the list to minimize network transfers
240+
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
241+
242+
// add GPUs
243+
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
244+
245+
// add integrated GPUs only if no other devices were found
246+
if (model->devices.empty()) {
247+
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
210248
}
211249
}
212250

@@ -227,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
227265
}
228266

229267
for (auto * dev : model->devices) {
230-
size_t free, total; // NOLINT
231-
ggml_backend_dev_memory(dev, &free, &total);
232-
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
268+
ggml_backend_dev_props props;
269+
ggml_backend_dev_get_props(dev, &props);
270+
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
271+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
272+
props.device_id ? props.device_id : "unknown id",
273+
props.memory_free/1024/1024);
233274
}
234275

235276
const int status = llama_model_load(path_model, splits, *model, params);

tools/llama-bench/llama-bench.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ static std::string get_gpu_info() {
128128
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
129129
auto * dev = ggml_backend_dev_get(i);
130130
auto dev_type = ggml_backend_dev_type(dev);
131-
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
131+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
132132
gpu_list.push_back(ggml_backend_dev_description(dev));
133133
}
134134
}
@@ -945,6 +945,7 @@ struct cmd_params_instance {
945945
exit(1);
946946
}
947947
}
948+
// FIXME: use llama.cpp device selection logic
948949
// add local GPU devices if any
949950
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
950951
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -957,6 +958,10 @@ struct cmd_params_instance {
957958
case GGML_BACKEND_DEVICE_TYPE_GPU:
958959
devices.push_back(dev);
959960
break;
961+
962+
case GGML_BACKEND_DEVICE_TYPE_IGPU:
963+
// iGPUs are not used when there are RPC servers
964+
break;
960965
}
961966
}
962967
devices.push_back(nullptr);

0 commit comments

Comments
 (0)