1
0
mirror of https://github.com/ggerganov/llama.cpp.git synced 2025-04-20 13:36:08 +00:00

model : do not repack if a GPU device is present ()

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-03-21 16:14:29 +02:00 committed by GitHub
parent 960e726077
commit af04481e6b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -271,19 +271,32 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
}
}
// add extra buffer types
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(cpu_dev, *extra_bufts);
++extra_bufts;
bool has_gpu_device = false;
for (auto * dev : devices) {
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
has_gpu_device = true;
break;
}
}
// add extra buffer types, only if no GPU device is present
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
if (!has_gpu_device) {
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(cpu_dev, *extra_bufts);
++extra_bufts;
}
}
} else {
LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
}
// add a host buffer type
// storing the tensors in a host buffer is useful when the processing of large batches
// is offloaded to a GPU device, since it reduces the time spent on data transfers