Skip to content

Commit d4ef64e

Browse files
NeoZhangJianyuhazelnutcloud
authored andcommitted
Revert "[SYCL] fix error when set main gpu to non-zero (ggml-org#5901)" (ggml-org#5918)
This reverts commit ceca1ae.
1 parent fefe748 commit d4ef64e

File tree

3 files changed

+64
-107
lines changed

3 files changed

+64
-107
lines changed

ggml-sycl.cpp

Lines changed: 60 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -3559,31 +3559,12 @@ class sycl_gpu_mgr {
35593559
int work_group_size = 0;
35603560
std::string gpus_list = "";
35613561

3562-
/*
3563-
Use all GPU with same top max compute units
3564-
*/
35653562
sycl_gpu_mgr() {
35663563
detect_sycl_gpu_list_with_max_cu();
35673564
get_allow_gpus();
35683565
create_context_with_gpus();
35693566
}
35703567

3571-
/*
3572-
Use the assigned GPU as only one
3573-
*/
3574-
sycl_gpu_mgr(int main_gpu_id) {
3575-
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
3576-
dpct::device_info prop;
3577-
dpct::get_device_info(prop, device);
3578-
gpus.push_back(main_gpu_id);
3579-
devices.push_back(device);
3580-
work_group_size = prop.get_max_work_group_size();
3581-
max_compute_units = prop.get_max_compute_units();
3582-
3583-
get_allow_gpus();
3584-
create_context_with_gpus();
3585-
}
3586-
35873568
void create_context_with_gpus() {
35883569
sycl::context ctx = sycl::context(devices);
35893570
assert(gpus.size() > 0);
@@ -3599,7 +3580,7 @@ class sycl_gpu_mgr {
35993580
gpus_list += std::to_string(gpus[i]);
36003581
gpus_list += ",";
36013582
}
3602-
if (gpus_list.length() > 1) {
3583+
if (gpus_list.length() > 2) {
36033584
gpus_list.pop_back();
36043585
}
36053586
}
@@ -3648,8 +3629,8 @@ class sycl_gpu_mgr {
36483629
if (gpus[i] == id)
36493630
return i;
36503631
}
3651-
printf("miss to get device index by id=%d\n", id);
3652-
GGML_ASSERT(false);
3632+
assert(false);
3633+
return -1;
36533634
}
36543635

36553636
int get_next_index(int id) {
@@ -3658,7 +3639,8 @@ class sycl_gpu_mgr {
36583639
if (gpus[i] == id)
36593640
return i;
36603641
}
3661-
GGML_ASSERT(false);
3642+
assert(false);
3643+
return -1;
36623644
}
36633645
};
36643646

@@ -3667,7 +3649,6 @@ static int g_device_count = -1;
36673649
static int g_all_sycl_device_count = -1;
36683650
static int g_main_device = -1;
36693651
static int g_main_device_id = -1;
3670-
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
36713652

36723653
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
36733654

@@ -13244,7 +13225,7 @@ void ggml_backend_sycl_print_sycl_devices() {
1324413225
}
1324513226

1324613227
void print_gpu_device_list() {
13247-
fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
13228+
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
1324813229
g_sycl_gpu_mgr->get_gpu_count(),
1324913230
g_sycl_gpu_mgr->gpus_list.c_str(),
1325013231
g_sycl_gpu_mgr->max_compute_units);
@@ -13283,15 +13264,6 @@ void ggml_init_sycl() try {
1328313264
#else
1328413265
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
1328513266
#endif
13286-
13287-
/* NOT REMOVE, keep it for next optimize for XMX.
13288-
#if defined(SYCL_USE_XMX)
13289-
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13290-
#else
13291-
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13292-
#endif
13293-
*/
13294-
1329513267
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
1329613268
dpct::dev_mgr::instance().device_count()) != 0) {
1329713269
initialized = true;
@@ -13300,61 +13272,68 @@ void ggml_init_sycl() try {
1330013272
}
1330113273
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
1330213274
ggml_backend_sycl_print_sycl_devices();
13303-
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
13304-
print_gpu_device_list();
13305-
initialized = true;
13306-
g_sycl_loaded = true;
13307-
}
1330813275

13276+
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
1330913277

13278+
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13279+
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
1331013280

13311-
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13312-
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13281+
print_gpu_device_list();
1331313282

13314-
int64_t total_vram = 0;
13283+
int64_t total_vram = 0;
1331513284

13285+
/* NOT REMOVE, keep it for next optimize for XMX.
13286+
#if defined(SYCL_USE_XMX)
13287+
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13288+
#else
13289+
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13290+
#endif
13291+
*/
13292+
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13293+
g_device_caps[id].vmm = 0;
13294+
g_device_caps[id].device_id = -1;
13295+
g_device_caps[id].cc = 0;
13296+
g_tensor_split[id] = 0;
13297+
g_default_tensor_split[id] = 0;
13298+
}
1331613299

13317-
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13318-
g_device_caps[id].vmm = 0;
13319-
g_device_caps[id].device_id = -1;
13320-
g_device_caps[id].cc = 0;
13321-
g_tensor_split[id] = 0;
13322-
g_default_tensor_split[id] = 0;
13323-
}
13300+
for (int i = 0; i < g_device_count; ++i) {
13301+
int device_id = g_sycl_gpu_mgr->gpus[i];
13302+
g_device_caps[i].vmm = 0;
1332413303

13325-
for (int i = 0; i < g_device_count; ++i) {
13326-
int device_id = g_sycl_gpu_mgr->gpus[i];
13327-
g_device_caps[i].vmm = 0;
13304+
dpct::device_info prop;
13305+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13306+
prop, dpct::dev_mgr::instance().get_device(device_id))));
1332813307

13329-
dpct::device_info prop;
13330-
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13331-
prop, dpct::dev_mgr::instance().get_device(device_id))));
13308+
g_default_tensor_split[i] = total_vram;
13309+
total_vram += prop.get_global_mem_size();
1333213310

13333-
g_default_tensor_split[i] = total_vram;
13334-
total_vram += prop.get_global_mem_size();
13311+
g_device_caps[i].cc =
13312+
100 * prop.get_major_version() + 10 * prop.get_minor_version();
13313+
}
1333513314

13336-
g_device_caps[i].cc =
13337-
100 * prop.get_major_version() + 10 * prop.get_minor_version();
13338-
}
13315+
for (int i = 0; i < g_device_count; ++i) {
13316+
g_default_tensor_split[i] /= total_vram;
13317+
}
1333913318

13340-
for (int i = 0; i < g_device_count; ++i) {
13341-
g_default_tensor_split[i] /= total_vram;
13342-
}
13319+
for (int i = 0; i < g_device_count; ++i) {
13320+
SYCL_CHECK(ggml_sycl_set_device(i));
1334313321

13344-
for (int i = 0; i < g_device_count; ++i) {
13345-
SYCL_CHECK(ggml_sycl_set_device(i));
13322+
// create sycl streams
13323+
for (int is = 0; is < MAX_STREAMS; ++is) {
13324+
SYCL_CHECK(CHECK_TRY_ERROR(
13325+
g_syclStreams[i][is] =
13326+
dpct::get_current_device().create_queue(
13327+
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13328+
}
1334613329

13347-
// create sycl streams
13348-
for (int is = 0; is < MAX_STREAMS; ++is) {
13349-
SYCL_CHECK(CHECK_TRY_ERROR(
13350-
g_syclStreams[i][is] =
13351-
dpct::get_current_device().create_queue(
13352-
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13330+
const dpct::queue_ptr stream = g_syclStreams[i][0];
13331+
// create sycl handle
13332+
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
1335313333
}
1335413334

13355-
const dpct::queue_ptr stream = g_syclStreams[i][0];
13356-
// create sycl handle
13357-
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13335+
initialized = true;
13336+
g_sycl_loaded = true;
1335813337
}
1335913338
}
1336013339
catch (sycl::exception const &exc) {
@@ -16753,24 +16732,22 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
1675316732
/* .is_host = */ nullptr,
1675416733
};
1675516734

16756-
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16757-
if (device_index>=g_device_count or device_index<0) {
16758-
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16759-
device_index, g_device_count-1);
16760-
GGML_ASSERT(device_index<g_device_count);
16761-
}
16735+
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
1676216736
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
1676316737

16764-
if (!g_ggml_backend_sycl_buffer_type_initialized) {
16738+
static bool ggml_backend_sycl_buffer_type_initialized = false;
16739+
16740+
if (!ggml_backend_sycl_buffer_type_initialized) {
1676516741
for (int i = 0; i < g_device_count; i++) {
1676616742
ggml_backend_sycl_buffer_types[i] = {
1676716743
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
1676816744
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
1676916745
};
1677016746
}
16771-
g_ggml_backend_sycl_buffer_type_initialized = true;
16747+
ggml_backend_sycl_buffer_type_initialized = true;
1677216748
}
16773-
return &ggml_backend_sycl_buffer_types[device_index];
16749+
16750+
return &ggml_backend_sycl_buffer_types[device];
1677416751
}
1677516752

1677616753
// sycl split buffer type
@@ -17519,17 +17496,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
1751917496
return g_sycl_gpu_mgr->get_index(device_id);
1752017497
}
1752117498

17522-
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
17523-
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17524-
printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
17525-
if (g_sycl_gpu_mgr) {
17526-
delete g_sycl_gpu_mgr;
17527-
}
17528-
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
17529-
ggml_init_sycl();
17530-
g_ggml_backend_sycl_buffer_type_initialized = false;
17531-
}
17532-
1753317499
extern "C" int ggml_backend_sycl_reg_devices();
1753417500

1753517501
int ggml_backend_sycl_reg_devices() {

ggml-sycl.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
2828
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
2929
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
3030
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
31-
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu);
3231

3332
#ifdef __cplusplus
3433
}

llama.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3750,14 +3750,6 @@ static bool llm_load_tensors(
37503750
model.main_gpu = main_gpu;
37513751
model.n_gpu_layers = n_gpu_layers;
37523752

3753-
#ifdef GGML_USE_SYCL
3754-
if (split_mode == LLAMA_SPLIT_MODE_NONE) {
3755-
ggml_backend_sycl_set_single_device(main_gpu);
3756-
//SYCL use device index (0, 1, 2), instead if device id.
3757-
main_gpu = ggml_backend_sycl_get_device_index(main_gpu);
3758-
}
3759-
#endif
3760-
37613753
const int64_t n_layer = hparams.n_layer;
37623754
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
37633755

@@ -12268,13 +12260,13 @@ struct llama_context * llama_new_context_with_model(
1226812260
ctx->backends.push_back(backend);
1226912261
} else {
1227012262
// LLAMA_SPLIT_LAYER requires a backend for each GPU
12271-
12263+
int id_list[GGML_SYCL_MAX_DEVICES];
12264+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
1227212265
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12266+
int device_id = id_list[i];
1227312267
ggml_backend_t backend = ggml_backend_sycl_init(i);
1227412268
if (backend == nullptr) {
12275-
int id_list[GGML_SYCL_MAX_DEVICES];
12276-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12277-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, id_list[i], i);
12269+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
1227812270
llama_free(ctx);
1227912271
return nullptr;
1228012272
}

0 commit comments

Comments
 (0)