update readme-sycl.md, rm debug code

arthw · Jun 29, 2024 · 99937ef · 99937ef
1 parent 8215a77
commit 99937ef
Show file tree

Hide file tree

Showing 5 changed files with 124 additions and 59 deletions.
diff --git a/README-sycl.md b/README-sycl.md
@@ -296,15 +296,25 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
 found 6 SYCL devices:
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
+Part1:
+|ID| Ver|        Device Type|                                   Name|Global mem size|
+|--|----|-------------------|---------------------------------------|---------------|
+| 0| 1.3| [level_zero:gpu:0]|         Intel Data Center GPU Flex 170|         16225M|
+| 1| 1.3| [level_zero:gpu:1]|         Intel Data Center GPU Flex 170|         16225M|
+| 2| 3.0|     [opencl:gpu:0]|         Intel Data Center GPU Flex 170|         16225M|
+| 3| 3.0|     [opencl:gpu:1]|         Intel Data Center GPU Flex 170|         16225M|
+| 4| 3.0|     [opencl:cpu:0]|     Intel Xeon Gold 6346 CPU @ 3.10GHz|        540700M|
+| 5| 1.2|     [opencl:acc:0]|            Intel FPGA Emulation Device|        540700M|
+Part2:
+|ID|Max compute units|Max work group|Max subgroup|                    Driver version|
+|--|-----------------|--------------|------------|----------------------------------|
+| 0|              512|          1024|          32|                         1.3.27642|
+| 1|              512|          1024|          32|                         1.3.27642|
+| 2|              512|          1024|          32|                    23.43.27642.40|
+| 3|              512|          1024|          32|                    23.43.27642.40|
+| 4|               64|          8192|          64|2024.17.5.0.08_160000.xmain-hotfix|
+| 5|               64|      67108864|          64|2024.17.5.0.08_160000.xmain-hotfix|
+
 ```
 
 | Attribute              | Note                                                        |
@@ -469,15 +479,24 @@ build\bin\ls-sycl-device.exe
 The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
 ```
 found 6 SYCL devices:
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
+Part1:
+|ID| Ver|        Device Type|                                   Name|Global mem size|
+|--|----|-------------------|---------------------------------------|---------------|
+| 0| 1.3| [level_zero:gpu:0]|         Intel Data Center GPU Flex 170|         16225M|
+| 1| 1.3| [level_zero:gpu:1]|         Intel Data Center GPU Flex 170|         16225M|
+| 2| 3.0|     [opencl:gpu:0]|         Intel Data Center GPU Flex 170|         16225M|
+| 3| 3.0|     [opencl:gpu:1]|         Intel Data Center GPU Flex 170|         16225M|
+| 4| 3.0|     [opencl:cpu:0]|     Intel Xeon Gold 6346 CPU @ 3.10GHz|        540700M|
+| 5| 1.2|     [opencl:acc:0]|            Intel FPGA Emulation Device|        540700M|
+Part2:
+|ID|Max compute units|Max work group|Max subgroup|                    Driver version|
+|--|-----------------|--------------|------------|----------------------------------|
+| 0|              512|          1024|          32|                         1.3.27642|
+| 1|              512|          1024|          32|                         1.3.27642|
+| 2|              512|          1024|          32|                    23.43.27642.40|
+| 3|              512|          1024|          32|                    23.43.27642.40|
+| 4|               64|          8192|          64|2024.17.5.0.08_160000.xmain-hotfix|
+| 5|               64|      67108864|          64|2024.17.5.0.08_160000.xmain-hotfix|
 
 ```
 
@@ -548,6 +567,32 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+| GGML_SYCL_VISIBLE_DEVICES|id1,id2,...|It's like `CUDA_VISIBLE_DEVICES`, define the SYCL device ID list to visible. Like "0", "0,2", "2,1" |
+| ONEAPI_DEVICE_SELECTOR|Refer to [oneapi-device-selector](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector)|be used to limit the choice of devices available when the SYCL-using application is run|
+
+##### Choose SYCL Devices in Running Time
+
+In SYCL running time, a physical device could be mapped to two logical devices on different running times: Level-Zero and OpenCL. So it will show more devices in SYCL view. But we need avoid to run code on these two logical devices on same physical device in same time.
+
+The SCYL backend supports dGPU or iGPU in same machine.
+
+##### SYCL Backend Rule:
+
+|Mode|Explain|Example|Recommend Cases|Note|
+|-|-|-|-|-|
+|Normal|Use all powest devices. Default mode. No special setting.<br>SYCL backend will detect and choose the **Level-Zero** devices which have top `Max compute units`.<br> ||Most cases of normal user.||
+|Advanced|Allow user choose one or more SYCL devices which could be Level-Zero or OpenCL or both.<br>Set the device list by environment variable: **GGML_SYCL_VISIBLE_DEVICES**, like `CUDA_VISIBLE_DEVICES`.<br>SYCL backend will choose all devices by it.| `set/export GGML_SYCL_VISIBLE_DEVICES=1`<br>`set/export GGML_SYCL_VISIBLE_DEVICES=0,1`<br>`set/export GGML_SYCL_VISIBLE_DEVICES=2,1`|Use iGPU or both in dGPU + iGPU environment<br>Use a dGPU in mulitple dGPU environment.<br>Use one or more OpenCL devices|There is known issue of OpenCL device. WIP.|
+|Developer|Allow SYCL developer choose one or more SYCL devices by environment varibale **ONEAPI_DEVICE_SELECTOR** with flexiable grammar.<br>Refer to [oneapi-device-selector](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector).|`set/export ONEAPI_DEVICE_SELECTOR=level_zero:1`<br>`set/export ONEAPI_DEVICE_SELECTOR=opencl:*`<br>`set/export ONEAPI_DEVICE_SELECTOR=opencl:gpu;level_zero:gpu`<br>|Cover the Advanced mode. It will impact **Normal** and **Advanced** modes as low level principle.<br>Flexiable grammar support more complex device environments.|There is known issue of OpenCL device. WIP.|
+
+##### Parameters of Llama.cpp
+
+The parameters about device choose of llama.cpp works with SYCL backend rule to decide the final result. User could use one or all chosen devices by SYCL backend rule.
+
+|Device|Values|Note|
+|-|-|-|
+|Single Device|`--split-mode=none` and `--main-gpu=id`|The value of `main-gpu` must be in the chosen device lists printed out during llama.cpp startup. Like:<br>`detect 2 SYCL level-zero GPUs:[0,1]`.<br>`main-gpu` should be set to `0` or `1`.|
+|Multiple Device|`--split-mode=layer`|Default|
+
 
 ## Known Issues
 

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
@@ -2555,11 +2555,11 @@ static inline int get_work_group_size(const sycl::device& device) {
 
 inline void check_allow_device_id(const int device_id) {
     if (ggml_sycl_info().device_count<1) {
-        fprintf(stderr, "%s: not detect any SYCL devices, please check GPU driver or unset ONEAPI_DEVICE_SELECTOR!\n", __func__);
+        fprintf(stderr, "%s: not detect any SYCL devices, check GPU driver or unset GGML_SYCL_VISIBLE_DEVICES and ONEAPI_DEVICE_SELECTOR\n", __func__);
         exit(1);
     }
     if (!ggml_sycl_info().is_allowed_device(device_id)) {
-        fprintf(stderr, "%s: device_id:%d is out of range [%s]. To use any SYCL devices, please set/export ONEAPI_DEVICE_SELECTOR\n",
+        fprintf(stderr, "%s: device_id:%d is out of range [%s]. To use any SYCL devices, set/export GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n",
             __func__, device_id, ggml_sycl_info().devices_list());
         exit_with_stack_print();
     }
@@ -5893,7 +5893,6 @@ GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
 
     GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
-    printf("zjy ggml_backend_sycl_set_tensor_async sycl_ctx->device=%d stream=%p\n", sycl_ctx->device, stream);
 
     SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
         (char *)tensor->data + offset, data, size).wait()));

diff --git a/ggml-sycl.h b/ggml-sycl.h
@@ -36,9 +36,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
 GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
 
-// GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
-// GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
-
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);

diff --git a/ggml-sycl/common.cpp b/ggml-sycl/common.cpp
@@ -87,7 +87,7 @@ static std::vector<int> get_sycl_visible_devices() {
     return device_ids;
 }
 
-void print_device_detail(int id, sycl::device &device, std::string device_type) {
+void print_device_detail_part1(int id, sycl::device &device, std::string device_type) {
 
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(
@@ -105,29 +105,52 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
 
     auto global_mem_size = prop.get_global_mem_size()/1000000;
 
-    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
-            name.c_str(), version.c_str(), prop.get_max_compute_units(),
-            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
-            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
+    fprintf(stderr, "|%2d|%4s|%19s|%39s|%14luM|\n", id, version.c_str(), device_type.c_str(),
+        name.c_str(), global_mem_size);
+}
+
+void print_device_detail_part2(int id, sycl::device &device, std::string device_type) {
+
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::get_device_info(prop, device)));
+
+    fprintf(stderr, "|%2d|%17d|%14d|%12d|%34s|\n", id,
+        prop.get_max_compute_units(),
+        prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
+        device.get_info<sycl::info::device::driver_version>().c_str());
 }
 
 void ggml_backend_sycl_print_sycl_devices() {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
     int device_count = dpct::dev_mgr::instance().device_count();
     std::map<std::string, size_t> DeviceNums;
     fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
-    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
-    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
-    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
+    fprintf(stderr, "Part1:\n");
+    fprintf(stderr, "|ID| Ver|        Device Type|                                   Name|Global mem size|\n");
+    fprintf(stderr, "|--|----|-------------------|---------------------------------------|---------------|\n");
     for (int id = 0; id < device_count; ++id) {
         sycl::device device = dpct::dev_mgr::instance().get_device(id);
         sycl::backend backend = device.get_backend();
         std::string backend_type = get_device_backend_and_type(device);
         int type_id=DeviceNums[backend_type]++;
         std::stringstream device_type;
         device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
-        print_device_detail(id, device, device_type.str());
+        print_device_detail_part1(id, device, device_type.str());
+    }
+
+    std::map<std::string, size_t> DeviceNums2;
+    fprintf(stderr, "Part2:\n");
+    fprintf(stderr, "|ID|Max compute units|Max work group|Max subgroup|                    Driver version|\n");
+    fprintf(stderr, "|--|-----------------|--------------|------------|----------------------------------|\n");
+    for (int id = 0; id < device_count; ++id) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums2[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail_part2(id, device, device_type.str());
     }
 }
 
@@ -174,7 +197,7 @@ static ggml_sycl_device_info ggml_sycl_init() try {
     info.refresh_device();
 
     if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n",
+        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
                 __func__);
         return info;
     }