From 3f1e5e7b701b636ef7290319cfd071b545900bbc Mon Sep 17 00:00:00 2001 From: Boian Petkantchin Date: Thu, 10 Aug 2023 16:25:16 -0700 Subject: [PATCH] Relax NCCL version constraints Instead of requiring exact NCCL version, relax constraints to the standard ABI versioning rules, namely found_version >= major.minor && found_version < major + 1, where major and minor are from the NCCL headers we use. --- runtime/src/iree/hal/drivers/cuda/cuda_device.c | 8 ++++---- .../src/iree/hal/drivers/cuda/dynamic_symbols.c | 15 ++++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c index ed2f8d35db85..b2a4ce9579d7 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c @@ -363,10 +363,10 @@ static iree_status_t iree_hal_cuda_device_create_channel( if (!device->context_wrapper.syms->nccl_library) { return iree_make_status( IREE_STATUS_UNAVAILABLE, - "NCCL runtime library (%d.%d.%d) not available; ensure installed and " - "the shared library is on your PATH/LD_LIBRARY_PATH " - "(nccl.dll/libnccl.so)", - NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH); + "NCCL runtime library version %d.%d and greater not available; " + " ensure installed and the shared library (nccl.dll/libnccl.so) " + "is on your PATH/LD_LIBRARY_PATH.", + NCCL_MAJOR, NCCL_MINOR); } // Today we only allow a single logical device per channel. diff --git a/runtime/src/iree/hal/drivers/cuda/dynamic_symbols.c b/runtime/src/iree/hal/drivers/cuda/dynamic_symbols.c index 436ce82cda5b..5e3a62240284 100644 --- a/runtime/src/iree/hal/drivers/cuda/dynamic_symbols.c +++ b/runtime/src/iree/hal/drivers/cuda/dynamic_symbols.c @@ -139,11 +139,12 @@ static iree_status_t iree_hal_cuda_nccl_check_version( minor = (nccl_version % 10000) / 100; } patch = nccl_version % 100; - if (major != NCCL_MAJOR || minor != NCCL_MINOR || patch != NCCL_PATCH) { + int required_minimum_version = NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, 0); + if (major != NCCL_MAJOR || nccl_version < required_minimum_version) { return iree_make_status( IREE_STATUS_UNAVAILABLE, - "NCCL version is %d.%d.%d, but %d.%d.%d is required", major, minor, - patch, NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH); + "NCCL version is %d.%d.%d, but >=%d.%d and <%d is required", major, + minor, patch, NCCL_MAJOR, NCCL_MINOR, NCCL_MAJOR + 1); } return iree_ok_status(); @@ -174,10 +175,10 @@ iree_status_t iree_hal_cuda_nccl_dynamic_symbols_initialize( iree_status_ignore(status); status = iree_make_status( IREE_STATUS_UNAVAILABLE, - "NCCL runtime library (%d.%d.%d) not available; ensure installed and " - "the shared library is on your PATH/LD_LIBRARY_PATH " - "(nccl.dll/libnccl.so)", - NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH); + "NCCL runtime library version %d.%d and greater not available; " + " ensure installed and the shared library (nccl.dll/libnccl.so) " + "is on your PATH/LD_LIBRARY_PATH.", + NCCL_MAJOR, NCCL_MINOR); } if (iree_status_is_ok(status)) {