diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 47bc7ac526..6a3693c882 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -583,7 +583,10 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, cuda_enter(ctx); - ctx->err = cuMemAlloc(&ptr, size); + if (1) + ctx->err = cuMemAllocManaged(&ptr, size, CU_MEM_ATTACH_GLOBAL); + else + ctx->err = cuMemAlloc(&ptr, size); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn index 5bfc890fba..7b0f67a080 100644 --- a/src/loaders/libcuda.fn +++ b/src/loaders/libcuda.fn @@ -30,6 +30,7 @@ DEF_PROC_V2(cuMemGetInfo, (size_t *free, size_t *total)); DEF_PROC_V2(cuMemAlloc, (CUdeviceptr *dptr, size_t bytesize)); DEF_PROC_V2(cuMemFree, (CUdeviceptr dptr)); DEF_PROC_V2(cuMemAllocHost, (void **pp, size_t bytesize)); +DEF_PROC(cuMemAllocManaged, (CUdeviceptr* dptr, size_t bytesize, unsigned int flags)); DEF_PROC(cuMemFreeHost, (void *p)); DEF_PROC_V2(cuMemcpyHtoDAsync, (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream)); diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h index 3a6bf35a8a..dd80a0e48f 100644 --- a/src/loaders/libcuda.h +++ b/src/loaders/libcuda.h @@ -32,6 +32,7 @@ typedef enum CUctx_flags_enum CUctx_flags; typedef enum CUipcMem_flags_enum CUipcMem_flags; typedef enum CUjit_option_enum CUjit_option; typedef enum CUjitInputType_enum CUjitInputType; +typedef enum CUmemAttach_flags_enum CUmemAttach_flags; #define CU_IPC_HANDLE_SIZE 64 @@ -187,6 +188,12 @@ enum CUipcMem_flags_enum { CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 }; +enum CUmemAttach_flags_enum { + CU_MEM_ATTACH_GLOBAL = 0x1, + CU_MEM_ATTACH_HOST = 0x2, + CU_MEM_ATTACH_SINGLE = 0x4 +}; + enum CUjit_option_enum { CU_JIT_MAX_REGISTERS = 0, CU_JIT_THREADS_PER_BLOCK,