From e4e20846f7927125cb6e20096fa47c2ca7423859 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Thu, 29 Aug 2024 09:22:18 -0400 Subject: [PATCH] fix: bad rebase of NNPACK Ext --- ext/NNlibNNPACK_jllExt/NNPACK.jl | 55 -------- ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl | 15 --- ext/NNlibNNPACK_jllExt/error.jl | 83 ------------ ext/NNlibNNPACK_jllExt/impl.jl | 50 ------- ext/NNlibNNPACK_jllExt/interface.jl | 44 ------ ext/NNlibNNPACK_jllExt/libnnpack.jl | 135 ------------------- ext/NNlibNNPACK_jllExt/libnnpack_types.jl | 85 ------------ ext/NNlibNNPACK_jllExt/performance.jl | 31 ----- 8 files changed, 498 deletions(-) delete mode 100644 ext/NNlibNNPACK_jllExt/NNPACK.jl delete mode 100644 ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl delete mode 100644 ext/NNlibNNPACK_jllExt/error.jl delete mode 100644 ext/NNlibNNPACK_jllExt/impl.jl delete mode 100644 ext/NNlibNNPACK_jllExt/interface.jl delete mode 100644 ext/NNlibNNPACK_jllExt/libnnpack.jl delete mode 100644 ext/NNlibNNPACK_jllExt/libnnpack_types.jl delete mode 100644 ext/NNlibNNPACK_jllExt/performance.jl diff --git a/ext/NNlibNNPACK_jllExt/NNPACK.jl b/ext/NNlibNNPACK_jllExt/NNPACK.jl deleted file mode 100644 index 685415a7e..000000000 --- a/ext/NNlibNNPACK_jllExt/NNPACK.jl +++ /dev/null @@ -1,55 +0,0 @@ -using NNPACK_jll - -include("libnnpack_types.jl") -include("error.jl") -include("libnnpack.jl") -include("performance.jl") -include("interface.jl") - - -const shared_threadpool_dict = Dict{UInt64, Base.RefValue}() - -""" - is_nnpack_available() - -Checks if the current hardware is supported by NNPACK. -""" -function is_nnpack_available() - status = nnp_initialize() - if status == nnp_status_unsupported_hardware - return false - else - return true - end -end - -""" - allocate_threadpool() - -Allocates several threadpool based on the upper limit on the number of threads for the machine. -Allows NNPACK to intelligently choose which threadpool to use for getting the best -performance. -""" -function allocate_threadpool() - global NNPACK_CPU_THREADS = NNPACK_CPU_THREADS > 8 ? UInt64(8) : UInt64(exp2(floor(log2(NNPACK_CPU_THREADS)))) - for i in 0:Int(log2(NNPACK_CPU_THREADS)) - threads = UInt64(2^i) - push!(shared_threadpool_dict, threads => Ref(pthreadpool_create(threads))) - end -end - -@init begin - status = nnp_initialize() - if status == nnp_status_unsupported_hardware - @warn "Hardware is unsupported by NNPACK so falling back to default NNlib" - end - try - global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"]) - catch - # Sys.CPU_THREADS should be a better default if we are tuning the benchmark suite on - # a particular machine. However, we fix the runtime threadpool here to have a max of - # 4 threads so anything above will be ignored anyways - global NNPACK_CPU_THREADS = UInt64(4) - end - allocate_threadpool() -end diff --git a/ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl b/ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl deleted file mode 100644 index 1c9566841..000000000 --- a/ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl +++ /dev/null @@ -1,15 +0,0 @@ -module NNlibNNPACK_jllExt - -using NNlib: NNlib -using NNPACK_jll, Pkg - -if isdefined(NNPACK_jll, :libnnpack) - include("NNPACK.jl") -else - @warn "NNPACK not available for your platform: " * - "$( Pkg.BinaryPlatforms.platform_name(Pkg.BinaryPlatforms.platform_key_abi()))" * - "($( Pkg.BinaryPlatforms.triplet(Pkg.BinaryPlatforms.platform_key_abi()))) - You will be able to use only the default Julia NNlib backend" -end - -end diff --git a/ext/NNlibNNPACK_jllExt/error.jl b/ext/NNlibNNPACK_jllExt/error.jl deleted file mode 100644 index 83522c37d..000000000 --- a/ext/NNlibNNPACK_jllExt/error.jl +++ /dev/null @@ -1,83 +0,0 @@ -struct NNPACKError <: Exception - code::nnp_status - msg::AbstractString -end - -Base.show(io::IO, err::NNPACKError) = print(io, "NNPACKError(code $(err.code), $(err.msg))") - -function NNPACKError(status::nnp_status) - msg = "NNPACK STATUS SUCCESS" - if status == nnp_status_invalid_batch_size - msg = "NNPACK STATUS INVALID BATCH SIZE" - elseif status == nnp_status_invalid_channels - msg = "NNPACK STATUS INVALID CHANNELS" - elseif status == nnp_status_invalid_input_channels - msg = "NNPACK STATUS INVALID INPUT CHANNELS" - elseif status == nnp_status_invalid_output_channels - msg = "NNPACK STATUS INVALID OUTPUT CHANNELS" - elseif status == nnp_status_invalid_input_size - msg = "NNPACK STATUS INVALID INPUT SIZE" - elseif status == nnp_status_invalid_input_stride - msg = "NNPACK STATUS INVALID INPUT STRIDE" - elseif status == nnp_status_invalid_input_padding - msg = "NNPACK STATUS INVALID INPUT PADDING" - elseif status == nnp_status_invalid_kernel_size - msg = "NNPACK STATUS INVALID KERNEL SIZE" - elseif status == nnp_status_invalid_pooling_size - msg = "NNPACK STATUS INVALID POOLING SIZE" - elseif status == nnp_status_invalid_pooling_stride - msg = "NNPACK STATUS INVALID POOLING STRIDE" - elseif status == nnp_status_invalid_algorithm - msg = "NNPACK STATUS INVALID ALGORITHM" - elseif status == nnp_status_invalid_transform_strategy - msg = "NNPACK STATUS INVALID TRANSFORM STRATEGY" - elseif status == nnp_status_invalid_output_subsampling - msg = "NNPACK STATUS INVALID OUTPUT SUBSAMPLING" - elseif status == nnp_status_invalid_activation - msg = "NNPACK STATUS INVALID ACTIVATION" - elseif status == nnp_status_invalid_activation_parameters - msg = "NNPACK STATUS INVALID ACTIVATION PARAMETERS" - elseif status == nnp_status_unsupported_input_size - msg = "NNPACK STATUS UNSUPPORTED INPUT SIZE" - elseif status == nnp_status_unsupported_input_stride - msg = "NNPACK STATUS UNSUPPORTED INPUT STRIDE" - elseif status == nnp_status_unsupported_input_padding - msg = "NNPACK STATUS UNSUPPORTED INPUT PADDING" - elseif status == nnp_status_unsupported_kernel_size - msg = "NNPACK STATUS UNSUPPORTED KERNEL SIZE" - elseif status == nnp_status_unsupported_pooling_size - msg = "NNPACK STATUS UNSUPPORTED POOLING SIZE" - elseif status == nnp_status_unsupported_pooling_stride - msg = "NNPACK STATUS UNSUPPORTED POOLING STRIDE" - elseif status == nnp_status_unsupported_algorithm - msg = "NNPACK STATUS UNSUPPORTED ALGORITHM" - elseif status == nnp_status_unsupported_transform_strategy - msg = "NNPACK STATUS UNSUPPORTED TRANSFORM STRATEGY" - elseif status == nnp_status_unsupported_activation - msg = "NNPACK STATUS UNSUPPORTED ACTIVATION" - elseif status == nnp_status_unsupported_activation_parameters - msg = "NNPACK STATUS UNSUPPORTED ACTIVATION PARAMETERS" - elseif status == nnp_status_uninitialized - msg = "NNPACK STATUS UNINITIALIZED" - elseif status == nnp_status_unsupported_hardware - msg = "NNPACK STATUS UNSUPPORTED HARDWARE" - elseif status == nnp_status_out_of_memory - msg = "NNPACK STATUS OUT OF MEMORY" - elseif status == nnp_status_insufficient_buffer - msg = "NNPACK STATUS INSUFFICIENT BUFFER" - elseif status == nnp_status_misaligned_buffer - msg = "NNPACK STATUS MISALIGNED BUFFER" - end - NNPACKError(status, msg) -end - -macro nnpack_check(nnp_func) - quote - local err::nnp_status - err = $(esc(nnp_func)) - if err != nnp_status_success - throw(NNPACKError(err)) - end - err - end -end diff --git a/ext/NNlibNNPACK_jllExt/impl.jl b/ext/NNlibNNPACK_jllExt/impl.jl deleted file mode 100644 index 3309404e1..000000000 --- a/ext/NNlibNNPACK_jllExt/impl.jl +++ /dev/null @@ -1,50 +0,0 @@ -function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}} - check_dims(size(x), size(y), pdims) - threadpool = select_threadpool(pdims, size(y, 4)) - nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims), - stride = stride(pdims), threadpool = threadpool) -end - -function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims; - b::A2 = zeros(Float32, size(x, 3)), - algo = UInt32(0)) where {A1<:Array{Float32, 4}, - A2<:Array{Float32, 1}} - check_dims(size(x), size(w), size(y), cdims) - threadpool = select_threadpool(cdims, size(y, 4)) - - if flipkernel(cdims) == 0 - w = flipweight(w) - end - - nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) -end - -function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims; - algo = UInt32(0)) where{A<:Array{Float32, 4}} - check_dims(size(dx), size(w), size(dy), cdims) - threadpool = select_threadpool(cdims, size(dy, 4)) - - if flipkernel(cdims) == 0 - w = flipweight(w) - end - - nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) -end - -function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims; - algo = UInt32(0)) where{A<:Array{Float32, 4}} - check_dims(size(x), size(dw), size(dy), cdims) - threadpool = select_threadpool(cdims, size(dy, 4)) - - nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) - - if flipkernel(cdims) == 0 - dw .= flipweight(dw) - end - - dw -end - diff --git a/ext/NNlibNNPACK_jllExt/interface.jl b/ext/NNlibNNPACK_jllExt/interface.jl deleted file mode 100644 index 6c4d86930..000000000 --- a/ext/NNlibNNPACK_jllExt/interface.jl +++ /dev/null @@ -1,44 +0,0 @@ -include("impl.jl") - -## NNPACK supports only Float32 -for (front_name, backend) in ( - :conv => :_nnpack, - :∇conv_data => :_nnpack, - :∇conv_filter => :_nnpack, - ) - @eval begin - function NNlib.$(Symbol("$(front_name)$(backend)!"))( - out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4}, - cdims::ConvDims; kwargs...) where {T1, T2, T3} - @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1 - # Output must of the same type as in the function signature - T1.($(Symbol("$(front_name)$(backend)!"))(Float32.(out), Float32.(in1), - Float32.(in2), cdims; kwargs...)) - end - end -end - -function maxpool_nnpack!(y::Array{T1, 4}, x::Array{T2, 4}, pdims::PoolDims; - kwargs...) where {T1, T2} - @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1 - # We want the output to be of the same type as desired - T1.(maxpool_nnpack!(Float32.(y), Float32.(x), pdims; kwargs...)) -end - -""" - nnpack_supported_operation(cdims::ConvDims) - nnpack_supported_operation(pdims::PoolDims) - -Returns `true` if nnpack supports the convolution/pooling operation for the given parameters. -""" -function nnpack_supported_operation(pdims::PoolDims{2, K, S, P, (1, 1)}) where {K, S, P} - val = input_size(pdims)[1:2] .+ (P[1] + P[2], P[3] + P[4]) .- K - return val .% S == (0, 0) ? true : false -end - -function nnpack_supported_operation(cdims::ConvDims{2, K, (1, 1), P, (1, 1)}) where {K, S, P} - return true -end - -# Return false for everything else -nnpack_supported_operation(dims) = false diff --git a/ext/NNlibNNPACK_jllExt/libnnpack.jl b/ext/NNlibNNPACK_jllExt/libnnpack.jl deleted file mode 100644 index 2f3996c32..000000000 --- a/ext/NNlibNNPACK_jllExt/libnnpack.jl +++ /dev/null @@ -1,135 +0,0 @@ -#NOTE: We do the error handling of nnp_initialize while loading NNPACK -function nnp_initialize() - ccall((:nnp_initialize, libnnpack), nnp_status, (),) -end - -function nnp_deinitialize() - @nnpack_check ccall((:nnp_deinitialize, libnnpack), nnp_status, (),) -end - -function pthreadpool_create(n = 0) - ccall((:pthreadpool_create, libnnpack), Ptr{Cvoid}, (Csize_t,), n) -end - -function nnp_relu_output(batch_size, channels, input, output, negative_slope, threadpool) - @nnpack_check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool) -end - -function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N} - # Investigate why the channel and batch dims need to specified like this - nnp_relu_output(prod(size(x)[N-1:N]), prod(size(x)[1:N-2]), x, y, negative_slope, threadpool) - y -end - -function nnp_relu_input_gradient(batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool) - @nnpack_check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool) -end - -function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N} - # Investigate why the channel and batch dims need to specified like this - nnp_relu_input_gradient(Csize_t(prod(size(x)[N-1:N])), prod(size(x)[1:N-2]), dy, x, dx, negative_slope, threadpool) - dx -end - -function nnp_softmax_output(batch_size, channels, input, output, threadpool) - @nnpack_check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool) -end - -function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = C_NULL) - nnp_softmax_output(ndims(x) == 2 ? size(x, 2) : 1, size(x, 1), x, y, threadpool) - y -end - -#FIXME: Output of fully connected not consistent with `kernel * input` -#NOTE: This most likely due to nnpack being row major. Investigate this. - -function nnp_fully_connected_output(batch_size, input_channels, output_channels, input, kernel, output, threadpool, profile) - @nnpack_check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL) -end - -function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = C_NULL) - profile = profile == nothing ? nnp_profile() : profile - nnp_fully_connected_output(size(x, 2), size(x, 1), size(w, 1), x, w, y, threadpool, profile) - y -end - -function nnp_fully_connected_inference_f16f32(input_channels, output_channels, input, kernel, output, threadpool) - @nnpack_check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool) -end - -nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = C_NULL) = - nnp_fully_connected_inference(reshape(x, size(x), 1), w, reshape(y, size(y), 1), threadpool = threadpool) - -function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Float16,2}, y::Array{Float32, 2}; threadpool = C_NULL) - nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool) - y -end - -function nnp_fully_connected_inference(input_channels, output_channels, input, kernel, output, threadpool) - @nnpack_check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool) -end - -nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = C_NULL) = - nnp_fully_connected_inference(reshape(x, size(x), 1), w, threadpool = threadpool) - -function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2}, y::Array{Float32, 2}; threadpool = C_NULL) - nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool) - y -end - -function nnp_max_pooling_output(batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool) - @nnpack_check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool) -end - -function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = C_NULL) - input_size = nnp_size(Csize_t.((size(x, 1), size(x, 2)))...) - pooling_size = nnp_size(Csize_t.(kernel)...) - input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1])) - pooling_stride = nnp_size(Csize_t.(stride)...) - nnp_max_pooling_output(size(x, 4), size(x, 3), input_size, input_padding, pooling_size, pooling_stride, x, y, threadpool) - y -end - -#TODO: Add wrapper for convolution inference - -function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile) - @nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL) -end - -function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing) - input_size = nnp_size(Csize_t.((size(dx,1), size(dx,2)))...) - kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...) - input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1])) - profile = profile == nothing ? nnp_profile() : profile - workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer - nnp_convolution_input_gradient(UInt32(algo), size(dx,4), size(dx,3), size(w,4), input_size, input_padding, kernel_size, dy, w, dx, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile) - dx -end - -function nnp_convolution_kernel_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile) - @nnpack_check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL) -end - -function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing) - input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...) - kernel_size = nnp_size(Csize_t.((size(dw,1),size(dw,2)))...) - input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1])) - profile = profile == nothing ? nnp_profile() : profile - workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer - nnp_convolution_kernel_gradient(UInt32(algo), size(x,4), size(x,3), size(dw,4), input_size, input_padding, kernel_size, x, dy, dw, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile) - dw -end - -function nnp_convolution_output(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile) - @nnpack_check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL) -end - -function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing) - input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...) - kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...) - input_padding = nnp_padding(Csize_t(padding[3]), Csize_t(padding[2]), Csize_t(padding[4]), Csize_t(padding[1])) - profile = profile == nothing ? nnp_profile() : profile - workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer - nnp_convolution_output(UInt32(algo), size(x,4), size(x,3), size(w,4), input_size, input_padding, kernel_size, x, w, b, y, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile) - y -end diff --git a/ext/NNlibNNPACK_jllExt/libnnpack_types.jl b/ext/NNlibNNPACK_jllExt/libnnpack_types.jl deleted file mode 100644 index 6e7b23c16..000000000 --- a/ext/NNlibNNPACK_jllExt/libnnpack_types.jl +++ /dev/null @@ -1,85 +0,0 @@ -const nnp_status = UInt32 -const nnp_status_success = (UInt32)(0) -const nnp_status_invalid_batch_size = (UInt32)(2) -const nnp_status_invalid_channels = (UInt32)(3) -const nnp_status_invalid_input_channels = (UInt32)(4) -const nnp_status_invalid_output_channels = (UInt32)(5) -const nnp_status_invalid_input_size = (UInt32)(10) -const nnp_status_invalid_input_stride = (UInt32)(11) -const nnp_status_invalid_input_padding = (UInt32)(12) -const nnp_status_invalid_kernel_size = (UInt32)(13) -const nnp_status_invalid_pooling_size = (UInt32)(14) -const nnp_status_invalid_pooling_stride = (UInt32)(15) -const nnp_status_invalid_algorithm = (UInt32)(16) -const nnp_status_invalid_transform_strategy = (UInt32)(17) -const nnp_status_invalid_output_subsampling = (UInt32)(13) -const nnp_status_invalid_activation = (UInt32)(14) -const nnp_status_invalid_activation_parameters = (UInt32)(15) -const nnp_status_unsupported_input_size = (UInt32)(20) -const nnp_status_unsupported_input_stride = (UInt32)(21) -const nnp_status_unsupported_input_padding = (UInt32)(22) -const nnp_status_unsupported_kernel_size = (UInt32)(23) -const nnp_status_unsupported_pooling_size = (UInt32)(24) -const nnp_status_unsupported_pooling_stride = (UInt32)(25) -const nnp_status_unsupported_algorithm = (UInt32)(26) -const nnp_status_unsupported_transform_strategy = (UInt32)(57) -const nnp_status_unsupported_activation = (UInt32)(28) -const nnp_status_unsupported_activation_parameters = (UInt32)(29) -const nnp_status_uninitialized = (UInt32)(50) -const nnp_status_unsupported_hardware = (UInt32)(51) -const nnp_status_out_of_memory = (UInt32)(52) -const nnp_status_insufficient_buffer = (UInt32)(53) -const nnp_status_misaligned_buffer = (UInt32)(54) - -const nnp_activation = UInt32 -const nnp_activation_identity = (UInt32)(0) -const nnp_activation_relu = (UInt32)(1) - -const nnp_convolution_algorithm = UInt32 -const nnp_convolution_algorithm_auto = (UInt32)(0) -const nnp_convolution_algorithm_ft8x8 = (UInt32)(1) -const nnp_convolution_algorithm_ft16x16 = (UInt32)(2) -const nnp_convolution_algorithm_wt8x8 = (UInt32)(3) -const nnp_convolution_algorithm_implicit_gemm = (UInt32)(4) -const nnp_convolution_algorithm_direct = (UInt32)(5) -const nnp_convolution_algorithm_wt8x8_fp16 = (UInt32)(6) - -const nnp_convolution_transform_strategy = UInt32 -const nnp_convolution_transform_strategy_compute = (UInt32)(1) -const nnp_convolution_transform_strategy_precompute = (UInt32)(2) -const nnp_convolution_transform_strategy_reuse = (UInt32)(3) - -const pthreadpool_t = Ptr{Nothing} - -mutable struct nnp_size - width::Csize_t - height::Csize_t - nnp_size() = new(Csize_t(0), Csize_t(0)) - nnp_size(w, h) = new(Csize_t(w), Csize_t(h)) -end - -Base.unsafe_convert(::Type{Ptr{nnp_size}}, a::nnp_size) = Ptr{a} - -mutable struct nnp_padding - top::Csize_t - right::Csize_t - bottom::Csize_t - left::Csize_t - nnp_padding() = new(Csize_t(0), Csize_t(0), Csize_t(0), Csize_t(0)) - nnp_padding(val) = new(Csize_t(val), Csize_t(val), Csize_t(val), Csize_t(val)) - nnp_padding(t, r, b, l) = new(Csize_t(t), Csize_t(r), Csize_t(b), Csize_t(l)) -end - -Base.unsafe_convert(::Type{Ptr{nnp_padding}}, a::nnp_padding) = Ptr{a} - -mutable struct nnp_profile - total::Cdouble - input_transform::Cdouble - kernel_transform::Cdouble - output_transform::Cdouble - block_multiplication::Cdouble - nnp_profile() = new(Cdouble(0.0), Cdouble(0.0), Cdouble(0.0), Cdouble(0.0), Cdouble(0.0)) - nnp_profile(t, it, kt, ot, bm) = new(Cdouble(t), Cdouble(it), Cdouble(kt), Cdouble(ot), Cdouble(bm)) -end - -Base.unsafe_convert(::Type{Ptr{nnp_profile}}, a::nnp_profile) = Ptr{a} diff --git a/ext/NNlibNNPACK_jllExt/performance.jl b/ext/NNlibNNPACK_jllExt/performance.jl deleted file mode 100644 index 24abdb411..000000000 --- a/ext/NNlibNNPACK_jllExt/performance.jl +++ /dev/null @@ -1,31 +0,0 @@ -function select_threadpool(cdims::DenseConvDims, batch_size::Int) - inp_size = input_size(cdims)[1] - if batch_size >= 32 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif batch_size >= 16 && inp_size >= 64 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size <= 32 - return C_NULL - elseif inp_size >= 128 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size * batch_size >= 256 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - end - return C_NULL -end - -function select_threadpool(pdims::PoolDims, batch_size::Int) - inp_size = input_size(pdims)[1] - if batch_size >= 32 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif batch_size >= 16 && inp_size >= 64 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size <= 32 - return C_NULL - elseif inp_size >= 128 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size * batch_size >= 256 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - end - return C_NULL -end