From e4e20846f7927125cb6e20096fa47c2ca7423859 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 29 Aug 2024 09:22:18 -0400
Subject: [PATCH] fix: bad rebase of NNPACK Ext

---
 ext/NNlibNNPACK_jllExt/NNPACK.jl             |  55 --------
 ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl |  15 ---
 ext/NNlibNNPACK_jllExt/error.jl              |  83 ------------
 ext/NNlibNNPACK_jllExt/impl.jl               |  50 -------
 ext/NNlibNNPACK_jllExt/interface.jl          |  44 ------
 ext/NNlibNNPACK_jllExt/libnnpack.jl          | 135 -------------------
 ext/NNlibNNPACK_jllExt/libnnpack_types.jl    |  85 ------------
 ext/NNlibNNPACK_jllExt/performance.jl        |  31 -----
 8 files changed, 498 deletions(-)
 delete mode 100644 ext/NNlibNNPACK_jllExt/NNPACK.jl
 delete mode 100644 ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl
 delete mode 100644 ext/NNlibNNPACK_jllExt/error.jl
 delete mode 100644 ext/NNlibNNPACK_jllExt/impl.jl
 delete mode 100644 ext/NNlibNNPACK_jllExt/interface.jl
 delete mode 100644 ext/NNlibNNPACK_jllExt/libnnpack.jl
 delete mode 100644 ext/NNlibNNPACK_jllExt/libnnpack_types.jl
 delete mode 100644 ext/NNlibNNPACK_jllExt/performance.jl

diff --git a/ext/NNlibNNPACK_jllExt/NNPACK.jl b/ext/NNlibNNPACK_jllExt/NNPACK.jl
deleted file mode 100644
index 685415a7e..000000000
--- a/ext/NNlibNNPACK_jllExt/NNPACK.jl
+++ /dev/null
@@ -1,55 +0,0 @@
-using NNPACK_jll
-
-include("libnnpack_types.jl")
-include("error.jl")
-include("libnnpack.jl")
-include("performance.jl")
-include("interface.jl")
-
-
-const shared_threadpool_dict = Dict{UInt64, Base.RefValue}()
-
-"""
-    is_nnpack_available()
-
-Checks if the current hardware is supported by NNPACK.
-"""
-function is_nnpack_available()
-    status = nnp_initialize()
-    if status == nnp_status_unsupported_hardware
-        return false
-    else
-        return true
-    end
-end
-
-"""
-    allocate_threadpool()
-
-Allocates several threadpool based on the upper limit on the number of threads for the machine.
-Allows NNPACK to intelligently choose which threadpool to use for getting the best
-performance.
-"""
-function allocate_threadpool()
-    global NNPACK_CPU_THREADS = NNPACK_CPU_THREADS > 8 ? UInt64(8) : UInt64(exp2(floor(log2(NNPACK_CPU_THREADS))))
-    for i in 0:Int(log2(NNPACK_CPU_THREADS))
-        threads = UInt64(2^i)
-        push!(shared_threadpool_dict, threads => Ref(pthreadpool_create(threads)))
-    end
-end
-
-@init begin
-    status = nnp_initialize()
-    if status == nnp_status_unsupported_hardware
-        @warn "Hardware is unsupported by NNPACK so falling back to default NNlib"
-    end
-    try
-        global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"])
-    catch
-        # Sys.CPU_THREADS should be a better default if we are tuning the benchmark suite on
-        # a particular machine. However, we fix the runtime threadpool here to have a max of
-        # 4 threads so anything above will be ignored anyways
-        global NNPACK_CPU_THREADS = UInt64(4)
-    end
-    allocate_threadpool()
-end
diff --git a/ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl b/ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl
deleted file mode 100644
index 1c9566841..000000000
--- a/ext/NNlibNNPACK_jllExt/NNlibNNPACK_jllExt.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-module NNlibNNPACK_jllExt
-
-using NNlib: NNlib
-using NNPACK_jll, Pkg
-
-if isdefined(NNPACK_jll, :libnnpack)
-    include("NNPACK.jl")
-else
-    @warn "NNPACK not available for your platform: " *
-          "$( Pkg.BinaryPlatforms.platform_name(Pkg.BinaryPlatforms.platform_key_abi()))" *
-          "($( Pkg.BinaryPlatforms.triplet(Pkg.BinaryPlatforms.platform_key_abi())))
-          You will be able to use only the default Julia NNlib backend"
-end
-
-end
diff --git a/ext/NNlibNNPACK_jllExt/error.jl b/ext/NNlibNNPACK_jllExt/error.jl
deleted file mode 100644
index 83522c37d..000000000
--- a/ext/NNlibNNPACK_jllExt/error.jl
+++ /dev/null
@@ -1,83 +0,0 @@
-struct NNPACKError <: Exception
-    code::nnp_status
-    msg::AbstractString
-end
-
-Base.show(io::IO, err::NNPACKError) = print(io, "NNPACKError(code $(err.code), $(err.msg))")
-
-function NNPACKError(status::nnp_status)
-    msg = "NNPACK STATUS SUCCESS"
-    if status == nnp_status_invalid_batch_size
-        msg = "NNPACK STATUS INVALID BATCH SIZE"
-    elseif status == nnp_status_invalid_channels
-        msg = "NNPACK STATUS INVALID CHANNELS"
-    elseif status == nnp_status_invalid_input_channels
-        msg = "NNPACK STATUS INVALID INPUT CHANNELS"
-    elseif status == nnp_status_invalid_output_channels
-        msg = "NNPACK STATUS INVALID OUTPUT CHANNELS"
-    elseif status == nnp_status_invalid_input_size
-	    msg = "NNPACK STATUS INVALID INPUT SIZE"
-    elseif status == nnp_status_invalid_input_stride
-        msg = "NNPACK STATUS INVALID INPUT STRIDE"
-    elseif status == nnp_status_invalid_input_padding
-        msg = "NNPACK STATUS INVALID INPUT PADDING"
-    elseif status == nnp_status_invalid_kernel_size
-        msg = "NNPACK STATUS INVALID KERNEL SIZE"
-    elseif status == nnp_status_invalid_pooling_size
-        msg = "NNPACK STATUS INVALID POOLING SIZE"
-    elseif status == nnp_status_invalid_pooling_stride
-        msg = "NNPACK STATUS INVALID POOLING STRIDE"
-    elseif status == nnp_status_invalid_algorithm
-        msg = "NNPACK STATUS INVALID ALGORITHM"
-    elseif status == nnp_status_invalid_transform_strategy
-        msg = "NNPACK STATUS INVALID TRANSFORM STRATEGY"
-    elseif status == nnp_status_invalid_output_subsampling
-        msg = "NNPACK STATUS INVALID OUTPUT SUBSAMPLING"
-    elseif status == nnp_status_invalid_activation
-        msg = "NNPACK STATUS INVALID ACTIVATION"
-    elseif status == nnp_status_invalid_activation_parameters
-        msg = "NNPACK STATUS INVALID ACTIVATION PARAMETERS"
-    elseif status == nnp_status_unsupported_input_size
-        msg = "NNPACK STATUS UNSUPPORTED INPUT SIZE"
-    elseif status == nnp_status_unsupported_input_stride
-        msg = "NNPACK STATUS UNSUPPORTED INPUT STRIDE"
-    elseif status == nnp_status_unsupported_input_padding
-        msg = "NNPACK STATUS UNSUPPORTED INPUT PADDING"
-    elseif status == nnp_status_unsupported_kernel_size
-        msg = "NNPACK STATUS UNSUPPORTED KERNEL SIZE"
-    elseif status == nnp_status_unsupported_pooling_size
-        msg = "NNPACK STATUS UNSUPPORTED POOLING SIZE"
-    elseif status == nnp_status_unsupported_pooling_stride
-        msg = "NNPACK STATUS UNSUPPORTED POOLING STRIDE"
-    elseif status == nnp_status_unsupported_algorithm
-        msg = "NNPACK STATUS UNSUPPORTED ALGORITHM"
-    elseif status == nnp_status_unsupported_transform_strategy
-        msg = "NNPACK STATUS UNSUPPORTED TRANSFORM STRATEGY"
-    elseif status == nnp_status_unsupported_activation
-        msg = "NNPACK STATUS UNSUPPORTED ACTIVATION"
-    elseif status == nnp_status_unsupported_activation_parameters
-        msg = "NNPACK STATUS UNSUPPORTED ACTIVATION PARAMETERS"
-    elseif status == nnp_status_uninitialized
-        msg = "NNPACK STATUS UNINITIALIZED"
-    elseif status == nnp_status_unsupported_hardware
-        msg = "NNPACK STATUS UNSUPPORTED HARDWARE"
-    elseif status == nnp_status_out_of_memory
-        msg = "NNPACK STATUS OUT OF MEMORY"
-    elseif status == nnp_status_insufficient_buffer
-        msg = "NNPACK STATUS INSUFFICIENT BUFFER"
-    elseif status == nnp_status_misaligned_buffer
-        msg = "NNPACK STATUS MISALIGNED BUFFER"
-    end
-    NNPACKError(status, msg)
-end
-
-macro nnpack_check(nnp_func)
-    quote
-        local err::nnp_status
-        err = $(esc(nnp_func))
-        if err != nnp_status_success
-            throw(NNPACKError(err))
-        end
-        err
-    end
-end
diff --git a/ext/NNlibNNPACK_jllExt/impl.jl b/ext/NNlibNNPACK_jllExt/impl.jl
deleted file mode 100644
index 3309404e1..000000000
--- a/ext/NNlibNNPACK_jllExt/impl.jl
+++ /dev/null
@@ -1,50 +0,0 @@
-function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}}
-    check_dims(size(x), size(y), pdims)
-    threadpool = select_threadpool(pdims, size(y, 4))
-    nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims),
-                           stride = stride(pdims), threadpool = threadpool)
-end
-
-function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
-                                       b::A2 = zeros(Float32, size(x, 3)),
-                                       algo = UInt32(0)) where {A1<:Array{Float32, 4},
-                                                                A2<:Array{Float32, 1}}
-    check_dims(size(x), size(w), size(y), cdims)
-    threadpool = select_threadpool(cdims, size(y, 4))
-
-    if flipkernel(cdims) == 0
-        w = flipweight(w)
-    end
-
-    nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims),
-                           stride = stride(cdims), threadpool = threadpool)
-end
-
-function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
-                                             algo = UInt32(0)) where{A<:Array{Float32, 4}}
-    check_dims(size(dx), size(w), size(dy), cdims)
-    threadpool = select_threadpool(cdims, size(dy, 4))
-    
-    if flipkernel(cdims) == 0
-        w = flipweight(w)
-    end
-
-    nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims),
-                                   stride = stride(cdims), threadpool = threadpool)
-end
-
-function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
-                                               algo = UInt32(0)) where{A<:Array{Float32, 4}}
-    check_dims(size(x), size(dw), size(dy), cdims)
-    threadpool = select_threadpool(cdims, size(dy, 4))
-    
-    nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims),
-                                    stride = stride(cdims), threadpool = threadpool)
-
-    if flipkernel(cdims) == 0
-        dw .= flipweight(dw)
-    end
-
-    dw
-end
-
diff --git a/ext/NNlibNNPACK_jllExt/interface.jl b/ext/NNlibNNPACK_jllExt/interface.jl
deleted file mode 100644
index 6c4d86930..000000000
--- a/ext/NNlibNNPACK_jllExt/interface.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-include("impl.jl")
-
-## NNPACK supports only Float32
-for (front_name, backend) in (
-        :conv          => :_nnpack,
-        :∇conv_data    => :_nnpack,
-        :∇conv_filter  => :_nnpack,
-    )
-    @eval begin
-        function NNlib.$(Symbol("$(front_name)$(backend)!"))(
-                        out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4},
-                        cdims::ConvDims; kwargs...) where {T1, T2, T3}
-            @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1
-            # Output must of the same type as in the function signature
-            T1.($(Symbol("$(front_name)$(backend)!"))(Float32.(out), Float32.(in1),
-                                                      Float32.(in2), cdims; kwargs...))
-        end
-    end
-end
-
-function maxpool_nnpack!(y::Array{T1, 4}, x::Array{T2, 4}, pdims::PoolDims;
-                         kwargs...) where {T1, T2}
-    @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1
-    # We want the output to be of the same type as desired
-    T1.(maxpool_nnpack!(Float32.(y), Float32.(x), pdims; kwargs...))
-end
-
-"""
-    nnpack_supported_operation(cdims::ConvDims)
-    nnpack_supported_operation(pdims::PoolDims)
-
-Returns `true` if nnpack supports the convolution/pooling operation for the given parameters.
-"""
-function nnpack_supported_operation(pdims::PoolDims{2, K, S, P, (1, 1)}) where {K, S, P}
-    val = input_size(pdims)[1:2] .+ (P[1] + P[2], P[3] + P[4]) .- K
-    return val .% S == (0, 0) ? true : false
-end
-
-function nnpack_supported_operation(cdims::ConvDims{2, K, (1, 1), P, (1, 1)}) where {K, S, P}
-    return true
-end
-
-# Return false for everything else
-nnpack_supported_operation(dims) = false
diff --git a/ext/NNlibNNPACK_jllExt/libnnpack.jl b/ext/NNlibNNPACK_jllExt/libnnpack.jl
deleted file mode 100644
index 2f3996c32..000000000
--- a/ext/NNlibNNPACK_jllExt/libnnpack.jl
+++ /dev/null
@@ -1,135 +0,0 @@
-#NOTE: We do the error handling of nnp_initialize while loading NNPACK
-function nnp_initialize()
-    ccall((:nnp_initialize, libnnpack), nnp_status, (),)
-end
-
-function nnp_deinitialize()
-    @nnpack_check ccall((:nnp_deinitialize, libnnpack), nnp_status, (),)
-end
-
-function pthreadpool_create(n = 0)
-    ccall((:pthreadpool_create, libnnpack), Ptr{Cvoid}, (Csize_t,), n)
-end
-
-function nnp_relu_output(batch_size, channels, input, output, negative_slope, threadpool)
-    @nnpack_check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool)
-end
-
-function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N}
-    # Investigate why the channel and batch dims need to specified like this
-    nnp_relu_output(prod(size(x)[N-1:N]), prod(size(x)[1:N-2]), x, y, negative_slope, threadpool)
-    y
-end
-
-function nnp_relu_input_gradient(batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
-    @nnpack_check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
-end
-
-function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N}
-    # Investigate why the channel and batch dims need to specified like this
-    nnp_relu_input_gradient(Csize_t(prod(size(x)[N-1:N])), prod(size(x)[1:N-2]), dy, x, dx, negative_slope, threadpool)
-    dx
-end
-
-function nnp_softmax_output(batch_size, channels, input, output, threadpool)
-    @nnpack_check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool)
-end
-
-function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = C_NULL)
-    nnp_softmax_output(ndims(x) == 2 ? size(x, 2) : 1, size(x, 1), x, y, threadpool)
-    y
-end
-
-#FIXME: Output of fully connected not consistent with `kernel * input`
-#NOTE: This most likely due to nnpack being row major. Investigate this.
-
-function nnp_fully_connected_output(batch_size, input_channels, output_channels, input, kernel, output, threadpool, profile)
-    @nnpack_check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL)
-end
-
-function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = C_NULL)
-    profile = profile == nothing ? nnp_profile() : profile
-    nnp_fully_connected_output(size(x, 2), size(x, 1), size(w, 1), x, w, y, threadpool, profile)
-    y
-end
-
-function nnp_fully_connected_inference_f16f32(input_channels, output_channels, input, kernel, output, threadpool)
-    @nnpack_check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
-end
-
-nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = C_NULL) =
-    nnp_fully_connected_inference(reshape(x, size(x), 1), w, reshape(y, size(y), 1), threadpool = threadpool)
-
-function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Float16,2}, y::Array{Float32, 2}; threadpool = C_NULL)
-    nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
-    y
-end
-
-function nnp_fully_connected_inference(input_channels, output_channels, input, kernel, output, threadpool)
-    @nnpack_check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
-end
-
-nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = C_NULL) =
-    nnp_fully_connected_inference(reshape(x, size(x), 1), w, threadpool = threadpool)
-
-function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2}, y::Array{Float32, 2}; threadpool = C_NULL)
-    nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
-    y
-end
-
-function nnp_max_pooling_output(batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
-    @nnpack_check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
-end
-
-function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = C_NULL)
-    input_size = nnp_size(Csize_t.((size(x, 1), size(x, 2)))...)
-    pooling_size = nnp_size(Csize_t.(kernel)...)
-    input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
-    pooling_stride = nnp_size(Csize_t.(stride)...)
-    nnp_max_pooling_output(size(x, 4), size(x, 3), input_size, input_padding, pooling_size, pooling_stride, x, y, threadpool)
-    y
-end
-
-#TODO: Add wrapper for convolution inference
-
-function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
-    @nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
-end
-
-function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
-    input_size = nnp_size(Csize_t.((size(dx,1), size(dx,2)))...)
-    kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
-    input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
-    profile = profile == nothing ? nnp_profile() : profile
-    workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer
-    nnp_convolution_input_gradient(UInt32(algo), size(dx,4), size(dx,3), size(w,4), input_size, input_padding, kernel_size, dy, w, dx, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile)
-    dx
-end
-
-function nnp_convolution_kernel_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
-    @nnpack_check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
-end
-
-function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
-    input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
-    kernel_size = nnp_size(Csize_t.((size(dw,1),size(dw,2)))...)
-    input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
-    profile = profile == nothing ? nnp_profile() : profile
-    workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer
-    nnp_convolution_kernel_gradient(UInt32(algo), size(x,4), size(x,3), size(dw,4), input_size, input_padding, kernel_size, x, dy, dw, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile)
-    dw
-end
-
-function nnp_convolution_output(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
-    @nnpack_check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
-end
-
-function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
-    input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
-    kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
-    input_padding = nnp_padding(Csize_t(padding[3]), Csize_t(padding[2]), Csize_t(padding[4]), Csize_t(padding[1]))
-    profile = profile == nothing ? nnp_profile() : profile
-    workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer
-    nnp_convolution_output(UInt32(algo), size(x,4), size(x,3), size(w,4), input_size, input_padding, kernel_size, x, w, b, y, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile)
-    y
-end
diff --git a/ext/NNlibNNPACK_jllExt/libnnpack_types.jl b/ext/NNlibNNPACK_jllExt/libnnpack_types.jl
deleted file mode 100644
index 6e7b23c16..000000000
--- a/ext/NNlibNNPACK_jllExt/libnnpack_types.jl
+++ /dev/null
@@ -1,85 +0,0 @@
-const nnp_status = UInt32
-const nnp_status_success = (UInt32)(0)
-const nnp_status_invalid_batch_size = (UInt32)(2)
-const nnp_status_invalid_channels = (UInt32)(3)
-const nnp_status_invalid_input_channels = (UInt32)(4)
-const nnp_status_invalid_output_channels = (UInt32)(5)
-const nnp_status_invalid_input_size = (UInt32)(10)
-const nnp_status_invalid_input_stride = (UInt32)(11)
-const nnp_status_invalid_input_padding = (UInt32)(12)
-const nnp_status_invalid_kernel_size = (UInt32)(13)
-const nnp_status_invalid_pooling_size = (UInt32)(14)
-const nnp_status_invalid_pooling_stride = (UInt32)(15)
-const nnp_status_invalid_algorithm = (UInt32)(16)
-const nnp_status_invalid_transform_strategy = (UInt32)(17)
-const nnp_status_invalid_output_subsampling = (UInt32)(13)
-const nnp_status_invalid_activation = (UInt32)(14)
-const nnp_status_invalid_activation_parameters = (UInt32)(15)
-const nnp_status_unsupported_input_size = (UInt32)(20)
-const nnp_status_unsupported_input_stride = (UInt32)(21)
-const nnp_status_unsupported_input_padding = (UInt32)(22)
-const nnp_status_unsupported_kernel_size = (UInt32)(23)
-const nnp_status_unsupported_pooling_size = (UInt32)(24)
-const nnp_status_unsupported_pooling_stride = (UInt32)(25)
-const nnp_status_unsupported_algorithm = (UInt32)(26)
-const nnp_status_unsupported_transform_strategy = (UInt32)(57)
-const nnp_status_unsupported_activation = (UInt32)(28)
-const nnp_status_unsupported_activation_parameters = (UInt32)(29)
-const nnp_status_uninitialized = (UInt32)(50)
-const nnp_status_unsupported_hardware = (UInt32)(51)
-const nnp_status_out_of_memory = (UInt32)(52)
-const nnp_status_insufficient_buffer = (UInt32)(53)
-const nnp_status_misaligned_buffer = (UInt32)(54)
-
-const nnp_activation = UInt32
-const nnp_activation_identity = (UInt32)(0)
-const nnp_activation_relu = (UInt32)(1)
-
-const nnp_convolution_algorithm = UInt32
-const nnp_convolution_algorithm_auto = (UInt32)(0)
-const nnp_convolution_algorithm_ft8x8 = (UInt32)(1)
-const nnp_convolution_algorithm_ft16x16 = (UInt32)(2)
-const nnp_convolution_algorithm_wt8x8 = (UInt32)(3)
-const nnp_convolution_algorithm_implicit_gemm = (UInt32)(4)
-const nnp_convolution_algorithm_direct = (UInt32)(5)
-const nnp_convolution_algorithm_wt8x8_fp16 = (UInt32)(6)
-
-const nnp_convolution_transform_strategy = UInt32
-const nnp_convolution_transform_strategy_compute = (UInt32)(1)
-const nnp_convolution_transform_strategy_precompute = (UInt32)(2)
-const nnp_convolution_transform_strategy_reuse = (UInt32)(3)
-
-const pthreadpool_t = Ptr{Nothing}
-
-mutable struct nnp_size
-    width::Csize_t
-    height::Csize_t
-    nnp_size() = new(Csize_t(0), Csize_t(0))
-    nnp_size(w, h) = new(Csize_t(w), Csize_t(h))
-end
-
-Base.unsafe_convert(::Type{Ptr{nnp_size}}, a::nnp_size) = Ptr{a}
-
-mutable struct nnp_padding
-    top::Csize_t
-    right::Csize_t
-    bottom::Csize_t
-    left::Csize_t
-    nnp_padding() = new(Csize_t(0), Csize_t(0), Csize_t(0), Csize_t(0))
-    nnp_padding(val) = new(Csize_t(val), Csize_t(val), Csize_t(val), Csize_t(val))
-    nnp_padding(t, r, b, l) = new(Csize_t(t), Csize_t(r), Csize_t(b), Csize_t(l))
-end
-
-Base.unsafe_convert(::Type{Ptr{nnp_padding}}, a::nnp_padding) = Ptr{a}
-
-mutable struct nnp_profile
-    total::Cdouble
-    input_transform::Cdouble
-    kernel_transform::Cdouble
-    output_transform::Cdouble
-    block_multiplication::Cdouble
-    nnp_profile() = new(Cdouble(0.0), Cdouble(0.0), Cdouble(0.0), Cdouble(0.0), Cdouble(0.0))
-    nnp_profile(t, it, kt, ot, bm) = new(Cdouble(t), Cdouble(it), Cdouble(kt), Cdouble(ot), Cdouble(bm))
-end
-
-Base.unsafe_convert(::Type{Ptr{nnp_profile}}, a::nnp_profile) = Ptr{a}
diff --git a/ext/NNlibNNPACK_jllExt/performance.jl b/ext/NNlibNNPACK_jllExt/performance.jl
deleted file mode 100644
index 24abdb411..000000000
--- a/ext/NNlibNNPACK_jllExt/performance.jl
+++ /dev/null
@@ -1,31 +0,0 @@
-function select_threadpool(cdims::DenseConvDims, batch_size::Int)
-    inp_size = input_size(cdims)[1] 
-    if batch_size >= 32
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    elseif batch_size >= 16 && inp_size >= 64
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    elseif inp_size <= 32
-        return C_NULL
-    elseif inp_size >= 128
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    elseif inp_size * batch_size >= 256
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    end    
-    return C_NULL
-end
-
-function select_threadpool(pdims::PoolDims, batch_size::Int)
-    inp_size = input_size(pdims)[1] 
-    if batch_size >= 32
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    elseif batch_size >= 16 && inp_size >= 64
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    elseif inp_size <= 32
-        return C_NULL
-    elseif inp_size >= 128
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    elseif inp_size * batch_size >= 256
-        return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    end    
-    return C_NULL
-end