daphne-eu · WangYuyao · Aug 6, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 18, 2024
diff --git a/src/runtime/local/context/CUDAContext.cpp b/src/runtime/local/context/CUDAContext.cpp
@@ -61,6 +61,10 @@ void CUDAContext::init() {
     CHECK_CUDNN(cudnnCreateActivationDescriptor(&activation_desc));
     CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
     CHECK_CUDNN(cudnnCreateFilterDescriptor(&filter_desc));
+
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&dy_tensor_desc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&bn_scale_bias_tensor_desc));
+
     CHECK_CUSOLVER(cusolverDnCreate(&cusolver_handle));
 
     CHECK_CUDART(cudaStreamCreateWithFlags(&cusolver_stream, cudaStreamNonBlocking));

diff --git a/src/runtime/local/context/CUDAContext.h b/src/runtime/local/context/CUDAContext.h
@@ -104,6 +104,8 @@ class CUDAContext final : public IContext {
     cudnnConvolutionDescriptor_t conv_desc{};
     cudnnBatchNormMode_t bn_mode = CUDNN_BATCHNORM_SPATIAL;
 
+    cudnnTensorDescriptor_t dy_tensor_desc{}, bn_scale_bias_tensor_desc{};
+
     // A block size of 256 works well in many cases.
     // Putting it here to avoid hard coding things elsewhere.
     const uint32_t default_block_size = 256;

diff --git a/src/runtime/local/kernels/CUDA/BatchNorm.cpp b/src/runtime/local/kernels/CUDA/BatchNorm.cpp
@@ -16,6 +16,7 @@
 
 #include "BatchNorm.h"
 #include <runtime/local/datastructures/AllocationDescriptorCUDA.h>
+#include <iostream>
 
 namespace CUDA::BatchNorm {
     template<typename DTRes, typename DTArg>
@@ -52,7 +53,66 @@ namespace CUDA::BatchNorm {
                 d_gamma, d_beta, d_ema_mean, d_ema_var, eps));
     }
 
+    template<typename DTRes, typename DTArg>
+    void Backward<DTRes, DTArg>::apply(DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta,
+                                       const DTArg *mean, const DTArg *invVar, 
+                                       const DTArg *in, const DTArg *dout, 
+                                       const DTArg *gamma, const typename DTArg::VT eps, DCTX(dctx))
+    {
+        const size_t deviceID = 0; //ToDo: multi device support
+        auto ctx = CUDAContext::get(dctx, deviceID);
+        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+        using VT = typename DTRes::VT;
+        const size_t N = in->getNumRows();
+        const size_t CHW = in->getNumCols();
+        const size_t C = gamma->getNumRows();
+        const size_t HW = CHW / C;
+        auto H = static_cast<size_t>(std::sqrt(HW));
+
+        VT alphaDataDiff = 1.0;
+        VT betaDataDiff = 0.0;
+        VT alphaParamDiff = 1.0;
+        VT betaParamDiff = 0.0;
+
+        const VT* d_mean = mean->getValues(&alloc_desc);
+        const VT* d_invVar = invVar->getValues(&alloc_desc);
+        const VT* d_in = in->getValues(&alloc_desc);
+        const VT* d_gamma = gamma->getValues(&alloc_desc);
+        const VT* d_dout = dout->getValues(&alloc_desc);
+
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), N, C, H, H));    
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dy_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), N, C, H, H));
+
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), N, C, H, H));
+        CHECK_CUDNN(cudnnDeriveBNTensorDescriptor(ctx->bn_scale_bias_tensor_desc, ctx->src_tensor_desc, ctx->bn_mode));
+
+        if (dX == nullptr)
+            dX = DataObjectFactory::create<DenseMatrix<VT>>(N, CHW, false, &alloc_desc);
+        if (dGamma == nullptr)
+            dGamma = DataObjectFactory::create<DenseMatrix<VT>>(C, 1, false, &alloc_desc);
+        if (dBeta == nullptr)
+            dBeta = DataObjectFactory::create<DenseMatrix<VT>>(C, 1, false, &alloc_desc);
+
+        VT* d_dX = dX->getValues(&alloc_desc);
+        VT* d_dGamma = dGamma->getValues(&alloc_desc);
+        VT* d_dBeta = dBeta->getValues(&alloc_desc);
+
+        CHECK_CUDNN(cudnnBatchNormalizationBackward(ctx->getCUDNNHandle(), 
+                                                    ctx->bn_mode,
+                                                    &alphaDataDiff, &betaDataDiff, &alphaParamDiff, &betaParamDiff,
+                                                    ctx->src_tensor_desc, d_in,
+                                                    ctx->dy_tensor_desc, d_dout,
+                                                    ctx->dst_tensor_desc, d_dX,
+                                                    ctx->bn_scale_bias_tensor_desc, d_gamma, d_dGamma, d_dBeta,
+                                                    eps,
+                                                    d_mean, d_invVar));
+
+    }
+
     template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
     template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+
+    template struct Backward<DenseMatrix<float>, DenseMatrix<float>>;
+    template struct Backward<DenseMatrix<double>, DenseMatrix<double>>;
 }
 
diff --git a/src/runtime/local/kernels/CUDA/BatchNorm.h b/src/runtime/local/kernels/CUDA/BatchNorm.h
@@ -23,9 +23,18 @@
 #include "HostUtils.h"
 
 namespace CUDA::BatchNorm {
+
     template<typename DTRes, typename DTArg>
     struct Forward {
         static void apply(DTRes *&res, const DTArg *data, const DTArg *gamma, const DTArg *beta, const DTArg *ema_mean,
                 const DTArg *ema_var, typename DTArg::VT eps, DCTX(dctx));
     };
+
+    template<typename DTRes, typename DTArg>
+    struct Backward {
+        static void apply(DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta,
+                          const DTArg *mean, const DTArg *invVar, 
+                          const DTArg *in, const DTArg *dout, 
+                          const DTArg *gamma, const typename DTArg::VT eps, DCTX(dctx));
+    };    
 }
diff --git a/src/runtime/local/kernels/CUDA/Pooling.cpp b/src/runtime/local/kernels/CUDA/Pooling.cpp
@@ -61,10 +61,80 @@ namespace CUDA::NN::Pooling {
                                         d_input, &blend_beta, ctx->dst_tensor_desc, d_res));
     }
 
+    template<template<typename> class OP, typename DTRes, typename DTArg>
+    void Backward<OP, DTRes, DTArg>::apply(DTRes *&res, 
+                                           const DTArg *input, const DTArg *output,const DTArg *dOut,
+                                           const size_t batch_size, const size_t num_channels, 
+                                           const size_t img_h, const size_t img_w,       
+                                           const size_t pool_h, const size_t pool_w,
+                                           const size_t stride_h, const size_t stride_w,
+                                           const size_t pad_h, const size_t pad_w,  
+                                           DCTX(dctx))
+    {
+        const size_t deviceID = 0; //ToDo: multi device support
+        auto ctx = CUDAContext::get(dctx, deviceID);
+        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+
+        using VT = typename DTRes::VT;
+        const VT blend_alpha = 1;
+        const VT blend_beta = 0;
+
+        CHECK_CUDNN(cudnnSetPooling2dDescriptor(ctx->pooling_desc, 
+                                                OP<VT>::isMAX() ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, 
+                                                CUDNN_PROPAGATE_NAN, 
+                                                pool_h, pool_w, 
+                                                pad_h, pad_w, 
+                                                stride_h, stride_w));
+
+        const VT* d_input = input->getValues(&alloc_desc);
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, 
+                                               ctx->tensor_format, 
+                                               ctx->getCUDNNDataType<VT>(), 
+                                               batch_size, num_channels, img_h, img_w));
+
+        const int tensorDims = 4;
+        int tensorOuputDimA[tensorDims];
+        CHECK_CUDNN(cudnnGetPoolingNdForwardOutputDim(ctx->pooling_desc, ctx->src_tensor_desc, tensorDims,
+            tensorOuputDimA));
+
+        int n = tensorOuputDimA[0]; int c = tensorOuputDimA[1];
+        int h = tensorOuputDimA[2]; int w = tensorOuputDimA[3];
+
+        const VT* d_output = output->getValues(&alloc_desc);
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, 
+                                               ctx->tensor_format, 
+                                               ctx->getCUDNNDataType<VT>(), 
+                                               n, c, h, w));
+
+        const VT* d_dOut = dOut->getValues(&alloc_desc);
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dy_tensor_desc, 
+                                               ctx->tensor_format, 
+                                               ctx->getCUDNNDataType<VT>(), 
+                                               n, c, h, w));
+
+        if (res == nullptr) {
+            res = DataObjectFactory::create<DTRes>(batch_size, num_channels * img_h * img_w, false, &alloc_desc);
+        }
+        VT* d_res = res->getValues(&alloc_desc);
+
+        CHECK_CUDNN(cudnnPoolingBackward(ctx->getCUDNNHandle(), 
+                                         ctx->pooling_desc, &blend_alpha,
+                                         ctx->dst_tensor_desc, d_output,
+                                         ctx->dy_tensor_desc, d_dOut,
+                                         ctx->src_tensor_desc, d_input,
+                                         &blend_beta, ctx->src_tensor_desc, d_res));
+    }
+
     template struct Forward<::NN::Pooling::AVG, DenseMatrix<float>, DenseMatrix<float>>;
     template struct Forward<::NN::Pooling::AVG, DenseMatrix<double>, DenseMatrix<double>>;
 
     template struct Forward<::NN::Pooling::MAX, DenseMatrix<float>, DenseMatrix<float>>;
     template struct Forward<::NN::Pooling::MAX, DenseMatrix<double>, DenseMatrix<double>>;
+
+    template struct Backward<::NN::Pooling::AVG, DenseMatrix<float>, DenseMatrix<float>>;
+    template struct Backward<::NN::Pooling::AVG, DenseMatrix<double>, DenseMatrix<double>>;
+
+    template struct Backward<::NN::Pooling::MAX, DenseMatrix<float>, DenseMatrix<float>>;
+    template struct Backward<::NN::Pooling::MAX, DenseMatrix<double>, DenseMatrix<double>>;
 }
 
diff --git a/src/runtime/local/kernels/CUDA/Pooling.h b/src/runtime/local/kernels/CUDA/Pooling.h
@@ -35,4 +35,16 @@ namespace CUDA::NN::Pooling {
                 size_t pool_h, size_t pool_w, size_t stride_h, size_t stride_w,
                 size_t pad_h, size_t pad_w, DCTX(dctx));
     };
+
+    template<template<typename> class OP, typename DTRes, typename DTArg>
+    struct Backward {
+        static void apply(DTRes *&res, 
+                        const DTArg *input, const DTArg *output,const DTArg *dOut,
+                        const size_t batch_size, const size_t num_channels, 
+                        const size_t img_h, const size_t img_w,       
+                        const size_t pool_h, const size_t pool_w,
+                        const size_t stride_h, const size_t stride_w,
+                        const size_t pad_h, const size_t pad_w,  
+                        DCTX(dctx));
+    };
 }
diff --git a/test/runtime/local/kernels/DNNBatchNorm2DBackwardTest.cpp b/test/runtime/local/kernels/DNNBatchNorm2DBackwardTest.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef USE_CUDA
+#include "run_tests.h"
+
+#include <api/cli/DaphneUserConfig.h>
+#include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
+#include "runtime/local/kernels/CUDA/BatchNorm.h"
+#include <runtime/local/kernels/CheckEq.h>
+#include <runtime/local/kernels/CheckEqApprox.h>
+
+#include <cassert>
+#include <catch.hpp>
+#include <tags.h>
+
+template<class DT>
+void checkBatchNorm2DBackwardCUDA(const DT* in, const DT* dOut, const DT* gamma, const DT* mean, const DT* invVar, const DT* exp1, 
+const DT* exp2, const DT* exp3, DaphneContext* dctx) {
+    DT* dX = nullptr;
+    DT* dGamma = nullptr;
+    DT* dBeta = nullptr;
+
+    typename DT::VT epsilon = 1e-5;
+    CUDA::BatchNorm::Backward<DT, DT>::apply(dX, dGamma, dBeta, mean, invVar, in, dOut, gamma, epsilon, dctx);
+    CHECK(checkEqApprox(dX, exp1, 1e-5, nullptr));
+    CHECK(checkEqApprox(dGamma, exp2, 1e-4, nullptr));
+    CHECK(checkEqApprox(dBeta, exp3, 1e-5, nullptr));
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("batch_norm_bwd_cuda", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+    auto dctx = setupContextAndLogger();
+    using DT = TestType;
+
+    auto in = genGivenVals<DT>(2, { 1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12,
+
+                                       1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12});
+
+    auto dOut = genGivenVals<DT>(2, { 1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12,
+
+                                       1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12});
+
+    auto gamma = genGivenVals<DT>(3, { 1, 1, 1 });
+    auto mean = genGivenVals<DT>(3, { 2.5, 6.5, 10.5 });
+    auto invVar = genGivenVals<DT>(3, { 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5) });
+    auto res1 = genGivenVals<DT>(2, {-1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05, 
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05, 
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05});
+    auto res2 = genGivenVals<DT>(3, {8.9442, 8.9442, 8.9442 });
+    auto res3 = genGivenVals<DT>(3, {20, 52, 84 });
+
+    checkBatchNorm2DBackwardCUDA(in, dOut, gamma, mean, invVar, res1, res2, res3, dctx.get());
+    //std::cout<<"gpu"<<std::endl;
+
+    DataObjectFactory::destroy(in);
+    DataObjectFactory::destroy(dOut);
+    DataObjectFactory::destroy(gamma);
+    DataObjectFactory::destroy(mean);
+    DataObjectFactory::destroy(invVar);
+    DataObjectFactory::destroy(res1);
+    DataObjectFactory::destroy(res2);
+    DataObjectFactory::destroy(res3);
+}
+#endif // USE_CUDA
+
+#include "run_tests.h"
+
+#include <api/cli/DaphneUserConfig.h>
+#include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
+#include "runtime/local/kernels/BatchNorm2DBackward.h"
+#include "runtime/local/kernels/CUDA/BatchNorm.h"
+#include <runtime/local/kernels/CheckEq.h>
+#include <runtime/local/kernels/CheckEqApprox.h>
+
+#include <cassert>
+#include <catch.hpp>
+#include <tags.h>
+
+#include <iostream>
+
+template<class DT>
+void checkBatchNorm2DBackward(const DT* in, const DT* dOut, const DT* gamma, const DT* mean, const DT* invVar, const DT* exp1, 
+const DT* exp2, const DT* exp3, DaphneContext* dctx)
+{
+    DT* dX = nullptr;
+    DT* dGamma = nullptr;
+    DT* dBeta = nullptr;
+
+    typename DT::VT epsilon = 1e-5;
+    BatchNorm2DBackward<DT, DT>::apply(dX, dGamma, dBeta, mean, invVar, in, dOut, gamma, epsilon, dctx);
+
+    // CHECK(Approx(*(dX->getValues())).epsilon(epsilon) == *(exp1->getValues()));
+    // // CHECK(*dX == *exp1);
+    // CHECK(Approx(*(dGamma->getValues())).epsilon(epsilon) == *(exp2->getValues()));
+    // CHECK(Approx(*(dBeta->getValues())).epsilon(epsilon) == *(exp3->getValues()));
+
+    CHECK(checkEqApprox(dX, exp1, 1e-5, nullptr));
+    CHECK(checkEqApprox(dGamma, exp2, 1e-4, nullptr));
+    CHECK(checkEqApprox(dBeta, exp3, 1e-5, nullptr));
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("batch_norm_bwd", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+    auto dctx = setupContextAndLogger();
+    using DT = TestType;
+
+    auto in = genGivenVals<DT>(2, { 1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12,
+
+                                       1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12});
+
+    auto dOut = genGivenVals<DT>(2, { 1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12,
+
+                                       1, 2, 3, 4, 
+                                       5, 6, 7, 8, 
+                                       9, 10, 11, 12});
+
+    auto gamma = genGivenVals<DT>(3, { 1, 1, 1 });
+    auto mean = genGivenVals<DT>(3, { 2.5, 6.5, 10.5 });
+    auto invVar = genGivenVals<DT>(3, { 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5) });
+    auto res1 = genGivenVals<DT>(2, {-1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05, 
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05, 
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05});
+    auto res2 = genGivenVals<DT>(3, {8.9442, 8.9442, 8.9442 });
+    auto res3 = genGivenVals<DT>(3, {20, 52, 84 });
+
+    checkBatchNorm2DBackward(in, dOut, gamma, mean, invVar, res1, res2, res3, dctx.get());
+
+    //std::cout<<"cpu"<<std::endl;
+
+    DataObjectFactory::destroy(in);
+    DataObjectFactory::destroy(dOut);
+    DataObjectFactory::destroy(gamma);
+    DataObjectFactory::destroy(mean);
+    DataObjectFactory::destroy(invVar);
+    DataObjectFactory::destroy(res1);
+    DataObjectFactory::destroy(res2);
+    DataObjectFactory::destroy(res3);
+}
diff --git a/test/runtime/local/kernels/DNNConvolutionTest.cpp b/test/runtime/local/kernels/DNNConvolutionTest.cpp
@@ -37,7 +37,7 @@ void check(const DT* in, const DT* filter, const DT* exp, DaphneContext* dctx) {
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("conv_fwd", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("conv_fwd_cuda", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
     auto dctx = setupContextAndLogger();
     using DT = TestType;