Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cudnn based DNN ops #815

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/runtime/local/context/CUDAContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ void CUDAContext::init() {
CHECK_CUDNN(cudnnCreateActivationDescriptor(&activation_desc));
CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
CHECK_CUDNN(cudnnCreateFilterDescriptor(&filter_desc));

CHECK_CUDNN(cudnnCreateTensorDescriptor(&dy_tensor_desc));
CHECK_CUDNN(cudnnCreateTensorDescriptor(&bn_scale_bias_tensor_desc));

CHECK_CUSOLVER(cusolverDnCreate(&cusolver_handle));

CHECK_CUDART(cudaStreamCreateWithFlags(&cusolver_stream, cudaStreamNonBlocking));
Expand Down
2 changes: 2 additions & 0 deletions src/runtime/local/context/CUDAContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ class CUDAContext final : public IContext {
cudnnConvolutionDescriptor_t conv_desc{};
cudnnBatchNormMode_t bn_mode = CUDNN_BATCHNORM_SPATIAL;

cudnnTensorDescriptor_t dy_tensor_desc{}, bn_scale_bias_tensor_desc{};

// A block size of 256 works well in many cases.
// Putting it here to avoid hard coding things elsewhere.
const uint32_t default_block_size = 256;
Expand Down
60 changes: 60 additions & 0 deletions src/runtime/local/kernels/CUDA/BatchNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "BatchNorm.h"
#include <runtime/local/datastructures/AllocationDescriptorCUDA.h>
#include <iostream>

namespace CUDA::BatchNorm {
template<typename DTRes, typename DTArg>
Expand Down Expand Up @@ -52,7 +53,66 @@ namespace CUDA::BatchNorm {
d_gamma, d_beta, d_ema_mean, d_ema_var, eps));
}

template<typename DTRes, typename DTArg>
void Backward<DTRes, DTArg>::apply(DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta,
const DTArg *mean, const DTArg *invVar,
const DTArg *in, const DTArg *dout,
const DTArg *gamma, const typename DTArg::VT eps, DCTX(dctx))
{
const size_t deviceID = 0; //ToDo: multi device support
auto ctx = CUDAContext::get(dctx, deviceID);
AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
using VT = typename DTRes::VT;
const size_t N = in->getNumRows();
const size_t CHW = in->getNumCols();
const size_t C = gamma->getNumRows();
const size_t HW = CHW / C;
auto H = static_cast<size_t>(std::sqrt(HW));

VT alphaDataDiff = 1.0;
VT betaDataDiff = 0.0;
VT alphaParamDiff = 1.0;
VT betaParamDiff = 0.0;

const VT* d_mean = mean->getValues(&alloc_desc);
const VT* d_invVar = invVar->getValues(&alloc_desc);
const VT* d_in = in->getValues(&alloc_desc);
const VT* d_gamma = gamma->getValues(&alloc_desc);
const VT* d_dout = dout->getValues(&alloc_desc);

CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), N, C, H, H));
CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dy_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), N, C, H, H));

CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), N, C, H, H));
CHECK_CUDNN(cudnnDeriveBNTensorDescriptor(ctx->bn_scale_bias_tensor_desc, ctx->src_tensor_desc, ctx->bn_mode));

if (dX == nullptr)
dX = DataObjectFactory::create<DenseMatrix<VT>>(N, CHW, false, &alloc_desc);
if (dGamma == nullptr)
dGamma = DataObjectFactory::create<DenseMatrix<VT>>(C, 1, false, &alloc_desc);
if (dBeta == nullptr)
dBeta = DataObjectFactory::create<DenseMatrix<VT>>(C, 1, false, &alloc_desc);

VT* d_dX = dX->getValues(&alloc_desc);
VT* d_dGamma = dGamma->getValues(&alloc_desc);
VT* d_dBeta = dBeta->getValues(&alloc_desc);

CHECK_CUDNN(cudnnBatchNormalizationBackward(ctx->getCUDNNHandle(),
ctx->bn_mode,
&alphaDataDiff, &betaDataDiff, &alphaParamDiff, &betaParamDiff,
ctx->src_tensor_desc, d_in,
ctx->dy_tensor_desc, d_dout,
ctx->dst_tensor_desc, d_dX,
ctx->bn_scale_bias_tensor_desc, d_gamma, d_dGamma, d_dBeta,
eps,
d_mean, d_invVar));

}

template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;

template struct Backward<DenseMatrix<float>, DenseMatrix<float>>;
template struct Backward<DenseMatrix<double>, DenseMatrix<double>>;
}

9 changes: 9 additions & 0 deletions src/runtime/local/kernels/CUDA/BatchNorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,18 @@
#include "HostUtils.h"

namespace CUDA::BatchNorm {

template<typename DTRes, typename DTArg>
struct Forward {
static void apply(DTRes *&res, const DTArg *data, const DTArg *gamma, const DTArg *beta, const DTArg *ema_mean,
const DTArg *ema_var, typename DTArg::VT eps, DCTX(dctx));
};

template<typename DTRes, typename DTArg>
struct Backward {
static void apply(DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta,
const DTArg *mean, const DTArg *invVar,
const DTArg *in, const DTArg *dout,
const DTArg *gamma, const typename DTArg::VT eps, DCTX(dctx));
};
}
70 changes: 70 additions & 0 deletions src/runtime/local/kernels/CUDA/Pooling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,80 @@ namespace CUDA::NN::Pooling {
d_input, &blend_beta, ctx->dst_tensor_desc, d_res));
}

template<template<typename> class OP, typename DTRes, typename DTArg>
void Backward<OP, DTRes, DTArg>::apply(DTRes *&res,
const DTArg *input, const DTArg *output,const DTArg *dOut,
const size_t batch_size, const size_t num_channels,
const size_t img_h, const size_t img_w,
const size_t pool_h, const size_t pool_w,
const size_t stride_h, const size_t stride_w,
const size_t pad_h, const size_t pad_w,
DCTX(dctx))
{
const size_t deviceID = 0; //ToDo: multi device support
auto ctx = CUDAContext::get(dctx, deviceID);
AllocationDescriptorCUDA alloc_desc(dctx, deviceID);

using VT = typename DTRes::VT;
const VT blend_alpha = 1;
const VT blend_beta = 0;

CHECK_CUDNN(cudnnSetPooling2dDescriptor(ctx->pooling_desc,
OP<VT>::isMAX() ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
CUDNN_PROPAGATE_NAN,
pool_h, pool_w,
pad_h, pad_w,
stride_h, stride_w));

const VT* d_input = input->getValues(&alloc_desc);
CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc,
ctx->tensor_format,
ctx->getCUDNNDataType<VT>(),
batch_size, num_channels, img_h, img_w));

const int tensorDims = 4;
int tensorOuputDimA[tensorDims];
CHECK_CUDNN(cudnnGetPoolingNdForwardOutputDim(ctx->pooling_desc, ctx->src_tensor_desc, tensorDims,
tensorOuputDimA));

int n = tensorOuputDimA[0]; int c = tensorOuputDimA[1];
int h = tensorOuputDimA[2]; int w = tensorOuputDimA[3];

const VT* d_output = output->getValues(&alloc_desc);
CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc,
ctx->tensor_format,
ctx->getCUDNNDataType<VT>(),
n, c, h, w));

const VT* d_dOut = dOut->getValues(&alloc_desc);
CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dy_tensor_desc,
ctx->tensor_format,
ctx->getCUDNNDataType<VT>(),
n, c, h, w));

if (res == nullptr) {
res = DataObjectFactory::create<DTRes>(batch_size, num_channels * img_h * img_w, false, &alloc_desc);
}
VT* d_res = res->getValues(&alloc_desc);

CHECK_CUDNN(cudnnPoolingBackward(ctx->getCUDNNHandle(),
ctx->pooling_desc, &blend_alpha,
ctx->dst_tensor_desc, d_output,
ctx->dy_tensor_desc, d_dOut,
ctx->src_tensor_desc, d_input,
&blend_beta, ctx->src_tensor_desc, d_res));
}

template struct Forward<::NN::Pooling::AVG, DenseMatrix<float>, DenseMatrix<float>>;
template struct Forward<::NN::Pooling::AVG, DenseMatrix<double>, DenseMatrix<double>>;

template struct Forward<::NN::Pooling::MAX, DenseMatrix<float>, DenseMatrix<float>>;
template struct Forward<::NN::Pooling::MAX, DenseMatrix<double>, DenseMatrix<double>>;

template struct Backward<::NN::Pooling::AVG, DenseMatrix<float>, DenseMatrix<float>>;
template struct Backward<::NN::Pooling::AVG, DenseMatrix<double>, DenseMatrix<double>>;

template struct Backward<::NN::Pooling::MAX, DenseMatrix<float>, DenseMatrix<float>>;
template struct Backward<::NN::Pooling::MAX, DenseMatrix<double>, DenseMatrix<double>>;
}

12 changes: 12 additions & 0 deletions src/runtime/local/kernels/CUDA/Pooling.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,16 @@ namespace CUDA::NN::Pooling {
size_t pool_h, size_t pool_w, size_t stride_h, size_t stride_w,
size_t pad_h, size_t pad_w, DCTX(dctx));
};

template<template<typename> class OP, typename DTRes, typename DTArg>
struct Backward {
static void apply(DTRes *&res,
const DTArg *input, const DTArg *output,const DTArg *dOut,
const size_t batch_size, const size_t num_channels,
const size_t img_h, const size_t img_w,
const size_t pool_h, const size_t pool_w,
const size_t stride_h, const size_t stride_w,
const size_t pad_h, const size_t pad_w,
DCTX(dctx));
};
}
172 changes: 172 additions & 0 deletions test/runtime/local/kernels/DNNBatchNorm2DBackwardTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Copyright 2021 The DAPHNE Consortium
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifdef USE_CUDA
#include "run_tests.h"

#include <api/cli/DaphneUserConfig.h>
#include <runtime/local/datagen/GenGivenVals.h>
#include <runtime/local/datastructures/DenseMatrix.h>
#include "runtime/local/kernels/CUDA/BatchNorm.h"
#include <runtime/local/kernels/CheckEq.h>
#include <runtime/local/kernels/CheckEqApprox.h>

#include <cassert>
#include <catch.hpp>
#include <tags.h>

template<class DT>
void checkBatchNorm2DBackwardCUDA(const DT* in, const DT* dOut, const DT* gamma, const DT* mean, const DT* invVar, const DT* exp1,
const DT* exp2, const DT* exp3, DaphneContext* dctx) {
DT* dX = nullptr;
DT* dGamma = nullptr;
DT* dBeta = nullptr;

typename DT::VT epsilon = 1e-5;
CUDA::BatchNorm::Backward<DT, DT>::apply(dX, dGamma, dBeta, mean, invVar, in, dOut, gamma, epsilon, dctx);
CHECK(checkEqApprox(dX, exp1, 1e-5, nullptr));
CHECK(checkEqApprox(dGamma, exp2, 1e-4, nullptr));
CHECK(checkEqApprox(dBeta, exp3, 1e-5, nullptr));
}

TEMPLATE_PRODUCT_TEST_CASE("batch_norm_bwd_cuda", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
auto dctx = setupContextAndLogger();
using DT = TestType;

auto in = genGivenVals<DT>(2, { 1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,

1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12});

auto dOut = genGivenVals<DT>(2, { 1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,

1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12});

auto gamma = genGivenVals<DT>(3, { 1, 1, 1 });
auto mean = genGivenVals<DT>(3, { 2.5, 6.5, 10.5 });
auto invVar = genGivenVals<DT>(3, { 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5) });
auto res1 = genGivenVals<DT>(2, {-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05});
auto res2 = genGivenVals<DT>(3, {8.9442, 8.9442, 8.9442 });
auto res3 = genGivenVals<DT>(3, {20, 52, 84 });

checkBatchNorm2DBackwardCUDA(in, dOut, gamma, mean, invVar, res1, res2, res3, dctx.get());
//std::cout<<"gpu"<<std::endl;

DataObjectFactory::destroy(in);
DataObjectFactory::destroy(dOut);
DataObjectFactory::destroy(gamma);
DataObjectFactory::destroy(mean);
DataObjectFactory::destroy(invVar);
DataObjectFactory::destroy(res1);
DataObjectFactory::destroy(res2);
DataObjectFactory::destroy(res3);
}
#endif // USE_CUDA

#include "run_tests.h"

#include <api/cli/DaphneUserConfig.h>
#include <runtime/local/datagen/GenGivenVals.h>
#include <runtime/local/datastructures/DenseMatrix.h>
#include "runtime/local/kernels/BatchNorm2DBackward.h"
#include "runtime/local/kernels/CUDA/BatchNorm.h"
#include <runtime/local/kernels/CheckEq.h>
#include <runtime/local/kernels/CheckEqApprox.h>

#include <cassert>
#include <catch.hpp>
#include <tags.h>

#include <iostream>

template<class DT>
void checkBatchNorm2DBackward(const DT* in, const DT* dOut, const DT* gamma, const DT* mean, const DT* invVar, const DT* exp1,
const DT* exp2, const DT* exp3, DaphneContext* dctx)
{
DT* dX = nullptr;
DT* dGamma = nullptr;
DT* dBeta = nullptr;

typename DT::VT epsilon = 1e-5;
BatchNorm2DBackward<DT, DT>::apply(dX, dGamma, dBeta, mean, invVar, in, dOut, gamma, epsilon, dctx);

// CHECK(Approx(*(dX->getValues())).epsilon(epsilon) == *(exp1->getValues()));
// // CHECK(*dX == *exp1);
// CHECK(Approx(*(dGamma->getValues())).epsilon(epsilon) == *(exp2->getValues()));
// CHECK(Approx(*(dBeta->getValues())).epsilon(epsilon) == *(exp3->getValues()));

CHECK(checkEqApprox(dX, exp1, 1e-5, nullptr));
CHECK(checkEqApprox(dGamma, exp2, 1e-4, nullptr));
CHECK(checkEqApprox(dBeta, exp3, 1e-5, nullptr));
}

TEMPLATE_PRODUCT_TEST_CASE("batch_norm_bwd", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
auto dctx = setupContextAndLogger();
using DT = TestType;

auto in = genGivenVals<DT>(2, { 1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,

1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12});

auto dOut = genGivenVals<DT>(2, { 1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,

1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12});

auto gamma = genGivenVals<DT>(3, { 1, 1, 1 });
auto mean = genGivenVals<DT>(3, { 2.5, 6.5, 10.5 });
auto invVar = genGivenVals<DT>(3, { 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5), 1 / std::sqrt(1.25 + 1e-5) });
auto res1 = genGivenVals<DT>(2, {-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05,
-1.0733e-05, -3.5777e-06, 3.5777e-06, 1.0733e-05});
auto res2 = genGivenVals<DT>(3, {8.9442, 8.9442, 8.9442 });
auto res3 = genGivenVals<DT>(3, {20, 52, 84 });

checkBatchNorm2DBackward(in, dOut, gamma, mean, invVar, res1, res2, res3, dctx.get());

//std::cout<<"cpu"<<std::endl;

DataObjectFactory::destroy(in);
DataObjectFactory::destroy(dOut);
DataObjectFactory::destroy(gamma);
DataObjectFactory::destroy(mean);
DataObjectFactory::destroy(invVar);
DataObjectFactory::destroy(res1);
DataObjectFactory::destroy(res2);
DataObjectFactory::destroy(res3);
}
2 changes: 1 addition & 1 deletion test/runtime/local/kernels/DNNConvolutionTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void check(const DT* in, const DT* filter, const DT* exp, DaphneContext* dctx) {
CHECK(*res == *exp);
}

TEMPLATE_PRODUCT_TEST_CASE("conv_fwd", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
TEMPLATE_PRODUCT_TEST_CASE("conv_fwd_cuda", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
auto dctx = setupContextAndLogger();
using DT = TestType;

Expand Down
Loading