cavis/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu

276 lines
13 KiB
Plaintext
Raw Normal View History

cuDNN integration (#150) * initial commit Signed-off-by: raver119 <raver119@gmail.com> * one file Signed-off-by: raver119 <raver119@gmail.com> * few more includes Signed-off-by: raver119 <raver119@gmail.com> * m? Signed-off-by: raver119 <raver119@gmail.com> * const Signed-off-by: raver119 <raver119@gmail.com> * cudnn linkage in tests Signed-off-by: raver119 <raver119@gmail.com> * culibos Signed-off-by: raver119 <raver119@gmail.com> * static reminder Signed-off-by: raver119 <raver119@gmail.com> * platform engine tag Signed-off-by: raver119 <raver119@gmail.com> * HAVE_CUDNN moved to config.h.in Signed-off-by: raver119 <raver119@gmail.com> * include Signed-off-by: raver119 <raver119@gmail.com> * include Signed-off-by: raver119 <raver119@gmail.com> * skip cudnn handle creation if there's not cudnn Signed-off-by: raver119 <raver119@gmail.com> * meh Signed-off-by: raver119 <raver119@gmail.com> * target device in context Signed-off-by: raver119 <raver119@gmail.com> * platform engines Signed-off-by: raver119 <raver119@gmail.com> * platform engines Signed-off-by: raver119 <raver119@gmail.com> * allow multiple -h args Signed-off-by: raver119 <raver119@gmail.com> * allow multiple -h args Signed-off-by: raver119 <raver119@gmail.com> * move mkldnn out of CPU block Signed-off-by: raver119 <raver119@gmail.com> * link to mkldnn on cuda Signed-off-by: raver119 <raver119@gmail.com> * less prints Signed-off-by: raver119 <raver119@gmail.com> * minor tweaks Signed-off-by: raver119 <raver119@gmail.com> * next step Signed-off-by: raver119 <raver119@gmail.com> * conv2d NCHW draft Signed-off-by: raver119 <raver119@gmail.com> * conv2d biasAdd Signed-off-by: raver119 <raver119@gmail.com> * test for MKL/CUDNN combined use Signed-off-by: raver119 <raver119@gmail.com> * - provide additional code for conv2d ff based on cudnn api, not tested yet Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on conv2d helper based on using cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - fixing several cuda bugs which appeared after cudnn lib had been started to use Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation of conv2d backprop op based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - implementaion of conv3d and conv3d_bp ops based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - bugs fixing in conv3d/conv3d_bp ops (cudnn in use) Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation of depthwiseConv2d (ff/bp) op based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation of batchnorm ff op based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - disable cudnn batchnorm temporary Signed-off-by: Yurii <iuriish@yahoo.com> * - add minor change in cmake Signed-off-by: Yurii <iuriish@yahoo.com> * engine for depthwise mkldnn Signed-off-by: raver119 <raver119@gmail.com> * couple of includes Signed-off-by: raver119 <raver119@gmail.com> * - provide permutation to cudnn batchnorm ff when format is NHWC Signed-off-by: Yurii <iuriish@yahoo.com> * lgamma fix Signed-off-by: raver119 <raver119@gmail.com> * - eliminate memory leak in two tests Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: Yurii Shyrma <iuriish@yahoo.com>
2020-01-20 19:32:46 +01:00
/*******************************************************************************
* Copyright (c) 2019 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author Yurii Shyrma (iuriish@yahoo.com)
//
#include "cudnnUtils.h"
#include <ops/declarable/helpers/convolutions.h>
namespace nd4j {
namespace ops {
namespace platforms {
//////////////////////////////////////////////////////////////////////////
static void batchnormCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* mean, const NDArray* variance,
const NDArray* gamma, const NDArray* beta,
NDArray* output,
const double epsilon, const bool isSpatialMode) {
// input, output -> 4D:nchw, 5D:ncdhw
// mean, variance, gamma, beta -> 1xCx1x1 for 4D and 1xCx1x1x1 for 5D for BATCHNORM_MODE_SPATIAL mode
// -> 1xCxHxW for 4D and 1xCxDxHxW for 5D for BATCHNORM_MODE_PER_ACTIVATION mode
const cudnnDataType_t dataType = cudnnDataType(input->dataType());
const int xRank = input->rankOf();
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
const std::vector<int> xShape = input->getShapeAsVectorInt(); // input and output have same shapes
std::vector<int> paramsShape, paramsStrides; // mean, variance, gamma and beta have same shapes
if(isSpatialMode) { // 1xCx1x1
const int iC = mean->lengthOf();
const int stride0 = mean->strideAt(0);
paramsShape = xRank == 4 ? std::vector<int>({1, iC, 1, 1}) : std::vector<int>({1, iC, 1, 1, 1});
paramsStrides = xRank == 4 ? std::vector<int>({iC*stride0, stride0, 1, 1}) : std::vector<int>({iC*stride0, stride0, 1, 1, 1});
}
else {
paramsShape = mean->getShapeAsVectorInt();
paramsStrides = xRank == 4 ? std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3)}) : std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3), (int)mean->strideAt(4)});
}
std::vector<int> xStrides = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3)};
std::vector<int> zStrides = {(int)output->strideAt(0), (int)output->strideAt(1), (int)output->strideAt(2), (int)output->strideAt(3)};
if(xRank > 4) { // 5D
xStrides.push_back((int)input->strideAt(4));
zStrides.push_back((int)output->strideAt(4));
}
cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(x, format, dataType, xRank, xShape.data());
else
err = cudnnSetTensorNdDescriptor(x, dataType, xRank, xShape.data(), xStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
// output descriptor
cudnnTensorDescriptor_t z;
cudnnCreateTensorDescriptor(&z);
if(output->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(z, format, dataType, xRank, xShape.data());
else
err = cudnnSetTensorNdDescriptor(z, dataType, xRank, xShape.data(), zStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
// mean, variance, gamma and beta descriptor, the same descriptor for all of them
cudnnTensorDescriptor_t params;
cudnnCreateTensorDescriptor(&params);
if(mean->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(params, format, dataType, xRank, paramsShape.data());
else
err = cudnnSetTensorNdDescriptor(params, dataType, xRank, paramsShape.data(), paramsStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/beta failed", err);
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* ptrAlpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* ptrBeta = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({output}, {input, mean, variance, gamma, beta});
// calculations
err = cudnnBatchNormalizationForwardInference(*handle, isSpatialMode ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION,
ptrAlpha, ptrBeta,
x, input->getSpecialBuffer(),
z, output->getSpecialBuffer(),
params,
gamma ? gamma->getSpecialBuffer(): nullptr,
beta ? beta->getSpecialBuffer() : nullptr,
mean->getSpecialBuffer(), variance->getSpecialBuffer(), epsilon);
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnBatchNormalizationForwardInference failed", err);
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("batchnormCUDNN: cudaStreamSynchronize failed !", cudaErr);
NDArray::registerSpecialUse({output}, {input, mean, variance, gamma, beta});
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(batchnorm, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0);
auto mean = INPUT_VARIABLE(1);
auto variance = INPUT_VARIABLE(2);
NDArray* gamma = nullptr;
NDArray* beta = nullptr;
auto output = OUTPUT_VARIABLE(0);
const bool applyScale = (bool)INT_ARG(0);
const bool applyOffset = (bool)INT_ARG(1);
const double epsilon = T_ARG(0);
if(applyScale)
gamma = INPUT_VARIABLE(3);
if(applyOffset)
beta = INPUT_VARIABLE(3 + (int)applyScale);
const int numOfIntArgs = block.getIArguments()->size();
const int inRank = input->rankOf();
// get axes args to normalize input array over
std::vector<int> axes;
if(numOfIntArgs > 2)
for(int i = 2; i < numOfIntArgs; ++i)
axes.push_back(INT_ARG(i));
else
axes.push_back(inRank-1); // default dimension to reduce along is last dimension
const int numOfAxes = axes.size();
REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM CUDNN op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
// evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
// for example if input shape is {2,3,4,5,6} and axes = {1,3}, then expected shape would be {1,3,1,5,1}, and if axes = {3}, then expected shape would be {5}
std::vector<Nd4jLong> expShape;
if(numOfAxes == 1)
expShape.push_back(input->sizeAt(axes[0]));
else { // get, for example, something like {1, inputDim1, 1, inputDim3, 1} if axes = {1, 3}
expShape = std::vector<Nd4jLong>(inRank, 1);
for(uint i = 0; i < numOfAxes; ++i)
expShape[axes[i]] = input->sizeAt(axes[i]);
}
REQUIRE_TRUE(mean->isSameShape(expShape) , 0, "BATCHNORM CUDNN op: wrong shape of mean array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(mean).c_str());
REQUIRE_TRUE(variance->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of variance array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(variance).c_str());
if(gamma)
REQUIRE_TRUE(gamma->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of gamma array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(gamma).c_str());
if(beta)
REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
// types of all input arrays should be the same
for(int i = 1; i < block.width(); ++i)
REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM CUDNN op: types of all input arrays should be the same !");
// cudnn supports NCHW format only
const bool needPermut = axes.size() == 1 && mean->lengthOf() == input->sizeAt(-1);
if(needPermut) { // if NHWC
std::vector<int> perm = {0, 3, 1, 2}; // NHWC -> NCHW
input = new NDArray(input->permute(perm));
output = new NDArray(output->permute(perm));
}
// calculations
batchnormCUDNN(block.launchContext(), input, mean, variance, gamma, beta, output, epsilon, axes.size() == 1);
if(needPermut) {
delete input;
delete output;
}
return Status::OK();
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(batchnorm, ENGINE_CUDA) {
const bool applyScale = (bool)INT_ARG(0);
const bool applyOffset = (bool)INT_ARG(1);
NDArray* input = INPUT_VARIABLE(0);
NDArray* mean = INPUT_VARIABLE(1);
NDArray* variance = INPUT_VARIABLE(2);
NDArray* gamma = applyScale ? INPUT_VARIABLE(3) : nullptr;
NDArray* beta = applyOffset ? INPUT_VARIABLE(3 + (int)applyScale) : nullptr;
const int numOfIntArgs = block.getIArguments()->size();
const int xRank = input->rankOf();
// disable cudnn batchnorm so far
return false;
// *********************************** //
if(xRank != 4 && xRank != 5)
return false;
// *********************************** //
const bool badType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
if(badType)
return false;
// *********************************** //
// get axes args to normalize input array over
std::vector<int> axes;
if(numOfIntArgs > 2)
for(int i = 2; i < numOfIntArgs; ++i)
axes.push_back(INT_ARG(i));
else
axes.push_back(xRank-1); // default dimension to reduce along is last dimension
if(axes.size() != 1 && axes.size() != 3 && axes.size() != 4)
return false;
// *********************************** //
bool allParamsHaveSameShapeAndStrides = shape::haveSameShapeAndStrides(mean->getShapeInfo(), variance->getShapeInfo());
if(gamma)
allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), gamma->getShapeInfo());
if(beta)
allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), beta->getShapeInfo());
if(!allParamsHaveSameShapeAndStrides)
return false;
// *********************************** //
bool isFormatGood = false;
if(axes.size() == 1)
isFormatGood = mean->lengthOf() == input->sizeAt(1) || mean->lengthOf() == input->sizeAt(-1); // mean [C]
else {
auto inputShapeModif = input->getShapeAsVector(); // [dim0,dim1,dim2,dim3] 4D or [dim0,dim1,dim2,dim3,dim4]
inputShapeModif[0] = 1;
isFormatGood = mean->isSameShape(inputShapeModif); // mean [1,dim1,dim2,dim3] 4D or [1,dim1,dim2,dim3,dim4]
}
if(!isFormatGood)
return false;
return true;
}
}
}
}