Shyrma merge max ind (#443)
* - provide correct possible output types in mergeMaxIndex op Signed-off-by: Yurii <iuriish@yahoo.com> * - cleaning up the unneeded backprop arg in reverse_bp op Signed-off-by: Yurii <iuriish@yahoo.com> * - improve clipByNorm both ff and bp Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation and testing clipByAvgNorm_bp op Signed-off-by: Yurii <iuriish@yahoo.com> * - pass biases in any way in dnnl lstm op, they are zeros when user doesn't provide them to us Signed-off-by: Yurii <iuriish@yahoo.com> * - start working on mkldnn concat op Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on mkldnn concat Signed-off-by: Yurii <iuriish@yahoo.com> * missing declaration fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - polishing mkl ops Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing bugs in mkl concat op Signed-off-by: Yurii <iuriish@yahoo.com> * - fix linkage error for windows cuda build Signed-off-by: Yurii <iuriish@yahoo.com> * - further conflicts resolving with master Signed-off-by: Yurii <iuriish@yahoo.com> * - fix format tags in mkldnn matmul op Signed-off-by: Yurii <iuriish@yahoo.com> * - provide additional type cast in clip.cu Signed-off-by: Yurii <iuriish@yahoo.com> * - finally bug in mkldnn tanh_bp was caught Co-authored-by: raver119@gmail.com <raver119@gmail.com>master
parent
872a511042
commit
76f3553679
|
@ -981,12 +981,12 @@ namespace sd {
|
||||||
* these methods suited for FlatBuffers use
|
* these methods suited for FlatBuffers use
|
||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
std::vector<T> getBufferAsVector();
|
std::vector<T> getBufferAsVector() const;
|
||||||
std::vector<Nd4jLong> getShapeAsVector() const;
|
std::vector<Nd4jLong> getShapeAsVector() const;
|
||||||
std::vector<int> getShapeAsVectorInt() const;
|
std::vector<int> getShapeAsVectorInt() const;
|
||||||
std::vector<Nd4jLong> getShapeInfoAsVector();
|
std::vector<Nd4jLong> getShapeInfoAsVector() const;
|
||||||
std::vector<int64_t> getShapeInfoAsFlatVector();
|
std::vector<int64_t> getShapeInfoAsFlatVector() const;
|
||||||
std::vector<int64_t> getShapeAsFlatVector();
|
std::vector<int64_t> getShapeAsFlatVector() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set new order and shape in case of suitable array length (in-place operation)
|
* set new order and shape in case of suitable array length (in-place operation)
|
||||||
|
|
|
@ -982,16 +982,16 @@ std::string NDArray::asString(Nd4jLong limit) {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
template<typename T>
|
template<typename T>
|
||||||
std::vector<T> NDArray::getBufferAsVector() {
|
std::vector<T> NDArray::getBufferAsVector() const {
|
||||||
std::vector<T> vector(lengthOf());
|
std::vector<T> vector(lengthOf());
|
||||||
for (Nd4jLong e = 0; e < lengthOf(); e++)
|
for (Nd4jLong e = 0; e < lengthOf(); e++)
|
||||||
vector[e] = this->e<T>(e);
|
vector[e] = this->e<T>(e);
|
||||||
return vector;
|
return vector;
|
||||||
}
|
}
|
||||||
BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT std::vector, NDArray::getBufferAsVector(), LIBND4J_TYPES);
|
BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT std::vector, NDArray::getBufferAsVector() const, LIBND4J_TYPES);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
std::vector<int64_t> NDArray::getShapeAsFlatVector() {
|
std::vector<int64_t> NDArray::getShapeAsFlatVector() const {
|
||||||
std::vector<int64_t> vector(this->rankOf());
|
std::vector<int64_t> vector(this->rankOf());
|
||||||
for (int e = 0; e < this->rankOf(); e++)
|
for (int e = 0; e < this->rankOf(); e++)
|
||||||
vector[e] = static_cast<int64_t>(this->sizeAt(e));
|
vector[e] = static_cast<int64_t>(this->sizeAt(e));
|
||||||
|
@ -1019,7 +1019,7 @@ std::vector<int> NDArray::getShapeAsVectorInt() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
std::vector<int64_t> NDArray::getShapeInfoAsFlatVector() {
|
std::vector<int64_t> NDArray::getShapeInfoAsFlatVector() const {
|
||||||
int magicNumber = shape::shapeInfoLength(this->rankOf());
|
int magicNumber = shape::shapeInfoLength(this->rankOf());
|
||||||
std::vector<int64_t> vector(magicNumber);
|
std::vector<int64_t> vector(magicNumber);
|
||||||
|
|
||||||
|
@ -1030,7 +1030,7 @@ std::vector<int64_t> NDArray::getShapeInfoAsFlatVector() {
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
std::vector<Nd4jLong> NDArray::getShapeInfoAsVector() {
|
std::vector<Nd4jLong> NDArray::getShapeInfoAsVector() const {
|
||||||
int magicNumber = shape::shapeInfoLength(this->rankOf());
|
int magicNumber = shape::shapeInfoLength(this->rankOf());
|
||||||
std::vector<Nd4jLong> vector(magicNumber);
|
std::vector<Nd4jLong> vector(magicNumber);
|
||||||
for (int e = 0; e < magicNumber; e++)
|
for (int e = 0; e < magicNumber; e++)
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
//
|
//
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <system/op_boilerplate.h>
|
#include <system/op_boilerplate.h>
|
||||||
|
@ -27,24 +28,58 @@
|
||||||
namespace sd {
|
namespace sd {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
CONFIGURABLE_OP_IMPL(clipbyavgnorm, 1, 1, true, 1, 0) {
|
CONFIGURABLE_OP_IMPL(clipbyavgnorm, 1, 1, true, 1, 0) {
|
||||||
|
|
||||||
auto input = INPUT_VARIABLE(0);
|
auto input = INPUT_VARIABLE(0);
|
||||||
auto output = OUTPUT_VARIABLE(0);
|
auto output = OUTPUT_VARIABLE(0);
|
||||||
|
|
||||||
const bool isInplace = block.isInplace();
|
const bool isInplace = block.isInplace();
|
||||||
auto ts = NDArrayFactory::create(T_ARG(0), block.launchContext());
|
auto clipNorm = NDArrayFactory::create(T_ARG(0), block.launchContext());
|
||||||
|
|
||||||
helpers::clipByAveraged(block.launchContext(), *input, *output, *block.getIArguments(), ts, isInplace);
|
helpers::clipByNorm(block.launchContext(), *input, *output, *block.getIArguments(), clipNorm, isInplace, true);
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
DECLARE_TYPES(clipbyavgnorm) {
|
DECLARE_TYPES(clipbyavgnorm) {
|
||||||
getOpDescriptor()
|
getOpDescriptor()
|
||||||
->setAllowedInputTypes(sd::DataType::ANY)
|
->setAllowedInputTypes(sd::DataType::ANY)
|
||||||
->setAllowedOutputTypes({ALL_FLOATS});
|
->setAllowedOutputTypes({ALL_FLOATS});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
CUSTOM_OP_IMPL(clipbyavgnorm_bp, 2, 1, false, 1, 0) {
|
||||||
|
|
||||||
|
auto input = INPUT_VARIABLE(0);
|
||||||
|
auto gradO = INPUT_VARIABLE(1);
|
||||||
|
|
||||||
|
auto gradI = OUTPUT_VARIABLE(0);
|
||||||
|
|
||||||
|
const auto clipNorm = NDArrayFactory::create(gradI->dataType(), T_ARG(0), block.launchContext());
|
||||||
|
|
||||||
|
helpers::clipByNormBp(block.launchContext(), *input, *gradO, *gradI, *block.getIArguments(), clipNorm, true);
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
DECLARE_SHAPE_FN(clipbyavgnorm_bp) {
|
||||||
|
|
||||||
|
Nd4jLong *newShape = nullptr;
|
||||||
|
COPY_SHAPE(inputShape->at(1), newShape);
|
||||||
|
|
||||||
|
return SHAPELIST(CONSTANT(newShape));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
DECLARE_TYPES(clipbyavgnorm_bp) {
|
||||||
|
getOpDescriptor()
|
||||||
|
->setAllowedInputTypes(0, DataType::ANY)
|
||||||
|
->setAllowedInputTypes(1, {ALL_FLOATS})
|
||||||
|
->setAllowedOutputTypes(0, {ALL_FLOATS});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,10 +31,10 @@ namespace ops {
|
||||||
auto input = INPUT_VARIABLE(0);
|
auto input = INPUT_VARIABLE(0);
|
||||||
auto output = OUTPUT_VARIABLE(0);
|
auto output = OUTPUT_VARIABLE(0);
|
||||||
|
|
||||||
const auto clipNorm = NDArrayFactory::create(input->dataType(), T_ARG(0), block.launchContext());
|
const auto clipNorm = NDArrayFactory::create(output->dataType(), T_ARG(0), block.launchContext());
|
||||||
const bool isInplace = block.isInplace();
|
const bool isInplace = block.isInplace();
|
||||||
|
|
||||||
helpers::clipByNorm(block.launchContext(), *input, *output, *block.getIArguments(), clipNorm, isInplace);
|
helpers::clipByNorm(block.launchContext(), *input, *output, *block.getIArguments(), clipNorm, isInplace, false);
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
@ -45,15 +45,15 @@ namespace ops {
|
||||||
auto gradO = INPUT_VARIABLE(1);
|
auto gradO = INPUT_VARIABLE(1);
|
||||||
|
|
||||||
auto gradI = OUTPUT_VARIABLE(0);
|
auto gradI = OUTPUT_VARIABLE(0);
|
||||||
const auto clipNorm = NDArrayFactory::create(T_ARG(0));
|
const auto clipNorm = NDArrayFactory::create(gradI->dataType(), T_ARG(0), block.launchContext());
|
||||||
|
|
||||||
helpers::clipByNormBP(block.launchContext(), *input, *gradO, *gradI, *block.getIArguments(), clipNorm);
|
helpers::clipByNormBp(block.launchContext(), *input, *gradO, *gradI, *block.getIArguments(), clipNorm, false);
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
DECLARE_SHAPE_FN(clipbynorm_bp) {
|
DECLARE_SHAPE_FN(clipbynorm_bp) {
|
||||||
auto inShapeInfo = inputShape->at(0);
|
auto inShapeInfo = inputShape->at(1);
|
||||||
|
|
||||||
Nd4jLong *newShape = nullptr;
|
Nd4jLong *newShape = nullptr;
|
||||||
COPY_SHAPE(inShapeInfo, newShape);
|
COPY_SHAPE(inShapeInfo, newShape);
|
||||||
|
|
|
@ -85,6 +85,7 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
|
||||||
|
|
||||||
// ******** input validation ******** //
|
// ******** input validation ******** //
|
||||||
REQUIRE_TRUE(allOfSameType, 0, "CONCAT op: all of input arrays must have same type !");
|
REQUIRE_TRUE(allOfSameType, 0, "CONCAT op: all of input arrays must have same type !");
|
||||||
|
REQUIRE_TRUE(nonEmptyArrs[0]->dataType() == OUTPUT_VARIABLE(0)->dataType(), 0, "CONCAT op: output array should have the same type as inputs arrays !");
|
||||||
REQUIRE_TRUE(0 <= axis && (axis < rank || (axis == 0 && rank == 0)), 0, "CONCAT op: input axis must be in range [0, %i], but got %i instead!", rank-1, axis);
|
REQUIRE_TRUE(0 <= axis && (axis < rank || (axis == 0 && rank == 0)), 0, "CONCAT op: input axis must be in range [0, %i], but got %i instead!", rank-1, axis);
|
||||||
|
|
||||||
for(int i = 1; i < numOfNonEmptyArrs; ++i)
|
for(int i = 1; i < numOfNonEmptyArrs; ++i)
|
||||||
|
|
|
@ -46,7 +46,8 @@ DECLARE_SYN(MergeMaxIndex, mergemaxindex);
|
||||||
|
|
||||||
DECLARE_TYPES(mergemaxindex) {
|
DECLARE_TYPES(mergemaxindex) {
|
||||||
getOpDescriptor()
|
getOpDescriptor()
|
||||||
->setAllowedInputTypes({ALL_INTS, ALL_FLOATS});
|
->setAllowedInputTypes({ALL_INTS, ALL_FLOATS})
|
||||||
|
->setAllowedOutputTypes({ALL_INDICES});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DECLARE_SHAPE_FN(mergemaxindex) {
|
DECLARE_SHAPE_FN(mergemaxindex) {
|
||||||
|
|
|
@ -52,7 +52,7 @@ namespace ops {
|
||||||
else {
|
else {
|
||||||
// check the consistency of input dimensions to reverse along
|
// check the consistency of input dimensions to reverse along
|
||||||
shape::checkDimensions(input->rankOf(), axis);
|
shape::checkDimensions(input->rankOf(), axis);
|
||||||
helpers::reverse(block.launchContext(), input, output, &axis, false);
|
helpers::reverse(block.launchContext(), input, output, &axis);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
@ -85,7 +85,7 @@ namespace ops {
|
||||||
// check the consistency of input dimensions to reverse along
|
// check the consistency of input dimensions to reverse along
|
||||||
shape::checkDimensions(input->rankOf(), axis);
|
shape::checkDimensions(input->rankOf(), axis);
|
||||||
// we just reverse back original array
|
// we just reverse back original array
|
||||||
helpers::reverse(block.launchContext(), eps, output, &axis, false);
|
helpers::reverse(block.launchContext(), eps, output, &axis);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
|
|
@ -36,6 +36,7 @@ namespace sd {
|
||||||
|
|
||||||
#if NOT_EXCLUDED(OP_clipbyavgnorm)
|
#if NOT_EXCLUDED(OP_clipbyavgnorm)
|
||||||
DECLARE_CONFIGURABLE_OP(clipbyavgnorm, 1, 1, true, 1, 0);
|
DECLARE_CONFIGURABLE_OP(clipbyavgnorm, 1, 1, true, 1, 0);
|
||||||
|
DECLARE_CUSTOM_OP(clipbyavgnorm_bp, 2, 1, false, 1, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if NOT_EXCLUDED(OP_cumsum)
|
#if NOT_EXCLUDED(OP_cumsum)
|
||||||
|
|
|
@ -15,83 +15,134 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
|
// @author sgazeos@gmail.com
|
||||||
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
||||||
#include <ops/declarable/helpers/transforms.h>
|
#include <ops/declarable/helpers/transforms.h>
|
||||||
#include <helpers/Loops.h>
|
#include <execution/Threads.h>
|
||||||
|
|
||||||
namespace sd {
|
namespace sd {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
namespace helpers {
|
namespace helpers {
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template<typename T>
|
void clipByNorm(sd::LaunchContext* context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace, const bool useAverage) {
|
||||||
static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
|
|
||||||
|
|
||||||
const int rank = input.rankOf();
|
NDArray* z = nullptr;
|
||||||
const auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
|
|
||||||
|
|
||||||
const T normActual = norm2.e<T>(0);
|
if(isInplace) {
|
||||||
const T normClip = clipNorm.e<T>(0);
|
z = &input;
|
||||||
|
|
||||||
if (isInplace) {
|
|
||||||
|
|
||||||
if(norm2.lengthOf() == 1) {
|
|
||||||
|
|
||||||
if(normActual > normClip)
|
|
||||||
input *= (normClip / normActual);
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
|
||||||
for (auto i = start; i < stop; i++) {
|
|
||||||
const T iNormActual = norm2.e<T>(i);
|
|
||||||
if (iNormActual > normClip)
|
|
||||||
*listOfInSubArrs.at(i) *= normClip / iNormActual;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
samediff::Threads::parallel_tad(func, 0, listOfInSubArrs.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
if(norm2.lengthOf() == 1) {
|
|
||||||
|
|
||||||
if(normActual > normClip)
|
|
||||||
output.assign(input * (normClip / normActual));
|
|
||||||
else
|
|
||||||
output.assign(input);
|
output.assign(input);
|
||||||
|
z = &output;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(dimensions.empty()) {
|
||||||
|
|
||||||
|
const NDArray actualNorm = useAverage ? z->reduceAlongDimension(reduce::Norm2, {}) / z->lengthOf() : z->reduceAlongDimension(reduce::Norm2, {});
|
||||||
|
|
||||||
|
if(actualNorm.e<float>(0) > clipNorm.e<float>(0))
|
||||||
|
*z *= clipNorm / actualNorm;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
|
auto listOfSubArrs = z->allTensorsAlongDimension(dimensions);
|
||||||
auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i++) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto inputSubArr = listOfInSubArrs.at(i);
|
const NDArray actualNorm = useAverage ? listOfSubArrs.at(i)->reduceAlongDimension(reduce::Norm2, {}) / listOfSubArrs.at(i)->lengthOf() : listOfSubArrs.at(i)->reduceAlongDimension(reduce::Norm2, {});
|
||||||
auto outputSubArr = listOfOutSubArrs.at(i);
|
if(actualNorm.e<float>(0) > clipNorm.e<float>(0))
|
||||||
outputSubArr->assign(inputSubArr);
|
*listOfSubArrs.at(i) *= clipNorm / actualNorm;
|
||||||
|
|
||||||
const T iNormActual = norm2.e<T>(i);
|
|
||||||
|
|
||||||
if (iNormActual > clipNorm.e<T>(0))
|
|
||||||
*outputSubArr *= clipNorm / iNormActual;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_tad(func, 0, listOfInSubArrs.size());
|
samediff::Threads::parallel_tad(func, 0, listOfSubArrs.size());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
|
template<typename T>
|
||||||
BUILD_SINGLE_SELECTOR(output.dataType(), clipByNorm_, (input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
|
static void clipByNormBp_(const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool useAverage) {
|
||||||
|
|
||||||
|
const int rank = input.rankOf();
|
||||||
|
|
||||||
|
auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
|
||||||
|
auto sums = input.reduceAlongDimension(reduce::Sum, dimensions);
|
||||||
|
|
||||||
|
if(norm2.lengthOf() == 1) {
|
||||||
|
|
||||||
|
const T norm = useAverage ? norm2.e<T>(0) / input.lengthOf() : norm2.e<T>(0);
|
||||||
|
|
||||||
|
auto clipVal = clipNorm.e<T>(0);
|
||||||
|
|
||||||
|
if(norm > clipVal) {
|
||||||
|
|
||||||
|
const T sum = sums.e<T>(0); // reduce to scalar
|
||||||
|
const T factor1 = clipVal / norm;
|
||||||
|
const T factor2 = static_cast<T>(1.f) / (norm * norm); // 1 / (norm*norm*norm)
|
||||||
|
|
||||||
|
auto lambda = LAMBDA_TT(x, y, sum, factor1, factor2) {
|
||||||
|
return factor1 * y * (static_cast<T>(1.f) - factor2 * x * sum);
|
||||||
|
};
|
||||||
|
|
||||||
|
const_cast<NDArray&>(input).applyPairwiseLambda<T>(const_cast<NDArray&>(gradO), lambda, gradI);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
gradI.assign(gradO);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
auto gradISubArrs = gradI.allTensorsAlongDimension({dimensions});
|
||||||
|
auto gradOSubArrs = gradO.allTensorsAlongDimension({dimensions});
|
||||||
|
auto inputSubArrs = input.allTensorsAlongDimension({dimensions});
|
||||||
|
|
||||||
|
auto clipVal = clipNorm.e<T>(0);
|
||||||
|
|
||||||
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
|
||||||
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
|
auto gradOSubArr = gradOSubArrs.at(i);
|
||||||
|
auto gradISubArr = gradISubArrs.at(i);
|
||||||
|
|
||||||
|
const T norm = useAverage ? norm2.e<T>(i) / gradISubArr->lengthOf() : norm2.e<T>(i);
|
||||||
|
|
||||||
|
if (norm > clipVal) {
|
||||||
|
|
||||||
|
auto inputSubArr = inputSubArrs.at(i);
|
||||||
|
|
||||||
|
const T sum = sums.e<T>(i); // reduce to scalar
|
||||||
|
const T factor1 = clipVal / norm;
|
||||||
|
const T factor2 = static_cast<T>(1.f) / (norm * norm); // 1 / (norm*norm*norm)
|
||||||
|
|
||||||
|
auto lambda = LAMBDA_TT(x, y, sum, factor1, factor2) {
|
||||||
|
return factor1 * y * (static_cast<T>(1.f) - factor2 * x * sum);
|
||||||
|
};
|
||||||
|
|
||||||
|
inputSubArr->applyPairwiseLambda<T>(*gradOSubArr, lambda, *gradISubArr);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
gradISubArr->assign(gradOSubArr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
samediff::Threads::parallel_tad(func, 0, gradISubArrs.size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
BUILD_SINGLE_TEMPLATE(template void clipByNormBp_, (const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool useAverage), FLOAT_TYPES);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
void clipByNormBp(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool useAverage) {
|
||||||
|
|
||||||
|
const NDArray& castedInput = gradI.dataType() == input.dataType() ? input : input.cast(gradI.dataType());
|
||||||
|
|
||||||
|
BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBp_, (castedInput, gradO, gradI, dimensions, clipNorm, useAverage), FLOAT_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -132,125 +183,6 @@ void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, co
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
|
BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename T>
|
|
||||||
static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm) {
|
|
||||||
|
|
||||||
const int rank = input.rankOf();
|
|
||||||
|
|
||||||
auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
|
|
||||||
|
|
||||||
if(norm2.lengthOf() == 1) {
|
|
||||||
|
|
||||||
const T N = norm2.e<T>(0);
|
|
||||||
|
|
||||||
auto cn = clipNorm.e<T>(0);
|
|
||||||
|
|
||||||
if(N > cn) {
|
|
||||||
|
|
||||||
const T sumOfProd = (input * gradO).reduceNumber(reduce::Sum).e<T>(0); // reduce to scalar
|
|
||||||
const T factor1 = static_cast<T>(1.f) / N;
|
|
||||||
const T factor3 = factor1 / (N * N); // 1 / (N*N*N)
|
|
||||||
|
|
||||||
auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
|
|
||||||
return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
|
|
||||||
};
|
|
||||||
|
|
||||||
(const_cast<NDArray&>(input)).applyPairwiseLambda<T>(const_cast<NDArray&>(gradO), lambda, gradI);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
gradI.assign(gradO);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
auto gradISubArrs = gradI.allTensorsAlongDimension({dimensions});
|
|
||||||
auto gradOSubArrs = gradO.allTensorsAlongDimension({dimensions});
|
|
||||||
auto inputSubArrs = input.allTensorsAlongDimension({dimensions});
|
|
||||||
|
|
||||||
auto cn = clipNorm.e<T>(0);
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
|
||||||
for (auto i = start; i < stop; i++) {
|
|
||||||
T N = norm2.e<T>(i);
|
|
||||||
|
|
||||||
auto gradOSubArr = gradOSubArrs.at(i);
|
|
||||||
auto gradISubArr = gradISubArrs.at(i);
|
|
||||||
|
|
||||||
if (N > cn) {
|
|
||||||
auto inputSubArr = inputSubArrs.at(i);
|
|
||||||
const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e<T>(0); // reduce to scalar
|
|
||||||
const T factor1 = static_cast<T>(1.f) / N;
|
|
||||||
const T factor3 = factor1 / (N * N); // 1 / (N*N*N)
|
|
||||||
|
|
||||||
auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
|
|
||||||
return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
|
|
||||||
};
|
|
||||||
|
|
||||||
inputSubArr->applyPairwiseLambda<T>(*gradOSubArr, lambda, *gradISubArr);
|
|
||||||
} else
|
|
||||||
gradISubArr->assign(gradOSubArr);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
samediff::Threads::parallel_tad(func, 0, gradISubArrs.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void clipByNormBP(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm) {
|
|
||||||
BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBP_, (input, gradO, gradI, dimensions, clipNorm), FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void clipByNormBP_, (const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm), FLOAT_TYPES);
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename T>
|
|
||||||
static void clipByAveraged_(NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
|
|
||||||
|
|
||||||
auto cn = clipNorm.e<T>(0);
|
|
||||||
if (dimensions.size() == 0) {
|
|
||||||
// all-reduce
|
|
||||||
T n2 = input.reduceNumber(reduce::Norm2).e<T>(0) / input.lengthOf();
|
|
||||||
if (n2 <= cn) {
|
|
||||||
if (!isInplace)
|
|
||||||
output.assign(input);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
const T factor = cn / n2;
|
|
||||||
auto lambda = LAMBDA_T(_x, factor) { return _x * factor; };
|
|
||||||
input.applyLambda<T>(lambda, output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// along dimension
|
|
||||||
auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions, false);
|
|
||||||
if (!isInplace)
|
|
||||||
output.assign(input);
|
|
||||||
auto tads = output.allTensorsAlongDimension(dimensions);
|
|
||||||
// TODO: make this CUDA-compliant somehow
|
|
||||||
for (int e = 0; e < tads.size(); e++) {
|
|
||||||
T n2 = norm2.e<T>(e) / tads.at(e)->lengthOf();
|
|
||||||
const T factor = cn / n2;
|
|
||||||
if (n2 > cn) {
|
|
||||||
auto lambda = LAMBDA_T(_x, factor) {return _x * factor;};
|
|
||||||
tads.at(e)->applyLambda<T>(lambda, output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void clipByAveraged(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
|
|
||||||
BUILD_SINGLE_SELECTOR(input.dataType(), clipByAveraged_, (input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void clipByAveraged_, (NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
|
|
||||||
|
|
||||||
/*
|
|
||||||
if (d1 > params[1])
|
|
||||||
return params[1];
|
|
||||||
else if (d1 < params[0])
|
|
||||||
return params[0];
|
|
||||||
else return d1;
|
|
||||||
*/
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static void clipByValue_(NDArray& input, double leftBound, double rightBound, NDArray& output) {
|
static void clipByValue_(NDArray& input, double leftBound, double rightBound, NDArray& output) {
|
||||||
|
|
|
@ -29,7 +29,7 @@ namespace helpers {
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template<typename T>
|
template<typename X, typename Z>
|
||||||
static void mergeMaxIndex_(const std::vector<const NDArray*>& inArrs, NDArray& output) {
|
static void mergeMaxIndex_(const std::vector<const NDArray*>& inArrs, NDArray& output) {
|
||||||
|
|
||||||
const Nd4jLong numArgs = inArrs.size();
|
const Nd4jLong numArgs = inArrs.size();
|
||||||
|
@ -37,17 +37,18 @@ static void mergeMaxIndex_(const std::vector<const NDArray*>& inArrs, NDArray& o
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e++) {
|
for (auto e = start; e < stop; e++) {
|
||||||
T max = -DataTypeUtils::max<T>();
|
X max = -DataTypeUtils::max<X>();
|
||||||
Nd4jLong idx = 0;
|
Z idx = static_cast<Z>(0);
|
||||||
|
|
||||||
for (Nd4jLong i = 0; i < numArgs; i++) {
|
for (Nd4jLong i = 0; i < numArgs; i++) {
|
||||||
T v = inArrs[i]->e<T>(e);
|
X v = inArrs[i]->t<X>(e);
|
||||||
if (v > max) {
|
if (v > max) {
|
||||||
max = v;
|
max = v;
|
||||||
idx = i;
|
idx = static_cast<Z>(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
output.p(e, idx);
|
// FIXME, use .r<Z>(e)
|
||||||
|
output.t<Z>(e) = static_cast<Z>(idx);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -55,7 +56,7 @@ static void mergeMaxIndex_(const std::vector<const NDArray*>& inArrs, NDArray& o
|
||||||
}
|
}
|
||||||
|
|
||||||
void mergeMaxIndex(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray& output) {
|
void mergeMaxIndex(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray& output) {
|
||||||
BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES);
|
BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES, INDEXING_TYPES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -193,13 +193,10 @@ static void reverseSequence_(sd::LaunchContext * context, const NDArray* input,
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp) {
|
void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs) {
|
||||||
|
|
||||||
// we need to reverse axis only if that's new op
|
auto listOut = output->allTensorsAlongDimension(*intArgs);
|
||||||
std::vector<int> dimensions = isBackProp ? ShapeUtils::evalDimsToExclude(input->rankOf(), *intArgs) : *intArgs;
|
auto listIn = input->allTensorsAlongDimension(*intArgs);
|
||||||
|
|
||||||
auto listOut = output->allTensorsAlongDimension(dimensions);
|
|
||||||
auto listIn = input->allTensorsAlongDimension(dimensions);
|
|
||||||
|
|
||||||
NDArray *subArrIn, *subArrOut;
|
NDArray *subArrIn, *subArrOut;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,334 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (c) 2019 Konduit K.K.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Apache License, Version 2.0 which is available at
|
||||||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
//
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
|
// @author sgazeos@gmail.com
|
||||||
|
// @author raver119@gmail.com
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#include <ops/declarable/helpers/transforms.h>
|
||||||
|
#include <helpers/ShapeUtils.h>
|
||||||
|
#include <helpers/PointersManager.h>
|
||||||
|
#include <helpers/ConstantTadHelper.h>
|
||||||
|
|
||||||
|
namespace sd {
|
||||||
|
namespace ops {
|
||||||
|
namespace helpers {
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename T>
|
||||||
|
__global__ static void clipByNormCuda(const void* vClipNorm, const void* vNorm, const Nd4jLong* normShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int* dimensions, const int dimsLen, const bool useAverage) {
|
||||||
|
|
||||||
|
const T clipNorm = *reinterpret_cast<const T*>(vClipNorm);
|
||||||
|
const T* norm = reinterpret_cast<const T*>(vNorm);
|
||||||
|
T* z = reinterpret_cast<T*>(vz);
|
||||||
|
|
||||||
|
__shared__ Nd4jLong zLen, tadLen, totalThreads;
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
|
zLen = shape::length(zShapeInfo);
|
||||||
|
tadLen = zLen / shape::length(normShapeInfo);
|
||||||
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
int zCoords[MAX_RANK], normCoords[MAX_RANK];
|
||||||
|
|
||||||
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
||||||
|
|
||||||
|
shape::index2coords(i, zShapeInfo, zCoords);
|
||||||
|
|
||||||
|
// deduce norm coords
|
||||||
|
for (int j = 0; j < dimsLen; ++j)
|
||||||
|
normCoords[j] = zCoords[dimensions[j]];
|
||||||
|
|
||||||
|
const T actualNorm = useAverage ? norm[shape::getOffset(normShapeInfo, normCoords)] / tadLen : norm[shape::getOffset(normShapeInfo, normCoords)];
|
||||||
|
|
||||||
|
if(actualNorm > clipNorm)
|
||||||
|
z[shape::getOffset(zShapeInfo, zCoords)] *= clipNorm / actualNorm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename T>
|
||||||
|
__host__ static void clipByNormCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream,
|
||||||
|
const void* vClipNorm, const void* vNorm, const Nd4jLong* normShapeInfo, void* vz, const Nd4jLong* zShapeInfo,
|
||||||
|
const int* dimensions, const int dimsLen, const bool useAverage) {
|
||||||
|
|
||||||
|
clipByNormCuda<T><<<blocksPerGrid, threadsPerBlock, 512, *stream>>>(vClipNorm, vNorm, normShapeInfo, vz, zShapeInfo, dimensions, dimsLen, useAverage);
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
void clipByNorm(sd::LaunchContext* context, NDArray& input, NDArray& output, const std::vector<int>& dims, const NDArray& clipNorm, const bool isInplace, const bool useAverage) {
|
||||||
|
|
||||||
|
NDArray* z = nullptr;
|
||||||
|
|
||||||
|
if(isInplace) {
|
||||||
|
z = &input;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
output.assign(input);
|
||||||
|
z = &output;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(dims.empty()) {
|
||||||
|
|
||||||
|
const NDArray actualNorm = useAverage ? z->reduceAlongDimension(reduce::Norm2, {}) / z->lengthOf() : z->reduceAlongDimension(reduce::Norm2, {});
|
||||||
|
|
||||||
|
if(actualNorm.e<float>(0) > clipNorm.e<float>(0))
|
||||||
|
*z *= clipNorm / actualNorm;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
const NDArray actualNorms = z->reduceAlongDimension(reduce::Norm2, dims);
|
||||||
|
|
||||||
|
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(z->rankOf(), dims);
|
||||||
|
|
||||||
|
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||||
|
const int blocksPerGrid = (z->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
|
|
||||||
|
PointersManager manager(context, "clipByNorm");
|
||||||
|
|
||||||
|
const int* dimensions = reinterpret_cast<const int*>(manager.replicatePointer(dimsToExclude.data(), dimsToExclude.size() * sizeof(int)));
|
||||||
|
|
||||||
|
NDArray::prepareSpecialUse({z}, {z, &actualNorms, &clipNorm});
|
||||||
|
BUILD_SINGLE_SELECTOR(z->dataType(), clipByNormCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), clipNorm.specialBuffer(), actualNorms.specialBuffer(), actualNorms.specialShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dimensions, (int)dimsToExclude.size(), useAverage), FLOAT_TYPES);
|
||||||
|
NDArray::registerSpecialUse({z}, {z, &actualNorms, &clipNorm});
|
||||||
|
|
||||||
|
manager.synchronize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename T>
|
||||||
|
__global__ static void clipByNormBpCuda(const void* vClipNorm,
|
||||||
|
const void* vx, const Nd4jLong* xShapeInfo, // input
|
||||||
|
const void* vy, const Nd4jLong* yShapeInfo, // gradO
|
||||||
|
const void* vNorm, const Nd4jLong* normShapeInfo,
|
||||||
|
const void* vSum, const Nd4jLong* sumShapeInfo,
|
||||||
|
void* vz, const Nd4jLong* zShapeInfo, // gradI
|
||||||
|
const int* dimensions, const int dimsLen, const bool useAverage) {
|
||||||
|
|
||||||
|
const T clipNorm = *reinterpret_cast<const T*>(vClipNorm);
|
||||||
|
const T* norm = reinterpret_cast<const T*>(vNorm);
|
||||||
|
const T* sum = reinterpret_cast<const T*>(vSum);
|
||||||
|
const T* x = reinterpret_cast<const T*>(vx);
|
||||||
|
const T* y = reinterpret_cast<const T*>(vy);
|
||||||
|
T* z = reinterpret_cast<T*>(vz);
|
||||||
|
|
||||||
|
__shared__ Nd4jLong zLen, tadLen, totalThreads;
|
||||||
|
__shared__ bool sameOffsets;
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
|
zLen = shape::length(zShapeInfo);
|
||||||
|
tadLen = zLen / shape::length(normShapeInfo);
|
||||||
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
|
sameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
int zCoords[MAX_RANK], normCoords[MAX_RANK];
|
||||||
|
|
||||||
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
||||||
|
|
||||||
|
shape::index2coords(i, zShapeInfo, zCoords);
|
||||||
|
|
||||||
|
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||||
|
const auto yOffset = sameOffsets ? zOffset : shape::getOffset(yShapeInfo, zCoords);
|
||||||
|
|
||||||
|
// deduce norm coords
|
||||||
|
for (int j = 0; j < dimsLen; ++j)
|
||||||
|
normCoords[j] = zCoords[dimensions[j]];
|
||||||
|
|
||||||
|
const T actualNorm = useAverage ? norm[shape::getOffset(normShapeInfo, normCoords)] / tadLen : norm[shape::getOffset(normShapeInfo, normCoords)];
|
||||||
|
|
||||||
|
if(actualNorm > clipNorm) {
|
||||||
|
|
||||||
|
const T sumVal = sum[shape::getOffset(sumShapeInfo, normCoords)];
|
||||||
|
const auto xOffset = sameOffsets ? zOffset : shape::getOffset(xShapeInfo, zCoords);
|
||||||
|
|
||||||
|
z[zOffset] = (clipNorm / actualNorm) * y[yOffset] * (static_cast<T>(1.f) - (x[xOffset] * sumVal) / (actualNorm * actualNorm));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
z[zOffset] = y[yOffset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename T>
|
||||||
|
void clipByNormBp_(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector<int>& dims, const NDArray& clipNorm, const bool useAverage) {
|
||||||
|
|
||||||
|
const int rank = input.rankOf();
|
||||||
|
|
||||||
|
auto actualNorms = input.reduceAlongDimension(reduce::Norm2, dims);
|
||||||
|
|
||||||
|
if(actualNorms.lengthOf() == 1) {
|
||||||
|
|
||||||
|
const T norm = useAverage ? actualNorms.e<T>(0) / static_cast<T>(input.lengthOf()) : actualNorms.e<T>(0);
|
||||||
|
|
||||||
|
auto clipVal = clipNorm.e<T>(0);
|
||||||
|
|
||||||
|
if(norm > clipVal) {
|
||||||
|
|
||||||
|
const T sum = input.reduceNumber(reduce::Sum).e<T>(0); // reduce to scalar
|
||||||
|
const T factor1 = clipVal / norm;
|
||||||
|
const T factor2 = static_cast<T>(1.f) / (norm * norm); // 1 / (norm*norm*norm)
|
||||||
|
|
||||||
|
auto lambda = LAMBDA_TT(x, y, sum, factor1, factor2) {
|
||||||
|
return factor1 * y * (static_cast<T>(1.f) - factor2 * x * sum);
|
||||||
|
};
|
||||||
|
|
||||||
|
const_cast<NDArray&>(input).applyPairwiseLambda(const_cast<NDArray&>(gradO), lambda, gradI);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
gradI.assign(gradO);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
const NDArray actualNorms = input.reduceAlongDimension(reduce::Norm2, dims);
|
||||||
|
const NDArray sums = input.reduceAlongDimension(reduce::Sum, dims);
|
||||||
|
|
||||||
|
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(gradI.rankOf(), dims);
|
||||||
|
|
||||||
|
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||||
|
const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
|
|
||||||
|
PointersManager manager(context, "clipByNormBp");
|
||||||
|
|
||||||
|
const int* dimensions = reinterpret_cast<const int*>(manager.replicatePointer(dimsToExclude.data(), dimsToExclude.size() * sizeof(int)));
|
||||||
|
|
||||||
|
NDArray::prepareSpecialUse({&gradI}, {&actualNorms, &sums, &clipNorm, &input, &gradO});
|
||||||
|
clipByNormBpCuda<T><<<blocksPerGrid, threadsPerBlock, 512, *context->getCudaStream()>>>(clipNorm.specialBuffer(), input.specialBuffer(), input.specialShapeInfo(), gradO.specialBuffer(), gradO.specialShapeInfo(), actualNorms.specialBuffer(), actualNorms.specialShapeInfo(), sums.specialBuffer(), sums.specialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), dimensions, (int)dimsToExclude.size(), useAverage);
|
||||||
|
NDArray::registerSpecialUse({&gradI}, {&actualNorms, &sums, &clipNorm, &input, &gradO});
|
||||||
|
|
||||||
|
manager.synchronize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BUILD_SINGLE_TEMPLATE(template void clipByNormBp_, (sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool useAverage), FLOAT_TYPES);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
void clipByNormBp(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool useAverage) {
|
||||||
|
|
||||||
|
const NDArray& castedInput = gradI.dataType() == input.dataType() ? input : input.cast(gradI.dataType());
|
||||||
|
BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBp_, (context, castedInput, gradO, gradI, dimensions, clipNorm, useAverage), FLOAT_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void clipByGlobalNorm_(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
|
||||||
|
NDArray globalNorm = NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
|
||||||
|
|
||||||
|
for (auto i = 0; i < inputs.size(); i++) {
|
||||||
|
auto input = inputs[i];
|
||||||
|
auto l2norm = input->reduceNumber(reduce::Norm2);
|
||||||
|
globalNorm += l2norm * l2norm;
|
||||||
|
}
|
||||||
|
|
||||||
|
globalNorm.applyTransform(transform::Sqrt, globalNorm); // = sd::math::nd4j_sqrt(globalNorm);
|
||||||
|
outputs[inputs.size()]->p(0, globalNorm);
|
||||||
|
globalNorm.syncToHost();
|
||||||
|
const T factor = static_cast<T>(clipNorm) / globalNorm.e<T>(0);
|
||||||
|
|
||||||
|
for (size_t e = 0; e < inputs.size(); e++) {
|
||||||
|
// all-reduce
|
||||||
|
auto input = inputs[e];
|
||||||
|
auto output = outputs[e];
|
||||||
|
|
||||||
|
if (globalNorm.e<double>(0) <= clipNorm) {
|
||||||
|
output->assign(input);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
auto lambda = LAMBDA_T(_x, factor) { return _x * factor; };
|
||||||
|
input->applyLambda(lambda, *output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void clipByGlobalNorm(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
|
||||||
|
BUILD_SINGLE_SELECTOR(outputs[0]->dataType(), clipByGlobalNorm_, (context, inputs, clipNorm, workspace, outputs, isInplace), FLOAT_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void __global__ clipByValueKernel(void* input, const Nd4jLong* inputShape, void* output, const Nd4jLong* outputShape, double leftBound, double rightBound) {
|
||||||
|
__shared__ T* outputBuf;
|
||||||
|
__shared__ T* inputBuf;
|
||||||
|
__shared__ Nd4jLong length;
|
||||||
|
__shared__ bool linearBuffers;
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
outputBuf = reinterpret_cast<T *>(output);
|
||||||
|
inputBuf = reinterpret_cast<T *>(input);
|
||||||
|
length = shape::length(inputShape);
|
||||||
|
linearBuffers = shape::elementWiseStride(inputShape) == shape::elementWiseStride(outputShape) && shape::elementWiseStride(inputShape) == 1;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
const auto step = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
|
for (Nd4jLong e = tid; e < length; e += step) {
|
||||||
|
if (linearBuffers) {
|
||||||
|
if (inputBuf[e] > rightBound) outputBuf[e] = (T) rightBound;
|
||||||
|
else if (inputBuf[e] < leftBound) outputBuf[e] = (T) leftBound;
|
||||||
|
else outputBuf[e] = inputBuf[e];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
auto inputOffset = shape::getIndexOffset(e, inputShape);
|
||||||
|
auto outputOffset = shape::getIndexOffset(e, outputShape);
|
||||||
|
if (inputBuf[inputOffset] > rightBound) outputBuf[outputOffset] = (T) rightBound;
|
||||||
|
else if (inputBuf[inputOffset] < leftBound) outputBuf[outputOffset] = (T) leftBound;
|
||||||
|
else outputBuf[outputOffset] = inputBuf[outputOffset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void clipByValue_(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
|
||||||
|
auto stream = context->getCudaStream();
|
||||||
|
if (!input.isActualOnDeviceSide())
|
||||||
|
input.syncToDevice();
|
||||||
|
NDArray::prepareSpecialUse({&output}, {&input});
|
||||||
|
clipByValueKernel<T><<<256, 512, 8192, *stream>>>(input.specialBuffer(), input.specialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), leftBound, rightBound);
|
||||||
|
NDArray::registerSpecialUse({&output}, {&input});
|
||||||
|
}
|
||||||
|
|
||||||
|
void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
|
||||||
|
BUILD_SINGLE_SELECTOR(input.dataType(), clipByValue_, (context, input, leftBound, rightBound, output), FLOAT_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
BUILD_SINGLE_TEMPLATE(template void clipByValue_, (sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);, FLOAT_TYPES);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -210,14 +210,10 @@ namespace helpers {
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp) {
|
void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs) {
|
||||||
// we need to reverse axis only if that's new op
|
|
||||||
std::vector<int> dimensions = isBackProp ? ShapeUtils::evalDimsToExclude(input->rankOf(), *intArgs) : *intArgs;
|
|
||||||
std::vector<int> axis = ShapeUtils::evalDimsToExclude(input->rankOf(), dimensions);
|
|
||||||
auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->shapeInfo(), dimensions);
|
|
||||||
auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), dimensions);
|
|
||||||
|
|
||||||
|
|
||||||
|
auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->shapeInfo(), *intArgs);
|
||||||
|
auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), *intArgs);
|
||||||
|
|
||||||
NDArray::prepareSpecialUse({output}, {input});
|
NDArray::prepareSpecialUse({output}, {input});
|
||||||
|
|
||||||
|
|
|
@ -300,269 +300,6 @@ void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray
|
||||||
manager.synchronize();
|
manager.synchronize();
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// x - input, y - gradO, z - gradI
|
|
||||||
template<typename X, typename Z>
|
|
||||||
__global__ static void clipByNormBPWholeArrCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, void* vreducBuff, const Z clipNormVal) {
|
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
|
|
||||||
if(tid >= shape::length(zShapeInfo))
|
|
||||||
return;
|
|
||||||
|
|
||||||
const auto x = reinterpret_cast<const X*>(vx);
|
|
||||||
const auto y = reinterpret_cast<const Z*>(vy);
|
|
||||||
auto z = reinterpret_cast<Z*>(vz);
|
|
||||||
|
|
||||||
auto reducBuff = reinterpret_cast<Z*>(vreducBuff);
|
|
||||||
uint* count = reinterpret_cast<uint*>(vreducBuff) + 16384;
|
|
||||||
|
|
||||||
__shared__ Z* shMem;
|
|
||||||
__shared__ Nd4jLong len;
|
|
||||||
__shared__ bool amIinLastBlock;
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
extern __shared__ unsigned char shmem[];
|
|
||||||
shMem = reinterpret_cast<Z*>(shmem);
|
|
||||||
|
|
||||||
len = shape::length(zShapeInfo); // xLen = yLen = zLen
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// fill shared memory with array elements
|
|
||||||
const auto xVal = x[shape::getIndexOffset(tid, xShapeInfo)];
|
|
||||||
const auto yVal = y[shape::getIndexOffset(tid, yShapeInfo)];
|
|
||||||
|
|
||||||
shMem[2*threadIdx.x] = static_cast<Z>(xVal * xVal); // for norm
|
|
||||||
shMem[2*threadIdx.x + 1] = static_cast<Z>(xVal * yVal); // for input * gradO
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// accumulate sum per block
|
|
||||||
for (int activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
|
|
||||||
|
|
||||||
if (threadIdx.x < activeThreads && tid + activeThreads < len) {
|
|
||||||
|
|
||||||
shMem[2*threadIdx.x] += shMem[2*(threadIdx.x + activeThreads)];
|
|
||||||
shMem[2*threadIdx.x + 1] += shMem[2*(threadIdx.x + activeThreads) + 1];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
// store accumulated sums in reduction buffer (reducBuff)
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
|
|
||||||
reducBuff[2*blockIdx.x] = shMem[0];
|
|
||||||
reducBuff[2*blockIdx.x + 1] = shMem[1];
|
|
||||||
|
|
||||||
__threadfence();
|
|
||||||
|
|
||||||
amIinLastBlock = gridDim.x == 1 || (atomicInc(count, gridDim.x) == gridDim.x - 1);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// shared memory of last block is used for final summation of values stored in reduction buffer
|
|
||||||
if (amIinLastBlock) {
|
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < gridDim.x; i += blockDim.x) {
|
|
||||||
|
|
||||||
shMem[2*threadIdx.x] = (i == threadIdx.x ) ? reducBuff[2*i] : reducBuff[2*i] + shMem[2*threadIdx.x];
|
|
||||||
shMem[2*threadIdx.x + 1] = (i == threadIdx.x ) ? reducBuff[2*i + 1] : reducBuff[2*i + 1] + shMem[2*threadIdx.x + 1];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// accumulate sum
|
|
||||||
for (int activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
|
|
||||||
|
|
||||||
if (threadIdx.x < activeThreads && threadIdx.x + activeThreads < gridDim.x) {
|
|
||||||
shMem[2*threadIdx.x] += shMem[2*(threadIdx.x + activeThreads)];
|
|
||||||
shMem[2*threadIdx.x + 1] += shMem[2*(threadIdx.x + activeThreads) + 1];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
|
|
||||||
reducBuff[0] = math::nd4j_sqrt<Z,Z>(shMem[0]);
|
|
||||||
reducBuff[1] = shMem[1];
|
|
||||||
count = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// x - input, y - gradO, z - gradI
|
|
||||||
template<typename X, typename Z>
|
|
||||||
__global__ static void clipByNormBPCalcGradCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, void* vreducBuff, const Z clipNormVal) {
|
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
|
|
||||||
const Nd4jLong len = shape::length(zShapeInfo); // xLen = yLen = zLen
|
|
||||||
|
|
||||||
if(tid >= len)
|
|
||||||
return;
|
|
||||||
|
|
||||||
const auto x = reinterpret_cast<const X*>(vx);
|
|
||||||
const auto y = reinterpret_cast<const Z*>(vy);
|
|
||||||
auto z = reinterpret_cast<Z*>(vz);
|
|
||||||
|
|
||||||
__shared__ Z norm, sumOfProd;
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
|
|
||||||
norm = reinterpret_cast<Z*>(vreducBuff)[0];
|
|
||||||
sumOfProd = reinterpret_cast<Z*>(vreducBuff)[1];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
const auto yOffset = shape::getIndexOffset(tid, yShapeInfo);
|
|
||||||
const auto zOffset = shape::getIndexOffset(tid, zShapeInfo);
|
|
||||||
|
|
||||||
if(norm > clipNormVal) {
|
|
||||||
|
|
||||||
const auto xOffset = shape::getIndexOffset(tid, xShapeInfo);
|
|
||||||
|
|
||||||
const Z factor1 = static_cast<Z>(1) / norm; // 1 / norm
|
|
||||||
const Z factor2 = factor1 / (norm * norm); // 1 / (norm * norm * norm)
|
|
||||||
|
|
||||||
z[zOffset] = clipNormVal * (factor1 * y[yOffset] - factor2 * sumOfProd * x[xOffset]);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
z[zOffset] = y[yOffset];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// x - input, y - gradO, z - gradI
|
|
||||||
template<typename X, typename Z>
|
|
||||||
__global__ static void clipByNormBPTadsCuda(const void* vx, const Nd4jLong* xTadShapeInfo, const Nd4jLong* xTadOffsets, const void* vy, const Nd4jLong* yTadShapeInfo, const Nd4jLong* yTadOffsets, void* vz, const Nd4jLong* zTadShapeInfo, const Nd4jLong* zTadOffsets, const Z clipNormVal) {
|
|
||||||
|
|
||||||
const auto x = reinterpret_cast<const X*>(vx);
|
|
||||||
const auto y = reinterpret_cast<const Z*>(vy);
|
|
||||||
auto z = reinterpret_cast<Z*>(vz);
|
|
||||||
|
|
||||||
__shared__ Z* shMem;
|
|
||||||
__shared__ Nd4jLong tadLen;
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
|
||||||
shMem = reinterpret_cast<Z*>(shmem);
|
|
||||||
tadLen = shape::length(zTadShapeInfo); // xTadLen = yTadLen = zTadLen
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
const auto* xTad = x + xTadOffsets[blockIdx.x];
|
|
||||||
const auto* yTad = y + yTadOffsets[blockIdx.x];
|
|
||||||
auto* zTad = z + zTadOffsets[blockIdx.x];
|
|
||||||
|
|
||||||
// *** FIRST STAGE - ACCUMULATE REQUIRED SUMS *** //
|
|
||||||
|
|
||||||
Z norm = 0;
|
|
||||||
Z sumOfProd = 0;
|
|
||||||
|
|
||||||
for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) {
|
|
||||||
|
|
||||||
const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo);
|
|
||||||
const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo);
|
|
||||||
|
|
||||||
shMem[2*threadIdx.x] = static_cast<Z>(xTad[xOffset] * xTad[xOffset]); // for norm
|
|
||||||
shMem[2*threadIdx.x + 1] = static_cast<Z>(xTad[xOffset] * yTad[yOffset]); // for input * gradO
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// accumulate sum per block
|
|
||||||
for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
|
|
||||||
|
|
||||||
if (threadIdx.x < activeThreads && i + activeThreads < tadLen) {
|
|
||||||
|
|
||||||
shMem[2*threadIdx.x] += shMem[2*(threadIdx.x + activeThreads)];
|
|
||||||
shMem[2*threadIdx.x + 1] += shMem[2*(threadIdx.x + activeThreads) + 1];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
norm += shMem[0];
|
|
||||||
sumOfProd += shMem[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
// *** SECOND STAGE - GRADIENT CALCULATION *** //
|
|
||||||
|
|
||||||
norm = math::nd4j_sqrt<Z,Z>(norm);
|
|
||||||
|
|
||||||
for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) {
|
|
||||||
|
|
||||||
const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo);
|
|
||||||
const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo);
|
|
||||||
|
|
||||||
if(norm > clipNormVal) {
|
|
||||||
|
|
||||||
const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo);
|
|
||||||
|
|
||||||
const Z factor1 = static_cast<Z>(1) / norm; // 1 / norm
|
|
||||||
const Z factor2 = factor1 / (norm * norm); // 1 / (norm * norm * norm)
|
|
||||||
|
|
||||||
zTad[zOffset] = clipNormVal * (factor1 * yTad[yOffset] - factor2 * sumOfProd * xTad[xOffset]);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
zTad[zOffset] = yTad[yOffset];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename X, typename Z>
|
|
||||||
static void clipByNormBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
|
|
||||||
const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets,
|
|
||||||
const void* vy, const Nd4jLong* yShapeInfo, const Nd4jLong* yTadOffsets,
|
|
||||||
void* vz, const Nd4jLong* zShapeInfo, const Nd4jLong* zTadOffsets,
|
|
||||||
void* vreducBuff, const double clipNormVal) {
|
|
||||||
|
|
||||||
if(xTadOffsets == nullptr) { // means whole array
|
|
||||||
clipByNormBPWholeArrCuda<X,Z><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vreducBuff, static_cast<Z>(clipNormVal));
|
|
||||||
clipByNormBPCalcGradCuda<X,Z><<<blocksPerGrid, threadsPerBlock, 256, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vreducBuff, static_cast<Z>(clipNormVal));
|
|
||||||
}
|
|
||||||
else // means tads using
|
|
||||||
clipByNormBPTadsCuda<X,Z><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, xTadOffsets, vy, yShapeInfo, yTadOffsets, vz, zShapeInfo, zTadOffsets, static_cast<Z>(clipNormVal));
|
|
||||||
}
|
|
||||||
BUILD_DOUBLE_TEMPLATE(template void clipByNormBPCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* xTadOffsets, const void *vy, const Nd4jLong *yShapeInfo, const Nd4jLong* yTadOffsets, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* zTadOffsets, void* vreducBuff, const double clipNormVal), FLOAT_TYPES, FLOAT_TYPES);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
void clipByNormBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm) {
|
|
||||||
|
|
||||||
PointersManager manager(context, "clipByNormBP");
|
|
||||||
|
|
||||||
const double clipNormVal = clipNorm.e<double>(0);
|
|
||||||
|
|
||||||
const auto xType = input.dataType();
|
|
||||||
const auto zType = gradI.dataType();
|
|
||||||
|
|
||||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
|
||||||
const int sharedMem = threadsPerBlock * 2 * input.sizeOfT() + 128;
|
|
||||||
|
|
||||||
NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
|
|
||||||
|
|
||||||
|
|
||||||
if(dimensions.empty() || dimensions.size() == input.rankOf()) { // means whole array
|
|
||||||
|
|
||||||
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
|
||||||
BUILD_DOUBLE_SELECTOR(xType, zType, clipByNormBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.specialBuffer(), input.specialShapeInfo(), nullptr, gradO.specialBuffer(), gradO.specialShapeInfo(), nullptr, gradI.specialBuffer(), gradI.specialShapeInfo(), nullptr, context->getReductionPointer(), clipNormVal), FLOAT_TYPES, FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
else { // means tads using
|
|
||||||
|
|
||||||
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input.shapeInfo(), dimensions);
|
|
||||||
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(gradO.shapeInfo(), dimensions);
|
|
||||||
auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(gradI.shapeInfo(), dimensions);
|
|
||||||
|
|
||||||
const int blocksPerGrid = packX.numberOfTads();
|
|
||||||
BUILD_DOUBLE_SELECTOR(xType, zType, clipByNormBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), gradO.specialBuffer(), packY.platformShapeInfo(), packY.platformOffsets(), gradI.specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), nullptr, clipNormVal), FLOAT_TYPES, FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
|
|
||||||
|
|
||||||
manager.synchronize();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void swapShuffleKernel(T* input, Nd4jLong const* shape, Nd4jLong firstDim, sd::graph::RandomGenerator* rng) {
|
static __global__ void swapShuffleKernel(T* input, Nd4jLong const* shape, Nd4jLong firstDim, sd::graph::RandomGenerator* rng) {
|
||||||
auto tid = blockIdx.x * blockDim.x;
|
auto tid = blockIdx.x * blockDim.x;
|
||||||
|
@ -692,252 +429,6 @@ void clipByNormBP(sd::LaunchContext* context, const NDArray& input, const NDArra
|
||||||
output.setIdentity();
|
output.setIdentity();
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template <typename T>
|
|
||||||
static __global__ void clipByNormInplaceKernel(Nd4jLong numOfSubArrs, T* inputBuffer, Nd4jLong const* shape, Nd4jLong const* inputOffsets, T* norm2Buf, Nd4jLong const* norm2shape, T clipNorm) {
|
|
||||||
for (int arr = blockIdx.x; arr < numOfSubArrs; arr += gridDim.x) {
|
|
||||||
__shared__ T* z;
|
|
||||||
__shared__ Nd4jLong len;
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
len = shape::length(shape);
|
|
||||||
z = inputBuffer + inputOffsets[arr];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
for (int j = threadIdx.x; j < len; j+= blockDim.x) {
|
|
||||||
auto xIndex = shape::getIndexOffset(j, shape);
|
|
||||||
|
|
||||||
if(norm2Buf[arr] > clipNorm)
|
|
||||||
z[xIndex] *= clipNorm / norm2Buf[arr]; // case with ews = 1 and ordering is 'c'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template <typename T>
|
|
||||||
static __global__ void clipByNormKernel(Nd4jLong numOfSubArrs, T* inputBuffer, Nd4jLong const* shape, Nd4jLong const* inputOffsets, T* outputBuffer, Nd4jLong const* outputShape, Nd4jLong const* outputOffsets, T* norm2Buf, Nd4jLong const* norm2shape, T clipNorm) {
|
|
||||||
|
|
||||||
for (Nd4jLong arr = blockIdx.x; arr < numOfSubArrs; arr += gridDim.x) {
|
|
||||||
__shared__ T* x, *z;
|
|
||||||
__shared__ Nd4jLong lenZ;
|
|
||||||
__shared__ T norm2;
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
x = inputBuffer + inputOffsets[arr];
|
|
||||||
z = outputBuffer + outputOffsets[arr];
|
|
||||||
lenZ = shape::length(outputShape);
|
|
||||||
norm2 = norm2Buf[shape::getIndexOffset(arr, norm2shape)];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
for (Nd4jLong j = threadIdx.x; j < lenZ; j+= blockDim.x) {
|
|
||||||
auto xIndex = shape::getIndexOffset(j, shape);
|
|
||||||
auto zIndex = shape::getIndexOffset(j, outputShape);
|
|
||||||
if(norm2 > clipNorm) {
|
|
||||||
z[zIndex] = x[xIndex] * clipNorm / norm2; // case with ews = 1 and ordering is 'c'
|
|
||||||
} else {
|
|
||||||
z[zIndex] = x[xIndex];
|
|
||||||
}
|
|
||||||
//printf("%lld: %lf %lf\n", j, z[zIndex], x[xIndex]);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename T>
|
|
||||||
static void clipByNorm_(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, NDArray const& clipNormA, const bool isInplace) {
|
|
||||||
const int rank = input.rankOf();
|
|
||||||
auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
|
|
||||||
clipNormA.syncToHost();
|
|
||||||
//norm2.printBuffer("Norm2");
|
|
||||||
T const clipNorm = clipNormA.e<T>(0);
|
|
||||||
//clipNormA.printBuffer("ClipNorm");
|
|
||||||
auto stream = context->getCudaStream();
|
|
||||||
if (isInplace) {
|
|
||||||
if(norm2.lengthOf() == 1) {
|
|
||||||
norm2.syncToHost();
|
|
||||||
T norm2Val = norm2.e<T>(0);
|
|
||||||
if(norm2Val > clipNorm)
|
|
||||||
input *= clipNorm / norm2Val;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(rank, dimensions);
|
|
||||||
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input.shapeInfo(), dimsToExclude);
|
|
||||||
auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.shapeInfo(), dimensions);
|
|
||||||
//auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.shapeInfo(), dimsToExclude);
|
|
||||||
T* inputBuffer = reinterpret_cast<T*>(input.specialBuffer());
|
|
||||||
T* norm2buf = reinterpret_cast<T*>(norm2.specialBuffer());
|
|
||||||
|
|
||||||
clipByNormInplaceKernel<T><<<256, 512, 1024, *stream>>>(numOfSubArrs, inputBuffer, packX.specialShapeInfo(), packX.specialOffsets(), norm2buf, norm2.specialShapeInfo(), clipNorm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
if(norm2.lengthOf() == 1) {
|
|
||||||
norm2.syncToHost();
|
|
||||||
T norm2Val = norm2.e<T>(0);
|
|
||||||
|
|
||||||
if(norm2Val > clipNorm)
|
|
||||||
output.assign( input * (clipNorm / norm2Val));
|
|
||||||
else
|
|
||||||
output.assign( input );
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(rank, dimensions);
|
|
||||||
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input.shapeInfo(), dimsToExclude);
|
|
||||||
auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.shapeInfo(), dimensions);
|
|
||||||
auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.shapeInfo(), dimensions);
|
|
||||||
T* inputBuffer = reinterpret_cast<T*>(input.specialBuffer());
|
|
||||||
T* norm2buf = reinterpret_cast<T*>(norm2.specialBuffer());
|
|
||||||
T* outputBuffer = reinterpret_cast<T*>(output.specialBuffer());
|
|
||||||
|
|
||||||
clipByNormKernel<T><<<256, 512, 1024, *stream>>>(numOfSubArrs, inputBuffer, packX.specialShapeInfo(), packX.specialOffsets(), outputBuffer, packZ.specialShapeInfo(), packZ.specialOffsets(), norm2buf, norm2.specialShapeInfo(), clipNorm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
|
|
||||||
BUILD_SINGLE_SELECTOR(output.dataType(), clipByNorm_, (context, input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void clipByNorm_, (sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void clipByGlobalNorm_(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
|
|
||||||
NDArray globalNorm = NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
|
|
||||||
|
|
||||||
for (auto i = 0; i < inputs.size(); i++) {
|
|
||||||
auto input = inputs[i];
|
|
||||||
auto l2norm = input->reduceNumber(reduce::Norm2);
|
|
||||||
globalNorm += l2norm * l2norm;
|
|
||||||
}
|
|
||||||
|
|
||||||
globalNorm.applyTransform(transform::Sqrt, globalNorm); // = sd::math::nd4j_sqrt(globalNorm);
|
|
||||||
outputs[inputs.size()]->p(0, globalNorm);
|
|
||||||
globalNorm.syncToHost();
|
|
||||||
const T factor = static_cast<T>(clipNorm) / globalNorm.e<T>(0);
|
|
||||||
|
|
||||||
for (size_t e = 0; e < inputs.size(); e++) {
|
|
||||||
// all-reduce
|
|
||||||
auto input = inputs[e];
|
|
||||||
auto output = outputs[e];
|
|
||||||
|
|
||||||
if (globalNorm.e<double>(0) <= clipNorm) {
|
|
||||||
output->assign(input);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
auto lambda = LAMBDA_T(_x, factor) { return _x * factor; };
|
|
||||||
input->applyLambda(lambda, *output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void clipByGlobalNorm(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
|
|
||||||
BUILD_SINGLE_SELECTOR(outputs[0]->dataType(), clipByGlobalNorm_, (context, inputs, clipNorm, workspace, outputs, isInplace), FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename T>
|
|
||||||
static void clipByAveraged_(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
|
|
||||||
auto cn = clipNorm.e<T>(0);
|
|
||||||
if (dimensions.size() == 0) {
|
|
||||||
// all-reduce
|
|
||||||
T n2 = input.reduceNumber(reduce::Norm2).e<T>(0) / static_cast<T>(input.lengthOf());
|
|
||||||
if (n2 <= cn) {
|
|
||||||
if (!isInplace)
|
|
||||||
output.assign(input);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
const T factor = cn / n2;
|
|
||||||
//auto lambda = LAMBDA_T(_x, factor) { return _x * factor; };
|
|
||||||
//input.applyLambda<T>(lambda, output);
|
|
||||||
output.assign(input * factor);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// along dimension
|
|
||||||
auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions, false);
|
|
||||||
if (!isInplace)
|
|
||||||
output.assign(input);
|
|
||||||
auto tads = output.allTensorsAlongDimension(dimensions);
|
|
||||||
auto outTads = output.allTensorsAlongDimension(dimensions);
|
|
||||||
// TODO: make this CUDA-compliant somehow
|
|
||||||
for (int e = 0; e < tads.size(); e++) {
|
|
||||||
T n2 = norm2.e<T>(e) / static_cast<T>(tads.at(e)->lengthOf());
|
|
||||||
const T factor = cn / n2;
|
|
||||||
if (n2 > cn) {
|
|
||||||
//auto lambda = LAMBDA_T(_x, factor) {return _x * factor;};
|
|
||||||
tads.at(e)->applyScalar(scalar::Multiply, factor, *outTads.at(e));//applyLambda<T>(lambda, &output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void clipByAveraged(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
|
|
||||||
BUILD_SINGLE_SELECTOR(input.dataType(), clipByAveraged_, (context, input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void clipByAveraged_, (sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
|
|
||||||
|
|
||||||
/*
|
|
||||||
if (d1 > params[1])
|
|
||||||
return params[1];
|
|
||||||
else if (d1 < params[0])
|
|
||||||
return params[0];
|
|
||||||
else return d1;
|
|
||||||
*/
|
|
||||||
template <typename T>
|
|
||||||
static void __global__ clipByValueKernel(void* input, Nd4jLong const* inputShape, void* output, Nd4jLong const* outputShape, double leftBound, double rightBound) {
|
|
||||||
__shared__ T* outputBuf;
|
|
||||||
__shared__ T* inputBuf;
|
|
||||||
__shared__ Nd4jLong length;
|
|
||||||
__shared__ bool linearBuffers;
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
outputBuf = reinterpret_cast<T *>(output);
|
|
||||||
inputBuf = reinterpret_cast<T *>(input);
|
|
||||||
length = shape::length(inputShape);
|
|
||||||
linearBuffers = shape::elementWiseStride(inputShape) == shape::elementWiseStride(outputShape) && shape::elementWiseStride(inputShape) == 1;
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
const auto step = gridDim.x * blockDim.x;
|
|
||||||
|
|
||||||
for (Nd4jLong e = tid; e < length; e += step) {
|
|
||||||
if (linearBuffers) {
|
|
||||||
if (inputBuf[e] > rightBound) outputBuf[e] = (T) rightBound;
|
|
||||||
else if (inputBuf[e] < leftBound) outputBuf[e] = (T) leftBound;
|
|
||||||
else outputBuf[e] = inputBuf[e];
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
auto inputOffset = shape::getIndexOffset(e, inputShape);
|
|
||||||
auto outputOffset = shape::getIndexOffset(e, outputShape);
|
|
||||||
if (inputBuf[inputOffset] > rightBound) outputBuf[outputOffset] = (T) rightBound;
|
|
||||||
else if (inputBuf[inputOffset] < leftBound) outputBuf[outputOffset] = (T) leftBound;
|
|
||||||
else outputBuf[outputOffset] = inputBuf[outputOffset];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static void clipByValue_(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
|
|
||||||
auto stream = context->getCudaStream();
|
|
||||||
if (!input.isActualOnDeviceSide())
|
|
||||||
input.syncToDevice();
|
|
||||||
NDArray::prepareSpecialUse({&output}, {&input});
|
|
||||||
clipByValueKernel<T><<<256, 512, 8192, *stream>>>(input.specialBuffer(), input.specialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), leftBound, rightBound);
|
|
||||||
NDArray::registerSpecialUse({&output}, {&input});
|
|
||||||
}
|
|
||||||
|
|
||||||
void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
|
|
||||||
BUILD_SINGLE_SELECTOR(input.dataType(), clipByValue_, (context, input, leftBound, rightBound, output), FLOAT_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void clipByValue_, (sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);, FLOAT_TYPES);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,7 @@ namespace helpers {
|
||||||
|
|
||||||
void reverseSequence(sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim);
|
void reverseSequence(sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim);
|
||||||
|
|
||||||
void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp);
|
void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -63,13 +63,13 @@ namespace helpers {
|
||||||
void mergeAdd(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray& output);
|
void mergeAdd(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray& output);
|
||||||
void mergeAddBp(sd::LaunchContext* context, const NDArray& gradient, std::vector<NDArray*>& outArrs);
|
void mergeAddBp(sd::LaunchContext* context, const NDArray& gradient, std::vector<NDArray*>& outArrs);
|
||||||
|
|
||||||
void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace);
|
void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace, const bool useAverage);
|
||||||
|
|
||||||
void clipByGlobalNorm(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace);
|
void clipByGlobalNorm(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace);
|
||||||
|
|
||||||
void clipByNormBP(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm);
|
void clipByNormBp(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool useAverage);
|
||||||
|
|
||||||
void clipByAveraged(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace);
|
void clipByAveragedNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace);
|
||||||
void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);
|
|
||||||
|
|
||||||
void mirrorPad(sd::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode);
|
void mirrorPad(sd::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode);
|
||||||
|
|
||||||
|
|
|
@ -1093,7 +1093,7 @@ namespace sd {
|
||||||
return ND4J_STATUS_OK;
|
return ND4J_STATUS_OK;
|
||||||
|
|
||||||
NDArray *a0 = block.array(0);
|
NDArray *a0 = block.array(0);
|
||||||
for (int e = 0; e < block.width(); e++) {
|
for (int e = 1; e < block.width(); e++) {
|
||||||
auto aV = block.array(e);
|
auto aV = block.array(e);
|
||||||
if (!shape::equalsSoft(a0->shapeInfo(), aV->shapeInfo()))
|
if (!shape::equalsSoft(a0->shapeInfo(), aV->shapeInfo()))
|
||||||
return ND4J_STATUS_BAD_DIMENSIONS;
|
return ND4J_STATUS_BAD_DIMENSIONS;
|
||||||
|
|
|
@ -90,13 +90,12 @@ static void batchnormMKLDNN(const NDArray* x, const NDArray* mean, const NDArray
|
||||||
// x
|
// x
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(dims, type, format);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(dims, type, format);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(dims, type, format);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(dims, type, format);
|
||||||
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
|
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
|
||||||
// z, output
|
// z, output
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(dims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(dims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(dims, type, format);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(dims, type, format);
|
||||||
|
mkldnnUtils::setBlockStrides(*z, z_user_md);
|
||||||
mkldnnUtils::setBlockStrides(z, z_user_md);
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -112,15 +111,10 @@ static void batchnormMKLDNN(const NDArray* x, const NDArray* mean, const NDArray
|
||||||
// provide memory and check whether reorder is required
|
// provide memory and check whether reorder is required
|
||||||
|
|
||||||
// x
|
// x
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_ff_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_ff_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// z
|
// z
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, z->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*z, engine, stream, z_user_md, op_ff_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_ff_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_ff_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
if (zReorder)
|
|
||||||
dnnl::reorder(z_user_mem, z_mkl_mem).execute(stream, z_user_mem, z_mkl_mem);
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// mean
|
// mean
|
||||||
auto mean_mkl_mem = dnnl::memory(op_ff_prim_desc.mean_desc(), engine, const_cast<void*>(mean->buffer()));
|
auto mean_mkl_mem = dnnl::memory(op_ff_prim_desc.mean_desc(), engine, const_cast<void*>(mean->buffer()));
|
||||||
|
@ -141,8 +135,8 @@ static void batchnormMKLDNN(const NDArray* x, const NDArray* mean, const NDArray
|
||||||
dnnl::batch_normalization_forward(op_ff_prim_desc).execute(stream, args);
|
dnnl::batch_normalization_forward(op_ff_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_ff_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
@ -151,7 +145,7 @@ static void batchnormMKLDNN(const NDArray* x, const NDArray* mean, const NDArray
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const NDArray* variance, const NDArray &dLdO, const NDArray* weights,
|
static void batchnormBpMKLDNN(const NDArray* x, const NDArray* mean, const NDArray* variance, const NDArray &dLdO, const NDArray* weights,
|
||||||
NDArray* dLdI, NDArray* dLdW, const float epsilon, const bool isNCHW) {
|
NDArray* dLdI, NDArray* dLdW, const float epsilon, const bool isNCHW) {
|
||||||
|
|
||||||
// unfortunately mkl dnn doesn't support any format (dnnl::memory::format_tag::any) for x
|
// unfortunately mkl dnn doesn't support any format (dnnl::memory::format_tag::any) for x
|
||||||
|
@ -206,20 +200,17 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
|
||||||
// x
|
// x
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(dims, type, format);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(dims, type, format);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(dims, type, format);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(dims, type, format);
|
||||||
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
|
||||||
|
|
||||||
// dLdO
|
// dLdO
|
||||||
dnnl::memory::desc dLdO_mkl_md = dnnl::memory::desc(dims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc dLdO_mkl_md = dnnl::memory::desc(dims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc dLdO_user_md = dnnl::memory::desc(dims, type, format);
|
dnnl::memory::desc dLdO_user_md = dnnl::memory::desc(dims, type, format);
|
||||||
|
mkldnnUtils::setBlockStrides(dLdO, dLdO_user_md);
|
||||||
mkldnnUtils::setBlockStrides(&dLdO, dLdO_user_md);
|
|
||||||
|
|
||||||
// dLdI
|
// dLdI
|
||||||
dnnl::memory::desc dLdI_mkl_md = dnnl::memory::desc(dims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc dLdI_mkl_md = dnnl::memory::desc(dims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc dLdI_user_md = dnnl::memory::desc(dims, type, format);
|
dnnl::memory::desc dLdI_user_md = dnnl::memory::desc(dims, type, format);
|
||||||
|
mkldnnUtils::setBlockStrides(*dLdI, dLdI_user_md);
|
||||||
mkldnnUtils::setBlockStrides(dLdI, dLdI_user_md);
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -239,10 +230,10 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
|
||||||
// provide memory and check whether reorder is required
|
// provide memory and check whether reorder is required
|
||||||
|
|
||||||
// x
|
// x
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// dLdO
|
// dLdO
|
||||||
mkldnnUtils::loadDataToMklStream(&dLdO, engine, stream, dLdO_user_md, op_bp_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
mkldnnUtils::loadDataToMklStream(dLdO, engine, stream, dLdO_user_md, op_bp_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
||||||
|
|
||||||
// mean
|
// mean
|
||||||
auto mean_mkl_mem = dnnl::memory(op_bp_prim_desc.mean_desc(), engine, const_cast<void*>(mean->buffer()));
|
auto mean_mkl_mem = dnnl::memory(op_bp_prim_desc.mean_desc(), engine, const_cast<void*>(mean->buffer()));
|
||||||
|
@ -253,10 +244,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
|
||||||
args[DNNL_ARG_VARIANCE] = var_mkl_mem;
|
args[DNNL_ARG_VARIANCE] = var_mkl_mem;
|
||||||
|
|
||||||
// dLdI
|
// dLdI
|
||||||
auto dLdI_user_mem = dnnl::memory(dLdI_user_md, engine, dLdI->buffer());
|
auto dLdI_user_mem = mkldnnUtils::loadDataToMklStream(*dLdI, engine, stream, dLdI_user_md, op_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool dLdIReorder = op_bp_prim_desc.diff_src_desc() != dLdI_user_mem.get_desc();
|
|
||||||
auto dLdI_mkl_mem = dLdIReorder ? dnnl::memory(op_bp_prim_desc.diff_src_desc(), engine) : dLdI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = dLdI_mkl_mem;
|
|
||||||
|
|
||||||
// gamma and beta (and their gradients) if they are present
|
// gamma and beta (and their gradients) if they are present
|
||||||
if(weights != nullptr) {
|
if(weights != nullptr) {
|
||||||
|
@ -272,8 +260,8 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
|
||||||
dnnl::batch_normalization_backward(op_bp_prim_desc).execute(stream, args);
|
dnnl::batch_normalization_backward(op_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (dLdIReorder)
|
if (op_bp_prim_desc.diff_src_desc() != dLdI_user_mem.get_desc())
|
||||||
dnnl::reorder(dLdI_mkl_mem, dLdI_user_mem).execute(stream, dLdI_mkl_mem, dLdI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], dLdI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], dLdI_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
@ -662,9 +650,9 @@ PLATFORM_IMPL(batchnorm_bp, ENGINE_CPU) {
|
||||||
const bool isNCHW = !(axes[0] == inRank - 1 && inRank > 2);
|
const bool isNCHW = !(axes[0] == inRank - 1 && inRank > 2);
|
||||||
|
|
||||||
if (shape::strideDescendingCAscendingF(dLdO->shapeInfo()))
|
if (shape::strideDescendingCAscendingF(dLdO->shapeInfo()))
|
||||||
batchnormBackPropMKLDNN(input, mean, variance, *dLdO, weights, dLdI, dLdW, epsilon, isNCHW);
|
batchnormBpMKLDNN(input, mean, variance, *dLdO, weights, dLdI, dLdW, epsilon, isNCHW);
|
||||||
else
|
else
|
||||||
batchnormBackPropMKLDNN(input, mean, variance, dLdO->dup(), weights, dLdI, dLdW, epsilon, isNCHW);
|
batchnormBpMKLDNN(input, mean, variance, dLdO->dup(), weights, dLdI, dLdW, epsilon, isNCHW);
|
||||||
|
|
||||||
*dLdM = 0;
|
*dLdM = 0;
|
||||||
*dLdV = 0;
|
*dLdV = 0;
|
||||||
|
|
|
@ -0,0 +1,186 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (c) 2020 Konduit K.K.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Apache License, Version 2.0 which is available at
|
||||||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
//
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
|
//
|
||||||
|
|
||||||
|
#include <ops/declarable/PlatformHelper.h>
|
||||||
|
#include <ops/declarable/OpRegistrator.h>
|
||||||
|
#include <system/platform_boilerplate.h>
|
||||||
|
|
||||||
|
#include <helpers/MKLDNNStream.h>
|
||||||
|
#include "mkldnnUtils.h"
|
||||||
|
#include <numeric>
|
||||||
|
|
||||||
|
|
||||||
|
namespace sd {
|
||||||
|
namespace ops {
|
||||||
|
namespace platforms {
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
static void concatMKLDNN(const std::vector<const NDArray*>& inArrs, NDArray& output, const int axis) {
|
||||||
|
|
||||||
|
// data type
|
||||||
|
dnnl::memory::data_type type;
|
||||||
|
if(output.dataType() == DataType::FLOAT32)
|
||||||
|
type = dnnl::memory::data_type::f32;
|
||||||
|
else if(output.dataType() == DataType::HALF)
|
||||||
|
type = dnnl::memory::data_type::f16;
|
||||||
|
else if(output.dataType() == DataType::BFLOAT16)
|
||||||
|
type = dnnl::memory::data_type::bf16;
|
||||||
|
else if(output.dataType() == DataType::UINT8)
|
||||||
|
type = dnnl::memory::data_type::u8;
|
||||||
|
else
|
||||||
|
type = dnnl::memory::data_type::s8;
|
||||||
|
|
||||||
|
std::vector<dnnl::memory::desc> x_user_md(inArrs.size()), x_mkl_md(inArrs.size());
|
||||||
|
|
||||||
|
// inputs
|
||||||
|
for (int i = 0; i < inArrs.size(); ++i) {
|
||||||
|
|
||||||
|
dnnl::memory::dims dims = inArrs[i]->getShapeAsFlatVector();
|
||||||
|
x_user_md[i] = x_mkl_md[i] = dnnl::memory::desc(dims, type, mkldnnUtils::getFormat(*inArrs[i]));
|
||||||
|
mkldnnUtils::setBlockStrides(*inArrs[i], x_user_md[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// output
|
||||||
|
dnnl::memory::dims dims = output.getShapeAsFlatVector();
|
||||||
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(dims, type, dnnl::memory::format_tag::any);
|
||||||
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(dims, type, mkldnnUtils::getFormat(output));
|
||||||
|
mkldnnUtils::setBlockStrides(output, z_user_md);
|
||||||
|
|
||||||
|
std::unordered_map<int, dnnl::memory> args;
|
||||||
|
|
||||||
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
dnnl::concat::primitive_desc op_prim_desc(axis, x_mkl_md, engine);
|
||||||
|
|
||||||
|
dnnl::stream stream(engine);
|
||||||
|
|
||||||
|
// inputs
|
||||||
|
for (int i = 0; i < inArrs.size(); ++i)
|
||||||
|
mkldnnUtils::loadDataToMklStream(*inArrs[i], engine, stream, x_user_md[i], op_prim_desc.src_desc(i), args[DNNL_ARG_MULTIPLE_SRC + i]);
|
||||||
|
|
||||||
|
// outputs
|
||||||
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(output, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
|
|
||||||
|
// primitive execution
|
||||||
|
dnnl::concat(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
|
// reorder output if necessary
|
||||||
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
|
stream.wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
PLATFORM_IMPL(concat, ENGINE_CPU) {
|
||||||
|
|
||||||
|
REQUIRE_TRUE(block.width() > 0, 0, "CONCAT MKLDNN op: No input arrays were provided");
|
||||||
|
|
||||||
|
const bool isAxisInLastArr = block.getBArguments()->size() == 0 ? false : B_ARG(0);
|
||||||
|
|
||||||
|
const int numOfInArrs = isAxisInLastArr ? block.width() - 1 : block.width();
|
||||||
|
|
||||||
|
// first of all take into account possible presence of empty arrays
|
||||||
|
// also if scalar is present -> copy its value to vector with length=1
|
||||||
|
std::vector<const NDArray*> nonEmptyArrs;
|
||||||
|
std::vector<int> arrsToDelete;
|
||||||
|
int index = 0;
|
||||||
|
bool allOfSameType = true;
|
||||||
|
auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
|
||||||
|
auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
|
||||||
|
|
||||||
|
for(int i = 0; i < numOfInArrs; ++i) {
|
||||||
|
auto input = INPUT_VARIABLE(i);
|
||||||
|
auto currentRank = input->rankOf();
|
||||||
|
|
||||||
|
if(!input->isEmpty()) {
|
||||||
|
|
||||||
|
allOfSameType &= (typeOfFirstArr == input->dataType());
|
||||||
|
|
||||||
|
if(input->rankOf() == 0) {
|
||||||
|
auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext());
|
||||||
|
vec->assign(input);
|
||||||
|
nonEmptyArrs.push_back(vec);
|
||||||
|
arrsToDelete.push_back(index);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
nonEmptyArrs.push_back(input);
|
||||||
|
}
|
||||||
|
++index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int numOfNonEmptyArrs = nonEmptyArrs.size();
|
||||||
|
|
||||||
|
if(numOfNonEmptyArrs == 0){
|
||||||
|
//All inputs are empty arrays -> return empty, mainly for TF import compatibility (no op)
|
||||||
|
REQUIRE_TRUE(OUTPUT_VARIABLE(0)->isEmpty(), 0, "CONCAT MKLDNN op: If all input variables are empty, output must be empty");
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
const int rank = nonEmptyArrs[0]->rankOf(); // look up to first non-empty array
|
||||||
|
int axis = isAxisInLastArr ? INPUT_VARIABLE(block.width() - 1)->e<int>(0) : INT_ARG(0);
|
||||||
|
if(axis < 0){
|
||||||
|
axis += rank;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ******** input validation ******** //
|
||||||
|
REQUIRE_TRUE(allOfSameType, 0, "CONCAT MKLDNN op: all of input arrays must have same type !");
|
||||||
|
REQUIRE_TRUE(nonEmptyArrs[0]->dataType() == OUTPUT_VARIABLE(0)->dataType(), 0, "CONCAT MKLDNN op: output array should have the same type as inputs arrays !");
|
||||||
|
REQUIRE_TRUE(0 <= axis && (axis < rank || (axis == 0 && rank == 0)), 0, "CONCAT MKLDNN op: input axis must be in range [0, %i], but got %i instead!", rank-1, axis);
|
||||||
|
|
||||||
|
for(int i = 1; i < numOfNonEmptyArrs; ++i)
|
||||||
|
REQUIRE_TRUE(nonEmptyArrs[i]->rankOf() == rank, 0, "CONCAT MKLDNN op: all input arrays must have the same rank !");
|
||||||
|
|
||||||
|
for(int i = 1; i < numOfNonEmptyArrs; ++i) {
|
||||||
|
for(int dim = 0; dim < rank; ++dim)
|
||||||
|
if(dim != axis)
|
||||||
|
REQUIRE_TRUE(nonEmptyArrs[i]->sizeAt(dim) == nonEmptyArrs[0]->sizeAt(dim), 0, "CONCAT MKLDNN op: all input arrays must have the same dimensions (except those on input axis) !");
|
||||||
|
}
|
||||||
|
// ******** end of input validation ******** //
|
||||||
|
|
||||||
|
auto output = OUTPUT_VARIABLE(0);
|
||||||
|
|
||||||
|
if(numOfNonEmptyArrs == 1)
|
||||||
|
output->assign(nonEmptyArrs[0]);
|
||||||
|
else
|
||||||
|
concatMKLDNN(nonEmptyArrs, *output, axis);
|
||||||
|
|
||||||
|
// delete dynamically allocated vectors with length=1
|
||||||
|
for(int index : arrsToDelete)
|
||||||
|
delete nonEmptyArrs[index];
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
PLATFORM_CHECK(concat, ENGINE_CPU) {
|
||||||
|
|
||||||
|
auto z = OUTPUT_VARIABLE(0);
|
||||||
|
|
||||||
|
const auto zType = z->dataType();
|
||||||
|
|
||||||
|
return z->rankOf() < 7 && (zType==DataType::FLOAT32 || zType==DataType::HALF || zType==DataType::BFLOAT16 || zType==DataType::UINT8 || zType==DataType::INT8);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -62,33 +62,23 @@ static void conv2dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
|
|
||||||
auto type = dnnl::memory::data_type::f32;
|
auto type = dnnl::memory::data_type::f32;
|
||||||
|
|
||||||
|
std::vector<int> permut;
|
||||||
|
if(0 == wFormat)
|
||||||
|
permut = {3,2,0,1}; // [kH, kW, iC, oC] -> [oC, iC, kH, kW]
|
||||||
|
else if(2 == wFormat)
|
||||||
|
permut = {0,3,1,2}; // [oC, kH, kW, iC] -> [oC, iC, kH, kW]
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
|
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
||||||
if(weights->ews() != 1 || weights->ordering() != 'c' || 1 != wFormat) {
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
uint i0, i1, i2, i3;
|
|
||||||
if(0 == wFormat) {
|
|
||||||
i0 = 3; i1 = 2; i2 = 0; i3 = 1; // [kH, kW, iC, oC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
else if(1 == wFormat) {
|
|
||||||
i0 = 0; i1 = 1; i2 = 2; i3 = 3;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
i0 = 0; i1 = 3; i2 = 1; i3 = 2; // [oC, kH, kW, iC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
dnnl::memory::desc b_mkl_md;
|
dnnl::memory::desc b_mkl_md;
|
||||||
|
@ -98,7 +88,7 @@ static void conv2dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
// output
|
// output
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(output, z_user_md);
|
mkldnnUtils::setBlockStrides(*output, z_user_md);
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -114,10 +104,10 @@ static void conv2dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
if(bias != nullptr) {
|
if(bias != nullptr) {
|
||||||
|
@ -126,17 +116,14 @@ static void conv2dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
}
|
}
|
||||||
|
|
||||||
// output
|
// output
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, output->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*output, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::convolution_forward(op_prim_desc).execute(stream, args);
|
dnnl::convolution_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
// shape::printArray(z_mkl_mem.map_data<float>(),8);
|
// shape::printArray(z_mkl_mem.map_data<float>(),8);
|
||||||
|
@ -170,64 +157,38 @@ static void conv2dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
|
|
||||||
auto type = dnnl::memory::data_type::f32;
|
auto type = dnnl::memory::data_type::f32;
|
||||||
|
|
||||||
|
std::vector<int> permut;
|
||||||
|
if(0 == wFormat)
|
||||||
|
permut = {3,2,0,1}; // [kH, kW, iC, oC] -> [oC, iC, kH, kW]
|
||||||
|
else if(2 == wFormat)
|
||||||
|
permut = {0,3,1,2}; // [oC, kH, kW, iC] -> [oC, iC, kH, kW]
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
|
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
||||||
if(weights->ews() != 1 || weights->ordering() != 'c' || 1 != wFormat) {
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
uint i0, i1, i2, i3;
|
|
||||||
if(0 == wFormat) {
|
|
||||||
i0 = 3; i1 = 2; i2 = 0; i3 = 1; // [kH, kW, iC, oC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
else if(1 == wFormat) {
|
|
||||||
i0 = 0; i1 = 1; i2 = 2; i3 = 3;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
i0 = 0; i1 = 3; i2 = 1; i3 = 2; // [oC, kH, kW, iC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradO, gradO_user_md);
|
mkldnnUtils::setBlockStrides(*gradO, gradO_user_md);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradI, gradI_user_md);
|
mkldnnUtils::setBlockStrides(*gradI, gradI_user_md);
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
||||||
if(gradW->ews() != 1 || gradW->ordering() != 'c' || 1 != wFormat) {
|
mkldnnUtils::setBlockStrides(*gradW, gradW_user_md, permut);
|
||||||
gradW_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
uint i0, i1, i2, i3;
|
|
||||||
if(0 == wFormat) {
|
|
||||||
i0 = 3; i1 = 2; i2 = 0; i3 = 1; // [kH, kW, iC, oC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
else if(1 == wFormat) {
|
|
||||||
i0 = 0; i1 = 1; i2 = 2; i3 = 3;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
i0 = 0; i1 = 3; i2 = 1; i3 = 2; // [oC, kH, kW, iC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[0] = gradW->strideAt(i0);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[1] = gradW->strideAt(i1);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[2] = gradW->strideAt(i2);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[3] = gradW->strideAt(i3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
dnnl::memory::desc gradB_mkl_md;
|
dnnl::memory::desc gradB_mkl_md;
|
||||||
|
@ -256,10 +217,10 @@ static void conv2dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
||||||
|
@ -274,16 +235,10 @@ static void conv2dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
auto gradI_user_mem = dnnl::memory(gradI_user_md, engine, gradI->buffer());
|
auto gradI_user_mem = mkldnnUtils::loadDataToMklStream(*gradI, engine, stream, gradI_user_md, op_data_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool gradIReorder = op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc();
|
|
||||||
auto gradI_mkl_mem = gradIReorder ? dnnl::memory(op_data_bp_prim_desc.diff_src_desc(), engine) : gradI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = gradI_mkl_mem;
|
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
auto gradW_user_mem = dnnl::memory(gradW_user_md, engine, gradW->buffer());
|
auto gradW_user_mem = mkldnnUtils::loadDataToMklStream(*gradW, engine, stream, gradW_user_md, op_weights_bp_prim_desc.diff_weights_desc(), args[DNNL_ARG_DIFF_WEIGHTS]);
|
||||||
const bool gradWReorder = op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc();
|
|
||||||
auto gradW_mkl_mem = gradWReorder ? dnnl::memory(op_weights_bp_prim_desc.diff_weights_desc(), engine) : gradW_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_WEIGHTS] = gradW_mkl_mem;
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
if(gradB != nullptr) {
|
if(gradB != nullptr) {
|
||||||
|
@ -301,10 +256,10 @@ static void conv2dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
dnnl::convolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
dnnl::convolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder gradI if necessary
|
// reorder gradI if necessary
|
||||||
if (gradIReorder)
|
if (op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc())
|
||||||
dnnl::reorder(gradI_mkl_mem, gradI_user_mem).execute(stream, gradI_mkl_mem, gradI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], gradI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], gradI_user_mem);
|
||||||
if (gradWReorder)
|
if (op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc())
|
||||||
dnnl::reorder(gradW_mkl_mem, gradW_user_mem).execute(stream, gradW_mkl_mem, gradW_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem).execute(stream, args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
|
|
@ -63,6 +63,12 @@ static void conv3dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
dnnl::memory::dims wDims = {oC, iC, kD, kH, kW};
|
dnnl::memory::dims wDims = {oC, iC, kD, kH, kW};
|
||||||
dnnl::memory::dims zDims = {bS, oC, oD, oH, oW};
|
dnnl::memory::dims zDims = {bS, oC, oD, oH, oW};
|
||||||
|
|
||||||
|
std::vector<int> permut;
|
||||||
|
if(0 == wFormat)
|
||||||
|
permut = {4,3,0,1,2}; // [kD, kH, kW, iC, oC] -> [oC, iC, kD, kH, kW]
|
||||||
|
else if(2 == wFormat)
|
||||||
|
permut = {0,4,1,2,3}; // [oC, kD, kH, kW, iC] -> [oC, iC, kD, kH, kW]
|
||||||
|
|
||||||
auto type = dnnl::memory::data_type::f32;
|
auto type = dnnl::memory::data_type::f32;
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
|
@ -70,29 +76,12 @@ static void conv3dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
||||||
if(weights->ews() != 1 || weights->ordering() != 'c' || 1 != wFormat) {
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
uint i0, i1, i2, i3, i4;
|
|
||||||
if(0 == wFormat) {
|
|
||||||
i0 = 4; i1 = 3; i2 = 0; i3 = 1; i4 = 2; // [kD, kH, kW, iC, oC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
else if(1 == wFormat) {
|
|
||||||
i0 = 0; i1 = 1; i2 = 2; i3 = 3; i4 = 4;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
i0 = 0; i1 = 4; i2 = 1; i3 = 2; i4 = 3; // [oC, kD, kH, kW, iC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[4] = weights->strideAt(i4);
|
|
||||||
}
|
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
dnnl::memory::desc b_mkl_md;
|
dnnl::memory::desc b_mkl_md;
|
||||||
|
@ -102,7 +91,7 @@ static void conv3dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
// output
|
// output
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(output, z_user_md);
|
mkldnnUtils::setBlockStrides(*output, z_user_md);
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -118,10 +107,10 @@ static void conv3dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
if(bias != nullptr) {
|
if(bias != nullptr) {
|
||||||
|
@ -130,17 +119,14 @@ static void conv3dMKLDNN(const NDArray *input, const NDArray *weights,
|
||||||
}
|
}
|
||||||
|
|
||||||
// output
|
// output
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, output->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*output, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::convolution_forward(op_prim_desc).execute(stream, args);
|
dnnl::convolution_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
@ -177,68 +163,40 @@ static void conv3dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
|
|
||||||
auto type = dnnl::memory::data_type::f32;
|
auto type = dnnl::memory::data_type::f32;
|
||||||
|
|
||||||
|
std::vector<int> permut;
|
||||||
|
if(0 == wFormat)
|
||||||
|
permut = {4,3,0,1,2}; // [kD, kH, kW, iC, oC] -> [oC, iC, kD, kH, kW]
|
||||||
|
else if(2 == wFormat)
|
||||||
|
permut = {0,4,1,2,3}; // [oC, kD, kH, kW, iC] -> [oC, iC, kD, kH, kW]
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
|
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
||||||
if(weights->ews() != 1 || weights->ordering() != 'c' || 1 != wFormat) {
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
uint i0, i1, i2, i3, i4;
|
|
||||||
if(0 == wFormat) {
|
|
||||||
i0 = 4; i1 = 3; i2 = 0; i3 = 1; i4 = 2; // [kD, kH, kW, iC, oC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
else if(1 == wFormat) {
|
|
||||||
i0 = 0; i1 = 1; i2 = 2; i3 = 3; i4 = 4;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
i0 = 0; i1 = 4; i2 = 1; i3 = 2; i4 = 3; // [oC, kD, kH, kW, iC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[4] = weights->strideAt(i4);
|
|
||||||
}
|
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, type, xzFormatMkl);
|
||||||
|
|
||||||
mkldnnUtils::setBlockStrides(gradO, gradO_user_md);
|
mkldnnUtils::setBlockStrides(*gradO, gradO_user_md);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, type, xzFormatMkl);
|
||||||
|
|
||||||
mkldnnUtils::setBlockStrides(gradI, gradI_user_md);
|
mkldnnUtils::setBlockStrides(*gradI, gradI_user_md);
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, type, wFormatMkl);
|
||||||
if(gradW->ews() != 1 || gradW->ordering() != 'c' || 1 != wFormat) {
|
mkldnnUtils::setBlockStrides(*gradW, gradW_user_md, permut);
|
||||||
gradW_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
uint i0, i1, i2, i3, i4;
|
|
||||||
if(0 == wFormat) {
|
|
||||||
i0 = 4; i1 = 3; i2 = 0; i3 = 1; i4 = 2; // [kD, kH, kW, iC, oC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
else if(1 == wFormat) {
|
|
||||||
i0 = 0; i1 = 1; i2 = 2; i3 = 3; i4 = 4;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
i0 = 0; i1 = 4; i2 = 1; i3 = 2; i4 = 3; // [oC, kD, kH, kW, iC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[0] = gradW->strideAt(i0);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[1] = gradW->strideAt(i1);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[2] = gradW->strideAt(i2);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[3] = gradW->strideAt(i3);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[4] = gradW->strideAt(i4);
|
|
||||||
}
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
dnnl::memory::desc gradB_mkl_md;
|
dnnl::memory::desc gradB_mkl_md;
|
||||||
|
@ -267,10 +225,10 @@ static void conv3dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
||||||
|
@ -285,16 +243,10 @@ static void conv3dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
auto gradI_user_mem = dnnl::memory(gradI_user_md, engine, gradI->buffer());
|
auto gradI_user_mem = mkldnnUtils::loadDataToMklStream(*gradI, engine, stream, gradI_user_md, op_data_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool gradIReorder = op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc();
|
|
||||||
auto gradI_mkl_mem = gradIReorder ? dnnl::memory(op_data_bp_prim_desc.diff_src_desc(), engine) : gradI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = gradI_mkl_mem;
|
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
auto gradW_user_mem = dnnl::memory(gradW_user_md, engine, gradW->buffer());
|
auto gradW_user_mem = mkldnnUtils::loadDataToMklStream(*gradW, engine, stream, gradW_user_md, op_weights_bp_prim_desc.diff_weights_desc(), args[DNNL_ARG_DIFF_WEIGHTS]);
|
||||||
const bool gradWReorder = op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc();
|
|
||||||
auto gradW_mkl_mem = gradWReorder ? dnnl::memory(op_weights_bp_prim_desc.diff_weights_desc(), engine) : gradW_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_WEIGHTS] = gradW_mkl_mem;
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
if(gradB != nullptr) {
|
if(gradB != nullptr) {
|
||||||
|
@ -312,10 +264,10 @@ static void conv3dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
|
||||||
dnnl::convolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
dnnl::convolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder gradI if necessary
|
// reorder gradI if necessary
|
||||||
if (gradIReorder)
|
if (op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc())
|
||||||
dnnl::reorder(gradI_mkl_mem, gradI_user_mem).execute(stream, gradI_mkl_mem, gradI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], gradI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], gradI_user_mem);
|
||||||
if (gradWReorder)
|
if (op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc())
|
||||||
dnnl::reorder(gradW_mkl_mem, gradW_user_mem).execute(stream, gradW_mkl_mem, gradW_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem).execute(stream, args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
|
|
@ -47,16 +47,13 @@ static void deconv2dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
dnnl::memory::dims padding_r = { (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
dnnl::memory::dims padding_r = { (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
||||||
dnnl::memory::dims dilation = { dH-1, dW-1 };
|
dnnl::memory::dims dilation = { dH-1, dW-1 };
|
||||||
|
|
||||||
uint i0, i1, i2, i3;
|
std::vector<int> permut;
|
||||||
if(0 == wFormat) {
|
if(0 == wFormat)
|
||||||
i0 = 2; i1 = 3; i2 = 0; i3 = 1; // [kH, kW, oC, iC] -> [oC, iC, kH, kW]
|
permut = {2,3,0,1}; // [kH, kW, oC, iC] -> [oC, iC, kH, kW]
|
||||||
}
|
else if(1 == wFormat)
|
||||||
else if(1 == wFormat) {
|
permut = {1,0,2,3}; // [iC, oC, kH, kW] -> [oC, iC, kH, kW]
|
||||||
i0 = 1; i1 = 0; i2 = 2; i3 = 3; // [iC, oC, kH, kW] -> [oC, iC, kH, kW]
|
else
|
||||||
}
|
permut = {3,0,1,2}; // [iC, kH, kW, oC] -> [oC, iC, kH, kW]
|
||||||
else {
|
|
||||||
i0 = 3; i1 = 0; i2 = 1; i3 = 2; // [iC, kH, kW, oC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
|
|
||||||
// input type
|
// input type
|
||||||
dnnl::memory::data_type xType;
|
dnnl::memory::data_type xType;
|
||||||
|
@ -99,16 +96,12 @@ static void deconv2dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
dnnl::memory::desc b_mkl_md;
|
dnnl::memory::desc b_mkl_md;
|
||||||
|
@ -118,7 +111,7 @@ static void deconv2dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
// output
|
// output
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, zType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, zType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, zType, xFormatMkl);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, zType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(output, z_user_md);
|
mkldnnUtils::setBlockStrides(*output, z_user_md);
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -135,10 +128,10 @@ static void deconv2dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
if(bias != nullptr) {
|
if(bias != nullptr) {
|
||||||
|
@ -147,17 +140,14 @@ static void deconv2dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
}
|
}
|
||||||
|
|
||||||
// output
|
// output
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, output->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*output, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::deconvolution_forward(op_prim_desc).execute(stream, args);
|
dnnl::deconvolution_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
@ -180,16 +170,13 @@ static void deconv2dBpMKLDNN(const NDArray* input, const NDArray* weights, const
|
||||||
dnnl::memory::dims padding_r = { (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
dnnl::memory::dims padding_r = { (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
||||||
dnnl::memory::dims dilation = { dH-1, dW-1 };
|
dnnl::memory::dims dilation = { dH-1, dW-1 };
|
||||||
|
|
||||||
uint i0, i1, i2, i3;
|
std::vector<int> permut;
|
||||||
if(0 == wFormat) {
|
if(0 == wFormat)
|
||||||
i0 = 2; i1 = 3; i2 = 0; i3 = 1; // [kH, kW, oC, iC] -> [oC, iC, kH, kW]
|
permut = {2,3,0,1}; // [kH, kW, oC, iC] -> [oC, iC, kH, kW]
|
||||||
}
|
else if(1 == wFormat)
|
||||||
else if(1 == wFormat) {
|
permut = {1,0,2,3}; // [iC, oC, kH, kW] -> [oC, iC, kH, kW]
|
||||||
i0 = 1; i1 = 0; i2 = 2; i3 = 3; // [iC, oC, kH, kW] -> [oC, iC, kH, kW]
|
else
|
||||||
}
|
permut = {3,0,1,2}; // [iC, kH, kW, oC] -> [oC, iC, kH, kW]
|
||||||
else {
|
|
||||||
i0 = 3; i1 = 0; i2 = 1; i3 = 2; // [iC, kH, kW, oC] -> [oC, iC, kH, kW]
|
|
||||||
}
|
|
||||||
|
|
||||||
// input type
|
// input type
|
||||||
dnnl::memory::data_type xType = input->dataType() == DataType::FLOAT32 ? dnnl::memory::data_type::f32 : dnnl::memory::data_type::bf16;
|
dnnl::memory::data_type xType = input->dataType() == DataType::FLOAT32 ? dnnl::memory::data_type::f32 : dnnl::memory::data_type::bf16;
|
||||||
|
@ -216,35 +203,27 @@ static void deconv2dBpMKLDNN(const NDArray* input, const NDArray* weights, const
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xFormatMkl);
|
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradO, gradO_user_md);
|
mkldnnUtils::setBlockStrides(*gradO, gradO_user_md);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xFormatMkl);
|
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradI, gradI_user_md);
|
mkldnnUtils::setBlockStrides(*gradI, gradI_user_md);
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, gradWType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, gradWType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, gradWType, wFormatMkl);
|
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, gradWType, wFormatMkl);
|
||||||
gradW_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*gradW, gradW_user_md, permut);
|
||||||
gradW_user_md.data.format_desc.blocking.strides[0] = gradW->strideAt(i0);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[1] = gradW->strideAt(i1);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[2] = gradW->strideAt(i2);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[3] = gradW->strideAt(i3);
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
dnnl::memory::desc gradB_mkl_md;
|
dnnl::memory::desc gradB_mkl_md;
|
||||||
|
@ -273,10 +252,10 @@ static void deconv2dBpMKLDNN(const NDArray* input, const NDArray* weights, const
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
||||||
|
@ -291,16 +270,10 @@ static void deconv2dBpMKLDNN(const NDArray* input, const NDArray* weights, const
|
||||||
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
auto gradI_user_mem = dnnl::memory(gradI_user_md, engine, gradI->buffer());
|
auto gradI_user_mem = mkldnnUtils::loadDataToMklStream(*gradI, engine, stream, gradI_user_md, op_data_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool gradIReorder = op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc();
|
|
||||||
auto gradI_mkl_mem = gradIReorder ? dnnl::memory(op_data_bp_prim_desc.diff_src_desc(), engine) : gradI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = gradI_mkl_mem;
|
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
auto gradW_user_mem = dnnl::memory(gradW_user_md, engine, gradW->buffer());
|
auto gradW_user_mem = mkldnnUtils::loadDataToMklStream(*gradW, engine, stream, gradW_user_md, op_weights_bp_prim_desc.diff_weights_desc(), args[DNNL_ARG_DIFF_WEIGHTS]);
|
||||||
const bool gradWReorder = op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc();
|
|
||||||
auto gradW_mkl_mem = gradWReorder ? dnnl::memory(op_weights_bp_prim_desc.diff_weights_desc(), engine) : gradW_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_WEIGHTS] = gradW_mkl_mem;
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
if(gradB != nullptr) {
|
if(gradB != nullptr) {
|
||||||
|
@ -318,10 +291,10 @@ static void deconv2dBpMKLDNN(const NDArray* input, const NDArray* weights, const
|
||||||
dnnl::deconvolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
dnnl::deconvolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder gradI if necessary
|
// reorder gradI if necessary
|
||||||
if (gradIReorder)
|
if (op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc())
|
||||||
dnnl::reorder(gradI_mkl_mem, gradI_user_mem).execute(stream, gradI_mkl_mem, gradI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], gradI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], gradI_user_mem);
|
||||||
if (gradWReorder)
|
if (op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc())
|
||||||
dnnl::reorder(gradW_mkl_mem, gradW_user_mem).execute(stream, gradW_mkl_mem, gradW_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem).execute(stream, args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ namespace ops {
|
||||||
namespace platforms {
|
namespace platforms {
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
static void deconv2TFdBackPropMKLDNN(const NDArray* weights, const NDArray* gradO, NDArray* gradI,
|
static void deconv2TFdBpMKLDNN(const NDArray* weights, const NDArray* gradO, NDArray* gradI,
|
||||||
const int bS, const int iC, const int iH, const int iW, const int oC, const int oH, const int oW,
|
const int bS, const int iC, const int iH, const int iW, const int oC, const int oH, const int oW,
|
||||||
const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW,
|
const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW,
|
||||||
const bool isNCHW, const int wFormat) {
|
const bool isNCHW, const int wFormat) {
|
||||||
|
@ -67,21 +67,17 @@ static void deconv2TFdBackPropMKLDNN(const NDArray* weights, const NDArray* grad
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, {3,2,0,1}); // permute [kH, kW, iC, oC] -> [oC, iC, kH, kW]
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(3); // permute [kH, kW, iC, oC] -> [oC, iC, kH, kW]
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(1);
|
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xFormatMkl);
|
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradO, gradO_user_md);
|
mkldnnUtils::setBlockStrides(*gradO, gradO_user_md);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xFormatMkl);
|
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradI, gradI_user_md);
|
mkldnnUtils::setBlockStrides(*gradI, gradI_user_md);
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -101,23 +97,20 @@ static void deconv2TFdBackPropMKLDNN(const NDArray* weights, const NDArray* grad
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
mkldnnUtils::loadDataToMklStream(gradO, engine, stream, gradO_user_md, op_data_bp_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
mkldnnUtils::loadDataToMklStream(*gradO, engine, stream, gradO_user_md, op_data_bp_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
auto gradI_user_mem = dnnl::memory(gradI_user_md, engine, gradI->buffer());
|
auto gradI_user_mem = mkldnnUtils::loadDataToMklStream(*gradI, engine, stream, gradI_user_md, op_data_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool gradIReorder = op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc();
|
|
||||||
auto gradI_mkl_mem = gradIReorder ? dnnl::memory(op_data_bp_prim_desc.diff_src_desc(), engine) : gradI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = gradI_mkl_mem;
|
|
||||||
|
|
||||||
// run backward data calculations
|
// run backward data calculations
|
||||||
dnnl::convolution_backward_data(op_data_bp_prim_desc).execute(stream, args);
|
dnnl::convolution_backward_data(op_data_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder gradI if necessary
|
// reorder gradI if necessary
|
||||||
if (gradIReorder)
|
if (op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc())
|
||||||
dnnl::reorder(gradI_mkl_mem, gradI_user_mem).execute(stream, gradI_mkl_mem, gradI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], gradI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], gradI_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
@ -189,7 +182,7 @@ PLATFORM_IMPL(deconv2d_tf, ENGINE_CPU) {
|
||||||
// gradO = new NDArray(gradO->permute({0,3,1,2})); // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
|
// gradO = new NDArray(gradO->permute({0,3,1,2})); // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
|
||||||
// }
|
// }
|
||||||
|
|
||||||
deconv2TFdBackPropMKLDNN(weights, gradO, gradI, bS, iC, iH, iW, oC, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW, wFormat);
|
deconv2TFdBpMKLDNN(weights, gradO, gradI, bS, iC, iH, iW, oC, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW, wFormat);
|
||||||
|
|
||||||
// delete weights;
|
// delete weights;
|
||||||
|
|
||||||
|
|
|
@ -48,16 +48,13 @@ static void deconv3dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
dnnl::memory::dims padding_r = { (iD - 1) * sD - oD + kD - pD, (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
dnnl::memory::dims padding_r = { (iD - 1) * sD - oD + kD - pD, (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
||||||
dnnl::memory::dims dilation = { dD-1, dH-1, dW-1 };
|
dnnl::memory::dims dilation = { dD-1, dH-1, dW-1 };
|
||||||
|
|
||||||
uint i0, i1, i2, i3, i4;
|
std::vector<int> permut;
|
||||||
if(0 == wFormat) {
|
if(0 == wFormat)
|
||||||
i0 = 3; i1 = 4; i2 = 0; i3 = 1; i4 = 2; // [kD, kH, kW, oC, iC] -> [oC, iC, kD, kH, kW]
|
permut = {3,4,0,1,2}; // [kD, kH, kW, oC, iC] -> [oC, iC, kD, kH, kW]
|
||||||
}
|
else if(1 == wFormat)
|
||||||
else if(1 == wFormat) {
|
permut = {1,0,2,3,4}; // [iC, oC, kD, kH, kW] -> [oC, iC, kD, kH, kW]
|
||||||
i0 = 1; i1 = 0; i2 = 2; i3 = 3; i4 = 4; // [iC, oC, kD, kH, kW] -> [oC, iC, kD, kH, kW]
|
else
|
||||||
}
|
permut = {4,0,1,2,3}; // [iC, kD, kH, kW, oC] -> [oC, iC, kD, kH, kW]
|
||||||
else {
|
|
||||||
i0 = 4; i1 = 0; i2 = 1; i3 = 2; i4 = 3; // [iC, kD, kH, kW, oC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
|
|
||||||
// input type
|
// input type
|
||||||
dnnl::memory::data_type xType;
|
dnnl::memory::data_type xType;
|
||||||
|
@ -100,17 +97,12 @@ static void deconv3dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[4] = weights->strideAt(i4);
|
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
dnnl::memory::desc b_mkl_md;
|
dnnl::memory::desc b_mkl_md;
|
||||||
|
@ -120,7 +112,7 @@ static void deconv3dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
// output
|
// output
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, zType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, zType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, zType, xFormatMkl);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, zType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(output, z_user_md);
|
mkldnnUtils::setBlockStrides(*output, z_user_md);
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -137,10 +129,10 @@ static void deconv3dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
if(bias != nullptr) {
|
if(bias != nullptr) {
|
||||||
|
@ -149,17 +141,14 @@ static void deconv3dMKLDNN(const NDArray* input, const NDArray* weights, const N
|
||||||
}
|
}
|
||||||
|
|
||||||
// output
|
// output
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, output->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*output, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::deconvolution_forward(op_prim_desc).execute(stream, args);
|
dnnl::deconvolution_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
@ -185,16 +174,13 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
dnnl::memory::dims padding_r = { (iD - 1) * sD - oD + kD - pD, (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
dnnl::memory::dims padding_r = { (iD - 1) * sD - oD + kD - pD, (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
|
||||||
dnnl::memory::dims dilation = { dD-1, dH-1, dW-1 };
|
dnnl::memory::dims dilation = { dD-1, dH-1, dW-1 };
|
||||||
|
|
||||||
uint i0, i1, i2, i3, i4;
|
std::vector<int> permut;
|
||||||
if(0 == wFormat) {
|
if(0 == wFormat)
|
||||||
i0 = 3; i1 = 4; i2 = 0; i3 = 1; i4 = 2; // [kD, kH, kW, oC, iC] -> [oC, iC, kD, kH, kW]
|
permut = {3,4,0,1,2}; // [kD, kH, kW, oC, iC] -> [oC, iC, kD, kH, kW]
|
||||||
}
|
else if(1 == wFormat)
|
||||||
else if(1 == wFormat) {
|
permut = {1,0,2,3,4}; // [iC, oC, kD, kH, kW] -> [oC, iC, kD, kH, kW]
|
||||||
i0 = 1; i1 = 0; i2 = 2; i3 = 3; i4 = 4; // [iC, oC, kD, kH, kW] -> [oC, iC, kD, kH, kW]
|
else
|
||||||
}
|
permut = {4,0,1,2,3}; // [iC, kD, kH, kW, oC] -> [oC, iC, kD, kH, kW]
|
||||||
else {
|
|
||||||
i0 = 4; i1 = 0; i2 = 1; i3 = 2; i4 = 3; // [iC, kD, kH, kW, oC] -> [oC, iC, kD, kH, kW]
|
|
||||||
}
|
|
||||||
|
|
||||||
// input type
|
// input type
|
||||||
dnnl::memory::data_type xType = input->dataType() == DataType::FLOAT32 ? dnnl::memory::data_type::f32 : dnnl::memory::data_type::bf16;
|
dnnl::memory::data_type xType = input->dataType() == DataType::FLOAT32 ? dnnl::memory::data_type::f32 : dnnl::memory::data_type::bf16;
|
||||||
|
@ -221,37 +207,27 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
dnnl::memory::desc w_user_md = dnnl::memory::desc(wDims, wType, wFormatMkl);
|
||||||
w_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*weights, w_user_md, permut);
|
||||||
w_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(i0);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(i1);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[2] = weights->strideAt(i2);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[3] = weights->strideAt(i3);
|
|
||||||
w_user_md.data.format_desc.blocking.strides[4] = weights->strideAt(i4);
|
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xFormatMkl);
|
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradO, gradO_user_md);
|
mkldnnUtils::setBlockStrides(*gradO, gradO_user_md);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xFormatMkl);
|
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradI, gradI_user_md);
|
mkldnnUtils::setBlockStrides(*gradI, gradI_user_md);
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, gradWType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, gradWType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, gradWType, wFormatMkl);
|
dnnl::memory::desc gradW_user_md = dnnl::memory::desc(wDims, gradWType, wFormatMkl);
|
||||||
gradW_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*gradW, gradW_user_md, permut);
|
||||||
gradW_user_md.data.format_desc.blocking.strides[0] = gradW->strideAt(i0);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[1] = gradW->strideAt(i1);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[2] = gradW->strideAt(i2);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[3] = gradW->strideAt(i3);
|
|
||||||
gradW_user_md.data.format_desc.blocking.strides[4] = gradW->strideAt(i4);
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
dnnl::memory::desc gradB_mkl_md;
|
dnnl::memory::desc gradB_mkl_md;
|
||||||
|
@ -281,10 +257,10 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
||||||
|
@ -299,16 +275,10 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
auto gradI_user_mem = dnnl::memory(gradI_user_md, engine, gradI->buffer());
|
auto gradI_user_mem = mkldnnUtils::loadDataToMklStream(*gradI, engine, stream, gradI_user_md, op_data_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool gradIReorder = op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc();
|
|
||||||
auto gradI_mkl_mem = gradIReorder ? dnnl::memory(op_data_bp_prim_desc.diff_src_desc(), engine) : gradI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = gradI_mkl_mem;
|
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
auto gradW_user_mem = dnnl::memory(gradW_user_md, engine, gradW->buffer());
|
auto gradW_user_mem = mkldnnUtils::loadDataToMklStream(*gradW, engine, stream, gradW_user_md, op_weights_bp_prim_desc.diff_weights_desc(), args[DNNL_ARG_DIFF_WEIGHTS]);
|
||||||
const bool gradWReorder = op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc();
|
|
||||||
auto gradW_mkl_mem = gradWReorder ? dnnl::memory(op_weights_bp_prim_desc.diff_weights_desc(), engine) : gradW_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_WEIGHTS] = gradW_mkl_mem;
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
if(gradB != nullptr) {
|
if(gradB != nullptr) {
|
||||||
|
@ -326,10 +296,10 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
dnnl::deconvolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
dnnl::deconvolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder gradI if necessary
|
// reorder gradI if necessary
|
||||||
if (gradIReorder)
|
if (op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc())
|
||||||
dnnl::reorder(gradI_mkl_mem, gradI_user_mem).execute(stream, gradI_mkl_mem, gradI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], gradI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], gradI_user_mem);
|
||||||
if (gradWReorder)
|
if (op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc())
|
||||||
dnnl::reorder(gradW_mkl_mem, gradW_user_mem).execute(stream, gradW_mkl_mem, gradW_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem).execute(stream, args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
|
|
@ -109,7 +109,7 @@ static void depthwiseConv2dMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xzFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
||||||
|
@ -129,7 +129,7 @@ static void depthwiseConv2dMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
// output
|
// output
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, zType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, zType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, zType, xzFormatMkl);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, zType, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(output, z_user_md);
|
mkldnnUtils::setBlockStrides(*output, z_user_md);
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -146,10 +146,10 @@ static void depthwiseConv2dMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
if(bias != nullptr) {
|
if(bias != nullptr) {
|
||||||
|
@ -158,24 +158,21 @@ static void depthwiseConv2dMKLDNN(const NDArray* input, const NDArray* weights,
|
||||||
}
|
}
|
||||||
|
|
||||||
// output
|
// output
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, output->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*output, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::convolution_forward(op_prim_desc).execute(stream, args);
|
dnnl::convolution_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
// shape::printArray(z_mkl_mem.map_data<float>(),8);
|
// shape::printArray(z_mkl_mem.map_data<float>(),8);
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* weights, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB,
|
static void depthwiseConv2dBpMKLDNN(const NDArray* input, const NDArray* weights, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB,
|
||||||
const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW,
|
const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW,
|
||||||
const int paddingMode, const bool isNCHW, const int wFormat) {
|
const int paddingMode, const bool isNCHW, const int wFormat) {
|
||||||
|
|
||||||
|
@ -235,7 +232,7 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, xType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xzFormatMkl);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, xType, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(input, x_user_md);
|
mkldnnUtils::setBlockStrides(*input, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc w_mkl_md = dnnl::memory::desc(wDims, wType, dnnl::memory::format_tag::any);
|
||||||
|
@ -250,12 +247,12 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w
|
||||||
// gradO
|
// gradO
|
||||||
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, gradOType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xzFormatMkl);
|
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, gradOType, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradO, gradO_user_md);
|
mkldnnUtils::setBlockStrides(*gradO, gradO_user_md);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, gradIType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xzFormatMkl);
|
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, gradIType, xzFormatMkl);
|
||||||
mkldnnUtils::setBlockStrides(gradI, gradI_user_md);
|
mkldnnUtils::setBlockStrides(*gradI, gradI_user_md);
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, gradWType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradW_mkl_md = dnnl::memory::desc(wDims, gradWType, dnnl::memory::format_tag::any);
|
||||||
|
@ -294,10 +291,10 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_weights_bp_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, w_user_md, op_data_bp_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
auto gradO_user_mem = dnnl::memory(gradO_user_md, engine, const_cast<void*>(gradO->buffer()));
|
||||||
|
@ -312,16 +309,10 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w
|
||||||
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
args[DNNL_ARG_DIFF_DST] = gradO_mkl_memD;
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
auto gradI_user_mem = dnnl::memory(gradI_user_md, engine, gradI->buffer());
|
auto gradI_user_mem = mkldnnUtils::loadDataToMklStream(*gradI, engine, stream, gradI_user_md, op_data_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool gradIReorder = op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc();
|
|
||||||
auto gradI_mkl_mem = gradIReorder ? dnnl::memory(op_data_bp_prim_desc.diff_src_desc(), engine) : gradI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = gradI_mkl_mem;
|
|
||||||
|
|
||||||
// gradW
|
// gradW
|
||||||
auto gradW_user_mem = dnnl::memory(gradW_user_md, engine, gradW->buffer());
|
auto gradW_user_mem = mkldnnUtils::loadDataToMklStream(*gradW, engine, stream, gradW_user_md, op_weights_bp_prim_desc.diff_weights_desc(), args[DNNL_ARG_DIFF_WEIGHTS]);
|
||||||
const bool gradWReorder = op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc();
|
|
||||||
auto gradW_mkl_mem = gradWReorder ? dnnl::memory(op_weights_bp_prim_desc.diff_weights_desc(), engine) : gradW_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_WEIGHTS] = gradW_mkl_mem;
|
|
||||||
|
|
||||||
// gradB
|
// gradB
|
||||||
if(gradB != nullptr) {
|
if(gradB != nullptr) {
|
||||||
|
@ -339,10 +330,10 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w
|
||||||
dnnl::convolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
dnnl::convolution_backward_weights(op_weights_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder gradI if necessary
|
// reorder gradI if necessary
|
||||||
if (gradIReorder)
|
if (op_data_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc())
|
||||||
dnnl::reorder(gradI_mkl_mem, gradI_user_mem).execute(stream, gradI_mkl_mem, gradI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], gradI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], gradI_user_mem);
|
||||||
if (gradWReorder)
|
if (op_weights_bp_prim_desc.diff_weights_desc() != gradW_user_mem.get_desc())
|
||||||
dnnl::reorder(gradW_mkl_mem, gradW_user_mem).execute(stream, gradW_mkl_mem, gradW_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem).execute(stream, args[DNNL_ARG_DIFF_WEIGHTS], gradW_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
@ -458,7 +449,7 @@ PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CPU) {
|
||||||
if(bias)
|
if(bias)
|
||||||
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM DEPTHWISECONV2D_BP MKL OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
|
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM DEPTHWISECONV2D_BP MKL OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
|
||||||
|
|
||||||
depthwiseConv2dNackPropMKLDNN(input, weights, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat);
|
depthwiseConv2dBpMKLDNN(input, weights, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat);
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
|
@ -169,71 +169,43 @@ static void lstmLayerMKLDNN(const NDArray* x, const NDArray* Wx, const NDArray*
|
||||||
x_lstm_md = dnnl::memory::desc({sL, bS, nIn}, xType, dnnl::memory::format_tag::any);
|
x_lstm_md = dnnl::memory::desc({sL, bS, nIn}, xType, dnnl::memory::format_tag::any);
|
||||||
// x_user_md = dataFormat == 0 ? dnnl::memory::desc({sL, bS, nIn}, type, dnnl::memory::format_tag::tnc) : dnnl::memory::desc({bS, sL, nIn}, type, dnnl::memory::format_tag::ntc);
|
// x_user_md = dataFormat == 0 ? dnnl::memory::desc({sL, bS, nIn}, type, dnnl::memory::format_tag::tnc) : dnnl::memory::desc({bS, sL, nIn}, type, dnnl::memory::format_tag::ntc);
|
||||||
x_user_md = dnnl::memory::desc({sL, bS, nIn}, xType, dnnl::memory::format_tag::tnc);
|
x_user_md = dnnl::memory::desc({sL, bS, nIn}, xType, dnnl::memory::format_tag::tnc);
|
||||||
x_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
x_user_md.data.format_desc.blocking.strides[0] = x->stridesOf()[0];
|
|
||||||
x_user_md.data.format_desc.blocking.strides[1] = x->stridesOf()[1];
|
|
||||||
x_user_md.data.format_desc.blocking.strides[2] = x->stridesOf()[2];
|
|
||||||
|
|
||||||
// wx
|
// wx
|
||||||
wx_lstm_md = dnnl::memory::desc({1,dirDim,nIn,4,nOut}, wType, dnnl::memory::format_tag::any);
|
wx_lstm_md = dnnl::memory::desc({1,dirDim,nIn,4,nOut}, wType, dnnl::memory::format_tag::any);
|
||||||
wx_user_md = dnnl::memory::desc({1,dirDim,nIn,4,nOut}, wType, dnnl::memory::format_tag::ldigo);
|
wx_user_md = dnnl::memory::desc({1,dirDim,nIn,4,nOut}, wType, dnnl::memory::format_tag::ldigo);
|
||||||
wx_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*Wx, wx_user_md);
|
||||||
wx_user_md.data.format_desc.blocking.strides[0] = Wx->stridesOf()[0];
|
|
||||||
wx_user_md.data.format_desc.blocking.strides[1] = Wx->stridesOf()[1];
|
|
||||||
wx_user_md.data.format_desc.blocking.strides[2] = Wx->stridesOf()[2];
|
|
||||||
wx_user_md.data.format_desc.blocking.strides[3] = Wx->stridesOf()[3];
|
|
||||||
wx_user_md.data.format_desc.blocking.strides[4] = Wx->stridesOf()[4];
|
|
||||||
|
|
||||||
// wr
|
// wr
|
||||||
wr_lstm_md = dnnl::memory::desc({1,dirDim,nOut,4,nOut}, wType, dnnl::memory::format_tag::any);
|
wr_lstm_md = dnnl::memory::desc({1,dirDim,nOut,4,nOut}, wType, dnnl::memory::format_tag::any);
|
||||||
wr_user_md = dnnl::memory::desc({1,dirDim,nOut,4,nOut}, wType, dnnl::memory::format_tag::ldigo);
|
wr_user_md = dnnl::memory::desc({1,dirDim,nOut,4,nOut}, wType, dnnl::memory::format_tag::ldigo);
|
||||||
wr_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*Wr, wr_user_md);
|
||||||
wr_user_md.data.format_desc.blocking.strides[0] = Wr->stridesOf()[0];
|
|
||||||
wr_user_md.data.format_desc.blocking.strides[1] = Wr->stridesOf()[1];
|
|
||||||
wr_user_md.data.format_desc.blocking.strides[2] = Wr->stridesOf()[2];
|
|
||||||
wr_user_md.data.format_desc.blocking.strides[3] = Wr->stridesOf()[3];
|
|
||||||
wr_user_md.data.format_desc.blocking.strides[4] = Wr->stridesOf()[4];
|
|
||||||
|
|
||||||
// h
|
// h
|
||||||
h_lstm_md = dnnl::memory::desc({sL, bS, hDirDim*nOut}, hType, dnnl::memory::format_tag::any);
|
h_lstm_md = dnnl::memory::desc({sL, bS, hDirDim*nOut}, hType, dnnl::memory::format_tag::any);
|
||||||
// h_user_md = dataFormat == 0 ? dnnl::memory::desc({sL, bS, hDirDim*nOut}, type, dnnl::memory::format_tag::tnc) : dnnl::memory::desc({bS, sL, hDirDim*nOut}, type, dnnl::memory::format_tag::ntc);
|
// h_user_md = dataFormat == 0 ? dnnl::memory::desc({sL, bS, hDirDim*nOut}, type, dnnl::memory::format_tag::tnc) : dnnl::memory::desc({bS, sL, hDirDim*nOut}, type, dnnl::memory::format_tag::ntc);
|
||||||
h_user_md = dnnl::memory::desc({sL, bS, hDirDim*nOut}, hType, dnnl::memory::format_tag::tnc);
|
h_user_md = dnnl::memory::desc({sL, bS, hDirDim*nOut}, hType, dnnl::memory::format_tag::tnc);
|
||||||
h_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*h, h_user_md);
|
||||||
h_user_md.data.format_desc.blocking.strides[0] = h->stridesOf()[0];
|
|
||||||
h_user_md.data.format_desc.blocking.strides[1] = h->stridesOf()[1];
|
|
||||||
h_user_md.data.format_desc.blocking.strides[2] = h->stridesOf()[2];
|
|
||||||
|
|
||||||
// b
|
// b
|
||||||
if(b) {
|
if(b) {
|
||||||
b_lstm_md = dnnl::memory::desc({1,dirDim,4,nOut}, bType, dnnl::memory::format_tag::any);
|
b_lstm_md = dnnl::memory::desc({1,dirDim,4,nOut}, bType, dnnl::memory::format_tag::any);
|
||||||
b_user_md = dnnl::memory::desc({1,dirDim,4,nOut}, bType, dnnl::memory::format_tag::ldgo);
|
b_user_md = dnnl::memory::desc({1,dirDim,4,nOut}, bType, dnnl::memory::format_tag::ldgo);
|
||||||
b_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*b, b_user_md);
|
||||||
b_user_md.data.format_desc.blocking.strides[0] = b->stridesOf()[0];
|
|
||||||
b_user_md.data.format_desc.blocking.strides[1] = b->stridesOf()[1];
|
|
||||||
b_user_md.data.format_desc.blocking.strides[2] = b->stridesOf()[2];
|
|
||||||
b_user_md.data.format_desc.blocking.strides[3] = b->stridesOf()[3];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// hI
|
// hI
|
||||||
if(hI) {
|
if(hI) {
|
||||||
hI_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::any);
|
hI_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::any);
|
||||||
hI_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::ldnc);
|
hI_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::ldnc);
|
||||||
hI_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*hI, hI_user_md);
|
||||||
hI_user_md.data.format_desc.blocking.strides[0] = hI->stridesOf()[0];
|
|
||||||
hI_user_md.data.format_desc.blocking.strides[1] = hI->stridesOf()[1];
|
|
||||||
hI_user_md.data.format_desc.blocking.strides[2] = hI->stridesOf()[2];
|
|
||||||
hI_user_md.data.format_desc.blocking.strides[3] = hI->stridesOf()[3];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// cI
|
// cI
|
||||||
if(cI) {
|
if(cI) {
|
||||||
cI_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::any);
|
cI_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::any);
|
||||||
cI_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::ldnc);
|
cI_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, xType, dnnl::memory::format_tag::ldnc);
|
||||||
cI_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*cI, cI_user_md);
|
||||||
cI_user_md.data.format_desc.blocking.strides[0] = cI->stridesOf()[0];
|
|
||||||
cI_user_md.data.format_desc.blocking.strides[1] = cI->stridesOf()[1];
|
|
||||||
cI_user_md.data.format_desc.blocking.strides[2] = cI->stridesOf()[2];
|
|
||||||
cI_user_md.data.format_desc.blocking.strides[2] = cI->stridesOf()[3];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// hL
|
// hL
|
||||||
|
@ -241,20 +213,13 @@ static void lstmLayerMKLDNN(const NDArray* x, const NDArray* Wx, const NDArray*
|
||||||
hL_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::any);
|
hL_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::any);
|
||||||
hL_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::ldnc);
|
hL_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::ldnc);
|
||||||
hL_user_md.data.format_kind = dnnl_blocked; // overrides format
|
hL_user_md.data.format_kind = dnnl_blocked; // overrides format
|
||||||
hL_user_md.data.format_desc.blocking.strides[0] = hL->stridesOf()[0];
|
mkldnnUtils::setBlockStrides(*hL, hL_user_md);
|
||||||
hL_user_md.data.format_desc.blocking.strides[1] = hL->stridesOf()[1];
|
|
||||||
hL_user_md.data.format_desc.blocking.strides[2] = hL->stridesOf()[2];
|
|
||||||
hL_user_md.data.format_desc.blocking.strides[3] = hL->stridesOf()[3];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(cL) {
|
if(cL) {
|
||||||
cL_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::ldnc);
|
cL_lstm_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::ldnc);
|
||||||
cL_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::ldnc);
|
cL_user_md = dnnl::memory::desc({1,dirDim,bS,nOut}, hType, dnnl::memory::format_tag::ldnc);
|
||||||
cL_user_md.data.format_kind = dnnl_blocked; // overrides format
|
mkldnnUtils::setBlockStrides(*cL, cL_user_md);
|
||||||
cL_user_md.data.format_desc.blocking.strides[0] = cL->stridesOf()[0];
|
|
||||||
cL_user_md.data.format_desc.blocking.strides[1] = cL->stridesOf()[1];
|
|
||||||
cL_user_md.data.format_desc.blocking.strides[2] = cL->stridesOf()[2];
|
|
||||||
cL_user_md.data.format_desc.blocking.strides[3] = cL->stridesOf()[3];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// lstm memory description
|
// lstm memory description
|
||||||
|
@ -272,64 +237,49 @@ static void lstmLayerMKLDNN(const NDArray* x, const NDArray* Wx, const NDArray*
|
||||||
|
|
||||||
// provide memory and check whether reorder is required
|
// provide memory and check whether reorder is required
|
||||||
// x
|
// x
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, lstm_prim_desc.src_layer_desc(), args[DNNL_ARG_SRC_LAYER]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, lstm_prim_desc.src_layer_desc(), args[DNNL_ARG_SRC_LAYER]);
|
||||||
|
|
||||||
// wx
|
// wx
|
||||||
mkldnnUtils::loadDataToMklStream(Wx, engine, stream, wx_user_md, lstm_prim_desc.weights_layer_desc(), args[DNNL_ARG_WEIGHTS_LAYER]);
|
mkldnnUtils::loadDataToMklStream(*Wx, engine, stream, wx_user_md, lstm_prim_desc.weights_layer_desc(), args[DNNL_ARG_WEIGHTS_LAYER]);
|
||||||
|
|
||||||
// wr
|
// wr
|
||||||
mkldnnUtils::loadDataToMklStream(Wr, engine, stream, wr_user_md, lstm_prim_desc.weights_iter_desc(), args[DNNL_ARG_WEIGHTS_ITER]);
|
mkldnnUtils::loadDataToMklStream(*Wr, engine, stream, wr_user_md, lstm_prim_desc.weights_iter_desc(), args[DNNL_ARG_WEIGHTS_ITER]);
|
||||||
|
|
||||||
// h
|
// h
|
||||||
auto h_user_mem = dnnl::memory(h_user_md, engine, h->buffer());
|
auto h_user_mem = mkldnnUtils::loadDataToMklStream(*h, engine, stream, h_user_md, lstm_prim_desc.dst_layer_desc(), args[DNNL_ARG_DST_LAYER]);
|
||||||
const bool hReorder = lstm_prim_desc.dst_layer_desc() != h_user_mem.get_desc();
|
|
||||||
auto h_lstm_mem = hReorder ? dnnl::memory(lstm_prim_desc.dst_layer_desc(), engine) : h_user_mem;
|
|
||||||
args[DNNL_ARG_DST_LAYER] = h_lstm_mem;
|
|
||||||
|
|
||||||
// b
|
// b
|
||||||
if(b) {
|
if(b)
|
||||||
mkldnnUtils::loadDataToMklStream(b, engine, stream, b_user_md, lstm_prim_desc.bias_desc(), args[DNNL_ARG_BIAS]);
|
mkldnnUtils::loadDataToMklStream(*b, engine, stream, b_user_md, lstm_prim_desc.bias_desc(), args[DNNL_ARG_BIAS]);
|
||||||
}
|
|
||||||
|
|
||||||
// hI
|
// hI
|
||||||
if(hI) {
|
if(hI)
|
||||||
mkldnnUtils::loadDataToMklStream(hI, engine, stream, hI_user_md, lstm_prim_desc.src_iter_desc(), args[DNNL_ARG_SRC_ITER]);
|
mkldnnUtils::loadDataToMklStream(*hI, engine, stream, hI_user_md, lstm_prim_desc.src_iter_desc(), args[DNNL_ARG_SRC_ITER]);
|
||||||
}
|
|
||||||
|
|
||||||
// cI
|
// cI
|
||||||
if(cI) {
|
if(cI)
|
||||||
mkldnnUtils::loadDataToMklStream(cI, engine, stream, cI_user_md, lstm_prim_desc.src_iter_c_desc(), args[DNNL_ARG_SRC_ITER_C]);
|
mkldnnUtils::loadDataToMklStream(*cI, engine, stream, cI_user_md, lstm_prim_desc.src_iter_c_desc(), args[DNNL_ARG_SRC_ITER_C]);
|
||||||
}
|
|
||||||
|
|
||||||
bool hLReorder(false), cLReorder(false);
|
|
||||||
dnnl::memory hL_user_mem, cL_user_mem, hL_lstm_mem, cL_lstm_mem;
|
dnnl::memory hL_user_mem, cL_user_mem, hL_lstm_mem, cL_lstm_mem;
|
||||||
|
|
||||||
// hL
|
// hL
|
||||||
if(hL) {
|
if(hL)
|
||||||
hL_user_mem = dnnl::memory(hL_user_md, engine, hL->buffer());
|
hL_user_mem = mkldnnUtils::loadDataToMklStream(*hL, engine, stream, hL_user_md, lstm_prim_desc.dst_iter_desc(), args[DNNL_ARG_DST_ITER]);
|
||||||
hLReorder = lstm_prim_desc.dst_iter_desc() != hL_user_mem.get_desc();
|
|
||||||
hL_lstm_mem = hLReorder ? dnnl::memory(lstm_prim_desc.dst_iter_desc(), engine) : hL_user_mem;
|
|
||||||
args[DNNL_ARG_DST_ITER] = hL_lstm_mem;
|
|
||||||
}
|
|
||||||
|
|
||||||
// cL
|
// cL
|
||||||
if(cL) {
|
if(cL)
|
||||||
cL_user_mem = dnnl::memory(cL_user_md, engine, cL->buffer());
|
cL_user_mem = mkldnnUtils::loadDataToMklStream(*cL, engine, stream, cL_user_md, lstm_prim_desc.dst_iter_c_desc(), args[DNNL_ARG_DST_ITER_C]);
|
||||||
cLReorder = lstm_prim_desc.dst_iter_c_desc() != cL_user_mem.get_desc();
|
|
||||||
cL_lstm_mem = cLReorder ? dnnl::memory(lstm_prim_desc.dst_iter_c_desc(), engine) : cL_user_mem;
|
|
||||||
args[DNNL_ARG_DST_ITER_C] = cL_lstm_mem;
|
|
||||||
}
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
lstm_forward(lstm_prim_desc).execute(stream, args);
|
lstm_forward(lstm_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (hReorder)
|
if (lstm_prim_desc.dst_layer_desc() != h_user_mem.get_desc())
|
||||||
reorder(h_lstm_mem, h_user_mem).execute(stream, h_lstm_mem, h_user_mem);
|
reorder(args[DNNL_ARG_DST_LAYER], h_user_mem).execute(stream, args[DNNL_ARG_DST_LAYER], h_user_mem);
|
||||||
if(hLReorder)
|
if(lstm_prim_desc.dst_iter_desc() != hL_user_mem.get_desc())
|
||||||
reorder(hL_lstm_mem, hL_user_mem).execute(stream, hL_lstm_mem, hL_user_mem);
|
reorder(args[DNNL_ARG_DST_ITER], hL_user_mem).execute(stream, args[DNNL_ARG_DST_ITER], hL_user_mem);
|
||||||
if(cLReorder)
|
if(lstm_prim_desc.dst_iter_c_desc() != cL_user_mem.get_desc())
|
||||||
reorder(cL_lstm_mem, cL_user_mem).execute(stream, cL_lstm_mem, cL_user_mem);
|
reorder(args[DNNL_ARG_DST_ITER_C], cL_user_mem).execute(stream, args[DNNL_ARG_DST_ITER_C], cL_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
@ -377,9 +327,9 @@ PLATFORM_IMPL(lstmLayer, ENGINE_CPU) {
|
||||||
auto cL = retLastC ? OUTPUT_VARIABLE(count++) : nullptr; // cell state at last step
|
auto cL = retLastC ? OUTPUT_VARIABLE(count++) : nullptr; // cell state at last step
|
||||||
|
|
||||||
// evaluate dimensions
|
// evaluate dimensions
|
||||||
const Nd4jLong sL = dataFormat == 3 ? x->sizeAt(0) : x->sizeAt(dataFormat);
|
const Nd4jLong sL = x->sizeAt(dataFormat);
|
||||||
const Nd4jLong bS = dataFormat == 1 || dataFormat == 2 ? x->sizeAt(0) : x->sizeAt(-2);
|
const Nd4jLong bS = dataFormat == 0 ? x->sizeAt(1) : x->sizeAt(0);
|
||||||
const Nd4jLong nIn = dataFormat == 2 ? x->sizeAt(1) : x->sizeAt(-1);
|
const Nd4jLong nIn = x->sizeAt(2);
|
||||||
const Nd4jLong nOut = Wx->sizeAt(-1) / 4;
|
const Nd4jLong nOut = Wx->sizeAt(-1) / 4;
|
||||||
|
|
||||||
// inputs validations
|
// inputs validations
|
||||||
|
@ -435,14 +385,21 @@ PLATFORM_IMPL(lstmLayer, ENGINE_CPU) {
|
||||||
|
|
||||||
WxR = new NDArray(Wx->reshape(Wx->ordering(), {1,dirDim,nIn,4,nOut}));
|
WxR = new NDArray(Wx->reshape(Wx->ordering(), {1,dirDim,nIn,4,nOut}));
|
||||||
WrR = new NDArray(Wr->reshape(Wr->ordering(), {1,dirDim,nOut,4,nOut}));
|
WrR = new NDArray(Wr->reshape(Wr->ordering(), {1,dirDim,nOut,4,nOut}));
|
||||||
|
|
||||||
if(b)
|
if(b)
|
||||||
bR = new NDArray(b->reshape(b->ordering(), {1,dirDim,4,nOut}));
|
bR = new NDArray(b->reshape(b->ordering(), {1,dirDim,4,nOut}));
|
||||||
|
else
|
||||||
|
bR = new NDArray(x->ordering(), {1,dirDim,4,nOut}, x->dataType(), x->getContext()); // already nullified
|
||||||
|
|
||||||
if(hI)
|
if(hI)
|
||||||
hIR = new NDArray(hI->reshape(hI->ordering(), {1,dirDim,bS,nOut}));
|
hIR = new NDArray(hI->reshape(hI->ordering(), {1,dirDim,bS,nOut}));
|
||||||
|
|
||||||
if(cI)
|
if(cI)
|
||||||
cIR = new NDArray(cI->reshape(cI->ordering(), {1,dirDim,bS,nOut}));
|
cIR = new NDArray(cI->reshape(cI->ordering(), {1,dirDim,bS,nOut}));
|
||||||
|
|
||||||
if(hL)
|
if(hL)
|
||||||
hLR = new NDArray(hL->reshape(hL->ordering(), {1,dirDim,bS,nOut}, false));
|
hLR = new NDArray(hL->reshape(hL->ordering(), {1,dirDim,bS,nOut}, false));
|
||||||
|
|
||||||
if(cL)
|
if(cL)
|
||||||
cLR = new NDArray(cL->reshape(cL->ordering(), {1,dirDim,bS,nOut}, false));
|
cLR = new NDArray(cL->reshape(cL->ordering(), {1,dirDim,bS,nOut}, false));
|
||||||
|
|
||||||
|
|
|
@ -31,20 +31,6 @@ namespace sd {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
namespace platforms {
|
namespace platforms {
|
||||||
|
|
||||||
dnnl::memory::format_tag get_format_tag(const sd::NDArray &array) {
|
|
||||||
switch (array.rankOf()) {
|
|
||||||
case 1:
|
|
||||||
return dnnl::memory::format_tag::ab;
|
|
||||||
case 2:
|
|
||||||
return array.ordering() == 'c' ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
|
|
||||||
case 3:
|
|
||||||
return array.ordering() == 'c' ? dnnl::memory::format_tag::abc : dnnl::memory::format_tag::cba;
|
|
||||||
default:
|
|
||||||
throw std::runtime_error("MKLDNN matmul only supports 2D/3D arrays");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const bool transX, const bool transY, float alpha = 1.f, float beta = 0.f) {
|
static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const bool transX, const bool transY, float alpha = 1.f, float beta = 0.f) {
|
||||||
|
|
||||||
|
@ -123,11 +109,16 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
|
||||||
else if(z->dataType() == DataType::INT8)
|
else if(z->dataType() == DataType::INT8)
|
||||||
zType = dnnl::memory::data_type::s8;
|
zType = dnnl::memory::data_type::s8;
|
||||||
|
|
||||||
|
|
||||||
|
const auto xFormat = xRank == 1 ? dnnl::memory::format_tag::ab : mkldnnUtils::getFormat(*xTR);
|
||||||
|
const auto yFormat = yRank == 1 ? dnnl::memory::format_tag::ab : mkldnnUtils::getFormat(*yTR);
|
||||||
|
const auto zFormat = zRank == 1 ? dnnl::memory::format_tag::ab : mkldnnUtils::getFormat(*zR);
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
|
dnnl::memory::desc x_mkl_md, x_user_md, y_mkl_md, y_user_md, z_mkl_md, z_user_md;
|
||||||
|
|
||||||
// x
|
// x
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR));
|
x_user_md = x_mkl_md = dnnl::memory::desc(xShape, xType, xFormat);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR));
|
|
||||||
if(xTR->ews() != 1) {
|
if(xTR->ews() != 1) {
|
||||||
x_user_md.data.format_kind = dnnl_blocked; // overrides format
|
x_user_md.data.format_kind = dnnl_blocked; // overrides format
|
||||||
x_user_md.data.format_desc.blocking.strides[0] = xRank == 1 ? 1 : xTR->strideAt(0);
|
x_user_md.data.format_desc.blocking.strides[0] = xRank == 1 ? 1 : xTR->strideAt(0);
|
||||||
|
@ -137,8 +128,7 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
|
||||||
}
|
}
|
||||||
|
|
||||||
// y
|
// y
|
||||||
dnnl::memory::desc y_mkl_md = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR));
|
y_user_md = y_mkl_md = dnnl::memory::desc(yShape, yType, yFormat);
|
||||||
dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR));
|
|
||||||
if(yTR->ews() != 1) {
|
if(yTR->ews() != 1) {
|
||||||
y_user_md.data.format_kind = dnnl_blocked; // overrides format
|
y_user_md.data.format_kind = dnnl_blocked; // overrides format
|
||||||
y_user_md.data.format_desc.blocking.strides[0] = yRank == 1 ? 1 : yTR->strideAt(0);
|
y_user_md.data.format_desc.blocking.strides[0] = yRank == 1 ? 1 : yTR->strideAt(0);
|
||||||
|
@ -148,8 +138,7 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
|
||||||
}
|
}
|
||||||
|
|
||||||
// z
|
// z
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, get_format_tag(*zR));
|
z_user_md = z_mkl_md = dnnl::memory::desc(zShape, zType, zFormat);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, get_format_tag(*zR));
|
|
||||||
if(zR->ews() != 1) {
|
if(zR->ews() != 1) {
|
||||||
z_user_md.data.format_kind = dnnl_blocked; // overrides format
|
z_user_md.data.format_kind = dnnl_blocked; // overrides format
|
||||||
z_user_md.data.format_desc.blocking.strides[0] = zRank == 1 ? 1 : zR->strideAt(0);
|
z_user_md.data.format_desc.blocking.strides[0] = zRank == 1 ? 1 : zR->strideAt(0);
|
||||||
|
@ -181,37 +170,20 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(xTR, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*xTR, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
/*
|
|
||||||
auto x_user_mem = dnnl::memory(x_user_md, engine, xTR->buffer());
|
|
||||||
const bool xReorder = op_prim_desc.src_desc() != x_user_mem.get_desc();
|
|
||||||
auto x_mkl_mem = xReorder ? dnnl::memory(op_prim_desc.src_desc(), engine) : x_user_mem;
|
|
||||||
if (xReorder)
|
|
||||||
dnnl::reorder(x_user_mem, x_mkl_mem).execute(stream, x_user_mem, x_mkl_mem);
|
|
||||||
args[DNNL_ARG_SRC] = x_mkl_mem;
|
|
||||||
*/
|
|
||||||
// y
|
// y
|
||||||
mkldnnUtils::loadDataToMklStream(yTR, engine, stream, y_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*yTR, engine, stream, y_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
/*
|
|
||||||
auto y_user_mem = dnnl::memory(y_user_md, engine, yTR->buffer());
|
|
||||||
const bool yReorder = op_prim_desc.weights_desc() != y_user_mem.get_desc();
|
|
||||||
auto y_mkl_mem = yReorder ? dnnl::memory(op_prim_desc.weights_desc(), engine) : y_user_mem;
|
|
||||||
if (yReorder)
|
|
||||||
dnnl::reorder(y_user_mem, y_mkl_mem).execute(stream, y_user_mem, y_mkl_mem);
|
|
||||||
args[DNNL_ARG_WEIGHTS] = y_mkl_mem;
|
|
||||||
*/
|
|
||||||
// z
|
// z
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, zR->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*zR, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::matmul(op_prim_desc).execute(stream, args);
|
dnnl::matmul(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
|
|
||||||
|
|
|
@ -38,45 +38,65 @@ void getDims(const NDArray* array, const int rank, dnnl::memory::dims& mklDims){
|
||||||
mklDims = dnnl::memory::dims(vDims);
|
mklDims = dnnl::memory::dims(vDims);
|
||||||
}
|
}
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
dnnl::memory::format_tag getFormat(const int rank){
|
dnnl::memory::format_tag getFormat(const NDArray& arr) {
|
||||||
if (2 == rank) {
|
|
||||||
return dnnl::memory::format_tag::ab;
|
dnnl::memory::format_tag result;
|
||||||
|
|
||||||
|
switch (arr.rankOf()) {
|
||||||
|
case 1:
|
||||||
|
result = dnnl::memory::format_tag::a;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
result = arr.ordering() == 'c' ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
result = arr.ordering() == 'c' ? dnnl::memory::format_tag::abc : dnnl::memory::format_tag::cba;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
result = dnnl::memory::format_tag::abcd;
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
result = dnnl::memory::format_tag::abcde;
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
result = dnnl::memory::format_tag::abcdef;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw std::invalid_argument("MKLDNN getFormat: do we really want to use arras with rank > 6 ?");
|
||||||
}
|
}
|
||||||
else if (3 == rank) {
|
|
||||||
return dnnl::memory::format_tag::abc;
|
return result;
|
||||||
}
|
|
||||||
else if (4 == rank) {
|
|
||||||
return dnnl::memory::format_tag::abcd;
|
|
||||||
}
|
|
||||||
else if (5 == rank) {
|
|
||||||
return dnnl::memory::format_tag::abcde;
|
|
||||||
}
|
|
||||||
else if (6 == rank) {
|
|
||||||
return dnnl::memory::format_tag::abcdef;
|
|
||||||
}
|
|
||||||
return dnnl::memory::format_tag::a; // 1 == dataSetRank
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
void setBlockStrides(const NDArray* array, dnnl::memory::desc& mklMd){
|
void setBlockStrides(const NDArray& array, dnnl::memory::desc& mklMd, const std::vector<int>& permut) {
|
||||||
|
|
||||||
|
if (array.ews() != 1 || (array.rankOf() > 3 && array.ordering() == 'f') || !permut.empty()) {
|
||||||
|
|
||||||
if (array->ews() != 1 || array->ordering() != 'c') {
|
|
||||||
mklMd.data.format_kind = dnnl_blocked; // overrides format
|
mklMd.data.format_kind = dnnl_blocked; // overrides format
|
||||||
for (auto i = 0; i < array->rankOf(); ++i) {
|
|
||||||
mklMd.data.format_desc.blocking.strides[i] = array->strideAt(i);
|
if(permut.empty())
|
||||||
|
for (auto i = 0; i < array.rankOf(); ++i)
|
||||||
|
mklMd.data.format_desc.blocking.strides[i] = array.strideAt(i);
|
||||||
|
else {
|
||||||
|
if(array.rankOf() != permut.size())
|
||||||
|
throw std::invalid_argument("mkldnnUtils::setBlockStrides: size of permut vector is not equal to array rank !");
|
||||||
|
for (auto i = 0; i < array.rankOf(); ++i)
|
||||||
|
mklMd.data.format_desc.blocking.strides[i] = array.strideAt(permut[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
void loadDataToMklStream(const NDArray* array, const dnnl::engine& engine, const dnnl::stream& stream, const dnnl::memory::desc& user_md, const dnnl::memory::desc& primitive_md,
|
dnnl::memory loadDataToMklStream(const NDArray& array, const dnnl::engine& engine, const dnnl::stream& stream,
|
||||||
dnnl::memory& arg) {
|
const dnnl::memory::desc& user_md, const dnnl::memory::desc& primitive_md, dnnl::memory& arg) {
|
||||||
|
|
||||||
auto user_mem = dnnl::memory(user_md, engine,const_cast<void*>(array->buffer()));
|
auto user_mem = dnnl::memory(user_md, engine, const_cast<NDArray&>(array).buffer());
|
||||||
const bool bReorder = primitive_md != user_mem.get_desc();
|
const bool bReorder = primitive_md != user_mem.get_desc();
|
||||||
auto mkl_mem = bReorder ? dnnl::memory(primitive_md, engine) : user_mem;
|
auto mkl_mem = bReorder ? dnnl::memory(primitive_md, engine) : user_mem;
|
||||||
if (bReorder)
|
if (bReorder)
|
||||||
dnnl::reorder(user_mem, mkl_mem).execute(stream, user_mem, mkl_mem);
|
dnnl::reorder(user_mem, mkl_mem).execute(stream, user_mem, mkl_mem);
|
||||||
arg = mkl_mem;
|
arg = mkl_mem;
|
||||||
|
return user_mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
@ -122,33 +142,21 @@ void poolingMKLDNN(const NDArray *input, NDArray *output,
|
||||||
xzFrmat = isNCHW ? dnnl::memory::format_tag::ncdhw : dnnl::memory::format_tag::ndhwc;
|
xzFrmat = isNCHW ? dnnl::memory::format_tag::ncdhw : dnnl::memory::format_tag::ndhwc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<int> permut;
|
||||||
|
if(!isNCHW)
|
||||||
|
permut = rank == 4 ? std::vector<int>({0,3,1,2}) : std::vector<int>({0,4,1,2,3});
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
|
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
||||||
if(input->ews() != 1 || input->ordering() != 'c') {
|
mkldnnUtils::setBlockStrides(*input, x_user_md, permut);
|
||||||
x_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
x_user_md.data.format_desc.blocking.strides[0] = input->strideAt(0);
|
|
||||||
x_user_md.data.format_desc.blocking.strides[1] = input->strideAt(isNCHW ? 1 :-1);
|
|
||||||
x_user_md.data.format_desc.blocking.strides[2] = input->strideAt(isNCHW ? 2 : 1);
|
|
||||||
x_user_md.data.format_desc.blocking.strides[3] = input->strideAt(isNCHW ? 3 : 2);
|
|
||||||
if(rank == 5)
|
|
||||||
x_user_md.data.format_desc.blocking.strides[4] = input->strideAt(isNCHW ? 4 : 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// output
|
// output
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, type, xzFrmat);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(zDims, type, xzFrmat);
|
||||||
if(output->ews() != 1 || output->ordering() != 'c') {
|
mkldnnUtils::setBlockStrides(*output, z_user_md, permut);
|
||||||
z_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
z_user_md.data.format_desc.blocking.strides[0] = output->strideAt(0);
|
|
||||||
z_user_md.data.format_desc.blocking.strides[1] = output->strideAt(isNCHW ? 1 :-1);
|
|
||||||
z_user_md.data.format_desc.blocking.strides[2] = output->strideAt(isNCHW ? 2 : 1);
|
|
||||||
z_user_md.data.format_desc.blocking.strides[3] = output->strideAt(isNCHW ? 3 : 2);
|
|
||||||
if(rank == 5)
|
|
||||||
z_user_md.data.format_desc.blocking.strides[4] = output->strideAt(isNCHW ? 4 : 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -164,20 +172,17 @@ void poolingMKLDNN(const NDArray *input, NDArray *output,
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// output
|
// output
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, output->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*output, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::pooling_forward(op_prim_desc).execute(stream, args);
|
dnnl::pooling_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
@ -226,46 +231,27 @@ void poolingBpMKLDNN(const NDArray *input, const NDArray *gradO, NDArray *gradI,
|
||||||
xzFrmat = isNCHW ? dnnl::memory::format_tag::ncdhw : dnnl::memory::format_tag::ndhwc;
|
xzFrmat = isNCHW ? dnnl::memory::format_tag::ncdhw : dnnl::memory::format_tag::ndhwc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<int> permut;
|
||||||
|
if(!isNCHW)
|
||||||
|
permut = rank == 4 ? std::vector<int>({0,3,1,2}) : std::vector<int>({0,4,1,2,3});
|
||||||
|
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
|
|
||||||
// input
|
// input
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
||||||
if(input->ews() != 1 || input->ordering() != 'c') {
|
mkldnnUtils::setBlockStrides(*input, x_user_md, permut);
|
||||||
x_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
x_user_md.data.format_desc.blocking.strides[0] = input->strideAt(0);
|
|
||||||
x_user_md.data.format_desc.blocking.strides[1] = input->strideAt(isNCHW ? 1 :-1);
|
|
||||||
x_user_md.data.format_desc.blocking.strides[2] = input->strideAt(isNCHW ? 2 : 1);
|
|
||||||
x_user_md.data.format_desc.blocking.strides[3] = input->strideAt(isNCHW ? 3 : 2);
|
|
||||||
if(rank == 5)
|
|
||||||
x_user_md.data.format_desc.blocking.strides[4] = input->strideAt(isNCHW ? 4 : 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradO_mkl_md = dnnl::memory::desc(zDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, type, xzFrmat);
|
dnnl::memory::desc gradO_user_md = dnnl::memory::desc(zDims, type, xzFrmat);
|
||||||
if(gradO->ews() != 1 || gradO->ordering() != 'c') {
|
mkldnnUtils::setBlockStrides(*gradO, gradO_user_md, permut);
|
||||||
gradO_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
gradO_user_md.data.format_desc.blocking.strides[0] = gradO->strideAt(0);
|
|
||||||
gradO_user_md.data.format_desc.blocking.strides[1] = gradO->strideAt(isNCHW ? 1 :-1);
|
|
||||||
gradO_user_md.data.format_desc.blocking.strides[2] = gradO->strideAt(isNCHW ? 2 : 1);
|
|
||||||
gradO_user_md.data.format_desc.blocking.strides[3] = gradO->strideAt(isNCHW ? 3 : 2);
|
|
||||||
if(rank == 5)
|
|
||||||
gradO_user_md.data.format_desc.blocking.strides[4] = gradO->strideAt(isNCHW ? 4 : 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
dnnl::memory::desc gradI_mkl_md = dnnl::memory::desc(xDims, type, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
dnnl::memory::desc gradI_user_md = dnnl::memory::desc(xDims, type, xzFrmat);
|
||||||
if(gradI->ews() != 1 || gradI->ordering() != 'c') {
|
mkldnnUtils::setBlockStrides(*gradI, gradI_user_md, permut);
|
||||||
gradI_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
gradI_user_md.data.format_desc.blocking.strides[0] = gradI->strideAt(0);
|
|
||||||
gradI_user_md.data.format_desc.blocking.strides[1] = gradI->strideAt(isNCHW ? 1 :-1);
|
|
||||||
gradI_user_md.data.format_desc.blocking.strides[2] = gradI->strideAt(isNCHW ? 2 : 1);
|
|
||||||
gradI_user_md.data.format_desc.blocking.strides[3] = gradI->strideAt(isNCHW ? 3 : 2);
|
|
||||||
if(rank == 5)
|
|
||||||
gradI_user_md.data.format_desc.blocking.strides[4] = gradI->strideAt(isNCHW ? 4 : 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
dnnl::stream stream(engine);
|
dnnl::stream stream(engine);
|
||||||
|
@ -282,18 +268,15 @@ void poolingBpMKLDNN(const NDArray *input, const NDArray *gradO, NDArray *gradI,
|
||||||
std::unordered_map<int, dnnl::memory> args;
|
std::unordered_map<int, dnnl::memory> args;
|
||||||
|
|
||||||
// gradO
|
// gradO
|
||||||
mkldnnUtils::loadDataToMklStream(gradO, engine, stream, gradO_user_md, op_bp_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
mkldnnUtils::loadDataToMklStream(*gradO, engine, stream, gradO_user_md, op_bp_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
||||||
|
|
||||||
// gradI
|
// gradI
|
||||||
auto gradI_user_mem = dnnl::memory(gradI_user_md, engine, gradI->buffer());
|
auto gradI_user_mem = mkldnnUtils::loadDataToMklStream(*gradI, engine, stream, gradI_user_md, op_bp_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool gradIReorder = op_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc();
|
|
||||||
auto gradI_mkl_mem = gradIReorder ? dnnl::memory(op_bp_prim_desc.diff_src_desc(), engine) : gradI_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = gradI_mkl_mem;
|
|
||||||
|
|
||||||
if(mode == algorithm::pooling_max) {
|
if(mode == algorithm::pooling_max) {
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(input, engine, stream, x_user_md, op_ff_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*input, engine, stream, x_user_md, op_ff_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// z
|
// z
|
||||||
auto z_mkl_mem = dnnl::memory(op_ff_prim_desc.dst_desc(), engine);
|
auto z_mkl_mem = dnnl::memory(op_ff_prim_desc.dst_desc(), engine);
|
||||||
|
@ -310,10 +293,9 @@ void poolingBpMKLDNN(const NDArray *input, const NDArray *gradO, NDArray *gradI,
|
||||||
// run backward calculations
|
// run backward calculations
|
||||||
dnnl::pooling_backward(op_bp_prim_desc).execute(stream, args);
|
dnnl::pooling_backward(op_bp_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
|
|
||||||
// reorder gradI if necessary
|
// reorder gradI if necessary
|
||||||
if (gradIReorder)
|
if (op_bp_prim_desc.diff_src_desc() != gradI_user_mem.get_desc())
|
||||||
dnnl::reorder(gradI_mkl_mem, gradI_user_mem).execute(stream, gradI_mkl_mem, gradI_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], gradI_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], gradI_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
|
|
@ -100,6 +100,8 @@ namespace sd {
|
||||||
|
|
||||||
DECLARE_PLATFORM(xw_plus_b_bp, ENGINE_CPU);
|
DECLARE_PLATFORM(xw_plus_b_bp, ENGINE_CPU);
|
||||||
|
|
||||||
|
DECLARE_PLATFORM(concat, ENGINE_CPU);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,19 +125,13 @@ namespace sd {
|
||||||
*/
|
*/
|
||||||
void getDims(const NDArray* array, const int rank, dnnl::memory::dims& mklDims);
|
void getDims(const NDArray* array, const int rank, dnnl::memory::dims& mklDims);
|
||||||
/**
|
/**
|
||||||
* This function generate memory format tag based on rank
|
* This function evaluate memory format tag based on array shapeInfo
|
||||||
* @param const array rank
|
* @param const array
|
||||||
* @return memory format
|
* @return memory format
|
||||||
*/
|
*/
|
||||||
dnnl::memory::format_tag getFormat(const int rank);
|
dnnl::memory::format_tag getFormat(const NDArray& arr);
|
||||||
/**
|
|
||||||
* This function generate memory format tag based on rank
|
void setBlockStrides(const NDArray& array, dnnl::memory::desc& mklMd, const std::vector<int>& permut = {});
|
||||||
* @param const pointer to dataset
|
|
||||||
* @param const dataset rank
|
|
||||||
* @param reference to memory descriptor
|
|
||||||
* @return memory format
|
|
||||||
*/
|
|
||||||
void setBlockStrides(const NDArray* array, dnnl::memory::desc& mklMd);
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
/**
|
/**
|
||||||
* This function load and reorder user memory to mkl
|
* This function load and reorder user memory to mkl
|
||||||
|
@ -147,7 +143,7 @@ namespace sd {
|
||||||
* @param primitive memory descriptor
|
* @param primitive memory descriptor
|
||||||
* @param dnnl arg activation enumerator
|
* @param dnnl arg activation enumerator
|
||||||
*/
|
*/
|
||||||
void loadDataToMklStream(const NDArray* array, const dnnl::engine& engine, const dnnl::stream& stream, const dnnl::memory::desc& user_md, const dnnl::memory::desc& primitive_md,
|
dnnl::memory loadDataToMklStream(const NDArray& array, const dnnl::engine& engine, const dnnl::stream& stream, const dnnl::memory::desc& user_md, const dnnl::memory::desc& primitive_md,
|
||||||
dnnl::memory& arg);
|
dnnl::memory& arg);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -35,32 +35,37 @@ namespace sd {
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
static void softmaxMKLDNN(const NDArray* x, NDArray* z, const int axis) {
|
static void softmaxMKLDNN(const NDArray* x, NDArray* z, const int axis) {
|
||||||
|
|
||||||
const auto xRank = x->rankOf();
|
dnnl::memory::dims shape = x->getShapeAsFlatVector();
|
||||||
dnnl::memory::dims xShape, zShape;
|
|
||||||
|
|
||||||
mkldnnUtils::getDims(x, xRank, xShape);
|
const int xRank = x->rankOf();
|
||||||
mkldnnUtils::getDims(z, xRank, zShape);
|
|
||||||
|
|
||||||
|
dnnl::memory::format_tag xFormat = mkldnnUtils::getFormat(*x);
|
||||||
|
dnnl::memory::format_tag zFormat = mkldnnUtils::getFormat(*z);
|
||||||
|
|
||||||
dnnl::memory::format_tag format = mkldnnUtils::getFormat(xRank);
|
|
||||||
// optimized cases
|
// optimized cases
|
||||||
if (2 == xRank && 0 == axis) {
|
if (2 == xRank && 0 == axis) {
|
||||||
format = dnnl::memory::format_tag::ba;
|
if(x->ews() == 1)
|
||||||
|
xFormat = dnnl::memory::format_tag::ba;
|
||||||
|
if(z->ews() == 1)
|
||||||
|
zFormat = dnnl::memory::format_tag::ba;
|
||||||
}
|
}
|
||||||
else if (4 == xRank && 1 == axis && (x->sizeAt(2) * x->sizeAt(3)) > 1) {
|
else if (4 == xRank && 1 == axis && (x->sizeAt(2) * x->sizeAt(3)) > 1) {
|
||||||
format = dnnl::memory::format_tag::acdb;
|
if(x->ews() == 1)
|
||||||
|
xFormat = dnnl::memory::format_tag::acdb;
|
||||||
|
if(z->ews() == 1)
|
||||||
|
zFormat = dnnl::memory::format_tag::acdb;
|
||||||
}
|
}
|
||||||
|
|
||||||
dnnl::memory::data_type xType = dnnl::memory::data_type::f32;
|
dnnl::memory::data_type xType = dnnl::memory::data_type::f32;
|
||||||
|
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, format);
|
dnnl::memory::desc x_mkl_md, x_user_md, z_mkl_md, z_user_md;
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, format);
|
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
x_user_md = x_mkl_md = dnnl::memory::desc(shape, xType, xFormat);
|
||||||
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
|
|
||||||
// z
|
// z
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, xType, format);
|
z_user_md = z_mkl_md = dnnl::memory::desc(shape, xType, zFormat);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, xType, format);
|
mkldnnUtils::setBlockStrides(*z, z_user_md);
|
||||||
mkldnnUtils::setBlockStrides(z, z_user_md);
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -80,20 +85,17 @@ namespace sd {
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// z
|
// z
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, z->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*z, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::softmax_forward(op_prim_desc).execute(stream, args);
|
dnnl::softmax_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
@ -142,33 +144,19 @@ namespace sd {
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
static void softmaxBpMKLDNN(const NDArray* x, const NDArray* dLdz, NDArray* dLdx, const int axis) {
|
static void softmaxBpMKLDNN(const NDArray* x, const NDArray* dLdz, NDArray* dLdx, const int axis) {
|
||||||
|
|
||||||
const auto xRank = x->rankOf();
|
dnnl::memory::desc x_user_md, x_mkl_md, dLdx_mkl_md, dLdx_user_md, dLdz_mkl_md, dLdz_user_md;
|
||||||
const auto dLdzRank = dLdz->rankOf();
|
|
||||||
|
|
||||||
dnnl::memory::dims xShape, dLdxShape, dLdzShape;
|
|
||||||
|
|
||||||
mkldnnUtils::getDims(x, xRank, xShape);
|
|
||||||
mkldnnUtils::getDims(dLdx, xRank, dLdxShape);
|
|
||||||
mkldnnUtils::getDims(dLdz, dLdzRank, dLdzShape);
|
|
||||||
|
|
||||||
dnnl::memory::format_tag format = mkldnnUtils::getFormat(xRank);
|
|
||||||
|
|
||||||
// x
|
// x
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
x_mkl_md = x_user_md = dnnl::memory::desc(x->getShapeAsFlatVector(), dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*x));
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
|
||||||
|
|
||||||
// dLdx
|
// dLdx
|
||||||
dnnl::memory::desc dLdx_mkl_md = dnnl::memory::desc(dLdxShape, dnnl::memory::data_type::f32, format);
|
dLdx_mkl_md = dLdx_user_md = dnnl::memory::desc(dLdx->getShapeAsFlatVector(), dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*dLdx));
|
||||||
dnnl::memory::desc dLdx_user_md = dnnl::memory::desc(dLdxShape, dnnl::memory::data_type::f32, format);
|
mkldnnUtils::setBlockStrides(*dLdx, dLdx_user_md);
|
||||||
mkldnnUtils::setBlockStrides(dLdx, dLdx_user_md);
|
|
||||||
// todo if mkl does not support broadcast we can remove this
|
|
||||||
format = mkldnnUtils::getFormat(dLdzRank);
|
|
||||||
|
|
||||||
// dLdz
|
// dLdz
|
||||||
dnnl::memory::desc dLdz_mkl_md = dnnl::memory::desc(dLdzShape, dnnl::memory::data_type::f32, format);
|
dLdz_mkl_md = dLdz_user_md = dnnl::memory::desc(dLdz->getShapeAsFlatVector(), dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*dLdz));
|
||||||
dnnl::memory::desc dLdz_user_md = dnnl::memory::desc(dLdzShape, dnnl::memory::data_type::f32, format);
|
mkldnnUtils::setBlockStrides(*dLdz, dLdz_user_md);
|
||||||
mkldnnUtils::setBlockStrides(dLdz, dLdz_user_md);
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -188,19 +176,18 @@ namespace sd {
|
||||||
|
|
||||||
// provide memory buffers and check whether reorder is required for forward
|
// provide memory buffers and check whether reorder is required for forward
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_ff_prim_desc.src_desc(), argsff[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_ff_prim_desc.src_desc(), argsff[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
|
// dLdz
|
||||||
|
mkldnnUtils::loadDataToMklStream(*dLdz, engine, stream, dLdz_user_md, op_bp_prim_desc.diff_dst_desc(), argsbp[DNNL_ARG_DIFF_DST]);
|
||||||
|
|
||||||
// dLdx
|
// dLdx
|
||||||
auto dLdx_user_mem = dnnl::memory(dLdx_user_md, engine, dLdx->buffer());
|
auto dLdx_user_mem = mkldnnUtils::loadDataToMklStream(*dLdx, engine, stream, dLdx_user_md, op_ff_prim_desc.src_desc(), argsff[DNNL_ARG_DST]);
|
||||||
const bool dLdxReorder = op_ff_prim_desc.dst_desc() != dLdx_user_mem.get_desc();
|
|
||||||
auto dLdx_mkl_mem = dLdxReorder ? dnnl::memory(op_ff_prim_desc.dst_desc(), engine) : dLdx_user_mem;
|
|
||||||
argsff[DNNL_ARG_DST] = dLdx_mkl_mem;
|
|
||||||
|
|
||||||
// check and arg set for backprob
|
// check and arg set for backprob
|
||||||
argsbp[DNNL_ARG_DIFF_SRC] = dLdx_mkl_mem;
|
argsbp[DNNL_ARG_DIFF_SRC] = argsff[DNNL_ARG_DST];
|
||||||
argsbp[DNNL_ARG_DST] = dLdx_mkl_mem;
|
argsbp[DNNL_ARG_DST] = argsff[DNNL_ARG_DST];
|
||||||
// dLdz
|
|
||||||
mkldnnUtils::loadDataToMklStream(dLdz, engine, stream, dLdz_user_md, op_bp_prim_desc.diff_dst_desc(), argsbp[DNNL_ARG_DIFF_DST]);
|
|
||||||
|
|
||||||
// run calculations forward
|
// run calculations forward
|
||||||
dnnl::softmax_forward(op_ff_prim_desc).execute(stream, argsff);
|
dnnl::softmax_forward(op_ff_prim_desc).execute(stream, argsff);
|
||||||
|
@ -209,8 +196,8 @@ namespace sd {
|
||||||
dnnl::softmax_backward(op_bp_prim_desc).execute(stream, argsbp);
|
dnnl::softmax_backward(op_bp_prim_desc).execute(stream, argsbp);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (dLdxReorder)
|
if (op_ff_prim_desc.dst_desc() != dLdx_user_mem.get_desc())
|
||||||
dnnl::reorder(dLdx_mkl_mem, dLdx_user_mem).execute(stream, dLdx_mkl_mem, dLdx_user_mem);
|
dnnl::reorder(argsff[DNNL_ARG_DST], dLdx_user_mem).execute(stream, argsff[DNNL_ARG_DST], dLdx_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,22 +34,16 @@ namespace sd {
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
static void tanhMKLDNN(const NDArray* x, NDArray* z) {
|
static void tanhMKLDNN(const NDArray* x, NDArray* z) {
|
||||||
|
|
||||||
const auto xRank = x->rankOf();
|
dnnl::memory::dims shape = x->getShapeAsFlatVector();
|
||||||
dnnl::memory::dims xShape, zShape;
|
|
||||||
|
|
||||||
mkldnnUtils::getDims(x, xRank, xShape);
|
dnnl::memory::desc x_mkl_md, x_user_md, z_mkl_md, z_user_md;
|
||||||
mkldnnUtils::getDims(z, xRank, zShape);
|
|
||||||
|
|
||||||
dnnl::memory::format_tag format = mkldnnUtils::getFormat(xRank);
|
x_user_md = x_mkl_md = dnnl::memory::desc(shape, dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*x));
|
||||||
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
|
||||||
|
|
||||||
// z
|
// z
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, dnnl::memory::data_type::f32, format);
|
z_user_md = z_mkl_md = dnnl::memory::desc(shape, dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*z));
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, dnnl::memory::data_type::f32, format);
|
mkldnnUtils::setBlockStrides(*z, z_user_md);
|
||||||
mkldnnUtils::setBlockStrides(z, z_user_md);
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -68,20 +62,17 @@ namespace sd {
|
||||||
|
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// z
|
// z
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, z->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*z, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::eltwise_forward(op_prim_desc).execute(stream, args);
|
dnnl::eltwise_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
@ -121,28 +112,21 @@ namespace sd {
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
static void tanhBpMKLDNN(const NDArray* x, const NDArray* dLdz, NDArray* dLdx) {
|
static void tanhBpMKLDNN(const NDArray* x, const NDArray* dLdz, NDArray* dLdx) {
|
||||||
|
|
||||||
const auto xRank = x->rankOf();
|
dnnl::memory::dims shape = x->getShapeAsFlatVector();
|
||||||
dnnl::memory::dims xShape, dLdzShape, dLdxShape;
|
|
||||||
|
|
||||||
mkldnnUtils::getDims(x, xRank, xShape);
|
dnnl::memory::desc x_mkl_md, x_user_md, dLdx_mkl_md, dLdx_user_md, dLdz_mkl_md, dLdz_user_md;
|
||||||
mkldnnUtils::getDims(dLdz, xRank, dLdzShape);
|
|
||||||
mkldnnUtils::getDims(dLdx, xRank, dLdxShape);
|
|
||||||
|
|
||||||
dnnl::memory::format_tag format = mkldnnUtils::getFormat(xRank);
|
// x
|
||||||
|
x_user_md = x_mkl_md = dnnl::memory::desc(shape, dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*x));
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
|
||||||
|
|
||||||
// dLdz
|
// dLdz
|
||||||
dnnl::memory::desc dLdz_mkl_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
dLdz_user_md = dLdz_mkl_md = dnnl::memory::desc(shape, dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*dLdz));
|
||||||
dnnl::memory::desc dLdz_user_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
mkldnnUtils::setBlockStrides(*dLdz, dLdz_user_md);
|
||||||
mkldnnUtils::setBlockStrides(dLdz, dLdz_user_md);
|
|
||||||
|
|
||||||
// dLdx
|
// dLdx
|
||||||
dnnl::memory::desc dLdx_mkl_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
dLdx_user_md = dLdx_mkl_md = dnnl::memory::desc(shape, dnnl::memory::data_type::f32, mkldnnUtils::getFormat(*dLdx));
|
||||||
dnnl::memory::desc dLdx_user_md = dnnl::memory::desc(xShape, dnnl::memory::data_type::f32, format);
|
mkldnnUtils::setBlockStrides(*dLdx, dLdx_user_md);
|
||||||
mkldnnUtils::setBlockStrides(dLdx, dLdx_user_md);
|
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -162,23 +146,20 @@ namespace sd {
|
||||||
|
|
||||||
// provide memory buffers and check whether reorder is required for forward
|
// provide memory buffers and check whether reorder is required for forward
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// dLdz
|
// dLdz
|
||||||
mkldnnUtils::loadDataToMklStream(dLdz, engine, stream, dLdz_user_md, op_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
mkldnnUtils::loadDataToMklStream(*dLdz, engine, stream, dLdz_user_md, op_prim_desc.diff_dst_desc(), args[DNNL_ARG_DIFF_DST]);
|
||||||
|
|
||||||
// dLdx
|
// dLdx
|
||||||
auto dLdx_user_mem = dnnl::memory(dLdx_user_md, engine, dLdx->buffer());
|
auto dLdx_user_mem = mkldnnUtils::loadDataToMklStream(*dLdx, engine, stream, dLdx_user_md, op_prim_desc.diff_src_desc(), args[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool dLdxReorder = op_prim_desc.diff_src_desc() != dLdx_user_mem.get_desc();
|
|
||||||
auto dLdx_mkl_mem = dLdxReorder ? dnnl::memory(op_prim_desc.diff_src_desc(), engine) : dLdx_user_mem;
|
|
||||||
args[DNNL_ARG_DIFF_SRC] = dLdx_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations backward
|
// run calculations backward
|
||||||
dnnl::eltwise_backward(op_prim_desc).execute(stream, args);
|
dnnl::eltwise_backward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (dLdxReorder)
|
if (op_prim_desc.diff_src_desc() != dLdx_user_mem.get_desc())
|
||||||
dnnl::reorder(dLdx_mkl_mem, dLdx_user_mem).execute(stream, dLdx_mkl_mem, dLdx_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DIFF_SRC], dLdx_user_mem).execute(stream, args[DNNL_ARG_DIFF_SRC], dLdx_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,33 +82,23 @@ namespace sd {
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
// x
|
// x
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, format);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, mkldnnUtils::getFormat(*x));
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc weights_mkl_md = dnnl::memory::desc(wShape, wType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc weights_mkl_md = dnnl::memory::desc(wShape, wType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc weights_user_md = dnnl::memory::desc(wShape, wType, format);
|
dnnl::memory::desc weights_user_md = dnnl::memory::desc(wShape, wType, mkldnnUtils::getFormat(*weights));
|
||||||
if (weights->ews() != 1 || weights->ordering() != 'c' || bShouldTransp) {
|
mkldnnUtils::setBlockStrides(*weights, weights_user_md, bShouldTransp ? std::vector<int>({1,0}) : std::vector<int>());
|
||||||
|
|
||||||
weights_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
if (bShouldTransp) {
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(1);
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(0);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(0);
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// bias
|
// bias
|
||||||
dnnl::memory::desc bias_mkl_md = dnnl::memory::desc(bShape, bType, dnnl::memory::format_tag::x);
|
dnnl::memory::desc bias_mkl_md = dnnl::memory::desc(bShape, bType, dnnl::memory::format_tag::a);
|
||||||
dnnl::memory::desc bias_user_md = dnnl::memory::desc(bShape, bType, dnnl::memory::format_tag::x);
|
dnnl::memory::desc bias_user_md = dnnl::memory::desc(bShape, bType, dnnl::memory::format_tag::a);
|
||||||
mkldnnUtils::setBlockStrides(bias, bias_user_md);
|
mkldnnUtils::setBlockStrides(*bias, bias_user_md);
|
||||||
|
|
||||||
// z
|
// z
|
||||||
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, format);
|
dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, mkldnnUtils::getFormat(*z));
|
||||||
mkldnnUtils::setBlockStrides(z, z_user_md);
|
mkldnnUtils::setBlockStrides(*z, z_user_md);
|
||||||
|
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
|
@ -125,27 +115,24 @@ namespace sd {
|
||||||
// provide memory buffers and check whether reorder is required
|
// provide memory buffers and check whether reorder is required
|
||||||
|
|
||||||
// input
|
// input
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_prim_desc.src_desc(), args[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, weights_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, weights_user_md, op_prim_desc.weights_desc(), args[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// bias
|
// bias
|
||||||
auto bias_mkl_mem = dnnl::memory(bias_mkl_md, engine, const_cast<void*>(bias->buffer()));
|
auto bias_mkl_mem = dnnl::memory(bias_mkl_md, engine, const_cast<void*>(bias->buffer()));
|
||||||
args[DNNL_ARG_BIAS] = bias_mkl_mem;
|
args[DNNL_ARG_BIAS] = bias_mkl_mem;
|
||||||
|
|
||||||
// z
|
// z
|
||||||
auto z_user_mem = dnnl::memory(z_user_md, engine, z->buffer());
|
auto z_user_mem = mkldnnUtils::loadDataToMklStream(*z, engine, stream, z_user_md, op_prim_desc.dst_desc(), args[DNNL_ARG_DST]);
|
||||||
const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
|
|
||||||
auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
|
|
||||||
args[DNNL_ARG_DST] = z_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations
|
// run calculations
|
||||||
dnnl::inner_product_forward(op_prim_desc).execute(stream, args);
|
dnnl::inner_product_forward(op_prim_desc).execute(stream, args);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (zReorder)
|
if (op_prim_desc.dst_desc() != z_user_mem.get_desc())
|
||||||
dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
|
dnnl::reorder(args[DNNL_ARG_DST], z_user_mem).execute(stream, args[DNNL_ARG_DST], z_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
@ -168,71 +155,53 @@ namespace sd {
|
||||||
dnnl::memory::dims dLdzShape = dnnl::memory::dims({ M, N });
|
dnnl::memory::dims dLdzShape = dnnl::memory::dims({ M, N });
|
||||||
|
|
||||||
dnnl::memory::dims bShape = dnnl::memory::dims({ N });
|
dnnl::memory::dims bShape = dnnl::memory::dims({ N });
|
||||||
|
|
||||||
// output dims
|
// output dims
|
||||||
dnnl::memory::dims dLdxShape = xShape;
|
dnnl::memory::dims dLdxShape = xShape;
|
||||||
dnnl::memory::dims dLdwShape = wShape;
|
dnnl::memory::dims dLdwShape = wShape;
|
||||||
|
|
||||||
dnnl::memory::format_tag format = dnnl::memory::format_tag::ab;
|
|
||||||
dnnl::memory::data_type dataType = dnnl::memory::data_type::f32;
|
dnnl::memory::data_type dataType = dnnl::memory::data_type::f32;
|
||||||
|
|
||||||
// memory descriptors for arrays
|
// memory descriptors for arrays
|
||||||
// x
|
// x
|
||||||
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, dataType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, dataType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, dataType, format);
|
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, dataType, mkldnnUtils::getFormat(*x));
|
||||||
mkldnnUtils::setBlockStrides(x, x_user_md);
|
mkldnnUtils::setBlockStrides(*x, x_user_md);
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
dnnl::memory::desc weights_mkl_md = dnnl::memory::desc(wShape, dataType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc weights_mkl_md = dnnl::memory::desc(wShape, dataType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc weights_user_md = dnnl::memory::desc(wShape, dataType, format);
|
dnnl::memory::desc weights_user_md = dnnl::memory::desc(wShape, dataType, mkldnnUtils::getFormat(*weights));
|
||||||
if (weights->ews() != 1 || weights->ordering() != 'c' || bShouldTransp) {
|
mkldnnUtils::setBlockStrides(*weights, weights_user_md, bShouldTransp ? std::vector<int>({1,0}) : std::vector<int>());
|
||||||
|
|
||||||
weights_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
if (bShouldTransp) {
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(1);
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(0);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[0] = weights->strideAt(0);
|
|
||||||
weights_user_md.data.format_desc.blocking.strides[1] = weights->strideAt(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// bias
|
// bias
|
||||||
dnnl::memory::desc bias_mkl_md = dnnl::memory::desc(bShape, dataType, dnnl::memory::format_tag::x);
|
dnnl::memory::desc bias_mkl_md = dnnl::memory::desc(bShape, dataType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc bias_user_md = dnnl::memory::desc(bShape, dataType, dnnl::memory::format_tag::x);
|
dnnl::memory::desc bias_user_md = dnnl::memory::desc(bShape, dataType, mkldnnUtils::getFormat(*bias));
|
||||||
mkldnnUtils::setBlockStrides(bias, bias_user_md);
|
mkldnnUtils::setBlockStrides(*bias, bias_user_md);
|
||||||
|
|
||||||
// dLdz
|
// dLdz
|
||||||
dnnl::memory::desc dLdz_mkl_md = dnnl::memory::desc(dLdzShape, dataType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc dLdz_mkl_md = dnnl::memory::desc(dLdzShape, dataType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc dLdz_user_md = dnnl::memory::desc(dLdzShape, dataType, format);
|
dnnl::memory::desc dLdz_user_md = dnnl::memory::desc(dLdzShape, dataType, mkldnnUtils::getFormat(*dLdz));
|
||||||
mkldnnUtils::setBlockStrides(dLdz, dLdz_user_md);
|
mkldnnUtils::setBlockStrides(*dLdz, dLdz_user_md);
|
||||||
|
|
||||||
|
|
||||||
// dLdw
|
// dLdw
|
||||||
dnnl::memory::desc dLdw_mkl_md = dnnl::memory::desc(wShape, dataType, format);
|
dnnl::memory::desc dLdw_mkl_md = dnnl::memory::desc(wShape, dataType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc dLdw_user_md = dnnl::memory::desc(wShape, dataType, format);
|
dnnl::memory::desc dLdw_user_md = dnnl::memory::desc(wShape, dataType, mkldnnUtils::getFormat(*dLdw));
|
||||||
if (dLdw->ews() != 1 || dLdw->ordering() != 'c' || bShouldTransp) {
|
mkldnnUtils::setBlockStrides(*dLdw, dLdw_user_md, bShouldTransp ? std::vector<int>({1,0}) : std::vector<int>());
|
||||||
|
|
||||||
dLdw_user_md.data.format_kind = dnnl_blocked; // overrides format
|
|
||||||
if (bShouldTransp) {
|
|
||||||
dLdw_user_md.data.format_desc.blocking.strides[0] = dLdw->strideAt(1);
|
|
||||||
dLdw_user_md.data.format_desc.blocking.strides[1] = dLdw->strideAt(0);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
dLdw_user_md.data.format_desc.blocking.strides[0] = dLdw->strideAt(0);
|
|
||||||
dLdw_user_md.data.format_desc.blocking.strides[1] = dLdw->strideAt(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// dLdb
|
// dLdb
|
||||||
dnnl::memory::desc dLdb_mkl_md = dnnl::memory::desc(bShape, dataType, dnnl::memory::format_tag::x);
|
dnnl::memory::desc dLdb_mkl_md = dnnl::memory::desc(bShape, dataType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc dLdb_user_md = dnnl::memory::desc(bShape, dataType, dnnl::memory::format_tag::x);
|
dnnl::memory::desc dLdb_user_md = dnnl::memory::desc(bShape, dataType, mkldnnUtils::getFormat(*dLdb));
|
||||||
mkldnnUtils::setBlockStrides(dLdb, dLdb_user_md);
|
mkldnnUtils::setBlockStrides(*dLdb, dLdb_user_md);
|
||||||
|
|
||||||
// dLdx
|
// dLdx
|
||||||
dnnl::memory::desc dLdx_mkl_md = dnnl::memory::desc(xShape, dataType, dnnl::memory::format_tag::any);
|
dnnl::memory::desc dLdx_mkl_md = dnnl::memory::desc(xShape, dataType, dnnl::memory::format_tag::any);
|
||||||
dnnl::memory::desc dLdx_user_md = dnnl::memory::desc(xShape, dataType, format);
|
dnnl::memory::desc dLdx_user_md = dnnl::memory::desc(xShape, dataType, mkldnnUtils::getFormat(*dLdx));
|
||||||
mkldnnUtils::setBlockStrides(dLdx, dLdx_user_md);
|
mkldnnUtils::setBlockStrides(*dLdx, dLdx_user_md);
|
||||||
|
|
||||||
|
// create engine
|
||||||
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
|
||||||
|
|
||||||
// forward
|
// forward
|
||||||
// operation primitive description
|
// operation primitive description
|
||||||
dnnl::inner_product_forward::desc op_ff_desc(dnnl::prop_kind::forward_inference, x_mkl_md, weights_mkl_md, bias_mkl_md, dLdz_mkl_md);
|
dnnl::inner_product_forward::desc op_ff_desc(dnnl::prop_kind::forward_inference, x_mkl_md, weights_mkl_md, bias_mkl_md, dLdz_mkl_md);
|
||||||
|
@ -254,34 +223,25 @@ namespace sd {
|
||||||
dnnl::stream stream(engine);
|
dnnl::stream stream(engine);
|
||||||
|
|
||||||
// dLdz dw
|
// dLdz dw
|
||||||
mkldnnUtils::loadDataToMklStream(dLdz, engine, stream, dLdz_user_md, op_bpdw_prim_desc.diff_dst_desc(), argsDw[DNNL_ARG_DIFF_DST]);
|
mkldnnUtils::loadDataToMklStream(*dLdz, engine, stream, dLdz_user_md, op_bpdw_prim_desc.diff_dst_desc(), argsDw[DNNL_ARG_DIFF_DST]);
|
||||||
|
|
||||||
// dLdz - dx
|
// dLdz - dx
|
||||||
mkldnnUtils::loadDataToMklStream(dLdz, engine, stream, dLdz_user_md, op_bpdx_prim_desc.diff_dst_desc(), argsDx[DNNL_ARG_DIFF_DST]);
|
mkldnnUtils::loadDataToMklStream(*dLdz, engine, stream, dLdz_user_md, op_bpdx_prim_desc.diff_dst_desc(), argsDx[DNNL_ARG_DIFF_DST]);
|
||||||
|
|
||||||
// input x for dw
|
// input x for dw
|
||||||
mkldnnUtils::loadDataToMklStream(x, engine, stream, x_user_md, op_bpdw_prim_desc.src_desc(), argsDw[DNNL_ARG_SRC]);
|
mkldnnUtils::loadDataToMklStream(*x, engine, stream, x_user_md, op_bpdw_prim_desc.src_desc(), argsDw[DNNL_ARG_SRC]);
|
||||||
|
|
||||||
// weights - dx
|
// weights - dx
|
||||||
mkldnnUtils::loadDataToMklStream(weights, engine, stream, weights_user_md, op_bpdx_prim_desc.weights_desc(), argsDx[DNNL_ARG_WEIGHTS]);
|
mkldnnUtils::loadDataToMklStream(*weights, engine, stream, weights_user_md, op_bpdx_prim_desc.weights_desc(), argsDx[DNNL_ARG_WEIGHTS]);
|
||||||
|
|
||||||
// dLdw
|
// dLdw
|
||||||
auto dLdw_user_mem = dnnl::memory(dLdw_user_md, engine, dLdw->buffer());
|
auto dLdw_user_mem = mkldnnUtils::loadDataToMklStream(*dLdw, engine, stream, dLdw_user_md, op_bpdw_prim_desc.diff_weights_desc(), argsDw[DNNL_ARG_DIFF_WEIGHTS]);
|
||||||
const bool dLdwReorder = op_bpdw_prim_desc.diff_weights_desc() != dLdw_user_mem.get_desc();
|
|
||||||
auto dLdw_mkl_mem = dLdwReorder ? dnnl::memory(op_bpdw_prim_desc.diff_weights_desc(), engine) : dLdw_user_mem;
|
|
||||||
argsDw[DNNL_ARG_DIFF_WEIGHTS] = dLdw_mkl_mem;
|
|
||||||
|
|
||||||
// dLdx
|
// dLdx
|
||||||
auto dLdx_user_mem = dnnl::memory(dLdx_user_md, engine, dLdx->buffer());
|
auto dLdx_user_mem = mkldnnUtils::loadDataToMklStream(*dLdx, engine, stream, dLdx_user_md, op_bpdx_prim_desc.diff_src_desc(), argsDx[DNNL_ARG_DIFF_SRC]);
|
||||||
const bool dLdxReorder = op_bpdx_prim_desc.diff_src_desc() != dLdx_user_mem.get_desc();
|
|
||||||
auto dLdx_mkl_mem = dLdxReorder ? dnnl::memory(op_bpdx_prim_desc.diff_src_desc(), engine) : dLdx_user_mem;
|
|
||||||
argsDx[DNNL_ARG_DIFF_SRC] = dLdx_mkl_mem;
|
|
||||||
|
|
||||||
// dLdb
|
// dLdb
|
||||||
auto dLdb_user_mem = dnnl::memory(dLdb_user_md, engine, dLdb->buffer());
|
auto dLdb_user_mem = mkldnnUtils::loadDataToMklStream(*dLdb, engine, stream, dLdb_user_md, op_bpdw_prim_desc.diff_bias_desc(), argsDw[DNNL_ARG_DIFF_BIAS]);
|
||||||
const bool dLdbReorder = op_bpdw_prim_desc.diff_bias_desc() != dLdb_user_mem.get_desc();
|
|
||||||
auto dLdb_mkl_mem = dLdbReorder ? dnnl::memory(op_bpdw_prim_desc.diff_bias_desc(), engine) : dLdb_user_mem;
|
|
||||||
argsDw[DNNL_ARG_DIFF_BIAS] = dLdb_mkl_mem;
|
|
||||||
|
|
||||||
// run calculations dw
|
// run calculations dw
|
||||||
dnnl::inner_product_backward_weights(op_bpdw_prim_desc).execute(stream, argsDw);
|
dnnl::inner_product_backward_weights(op_bpdw_prim_desc).execute(stream, argsDw);
|
||||||
|
@ -289,14 +249,14 @@ namespace sd {
|
||||||
dnnl::inner_product_backward_data(op_bpdx_prim_desc).execute(stream, argsDx);
|
dnnl::inner_product_backward_data(op_bpdx_prim_desc).execute(stream, argsDx);
|
||||||
|
|
||||||
// reorder outputs if necessary
|
// reorder outputs if necessary
|
||||||
if (dLdxReorder)
|
if (op_bpdx_prim_desc.diff_src_desc() != dLdx_user_mem.get_desc())
|
||||||
dnnl::reorder(dLdx_mkl_mem, dLdx_user_mem).execute(stream, dLdx_mkl_mem, dLdx_user_mem);
|
dnnl::reorder(argsDx[DNNL_ARG_DIFF_SRC], dLdx_user_mem).execute(stream, argsDx[DNNL_ARG_DIFF_SRC], dLdx_user_mem);
|
||||||
|
|
||||||
if (dLdwReorder)
|
if (op_bpdw_prim_desc.diff_weights_desc() != dLdw_user_mem.get_desc())
|
||||||
dnnl::reorder(dLdw_mkl_mem, dLdw_user_mem).execute(stream, dLdw_mkl_mem, dLdw_user_mem);
|
dnnl::reorder(argsDw[DNNL_ARG_DIFF_WEIGHTS], dLdw_user_mem).execute(stream, argsDw[DNNL_ARG_DIFF_WEIGHTS], dLdw_user_mem);
|
||||||
|
|
||||||
if (dLdbReorder)
|
if (op_bpdw_prim_desc.diff_bias_desc() != dLdb_user_mem.get_desc())
|
||||||
dnnl::reorder(dLdb_mkl_mem, dLdb_user_mem).execute(stream, dLdb_mkl_mem, dLdb_user_mem);
|
dnnl::reorder(argsDw[DNNL_ARG_DIFF_BIAS], dLdb_user_mem).execute(stream, argsDw[DNNL_ARG_DIFF_BIAS], dLdb_user_mem);
|
||||||
|
|
||||||
stream.wait();
|
stream.wait();
|
||||||
}
|
}
|
||||||
|
|
|
@ -107,6 +107,25 @@ namespace sd {
|
||||||
// samediff::Threads::parallel_tad(func, 0, numOfArrs);
|
// samediff::Threads::parallel_tad(func, 0, numOfArrs);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
// static Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
|
||||||
|
|
||||||
|
// Nd4jLong result = 9223372036854775807LL;
|
||||||
|
|
||||||
|
// for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
|
||||||
|
|
||||||
|
// const auto currentStride = shape::stride(inShapeInfo)[i];
|
||||||
|
|
||||||
|
// if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
|
||||||
|
// continue;
|
||||||
|
|
||||||
|
// if(result > currentStride)
|
||||||
|
// result = currentStride;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// return result == 9223372036854775807LL ? 1 : result;
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inArrs, NDArray& output, const int axis) {
|
void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inArrs, NDArray& output, const int axis) {
|
||||||
|
|
||||||
|
@ -150,7 +169,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
|
||||||
// if(!areInputsContin || !allSameOrder)
|
// if(!areInputsContin || !allSameOrder)
|
||||||
// break;
|
// break;
|
||||||
|
|
||||||
// strideOfContigStride[i] = shape::strideOverContigAxis(axis, inArrs[i]->shapeInfo());
|
// strideOfContigStride[i] = strideOverContigAxis(axis, inArrs[i]->getShapeInfo());
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
@ -158,7 +177,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
|
||||||
|
|
||||||
// if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
|
// if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
|
||||||
|
|
||||||
// const auto zStep = shape::strideOverContigAxis(axis, output.shapeInfo());
|
// const auto zStep = strideOverContigAxis(axis, output.getShapeInfo());
|
||||||
|
|
||||||
// for (uint i = 0; i < output.lengthOf() / output.sizeAt(axis); ++i) {
|
// for (uint i = 0; i < output.lengthOf() / output.sizeAt(axis); ++i) {
|
||||||
|
|
||||||
|
|
|
@ -184,7 +184,7 @@ TEST_F(DeclarableOpsTests16, test_range_2) {
|
||||||
double tArgs[] = { -1.0, 1.0, 0.01 };
|
double tArgs[] = { -1.0, 1.0, 0.01 };
|
||||||
|
|
||||||
auto shapes = ::calculateOutputShapes2(nullptr, op.getOpHash(), nullptr, nullptr, 0, tArgs, 3, nullptr, 0, nullptr, 0, nullptr, 0);
|
auto shapes = ::calculateOutputShapes2(nullptr, op.getOpHash(), nullptr, nullptr, 0, tArgs, 3, nullptr, 0, nullptr, 0, nullptr, 0);
|
||||||
shape::printShapeInfoLinear("Result", shapes->at(0));
|
// shape::printShapeInfoLinear("Result", shapes->at(0));
|
||||||
ASSERT_TRUE(shape::shapeEquals(z.shapeInfo(), shapes->at(0)));
|
ASSERT_TRUE(shape::shapeEquals(z.shapeInfo(), shapes->at(0)));
|
||||||
|
|
||||||
delete shapes;
|
delete shapes;
|
||||||
|
@ -1074,3 +1074,422 @@ TEST_F(DeclarableOpsTests16, test_yiq_to_rgb_6) {
|
||||||
ASSERT_EQ(ND4J_STATUS_OK, status);
|
ASSERT_EQ(ND4J_STATUS_OK, status);
|
||||||
ASSERT_TRUE(expected.equalsTo(actual));
|
ASSERT_TRUE(expected.equalsTo(actual));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_1) {
|
||||||
|
auto x= NDArrayFactory::create<double>('c', {2, 3}, {-3.0, 0.0, 0.0, 4.0, 0.0, 0.0});
|
||||||
|
auto exp= NDArrayFactory::create<double>('c', {2, 3}, {-2.4, 0.0, 0.0, 3.2, 0.0, 0.0});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {4.0}, {});
|
||||||
|
|
||||||
|
auto z = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(z));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(z));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_2) {
|
||||||
|
auto x= NDArrayFactory::create<double>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
|
||||||
|
auto exp= NDArrayFactory::create<double>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {6.0}, {});
|
||||||
|
|
||||||
|
auto z = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(z));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(z));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_3) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
||||||
|
auto unities = NDArrayFactory::create<double>('c', {3, 1}, {1., 1., 1.});
|
||||||
|
auto scale = NDArrayFactory::create<double>('c', {3, 1}, {1.1, 1., 0.9});
|
||||||
|
|
||||||
|
x.linspace(100.);
|
||||||
|
|
||||||
|
auto xNorm1 = x.reduceAlongDimension(reduce::Norm2, {1}, true);
|
||||||
|
x /= xNorm1;
|
||||||
|
xNorm1 = x.reduceAlongDimension(reduce::Norm2,{1}, true);
|
||||||
|
|
||||||
|
ASSERT_TRUE(unities.isSameShape(xNorm1));
|
||||||
|
ASSERT_TRUE(unities.equalsTo(xNorm1));
|
||||||
|
|
||||||
|
x *= scale;
|
||||||
|
xNorm1 = x.reduceAlongDimension(reduce::Norm2, {1}, true);
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {1.0}, {1});
|
||||||
|
auto z = result.at(0);
|
||||||
|
|
||||||
|
auto zNorm1 = z->reduceAlongDimension(reduce::Norm2, {1}, true);
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {3, 1}, {1., 1., xNorm1.e<double>(2)});
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(&zNorm1));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(&zNorm1));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_4) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {3, 5}, {0.7044955, 0.55606544, 0.15833677, 0.001874401, 0.61595726, 0.3924779, 0.7414847, 0.4127324, 0.24026828, 0.26093036, 0.46741188, 0.01863421, 0.08528871, 0.529365, 0.5510694});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {1.f}, {});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_5) {
|
||||||
|
|
||||||
|
// auto x = NDArrayFactory::create<double>('c', {3, 5}, {1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5});
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 2.89271, 3.50524, 4.00892, 6., 7., 7.71389, 7.88678, 8.01784, 11., 12., 12.53507, 12.26833, 12.02676});
|
||||||
|
// auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1});
|
||||||
|
|
||||||
|
x.linspace(1);
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {15.f}, {0});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_6) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484});
|
||||||
|
|
||||||
|
x.linspace(1);
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {15.f}, {1});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_7) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
|
||||||
|
|
||||||
|
x.linspace(1);
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {15.f}, {0,1});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_8) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
|
||||||
|
|
||||||
|
x.linspace(1);
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {15.}, {});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_9) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {2}, {3., 4.});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {2}, {2.4, 3.2});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {4.}, {});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_10) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>(6.);
|
||||||
|
auto exp = NDArrayFactory::create<double>(5.);
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {5.}, {});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_11) {
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {1., 2., 3., 4., 4.44787, 5.33745, 6.22702, 7.1166 , 6.33046, 7.03384, 7.73723, 8.44061,
|
||||||
|
13., 14., 15., 16., 15.12277, 16.01235, 16.90192, 17.7915 ,14.77107, 15.47446, 16.17784, 16.88123});
|
||||||
|
|
||||||
|
x.linspace(1);
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {35.}, {0, 2});
|
||||||
|
auto output = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_12) {
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5,6, 7, 8, 9});
|
||||||
|
auto e = NDArrayFactory::create<double>('c', {3, 3}, {0.03198684, 0.06397368, 0.09596053, 0.12794736, 0.15993419, 0.19192106, 0.22390789, 0.25589472, 0.28788155});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&x}, {0.54}, {});
|
||||||
|
|
||||||
|
ASSERT_EQ(e, *result.at(0));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_13) {
|
||||||
|
|
||||||
|
const int bS = 5;
|
||||||
|
const int nOut = 4;
|
||||||
|
const int axis = 0;
|
||||||
|
const double clip = 2.;
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.897 ,0.173 ,0.931 ,0.736 ,0.540 ,0.953 ,0.278 ,0.573 ,0.787 ,0.320 ,0.776 ,0.338 ,0.311 ,0.835 ,0.909 ,0.890 ,0.290}); // uniform random in range [0,1]
|
||||||
|
auto colVect = NDArrayFactory::create<double>('c', {bS, 1}, {0.9, 0.95, 1.00, 1.05, 1.1});
|
||||||
|
auto expect = NDArrayFactory::create<double>('c', {bS, nOut});
|
||||||
|
|
||||||
|
auto norm2 = x.reduceAlongDimension(reduce::Norm2, {axis}, true); // norm2 has shape [1, nOut]
|
||||||
|
|
||||||
|
auto y = ( (x / norm2) * clip) * colVect ;
|
||||||
|
auto temp = (x / norm2) * clip;
|
||||||
|
|
||||||
|
for (int j = 0; j < nOut; ++j) {
|
||||||
|
auto yCol = y({0,0, j,j+1});
|
||||||
|
const double norm2Col = yCol.reduceNumber(reduce::Norm2).e<double>(0);
|
||||||
|
if (norm2Col <= clip)
|
||||||
|
expect({0,0, j,j+1}).assign(yCol);
|
||||||
|
else
|
||||||
|
expect({0,0, j,j+1}).assign ( yCol * (clip / norm2Col) );
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::ops::clipbynorm op;
|
||||||
|
auto result = op.evaluate({&y}, {clip}, {axis});
|
||||||
|
auto outFF = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(expect.isSameShape(outFF));
|
||||||
|
ASSERT_TRUE(expect.equalsTo(outFF));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_bp_1) {
|
||||||
|
|
||||||
|
const int bS = 2;
|
||||||
|
const int nOut = 3;
|
||||||
|
const double clip = 0.7;
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
||||||
|
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
||||||
|
|
||||||
|
const OpArgsHolder argsHolderFF({&x}, {clip}, {});
|
||||||
|
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm opFF;
|
||||||
|
sd::ops::clipbynorm_bp opBP;
|
||||||
|
|
||||||
|
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
||||||
|
|
||||||
|
ASSERT_TRUE(isGradCorrect);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_bp_2) {
|
||||||
|
|
||||||
|
const int bS = 2;
|
||||||
|
const int nOut = 3;
|
||||||
|
const int axis = 0;
|
||||||
|
const double clip = 0.7;
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
||||||
|
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
||||||
|
|
||||||
|
const OpArgsHolder argsHolderFF({&x}, {clip}, {axis});
|
||||||
|
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {axis});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm opFF;
|
||||||
|
sd::ops::clipbynorm_bp opBP;
|
||||||
|
|
||||||
|
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
||||||
|
|
||||||
|
ASSERT_TRUE(isGradCorrect);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbynorm_bp_3) {
|
||||||
|
|
||||||
|
const int bS = 2;
|
||||||
|
const int nOut = 3;
|
||||||
|
const int axis = 1;
|
||||||
|
const double clip = 1.;
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
||||||
|
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
||||||
|
|
||||||
|
const OpArgsHolder argsHolderFF({&x}, {clip}, {axis});
|
||||||
|
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {axis});
|
||||||
|
|
||||||
|
sd::ops::clipbynorm opFF;
|
||||||
|
sd::ops::clipbynorm_bp opBP;
|
||||||
|
|
||||||
|
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
||||||
|
|
||||||
|
ASSERT_TRUE(isGradCorrect);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbyavgnorm_1) {
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {2, 3}, {-3.0, 0.0, 0.0, 4.0, 0.0, 0.0});
|
||||||
|
auto exp = NDArrayFactory::create<double>('c', {2, 3}, {-2.88, 0.0, 0.0, 3.84, 0.0, 0.0});
|
||||||
|
|
||||||
|
sd::ops::clipbyavgnorm op;
|
||||||
|
auto result = op.evaluate({&x}, {0.8}, {});
|
||||||
|
|
||||||
|
auto z = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(z));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(z));
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbyavgnorm_2) {
|
||||||
|
auto x= NDArrayFactory::create<float>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
|
||||||
|
auto exp= NDArrayFactory::create<float>('c', {2, 3}, {-3.f, 0.0f, 0.0f, 4.f, 0.0f, 0.0f});
|
||||||
|
|
||||||
|
sd::ops::clipbyavgnorm op;
|
||||||
|
auto result = op.evaluate({&x}, {0.9}, {});
|
||||||
|
|
||||||
|
auto z = result.at(0);
|
||||||
|
|
||||||
|
ASSERT_TRUE(exp.isSameShape(z));
|
||||||
|
ASSERT_TRUE(exp.equalsTo(z));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbyavgnorm_bp_1) {
|
||||||
|
|
||||||
|
const int bS = 2;
|
||||||
|
const int nOut = 3;
|
||||||
|
const double clip = 0.7;
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
||||||
|
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
||||||
|
|
||||||
|
const OpArgsHolder argsHolderFF({&x}, {clip}, {});
|
||||||
|
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {});
|
||||||
|
|
||||||
|
sd::ops::clipbyavgnorm opFF;
|
||||||
|
sd::ops::clipbyavgnorm_bp opBP;
|
||||||
|
|
||||||
|
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
||||||
|
|
||||||
|
ASSERT_TRUE(isGradCorrect);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbyavgnorm_bp_2) {
|
||||||
|
|
||||||
|
const int bS = 2;
|
||||||
|
const int nOut = 3;
|
||||||
|
const int axis = 1;
|
||||||
|
const double clip = 1.;
|
||||||
|
|
||||||
|
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
||||||
|
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
||||||
|
|
||||||
|
const OpArgsHolder argsHolderFF({&x}, {clip}, {axis});
|
||||||
|
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {axis});
|
||||||
|
|
||||||
|
sd::ops::clipbyavgnorm opFF;
|
||||||
|
sd::ops::clipbyavgnorm_bp opBP;
|
||||||
|
|
||||||
|
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
||||||
|
|
||||||
|
ASSERT_TRUE(isGradCorrect);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests16, clipbyavgnorm_bp_3) {
|
||||||
|
|
||||||
|
NDArray x('c', {2, 3, 4}, {-0.14 ,0.96 ,0.47 ,-0.98 ,0.03 ,0.95 ,0.33 ,-0.97 ,0.59 ,-0.92 ,-0.12 ,-0.33 ,0.82 ,-0.76 ,-0.69 ,-0.95 ,-0.77 ,0.25 ,-0.35 ,0.94 ,0.50 ,0.04 ,0.61 ,0.99}, sd::DataType::DOUBLE);
|
||||||
|
NDArray gradO('c', {2, 3, 4}, sd::DataType::DOUBLE);
|
||||||
|
|
||||||
|
const OpArgsHolder argsHolderFF({&x}, {0.7}, {0,2});
|
||||||
|
const OpArgsHolder argsHolderBP({&x, &gradO}, {0.7}, {0,2});
|
||||||
|
|
||||||
|
sd::ops::clipbyavgnorm opFF;
|
||||||
|
sd::ops::clipbyavgnorm_bp opBP;
|
||||||
|
|
||||||
|
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
||||||
|
|
||||||
|
ASSERT_TRUE(isGradCorrect);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -244,94 +244,6 @@ TEST_F(DeclarableOpsTests3, Test_Norm_2) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests3, Test_ClipByAvgNorm_1) {
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {2, 3}, {-3.0, 0.0, 0.0, 4.0, 0.0, 0.0});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2, 3}, {-2.88, 0.0, 0.0, 3.84, 0.0, 0.0});
|
|
||||||
|
|
||||||
sd::ops::clipbyavgnorm op;
|
|
||||||
auto result = op.evaluate({&x}, {0.8}, {});
|
|
||||||
|
|
||||||
auto z = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(z));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(z));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests3, Test_ClipByAvgNorm_2) {
|
|
||||||
auto x= NDArrayFactory::create<float>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
|
|
||||||
auto exp= NDArrayFactory::create<float>('c', {2, 3}, {-3.f, 0.0f, 0.0f, 4.f, 0.0f, 0.0f});
|
|
||||||
|
|
||||||
sd::ops::clipbyavgnorm op;
|
|
||||||
auto result = op.evaluate({&x}, {0.9}, {});
|
|
||||||
|
|
||||||
auto z = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(z));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(z));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests3, Test_ClipByNorm_1) {
|
|
||||||
auto x= NDArrayFactory::create<double>('c', {2, 3}, {-3.0, 0.0, 0.0, 4.0, 0.0, 0.0});
|
|
||||||
auto exp= NDArrayFactory::create<double>('c', {2, 3}, {-2.4, 0.0, 0.0, 3.2, 0.0, 0.0});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {4.0}, {});
|
|
||||||
|
|
||||||
auto z = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(z));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(z));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests3, Test_ClipByNorm_2) {
|
|
||||||
auto x= NDArrayFactory::create<double>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
|
|
||||||
auto exp= NDArrayFactory::create<double>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {6.0}, {});
|
|
||||||
|
|
||||||
auto z = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(z));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(z));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests3, Test_ClipByNorm_3) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
|
||||||
auto unities = NDArrayFactory::create<double>('c', {3, 1}, {1., 1., 1.});
|
|
||||||
auto scale = NDArrayFactory::create<double>('c', {3, 1}, {1.1, 1., 0.9});
|
|
||||||
|
|
||||||
x.linspace(100.);
|
|
||||||
|
|
||||||
auto xNorm1 = x.reduceAlongDimension(reduce::Norm2, {1}, true);
|
|
||||||
x /= xNorm1;
|
|
||||||
xNorm1 = x.reduceAlongDimension(reduce::Norm2,{1}, true);
|
|
||||||
|
|
||||||
ASSERT_TRUE(unities.isSameShape(xNorm1));
|
|
||||||
ASSERT_TRUE(unities.equalsTo(xNorm1));
|
|
||||||
|
|
||||||
x *= scale;
|
|
||||||
xNorm1 = x.reduceAlongDimension(reduce::Norm2, {1}, true);
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {1.0}, {1});
|
|
||||||
auto z = result.at(0);
|
|
||||||
|
|
||||||
auto zNorm1 = z->reduceAlongDimension(reduce::Norm2, {1}, true);
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3, 1}, {1., 1., xNorm1.e<double>(2)});
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(&zNorm1));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(&zNorm1));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests3, Test_ListDiff_1) {
|
TEST_F(DeclarableOpsTests3, Test_ListDiff_1) {
|
||||||
auto x= NDArrayFactory::create<float>('c', {6}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
|
auto x= NDArrayFactory::create<float>('c', {6}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
|
||||||
auto y= NDArrayFactory::create<float>('c', {3}, {1.f, 3.f, 5.f});
|
auto y= NDArrayFactory::create<float>('c', {3}, {1.f, 3.f, 5.f});
|
||||||
|
|
|
@ -2432,6 +2432,7 @@ TEST_F(DeclarableOpsTests5, ZeroFraction_3) {
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests5, XWPlusB_1) {
|
TEST_F(DeclarableOpsTests5, XWPlusB_1) {
|
||||||
|
|
||||||
|
@ -2451,6 +2452,7 @@ TEST_F(DeclarableOpsTests5, XWPlusB_1) {
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests5, XWPlusB_2) {
|
TEST_F(DeclarableOpsTests5, XWPlusB_2) {
|
||||||
|
|
||||||
|
|
|
@ -779,9 +779,6 @@ TEST_F(DeclarableOpsTests6, TestMergeMaxIndex_1) {
|
||||||
auto res = op.evaluate({&x, &y, &z}, {}, {}, {});
|
auto res = op.evaluate({&x, &y, &z}, {}, {}, {});
|
||||||
|
|
||||||
ASSERT_EQ(ND4J_STATUS_OK, res.status());
|
ASSERT_EQ(ND4J_STATUS_OK, res.status());
|
||||||
// res.at(0)->printIndexedBuffer("MergeMaxIndex Result is ");
|
|
||||||
// res.at(0)->printShapeInfo("Shape info for MergeMaxIdex");
|
|
||||||
// x.printIndexedBuffer("Input is");
|
|
||||||
ASSERT_TRUE(res.at(0)->equalsTo(exp));
|
ASSERT_TRUE(res.at(0)->equalsTo(exp));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -789,22 +786,35 @@ TEST_F(DeclarableOpsTests6, TestMergeMaxIndex_1) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests6, TestMergeMaxIndex_2) {
|
TEST_F(DeclarableOpsTests6, TestMergeMaxIndex_2) {
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
|
auto x = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 60.f, 7.f, 8.f});
|
||||||
auto y = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
|
auto y = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
|
||||||
auto z = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 20.f, 3.f, 40.f, 5.f, 60.f, 7.f, 80.f});
|
auto z = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 20.f, 3.f, 40.f, 5.f, 6.f, 7.f, 80.f});
|
||||||
auto exp = NDArrayFactory::create<Nd4jLong>('c', {2, 2, 2}, {1, 2, 1, 2, 1, 2, 1, 2});
|
auto exp = NDArrayFactory::create<Nd4jLong>('c', {2, 2, 2}, {1, 2, 1, 2, 1, 0, 1, 2});
|
||||||
sd::ops::mergemaxindex op;
|
sd::ops::mergemaxindex op;
|
||||||
|
|
||||||
auto ress = op.evaluate({&x, &y, &z}, {}, {sd::DataType::INT64});
|
auto ress = op.evaluate({&x, &y, &z}, {}, {sd::DataType::INT64});
|
||||||
|
|
||||||
ASSERT_EQ(ND4J_STATUS_OK, ress.status());
|
ASSERT_EQ(ND4J_STATUS_OK, ress.status());
|
||||||
// res.at(0)->printIndexedBuffer("MergeMaxIndex2 Result is ");
|
|
||||||
// res.at(0)->printShapeInfo("Shape info for MergeMaxIdex2");
|
|
||||||
// x.printIndexedBuffer("Input is");
|
|
||||||
ASSERT_TRUE(ress.at(0)->equalsTo(exp));
|
ASSERT_TRUE(ress.at(0)->equalsTo(exp));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST_F(DeclarableOpsTests6, TestMergeMaxIndex_3) {
|
||||||
|
|
||||||
|
auto x1 = NDArrayFactory::create<double>('c', {3}, {1.f, 0.f, 0.f});
|
||||||
|
auto x2 = NDArrayFactory::create<double>('c', {3}, {0.f, 1.f, 0.f});
|
||||||
|
auto x3 = NDArrayFactory::create<double>('c', {3}, {0.f, 0.f, 1.f});
|
||||||
|
NDArray z('c', {3}, sd::DataType::INT32);
|
||||||
|
NDArray expZ('c', {3}, {0, 1, 2}, sd::DataType::INT32);
|
||||||
|
|
||||||
|
sd::ops::mergemaxindex op;
|
||||||
|
auto result = op.execute({&x1, &x2, &x3}, {&z}, {}, {}, {});
|
||||||
|
|
||||||
|
ASSERT_EQ(Status::OK(), result);
|
||||||
|
ASSERT_TRUE(z.equalsTo(expZ));
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests6, TestDropout_1) {
|
TEST_F(DeclarableOpsTests6, TestDropout_1) {
|
||||||
|
|
||||||
|
|
|
@ -2675,160 +2675,6 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) {
|
||||||
ASSERT_TRUE(expected.equalsTo(output));
|
ASSERT_TRUE(expected.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test4) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {3, 5}, {0.7044955, 0.55606544, 0.15833677, 0.001874401, 0.61595726, 0.3924779, 0.7414847, 0.4127324, 0.24026828, 0.26093036, 0.46741188, 0.01863421, 0.08528871, 0.529365, 0.5510694});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {1.f}, {});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test5) {
|
|
||||||
|
|
||||||
// auto x = NDArrayFactory::create<double>('c', {3, 5}, {1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5});
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 2.89271, 3.50524, 4.00892, 6., 7., 7.71389, 7.88678, 8.01784, 11., 12., 12.53507, 12.26833, 12.02676});
|
|
||||||
// auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1});
|
|
||||||
|
|
||||||
x.linspace(1);
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {15.f}, {0});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test6) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484});
|
|
||||||
|
|
||||||
x.linspace(1);
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {15.f}, {1});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test7) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
|
|
||||||
|
|
||||||
x.linspace(1);
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {15.f}, {0,1});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test8) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {3, 5});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
|
|
||||||
|
|
||||||
x.linspace(1);
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {15.}, {});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test9) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {2}, {3., 4.});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2}, {2.4, 3.2});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {4.}, {});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test10) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>(6.);
|
|
||||||
auto exp = NDArrayFactory::create<double>(5.);
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {5.}, {});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test11) {
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
|
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {1., 2., 3., 4., 4.44787, 5.33745, 6.22702, 7.1166 , 6.33046, 7.03384, 7.73723, 8.44061,
|
|
||||||
13., 14., 15., 16., 15.12277, 16.01235, 16.90192, 17.7915 ,14.77107, 15.47446, 16.17784, 16.88123});
|
|
||||||
|
|
||||||
x.linspace(1);
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {35.}, {0, 2});
|
|
||||||
auto output = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests8, clipbynorm_test_tf_119_1) {
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5,6, 7, 8, 9});
|
|
||||||
auto e = NDArrayFactory::create<double>('c', {3, 3}, {0.03198684, 0.06397368, 0.09596053, 0.12794736, 0.15993419, 0.19192106, 0.22390789, 0.25589472, 0.28788155});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&x}, {0.54}, {});
|
|
||||||
|
|
||||||
ASSERT_EQ(e, *result.at(0));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -236,10 +236,10 @@ TEST_F(DeclarableOpsTests9, ScalarOpTest_MixedOrders_1) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test1) {
|
TEST_F(DeclarableOpsTests9, concat_test1) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {2,3,4});
|
auto x0 = NDArrayFactory::create<float>('c', {2,3,4});
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {2,2,4});
|
auto x1 = NDArrayFactory::create<float>('c', {2,2,4});
|
||||||
auto x2 = NDArrayFactory::create<double>('c', {2,1,4});
|
auto x2 = NDArrayFactory::create<float>('c', {2,1,4});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2,6,4}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
auto exp = NDArrayFactory::create<float>('c', {2,6,4}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
||||||
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.});
|
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.});
|
||||||
|
|
||||||
x0.linspace(1);
|
x0.linspace(1);
|
||||||
|
@ -261,10 +261,10 @@ TEST_F(DeclarableOpsTests9, concat_test1) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test2) {
|
TEST_F(DeclarableOpsTests9, concat_test2) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {1,3,1});
|
auto x0 = NDArrayFactory::create<float>('c', {1,3,1});
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {1,2,1});
|
auto x1 = NDArrayFactory::create<float>('c', {1,2,1});
|
||||||
auto x2 = NDArrayFactory::create<double>('c', {1,1,1});
|
auto x2 = NDArrayFactory::create<float>('c', {1,1,1});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {1,6,1}, {1.f, 2.f, 3.f, 1.f, 2.f, 1.f});
|
auto exp = NDArrayFactory::create<float>('c', {1,6,1}, {1.f, 2.f, 3.f, 1.f, 2.f, 1.f});
|
||||||
|
|
||||||
x0.linspace(1);
|
x0.linspace(1);
|
||||||
x1.linspace(1);
|
x1.linspace(1);
|
||||||
|
@ -285,10 +285,10 @@ TEST_F(DeclarableOpsTests9, concat_test2) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test3) {
|
TEST_F(DeclarableOpsTests9, concat_test3) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {3});
|
auto x0 = NDArrayFactory::create<float>('c', {3});
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {2});
|
auto x1 = NDArrayFactory::create<float>('c', {2});
|
||||||
auto x2 = NDArrayFactory::create<double>('c', {1});
|
auto x2 = NDArrayFactory::create<float>('c', {1});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {6}, {1.f, 2.f, 3.f, 1.f, 2.f, 1.f});
|
auto exp = NDArrayFactory::create<float>('c', {6}, {1.f, 2.f, 3.f, 1.f, 2.f, 1.f});
|
||||||
|
|
||||||
x0.linspace(1);
|
x0.linspace(1);
|
||||||
x1.linspace(1);
|
x1.linspace(1);
|
||||||
|
@ -300,21 +300,17 @@ TEST_F(DeclarableOpsTests9, concat_test3) {
|
||||||
ASSERT_EQ(ND4J_STATUS_OK, result.status());
|
ASSERT_EQ(ND4J_STATUS_OK, result.status());
|
||||||
auto output = result.at(0);
|
auto output = result.at(0);
|
||||||
|
|
||||||
output->printBuffer();
|
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test4) {
|
TEST_F(DeclarableOpsTests9, concat_test4) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {1,1,1}, {1.f});
|
auto x0 = NDArrayFactory::create<float>('c', {1,1,1}, {1.f});
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {1,1,1}, {2.f});
|
auto x1 = NDArrayFactory::create<float>('c', {1,1,1}, {2.f});
|
||||||
auto x2 = NDArrayFactory::create<double>('c', {1,1,1}, {3.f});
|
auto x2 = NDArrayFactory::create<float>('c', {1,1,1}, {3.f});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {1.f, 2.f, 3.f});
|
auto exp = NDArrayFactory::create<float>('c', {1,3,1}, {1.f, 2.f, 3.f});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -331,10 +327,10 @@ TEST_F(DeclarableOpsTests9, concat_test4) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test5) {
|
TEST_F(DeclarableOpsTests9, concat_test5) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>(1.f);
|
auto x0 = NDArrayFactory::create<float>(1.f);
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {1}, {2.f});
|
auto x1 = NDArrayFactory::create<float>('c', {1}, {2.f});
|
||||||
auto x2 = NDArrayFactory::create<double>(3.f);
|
auto x2 = NDArrayFactory::create<float>(3.f);
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3}, {1.f, 2.f, 3.f});
|
auto exp = NDArrayFactory::create<float>('c', {3}, {1.f, 2.f, 3.f});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -351,10 +347,10 @@ TEST_F(DeclarableOpsTests9, concat_test5) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test6) {
|
TEST_F(DeclarableOpsTests9, concat_test6) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>(1.f);
|
auto x0 = NDArrayFactory::create<float>(1.f);
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {2}, {2.f, 20.f});
|
auto x1 = NDArrayFactory::create<float>('c', {2}, {2.f, 20.f});
|
||||||
auto x2 = NDArrayFactory::create<double>(3.f);
|
auto x2 = NDArrayFactory::create<float>(3.f);
|
||||||
auto exp = NDArrayFactory::create<double>('c', {4}, {1.f, 2.f, 20.f, 3.f});
|
auto exp = NDArrayFactory::create<float>('c', {4}, {1.f, 2.f, 20.f, 3.f});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -371,10 +367,10 @@ TEST_F(DeclarableOpsTests9, concat_test6) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test7) {
|
TEST_F(DeclarableOpsTests9, concat_test7) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>(1.f);
|
auto x0 = NDArrayFactory::create<float>(1.f);
|
||||||
auto x1 = NDArrayFactory::create<double>(2.f);
|
auto x1 = NDArrayFactory::create<float>(2.f);
|
||||||
auto x2 = NDArrayFactory::create<double>(3.f);
|
auto x2 = NDArrayFactory::create<float>(3.f);
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3}, {1.f, 2.f, 3.f});
|
auto exp = NDArrayFactory::create<float>('c', {3}, {1.f, 2.f, 3.f});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -391,8 +387,8 @@ TEST_F(DeclarableOpsTests9, concat_test7) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test8) {
|
TEST_F(DeclarableOpsTests9, concat_test8) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>(1.f);
|
auto x0 = NDArrayFactory::create<float>(1.f);
|
||||||
auto exp = NDArrayFactory::create<double>('c', {1}, {1.f});
|
auto exp = NDArrayFactory::create<float>('c', {1}, {1.f});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -409,8 +405,8 @@ TEST_F(DeclarableOpsTests9, concat_test8) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test9) {
|
TEST_F(DeclarableOpsTests9, concat_test9) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {1}, {1.f});
|
auto x0 = NDArrayFactory::create<float>('c', {1}, {1.f});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {1}, {1.f});
|
auto exp = NDArrayFactory::create<float>('c', {1}, {1.f});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -427,10 +423,10 @@ TEST_F(DeclarableOpsTests9, concat_test9) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test10) {
|
TEST_F(DeclarableOpsTests9, concat_test10) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {2,3,4});
|
auto x0 = NDArrayFactory::create<float>('c', {2,3,4});
|
||||||
auto x1 = NDArrayFactory::create<double>('f', {2,2,4});
|
auto x1 = NDArrayFactory::create<float>('f', {2,2,4});
|
||||||
auto x2 = NDArrayFactory::create<double>('c', {2,1,4});
|
auto x2 = NDArrayFactory::create<float>('c', {2,1,4});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2,6,4}, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
auto exp = NDArrayFactory::create<float>('c', {2,6,4}, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
||||||
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.f});
|
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.f});
|
||||||
|
|
||||||
x0.linspace(1);
|
x0.linspace(1);
|
||||||
|
@ -452,10 +448,10 @@ TEST_F(DeclarableOpsTests9, concat_test10) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test11) {
|
TEST_F(DeclarableOpsTests9, concat_test11) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {2,3,4});
|
auto x0 = NDArrayFactory::create<float>('c', {2,3,4});
|
||||||
auto x1 = NDArrayFactory::create<double>('f', {2,2,4});
|
auto x1 = NDArrayFactory::create<float>('f', {2,2,4});
|
||||||
auto x2 = NDArrayFactory::create<double>('f', {2,1,4});
|
auto x2 = NDArrayFactory::create<float>('f', {2,1,4});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2,6,4}, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
auto exp = NDArrayFactory::create<float>('c', {2,6,4}, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
||||||
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.f});
|
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.f});
|
||||||
|
|
||||||
x0.linspace(1);
|
x0.linspace(1);
|
||||||
|
@ -477,10 +473,10 @@ TEST_F(DeclarableOpsTests9, concat_test11) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test12) {
|
TEST_F(DeclarableOpsTests9, concat_test12) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {2,3,4});
|
auto x0 = NDArrayFactory::create<float>('c', {2,3,4});
|
||||||
auto x1 = NDArrayFactory::create<double>('f', {2,2,4});
|
auto x1 = NDArrayFactory::create<float>('f', {2,2,4});
|
||||||
auto x2 = NDArrayFactory::create<double>('f', {2,1,4});
|
auto x2 = NDArrayFactory::create<float>('f', {2,1,4});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2,6,4}, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
auto exp = NDArrayFactory::create<float>('c', {2,6,4}, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 1.f, 2.f, 3.f, 4.f,
|
||||||
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.f});
|
13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f,21.f, 22.f, 23.f, 24.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f, 5.f, 6.f, 7.f, 8.f});
|
||||||
|
|
||||||
x0.linspace(1);
|
x0.linspace(1);
|
||||||
|
@ -502,10 +498,10 @@ TEST_F(DeclarableOpsTests9, concat_test12) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test13) {
|
TEST_F(DeclarableOpsTests9, concat_test13) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('f', {2,3,4});
|
auto x0 = NDArrayFactory::create<float>('f', {2,3,4});
|
||||||
auto x1 = NDArrayFactory::create<double>('f', {2,2,4});
|
auto x1 = NDArrayFactory::create<float>('f', {2,2,4});
|
||||||
auto x2 = NDArrayFactory::create<double>('f', {2,1,4});
|
auto x2 = NDArrayFactory::create<float>('f', {2,1,4});
|
||||||
auto exp = NDArrayFactory::create<double>('f', {2,6,4}, { 1.f, 13.f, 5.f, 17.f, 9.f, 21.f, 1.f, 9.f, 5.f, 13.f, 1.f, 5.f, 2.f, 14.f, 6.f, 18.f,10.f, 22.f, 2.f, 10.f, 6.f, 14.f, 2.f, 6.f,
|
auto exp = NDArrayFactory::create<float>('f', {2,6,4}, { 1.f, 13.f, 5.f, 17.f, 9.f, 21.f, 1.f, 9.f, 5.f, 13.f, 1.f, 5.f, 2.f, 14.f, 6.f, 18.f,10.f, 22.f, 2.f, 10.f, 6.f, 14.f, 2.f, 6.f,
|
||||||
3.f, 15.f, 7.f, 19.f,11.f, 23.f, 3.f, 11.f, 7.f, 15.f, 3.f, 7.f, 4.f, 16.f, 8.f, 20.f,12.f, 24.f, 4.f, 12.f, 8.f, 16.f, 4.f, 8.f});
|
3.f, 15.f, 7.f, 19.f,11.f, 23.f, 3.f, 11.f, 7.f, 15.f, 3.f, 7.f, 4.f, 16.f, 8.f, 20.f,12.f, 24.f, 4.f, 12.f, 8.f, 16.f, 4.f, 8.f});
|
||||||
|
|
||||||
x0.linspace(1);
|
x0.linspace(1);
|
||||||
|
@ -527,8 +523,8 @@ TEST_F(DeclarableOpsTests9, concat_test13) {
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests9, concat_test14) {
|
TEST_F(DeclarableOpsTests9, concat_test14) {
|
||||||
|
|
||||||
NDArray x0('c', {1, 40, 60}, sd::DataType::DOUBLE);
|
NDArray x0('c', {1, 40, 60}, sd::DataType::FLOAT32);
|
||||||
NDArray x1('c', {1, 40, 60}, sd::DataType::DOUBLE);
|
NDArray x1('c', {1, 40, 60}, sd::DataType::FLOAT32);
|
||||||
|
|
||||||
x0 = 1.;
|
x0 = 1.;
|
||||||
x1 = 2.;
|
x1 = 2.;
|
||||||
|
@ -544,7 +540,7 @@ TEST_F(DeclarableOpsTests9, concat_test14) {
|
||||||
|
|
||||||
for (int e = 0; e < numOfTads; ++e) {
|
for (int e = 0; e < numOfTads; ++e) {
|
||||||
NDArray tad = (*z)(e, {0});
|
NDArray tad = (*z)(e, {0});
|
||||||
auto mean = tad.meanNumber().e<double>(0);
|
auto mean = tad.meanNumber().e<float>(0);
|
||||||
ASSERT_NEAR((e+1)*1., mean, 1e-5);
|
ASSERT_NEAR((e+1)*1., mean, 1e-5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -552,9 +548,9 @@ TEST_F(DeclarableOpsTests9, concat_test14) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests9, concat_test15) {
|
TEST_F(DeclarableOpsTests9, concat_test15) {
|
||||||
auto x = NDArrayFactory::create<double>('c', {2}, {1, 0});
|
auto x = NDArrayFactory::create<float>('c', {2}, {1, 0});
|
||||||
auto y = NDArrayFactory::create<double> (3.0f);
|
auto y = NDArrayFactory::create<float> (3.0f);
|
||||||
auto exp = NDArrayFactory::create<double>('c', {3}, {1, 0, 3});
|
auto exp = NDArrayFactory::create<float>('c', {3}, {1, 0, 3});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
auto result = op.evaluate({&x, &y}, {}, {0});
|
auto result = op.evaluate({&x, &y}, {}, {0});
|
||||||
|
@ -571,9 +567,9 @@ TEST_F(DeclarableOpsTests9, concat_test15) {
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test16) {
|
TEST_F(DeclarableOpsTests9, concat_test16) {
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {0,2,3});
|
auto x = NDArrayFactory::create<float>('c', {0,2,3});
|
||||||
auto y = NDArrayFactory::create<double>('c', {0,2,3});
|
auto y = NDArrayFactory::create<float>('c', {0,2,3});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {0,2,3});
|
auto exp = NDArrayFactory::create<float>('c', {0,2,3});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
auto result = op.evaluate({&x, &y}, {}, {0});
|
auto result = op.evaluate({&x, &y}, {}, {0});
|
||||||
|
@ -587,8 +583,8 @@ TEST_F(DeclarableOpsTests9, concat_test16) {
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test17) {
|
TEST_F(DeclarableOpsTests9, concat_test17) {
|
||||||
|
|
||||||
NDArray x0('c', {1, 55, 40}, sd::DataType::DOUBLE);
|
NDArray x0('c', {1, 55, 40}, sd::DataType::FLOAT32);
|
||||||
NDArray x1('c', {1, 55, 40}, sd::DataType::DOUBLE);
|
NDArray x1('c', {1, 55, 40}, sd::DataType::FLOAT32);
|
||||||
|
|
||||||
x0 = 1.;
|
x0 = 1.;
|
||||||
x1 = 2.;
|
x1 = 2.;
|
||||||
|
@ -606,7 +602,7 @@ TEST_F(DeclarableOpsTests9, concat_test17) {
|
||||||
|
|
||||||
for (int e = 0; e < numOfTads; ++e) {
|
for (int e = 0; e < numOfTads; ++e) {
|
||||||
NDArray tad = (*z)(e, {0});
|
NDArray tad = (*z)(e, {0});
|
||||||
auto mean = tad.meanNumber().e<double>(0);
|
auto mean = tad.meanNumber().e<float>(0);
|
||||||
ASSERT_NEAR((e+1)*1., mean, 1e-5);
|
ASSERT_NEAR((e+1)*1., mean, 1e-5);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -664,10 +660,10 @@ TEST_F(DeclarableOpsTests9, concat_test19) {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test20) {
|
TEST_F(DeclarableOpsTests9, concat_test20) {
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {1, 100, 150});
|
auto x0 = NDArrayFactory::create<float>('c', {1, 100, 150});
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {1, 100, 150});
|
auto x1 = NDArrayFactory::create<float>('c', {1, 100, 150});
|
||||||
auto x2 = NDArrayFactory::create<double>('c', {1, 100, 150});
|
auto x2 = NDArrayFactory::create<float>('c', {1, 100, 150});
|
||||||
auto x3 = NDArrayFactory::create<double>('c', {1, 100, 150});
|
auto x3 = NDArrayFactory::create<float>('c', {1, 100, 150});
|
||||||
|
|
||||||
x0.assign(1.0);
|
x0.assign(1.0);
|
||||||
x1.assign(2.0);
|
x1.assign(2.0);
|
||||||
|
@ -685,8 +681,8 @@ TEST_F(DeclarableOpsTests9, concat_test20) {
|
||||||
|
|
||||||
for (int e = 0; e < numOfTads; e++) {
|
for (int e = 0; e < numOfTads; e++) {
|
||||||
NDArray tad = (*z)(e, {0});
|
NDArray tad = (*z)(e, {0});
|
||||||
auto mean = tad.meanNumber().e<double>(0);
|
auto mean = tad.meanNumber().e<float>(0);
|
||||||
ASSERT_NEAR((double) e+1, mean, 1e-5);
|
ASSERT_NEAR((float) e+1, mean, 1e-5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -710,10 +706,10 @@ TEST_F(DeclarableOpsTests9, concat_test21) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test22) {
|
TEST_F(DeclarableOpsTests9, concat_test22) {
|
||||||
|
|
||||||
NDArray x0('c', {1,6}, {1,2,3,4,5,6});
|
NDArray x0('c', {1,6}, {1,2,3,4,5,6}, sd::DataType::FLOAT32);
|
||||||
NDArray x1('c', {1,6}, {7,8,9,10,11,12});
|
NDArray x1('c', {1,6}, {7,8,9,10,11,12}, sd::DataType::FLOAT32);
|
||||||
NDArray output('f', {2,6}, sd::DataType::DOUBLE);
|
NDArray output('f', {2,6}, sd::DataType::FLOAT32);
|
||||||
NDArray exp('c', {2,6}, {1,2,3,4,5,6,7,8,9,10,11,12});
|
NDArray exp('c', {2,6}, {1,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::FLOAT32);
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -726,10 +722,10 @@ TEST_F(DeclarableOpsTests9, concat_test22) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test23) {
|
TEST_F(DeclarableOpsTests9, concat_test23) {
|
||||||
|
|
||||||
NDArray x0('c', {1,4}, {1,2,3,4});
|
NDArray x0('c', {1,4}, {1,2,3,4},sd::DataType::FLOAT32);
|
||||||
NDArray x1('c', {1,4}, {5,6,7,8});
|
NDArray x1('c', {1,4}, {5,6,7,8},sd::DataType::FLOAT32);
|
||||||
NDArray output('c', {2,4}, sd::DataType::DOUBLE);
|
NDArray output('c', {2,4}, sd::DataType::FLOAT32);
|
||||||
NDArray exp('c', {2,4}, {1,2,3,4,5,6,7,8});
|
NDArray exp('c', {2,4}, {1,2,3,4,5,6,7,8}, sd::DataType::FLOAT32);
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -741,10 +737,10 @@ TEST_F(DeclarableOpsTests9, concat_test23) {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test24) {
|
TEST_F(DeclarableOpsTests9, concat_test24) {
|
||||||
auto x = NDArrayFactory::create<double>('c', {2, 1}, {1, 1});
|
auto x = NDArrayFactory::create<float>('c', {2, 1}, {1, 1});
|
||||||
auto y = NDArrayFactory::create<double>('c', {2, 1}, {0, 0});
|
auto y = NDArrayFactory::create<float>('c', {2, 1}, {0, 0});
|
||||||
auto e = NDArrayFactory::create<double>('c', {2, 2}, {1, 0, 1, 0});
|
auto e = NDArrayFactory::create<float>('c', {2, 2}, {1, 0, 1, 0});
|
||||||
auto z = NDArrayFactory::create<double>('c', {2, 2});
|
auto z = NDArrayFactory::create<float>('c', {2, 2});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
auto status = op.execute({&x, &y}, {&z}, {}, {1}, {});
|
auto status = op.execute({&x, &y}, {&z}, {}, {1}, {});
|
||||||
|
@ -756,10 +752,10 @@ TEST_F(DeclarableOpsTests9, concat_test24) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test25) {
|
TEST_F(DeclarableOpsTests9, concat_test25) {
|
||||||
|
|
||||||
auto x0 = NDArrayFactory::create<double>('c', {1,4}, {1,2,3,4});
|
auto x0 = NDArrayFactory::create<float>('c', {1,4}, {1,2,3,4});
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {1,4}, {5,6,7,8});
|
auto x1 = NDArrayFactory::create<float>('c', {1,4}, {5,6,7,8});
|
||||||
auto axis = NDArrayFactory::create<double>('c', {1}, {0.});
|
auto axis = NDArrayFactory::create<float>('c', {1}, {0.});
|
||||||
auto exp = NDArrayFactory::create<double>('c', {2,4}, {1,2,3,4,5,6,7,8});
|
auto exp = NDArrayFactory::create<float>('c', {2,4}, {1,2,3,4,5,6,7,8});
|
||||||
|
|
||||||
sd::ops::concat op;
|
sd::ops::concat op;
|
||||||
|
|
||||||
|
@ -793,7 +789,7 @@ TEST_F(DeclarableOpsTests9, concat_test26) {
|
||||||
|
|
||||||
ASSERT_EQ(ND4J_STATUS_OK, result.status());
|
ASSERT_EQ(ND4J_STATUS_OK, result.status());
|
||||||
auto output = result.at(0);
|
auto output = result.at(0);
|
||||||
output->printLinearBuffer();
|
// output->printLinearBuffer();
|
||||||
|
|
||||||
ASSERT_TRUE(exp.isSameShape(output));
|
ASSERT_TRUE(exp.isSameShape(output));
|
||||||
ASSERT_TRUE(exp.equalsTo(output));
|
ASSERT_TRUE(exp.equalsTo(output));
|
||||||
|
@ -802,10 +798,10 @@ TEST_F(DeclarableOpsTests9, concat_test26) {
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, concat_test27) {
|
TEST_F(DeclarableOpsTests9, concat_test27) {
|
||||||
|
|
||||||
auto x1 = NDArrayFactory::create<double>('c', {0,1});
|
auto x1 = NDArrayFactory::create<float>('c', {0,1});
|
||||||
auto x2 = NDArrayFactory::create<double>('c', {0,1});
|
auto x2 = NDArrayFactory::create<float>('c', {0,1});
|
||||||
auto x3 = NDArrayFactory::create<double>('c', {0,1});
|
auto x3 = NDArrayFactory::create<float>('c', {0,1});
|
||||||
auto x4 = NDArrayFactory::create<double>('c', {0,1});
|
auto x4 = NDArrayFactory::create<float>('c', {0,1});
|
||||||
|
|
||||||
std::vector<Nd4jLong> expShape = {0, 4};
|
std::vector<Nd4jLong> expShape = {0, 4};
|
||||||
|
|
||||||
|
@ -1245,109 +1241,6 @@ TEST_F(DeclarableOpsTests9, test_unstack_SGO_1) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests9, clipbynorm_test12) {
|
|
||||||
|
|
||||||
const int bS = 5;
|
|
||||||
const int nOut = 4;
|
|
||||||
const int axis = 0;
|
|
||||||
const double clip = 2.;
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.897 ,0.173 ,0.931 ,0.736 ,0.540 ,0.953 ,0.278 ,0.573 ,0.787 ,0.320 ,0.776 ,0.338 ,0.311 ,0.835 ,0.909 ,0.890 ,0.290}); // uniform random in range [0,1]
|
|
||||||
auto colVect = NDArrayFactory::create<double>('c', {bS, 1}, {0.9, 0.95, 1.00, 1.05, 1.1});
|
|
||||||
auto expect = NDArrayFactory::create<double>('c', {bS, nOut});
|
|
||||||
|
|
||||||
auto norm2 = x.reduceAlongDimension(reduce::Norm2, {axis}, true); // norm2 has shape [1, nOut]
|
|
||||||
|
|
||||||
auto y = ( (x / norm2) * clip) * colVect ;
|
|
||||||
auto temp = (x / norm2) * clip;
|
|
||||||
|
|
||||||
for (int j = 0; j < nOut; ++j) {
|
|
||||||
auto yCol = y({0,0, j,j+1});
|
|
||||||
const double norm2Col = yCol.reduceNumber(reduce::Norm2).e<double>(0);
|
|
||||||
if (norm2Col <= clip)
|
|
||||||
expect({0,0, j,j+1}).assign(yCol);
|
|
||||||
else
|
|
||||||
expect({0,0, j,j+1}).assign ( yCol * (clip / norm2Col) );
|
|
||||||
}
|
|
||||||
|
|
||||||
sd::ops::clipbynorm op;
|
|
||||||
auto result = op.evaluate({&y}, {clip}, {axis});
|
|
||||||
auto outFF = result.at(0);
|
|
||||||
|
|
||||||
ASSERT_TRUE(expect.isSameShape(outFF));
|
|
||||||
ASSERT_TRUE(expect.equalsTo(outFF));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests9, clipbynorm_bp_test1) {
|
|
||||||
|
|
||||||
const int bS = 2;
|
|
||||||
const int nOut = 3;
|
|
||||||
const double clip = 0.7;
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
|
||||||
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
|
||||||
|
|
||||||
const OpArgsHolder argsHolderFF({&x}, {clip}, {});
|
|
||||||
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm opFF;
|
|
||||||
sd::ops::clipbynorm_bp opBP;
|
|
||||||
|
|
||||||
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
|
||||||
|
|
||||||
ASSERT_TRUE(isGradCorrect);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests9, clipbynorm_bp_test2) {
|
|
||||||
|
|
||||||
const int bS = 2;
|
|
||||||
const int nOut = 3;
|
|
||||||
const int axis = 0;
|
|
||||||
const double clip = 0.7;
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
|
||||||
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
|
||||||
|
|
||||||
const OpArgsHolder argsHolderFF({&x}, {clip}, {axis});
|
|
||||||
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {axis});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm opFF;
|
|
||||||
sd::ops::clipbynorm_bp opBP;
|
|
||||||
|
|
||||||
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
|
||||||
|
|
||||||
ASSERT_TRUE(isGradCorrect);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
TEST_F(DeclarableOpsTests9, clipbynorm_bp_test3) {
|
|
||||||
|
|
||||||
const int bS = 2;
|
|
||||||
const int nOut = 3;
|
|
||||||
const int axis = 1;
|
|
||||||
const double clip = 1.;
|
|
||||||
|
|
||||||
auto x = NDArrayFactory::create<double>('c', {bS, nOut}, {0.412 ,0.184 ,0.961 ,0.173 ,0.736 ,0.540 }); // uniform random in range [0,1]
|
|
||||||
auto gradO = NDArrayFactory::create<double>('c', {bS, nOut});
|
|
||||||
|
|
||||||
const OpArgsHolder argsHolderFF({&x}, {clip}, {axis});
|
|
||||||
const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {axis});
|
|
||||||
|
|
||||||
sd::ops::clipbynorm opFF;
|
|
||||||
sd::ops::clipbynorm_bp opBP;
|
|
||||||
|
|
||||||
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
|
|
||||||
|
|
||||||
ASSERT_TRUE(isGradCorrect);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests9, cumprod_1) {
|
TEST_F(DeclarableOpsTests9, cumprod_1) {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue