From fe47f52896a3138beeb94361ab40e5510f83f7c1 Mon Sep 17 00:00:00 2001 From: Yurii Shyrma Date: Thu, 13 Feb 2020 19:33:54 +0200 Subject: [PATCH 01/19] Oleh tenzor mmul (#231) * Libnd4j: TensorMMul backprop op #8174, raw implementation Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 merge master and some corrections Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 algorithm update, need testing, sync with master * Libnd4j: TensorMMul backprop op #8174 fixed incorrect B axes calculation Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 optimize axes identification and fix bug of indeces overlapping, added first test. need testing with different shapes Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 some fixes and improvements need more testing Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 fixed order of matrix multiply Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 fixed issue of incorrect axes definition, add tests based on TF, need additional testing for case dLdC not equal 1 Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 fixed scalar case add test Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 fixed bp algorithm, axes definition, need some mode testing with different orders combination f,c; c,f f,f and add some checks for inputs Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 some checks and corrections added tests, exists the problem with different input orders support A-f B-c and A-f B-f Signed-off-by: Oleg * Libnd4j: TensorMMul backprop op #8174 sync master Signed-off-by: Oleg * - correct bug in MmulHelper::tensorDot(a, b, c, axes_a, axes_b,permutForC) Signed-off-by: Yurii * Libnd4j: TensorMMul backprop op #8174 code clean up and refactoring Signed-off-by: Oleg * - add check for linspase ordered permutations in ShapeUtils::evalShapeForTensorDot Signed-off-by: Yurii * - provide additional code in shape::reshape stuff in order to reduce amount of allocation/copy operations during reshaping procedure Signed-off-by: Yurii * - further work on problem of wrong shape evaluation during permute/reshape procedures Signed-off-by: Yurii * - still looking for bug reason in reshape/permute stuff Signed-off-by: Yurii * - correct bug in transform cuda native ops Signed-off-by: Yurii * - correct bug in NDArray::assign Signed-off-by: Yurii * - remove old shape::reshape stuff Signed-off-by: Yurii * - add possibility to disable copy of old buffer to new buffer during reshape operation in NDArray class Signed-off-by: Yurii * - correct bug in tensorDot which had to do with wrong pointers assigments Signed-off-by: Yurii Co-authored-by: Oleh --- libnd4j/blas/NDArray.h | 16 +- libnd4j/blas/NDArray.hpp | 165 ++-- libnd4j/include/helpers/ShapeBuilders.h | 13 +- .../include/helpers/cpu/ConstantTadHelper.cpp | 2 +- libnd4j/include/helpers/impl/MmulHelper.cpp | 63 +- .../include/helpers/impl/ShapeBuilders.cpp | 10 + libnd4j/include/helpers/impl/ShapeUtils.cpp | 21 +- libnd4j/include/helpers/shape.h | 831 ++++++++++-------- .../loops/cuda/transform/transform_any.cu | 2 +- .../loops/cuda/transform/transform_bool.cu | 2 +- .../loops/cuda/transform/transform_float.cu | 2 +- .../loops/cuda/transform/transform_same.cu | 2 +- .../loops/cuda/transform/transform_strict.cu | 2 +- .../declarable/generic/blas/tensormmul.cpp | 192 +++- .../declarable/generic/nn/convo/conv1d.cpp | 8 +- .../declarable/generic/nn/convo/conv3d.cpp | 2 +- .../declarable/generic/nn/convo/deconv2d.cpp | 2 +- .../declarable/generic/nn/convo/deconv3d.cpp | 2 +- .../generic/parity_ops/resize_area.cpp | 6 +- .../generic/parity_ops/resize_bicubic.cpp | 6 +- .../generic/parity_ops/resize_linear.cpp | 6 +- .../generic/parity_ops/resize_neighbor.cpp | 4 +- .../ops/declarable/generic/shape/squeeze.cpp | 12 +- libnd4j/include/ops/declarable/headers/blas.h | 3 +- .../declarable/helpers/cpu/convolutions.cpp | 4 +- .../ops/declarable/helpers/cpu/cross.cpp | 2 +- .../ops/declarable/helpers/cpu/s_t_b.cpp | 8 +- .../include/ops/declarable/helpers/cross.h | 2 +- .../declarable/helpers/cuda/convolutions.cu | 6 +- .../ops/declarable/helpers/cuda/s_t_b.cu | 8 +- .../declarable/platform/mkldnn/lstmLayer.cpp | 4 +- .../layers_tests/DeclarableOpsTests1.cpp | 274 ++++++ .../layers_tests/DeclarableOpsTests12.cpp | 24 - .../layers_tests/DeclarableOpsTests15.cpp | 444 ++++++++++ .../layers_tests/DeclarableOpsTests2.cpp | 240 ----- .../layers_tests/DeclarableOpsTests9.cpp | 33 - .../dimensionalityreduction/TestPCA.java | 2 +- 37 files changed, 1524 insertions(+), 901 deletions(-) diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h index fe5f90bc3..3a68edde1 100644 --- a/libnd4j/blas/NDArray.h +++ b/libnd4j/blas/NDArray.h @@ -999,14 +999,14 @@ namespace nd4j { * set new order and shape in case of suitable array length (in-place operation) * order - order to set * shape - shape to set - * + * copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping * if there was permute applied before or there are weird strides, then new buffer is allocated for array */ - bool reshapei(const char order, const std::initializer_list& shape); - bool reshapei(const char order, const std::vector& shape); + bool reshapei(const char order, const std::initializer_list& shape, const bool copyToNewBuff = true); + bool reshapei(const char order, const std::vector& shape, const bool copyToNewBuff = true); - bool reshapei(const std::initializer_list& shape); - bool reshapei(const std::vector& shape); + bool reshapei(const std::initializer_list& shape, const bool copyToNewBuff = true); + bool reshapei(const std::vector& shape, const bool copyToNewBuff = true); /** * creates new array with corresponding order and shape, new array will point on _buffer of this array @@ -1015,8 +1015,8 @@ namespace nd4j { * * if permute have been applied before or there are weird strides, then new buffer is allocated for new array */ - NDArray reshape(const char order, const std::vector& shape) const &; - NDArray reshape(const char order, const std::vector& shape) &&; + NDArray reshape(const char order, const std::vector& shape, const bool copyToNewBuff = true) const &; + NDArray reshape(const char order, const std::vector& shape, const bool copyToNewBuff = true) &&; /** * calculate strides and set given order @@ -1493,7 +1493,7 @@ namespace nd4j { * @return */ bool isS() const; - + template std::vector asVectorT(); diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 79137ac3a..f7bad72c3 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -42,7 +42,7 @@ ND4J_EXPORT std::u32string NDArray::e(const Nd4jLong i) const; //////////////////////////////////////////////////////////////////////// // copy constructor NDArray::NDArray(const NDArray& other) { - + _context = other._context; _offset = 0; @@ -308,7 +308,7 @@ NDArray::NDArray(const std::u16string& u16string, nd4j::DataType dtype, nd4j::La if (!unicode::isStringValidU16(u16string.data(), u16string.data() + u16string.size())) { throw std::invalid_argument("NDArray::NDArray: invalid character in input string"); } - + // one word that is why used 1 Nd4jLong headerLength = ShapeUtils::stringBufferHeaderRequirements(1); @@ -435,11 +435,11 @@ NDArray::NDArray(const std::string& str, nd4j::DataType dtype, nd4j::LaunchConte _offset = 0; setShapeInfo(ShapeDescriptor::scalarDescriptor(dtype)); - + memcpy(bufferAsT(), &offsets[0], 2 * sizeof(Nd4jLong)); - + auto data = reinterpret_cast(bufferAsT() + headerLength); - + if (dtype == DataType::UTF8) { memcpy(data, str.data(), str.size()); } @@ -456,13 +456,13 @@ NDArray::NDArray(const std::string& str, nd4j::DataType dtype, nd4j::LaunchConte ///////////////////////////////////////////////////////////////////////// // constructors for vector of strings NDArray::NDArray(const std::vector& shape, const std::vector& string, const nd4j::DataType dataType, nd4j::LaunchContext* context) { - + if (!DataTypeUtils::isS(dataType)) throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used"); if (shape::prodLong(shape.data(), shape.size()) != string.size()) throw std::invalid_argument("NDArray::NDArray: Number of strings should match length of array"); - + for (const auto& str : string) { if (!unicode::isStringValidU8(str, str + std::char_traits::length(str)) ) { throw std::invalid_argument("NDArray::NDArray: invalid character in input string"); @@ -497,7 +497,7 @@ NDArray::NDArray(const std::vector& shape, const std::vectorgetWorkspace() != nullptr); memcpy(bufferAsT(), offsets.data(), offsets.size() * sizeof(Nd4jLong)); - + auto data = reinterpret_cast(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ @@ -631,9 +631,9 @@ NDArray::NDArray(const std::vector& shape, const std::vectorgetWorkspace() != nullptr); memcpy(bufferAsT(), offsets.data(), offsets.size() * sizeof(Nd4jLong)); - + auto data = reinterpret_cast(bufferAsT() + headerLength); - + auto func = PRAGMA_THREADS_FOR{ for (auto e = start; e < stop; e += increment) { auto cdata = data + offsets[e]; @@ -699,7 +699,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); - + auto func = PRAGMA_THREADS_FOR{ for (auto e = start; e < stop; e += increment) { auto cdata = data + offsets[e]; @@ -715,7 +715,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector& shape, const std::vector(), offsets.data(), offsets.size() * sizeof(Nd4jLong)); - auto data = reinterpret_cast(bufferAsT() + headerLength); - + auto data = reinterpret_cast(bufferAsT() + headerLength); + auto func = PRAGMA_THREADS_FOR{ for (auto e = start; e < stop; e += increment) { auto cdata = data + offsets[e]; @@ -781,7 +781,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector& shape, const std::vector(), offsets.data(), offsets.size() * sizeof(Nd4jLong)); auto data = reinterpret_cast(bufferAsT() + headerLength); - + auto func = PRAGMA_THREADS_FOR{ for (auto e = start; e < stop; e += increment) { auto cdata = data + offsets[e]; @@ -847,7 +847,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector& dimensions) cons ////////////////////////////////////////////////////////////////////////// void NDArray::printShapeInfo(const char * msg) const { - //shape::printShapeInfo(_shapeInfo); - if (msg == nullptr) - shape::printShapeInfoLinear(_shapeInfo); - else { - int rank = shape::rank(_shapeInfo); - int lim = shape::shapeInfoLength(rank); - printf("%s: [", msg); - for (int i = 0; i < shape::shapeInfoLength(rank); i++) { - printf("%lld", (long long) _shapeInfo[i]); - if (i < lim - 1) - printf(", "); - } - printf("]\n"); + + int rank = shape::rank(_shapeInfo); + int lim = shape::shapeInfoLength(rank); + + if(msg != nullptr) + printf("shapeInfo %s: [", msg); + else + printf("shapeInfo: ["); + + printf("%i, ", rank); + for (int i = 1; i < shape::shapeInfoLength(rank) - 3; i++){ + if(i == rank + 1) + printf(" "); + printf("%lld,", _shapeInfo[i]); } + printf(" %lld,", shape::type(_shapeInfo)); + printf("%lld,", shape::elementWiseStride(_shapeInfo)); + printf("%lld]\n", (Nd4jLong)shape::order(_shapeInfo)); + fflush(stdout); } @@ -1624,7 +1629,7 @@ void NDArray::printBuffer(const char* msg, Nd4jLong limit, const bool sync) cons if (e < limit - 1) printf(", "); } - } + } else if (this->isS()) { // todo do we need this print offsets /* @@ -1773,7 +1778,7 @@ void NDArray::printIndexedBuffer(const char* msg, Nd4jLong limit) const { printf("%s\n", this->e(0)?"true":"false"); } else if (this->isS()) { - // todo do we need this + // todo do we need this // printf("\"%lld\"\n", this->getOffset(e)); printf("\"%s\"\n", this->e(0).c_str()); } @@ -1855,19 +1860,19 @@ void NDArray::updateStrides(const char order) { ////////////////////////////////////////////////////////////////////////// // set new order and shape in case of suitable array length -bool NDArray::reshapei(const char order, const std::initializer_list& shape) { +bool NDArray::reshapei(const char order, const std::initializer_list& shape, const bool copyToNewBuff) { std::vector vShape(shape); - return reshapei(order, vShape); + return reshapei(order, vShape, copyToNewBuff); } ////////////////////////////////////////////////////////////////////////// -bool NDArray::reshapei(const std::initializer_list& shape) { - return reshapei('c', shape); +bool NDArray::reshapei(const std::initializer_list& shape, const bool copyToNewBuff) { + return reshapei(ordering(), shape, copyToNewBuff); } ////////////////////////////////////////////////////////////////////////// -bool NDArray::reshapei(const std::vector& shape) { - return reshapei('c', shape); +bool NDArray::reshapei(const std::vector& shape, const bool copyToNewBuff) { + return reshapei(ordering(), shape, copyToNewBuff); } ////////////////////////////////////////////////////////////////////////// @@ -1918,18 +1923,18 @@ Nd4jLong NDArray::argMax(std::initializer_list dimensions) { ////////////////////////////////////////////////////////////////////////// // create new array with corresponding order and shape, new array will point to the same _buffer as this array -NDArray NDArray::reshape(const char order, const std::vector& shape) const & { +NDArray NDArray::reshape(const char order, const std::vector& shape, const bool copyToNewBuff) const & { NDArray newArr(getDataBuffer(), ShapeDescriptor(getShapeInfo()), getContext(), getBufferOffset()); - newArr.reshapei(order, shape); + newArr.reshapei(order, shape, copyToNewBuff); return newArr; } ////////////////////////////////////////////////////////////////////////// -NDArray NDArray::reshape(const char order, const std::vector& shape) && { +NDArray NDArray::reshape(const char order, const std::vector& shape, const bool copyToNewBuff) && { - this->reshapei(order, shape); + this->reshapei(order, shape, copyToNewBuff); return std::move(*this); } @@ -2280,7 +2285,7 @@ template NDArray NDArray::asT() const{ auto result = isScalar() ? NDArray('c', {}, std::vector{0.}, DataTypeUtils::fromT(), this->getContext()) : NDArray(ordering(), getShapeAsVector(), DataTypeUtils::fromT(), this->getContext()); - + NDArray::prepareSpecialUse({&result}, {this}); NativeOpExecutioner::execTransformAny(getContext(), transform::AnyOps::Assign, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), result.getBuffer(), result.getShapeInfo(), result.getSpecialBuffer(), result.getSpecialShapeInfo(), nullptr, nullptr, nullptr); NDArray::registerSpecialUse({&result}, {this}); @@ -2298,15 +2303,15 @@ NDArray NDArray::asS() const { auto dtype = DataTypeUtils::fromT(); - if (!(DataTypeUtils::isS(dtype))) + if (!(DataTypeUtils::isS(dtype))) throw std::invalid_argument("NDArray::asS: invalid DataType used"); - + if (dtype == dataType()) { - + Nd4jLong offsetsLength = ShapeUtils::stringBufferHeaderRequirements(lengthOf()); const auto nInputoffsets = bufferAsT(); std::shared_ptr pBuffer = std::make_shared(offsetsLength + nInputoffsets[lengthOf()], dtype, getContext()->getWorkspace(), true); - + NDArray res(pBuffer, ShapeDescriptor(dtype, ordering(), getShapeAsVector()), getContext()); res.setAttached(getContext()->getWorkspace() != nullptr); @@ -2319,7 +2324,7 @@ NDArray NDArray::asS() const { registerPrimaryUse({ &res }, { this }); return res; } - + Nd4jLong offsetsLength = ShapeUtils::stringBufferHeaderRequirements(lengthOf()); std::vector offsets(lengthOf() + 1); @@ -2353,7 +2358,7 @@ NDArray NDArray::asS() const { NDArray res(pBuffer, ShapeDescriptor(dtype, ordering(), getShapeAsVector()), getContext()); res.setAttached(getContext()->getWorkspace() != nullptr); - + preparePrimaryUse({ &res }, { this }); memcpy(res.bufferAsT(), offsets.data(), offsets.size() * sizeof(Nd4jLong)); @@ -2403,7 +2408,7 @@ BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArray::asS, () const, LIBND //////////////////////////////////////////////////////////////////////// NDArray NDArray::asT(DataType dtype) const { - + if (isS() && !DataTypeUtils::isS(dtype)) throw std::runtime_error("NDArray::asT: you can't use this method on String array with not string DataType!"); @@ -3221,7 +3226,7 @@ BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT std::vector, NDArray::asVectorT(), LI ////////////////////////////////////////////////////////////////////////// // set new order and shape in case of suitable array length -bool NDArray::reshapei(const char order, const std::vector& cshape) { +bool NDArray::reshapei(const char order, const std::vector& cshape, const bool copyToNewBuff) { // check firstly whether cshape is identical to shape of array, if yes then reshape is unnecessary if(order == ordering() && shape::shapeEquals(rankOf(), shapeOf(), cshape.size(), cshape.data())) @@ -3293,19 +3298,15 @@ bool NDArray::reshapei(const char order, const std::vector& cshape) { Nd4jLong *shapeInfoNew; ALLOCATE(shapeInfoNew, getContext()->getWorkspace(), shape::shapeInfoLength(rank), Nd4jLong); - bool canReshape = shape::reshapeC(rankOf(), shapeInfo(), shape.size(), shape.data(), shapeInfoNew); + bool canReshape = shape::reshapeC(shapeInfo(), order, shape.size(), shape.data(), shapeInfoNew); - // we can do this only if there was no permute applied, or there are no weird strides if (canReshape) { - if(ordering() == 'c' && order == 'f') - throw std::invalid_argument("NDArray::reshapei(order, shape): in case of reshapeC it doesn't make sense to reshape from c order to f order !"); - - shape::setEws(shapeInfoNew, arrLength); setShapeInfo(shapeInfoNew); } else { NDArray temp(order, shape, dataType(), getContext()); - this->applyTransform(transform::Assign, temp, nullptr); + if(copyToNewBuff) + this->applyTransform(transform::Assign, temp, nullptr); *this = std::move(temp); } @@ -3463,7 +3464,7 @@ NDArray NDArray::dup(const char newOrder) const { if (isS()) { if (dataType() == DataType::UTF8) { std::vector strings(lengthOf()); - + auto func = PRAGMA_THREADS_FOR{ for (auto i = start; i < stop; i += increment) { strings[i] = std::move(this->e(i)); @@ -3521,7 +3522,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const { if (isS()) { // string is special case, we'll compare them one by one, considering both arrays are guaranteed to have the same length - + if (dataType() == DataType::UTF8) { for (int e = 0; e < this->lengthOf(); e++) { auto s1 = this->e(e); @@ -3585,7 +3586,7 @@ std::string NDArray::e(const Nd4jLong i) const { if (i == lengthOf()) throw std::runtime_error("Can't get std::string for index out of range"); - + if (this->dataType() == DataType::UTF16) { auto u16 = this->e(i); std::string s; @@ -4846,7 +4847,7 @@ NDArray NDArray::operator()(const std::vector& idx, const bool keepUni auto shapeOf = shape::shapeOf(newShapeInfo); auto stridesOf = shape::stride(newShapeInfo); - Nd4jLong offset(0), subArrLen(1); + Nd4jLong offset = 0; int n(isStrided ? 3 : 2), first, last, stride; for (int d = rank - 1; d >= 0; --d) { @@ -4863,29 +4864,31 @@ NDArray NDArray::operator()(const std::vector& idx, const bool keepUni if(shapeOf[d] != 1) stridesOf[d] *= stride; } + } - subArrLen *= shapeOf[d]; + Nd4jLong *shapeInfoNoUnities = newShapeInfo; + + if(!keepUnitiesInShape) { + + std::vector dimsWithUnities; + + for (uint d = 0; d < rank; ++d) + if(idx[n*d] != idx[n*d+1] && shapeOf[d] == 1) + dimsWithUnities.push_back(d); + + if(!dimsWithUnities.empty()) + shapeInfoNoUnities = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace()); } // check if there is possibility to set ews = 1 - shape::setEws(newShapeInfo, subArrLen); + shape::checkStridesSetEwsAndOrder(shapeInfoNoUnities); - NDArray result(_buffer, ShapeDescriptor(newShapeInfo), getContext(), offset + getBufferOffset()); + NDArray result(_buffer, ShapeDescriptor(shapeInfoNoUnities), getContext(), offset + getBufferOffset()); result._isView = true; - if(!keepUnitiesInShape) { - const int coeff = isStrided ? 3 : 2; - std::vector nonUnitDims; - - for (int d = 0; d < rank; ++d) - if(!(idx[coeff*d] != idx[coeff*d+1] && newShapeInfo[d+1] == 1)) - nonUnitDims.push_back(newShapeInfo[d+1]); - - if(nonUnitDims.size() != rank) - result.reshapei(nonUnitDims); - } - RELEASE(newShapeInfo, getContext()->getWorkspace()); + if(newShapeInfo != shapeInfoNoUnities) + RELEASE(shapeInfoNoUnities, getContext()->getWorkspace()); return result; } diff --git a/libnd4j/include/helpers/ShapeBuilders.h b/libnd4j/include/helpers/ShapeBuilders.h index 49ef20e9f..2d71c7ab2 100644 --- a/libnd4j/include/helpers/ShapeBuilders.h +++ b/libnd4j/include/helpers/ShapeBuilders.h @@ -30,15 +30,15 @@ namespace nd4j { class ND4J_EXPORT ShapeBuilders { - public: + public: static Nd4jLong* createScalarShapeInfo(nd4j::DataType dataType, nd4j::memory::Workspace* workspace = nullptr); - + static Nd4jLong* createVectorShapeInfo(const nd4j::DataType dataType, const Nd4jLong length, nd4j::memory::Workspace* workspace = nullptr); /** * create shapeInfo for given order basing on shape stored in shapeOnly vector * memory allocation for shapeInfo is on given workspace - */ + */ static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, int rank, const Nd4jLong* shapeOnly, memory::Workspace* workspace = nullptr); static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const std::vector& shapeOnly, memory::Workspace* workspace = nullptr); static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const std::initializer_list& shapeOnly, memory::Workspace* workspace = nullptr); @@ -51,6 +51,13 @@ namespace nd4j { static Nd4jLong* copyShapeInfoAndType(const Nd4jLong* inShapeInfo, const DataType dtype, const bool copyStrides, memory::Workspace* workspace = nullptr); static Nd4jLong* copyShapeInfoAndType(const Nd4jLong* inShapeInfo, const Nd4jLong* shapeInfoToGetTypeFrom, const bool copyStrides, memory::Workspace* workspace = nullptr); + /** + * allocates memory for new shapeInfo and copy all information from inShapeInfo to new shapeInfo except dimensions in dimsToExclude (unit dimensions) and corresponding strides + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} + */ + static Nd4jLong* copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace = nullptr); + static Nd4jLong* emptyShapeInfo(const nd4j::DataType dataType, memory::Workspace* workspace = nullptr); static Nd4jLong* emptyShapeInfo(const nd4j::DataType dataType, const char order, const std::vector &shape, memory::Workspace* workspace = nullptr); diff --git a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp index 822b5ad0d..d48cfca61 100644 --- a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp +++ b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp @@ -68,7 +68,7 @@ namespace nd4j { const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(shapeInfo, dimsToExclude); const int subArrRank = (rank == dimsToExclude.size() || descriptor.areUnitiesinShape()) ? rank : rank - dimsToExclude.size(); - auto sPtr = new Nd4jLong[shape::shapeInfoLength(subArrRank)]; + auto sPtr = new Nd4jLong[shape::shapeInfoLength(subArrRank)]; // shape of sub-arrays (same for all for them) auto oPtr = new Nd4jLong[numOfSubArrs]; if (numOfSubArrs > 0) diff --git a/libnd4j/include/helpers/impl/MmulHelper.cpp b/libnd4j/include/helpers/impl/MmulHelper.cpp index 716062a53..abc353132 100644 --- a/libnd4j/include/helpers/impl/MmulHelper.cpp +++ b/libnd4j/include/helpers/impl/MmulHelper.cpp @@ -43,23 +43,30 @@ nd4j::NDArray* nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::N auto outShape = ShapeUtils::evalShapeForTensorDot(a, b, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt); - NDArray aPR = a->permute(permutAt); - NDArray bPR = b->permute(permutBt); + // check whether permutation is necessary + const NDArray* aP = permutAt.empty() ? a : new NDArray(a->permute(permutAt)); + const NDArray* bP = permutBt.empty() ? b : new NDArray(b->permute(permutBt)); // check whether reshape is necessary - if(!aPR.isSameShape(shapeAt)) - aPR.reshapei( shapeAt); - if(!bPR.isSameShape(shapeBt)) - bPR.reshapei( shapeBt); + const NDArray* aPR = aP->isSameShape(shapeAt) ? aP : new NDArray(aP->reshape(aP->ordering(), shapeAt)); + const NDArray* bPR = bP->isSameShape(shapeAt) ? bP : new NDArray(bP->reshape(bP->ordering(), shapeBt)); - NDArray* c = mmul(&aPR, &bPR, nullptr, 1.0, 0.0); + NDArray* c = mmul(aPR, bPR, nullptr, 1.0, 0.0); c->reshapei(outShape); + if(aP != aPR) + delete aPR; + if(bP != bPR) + delete bPR; + if(a != aP) + delete aP; + if(b != bP) + delete bP; + return c; } - ////////////////////////////////////////////////////////////////////////// void nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, nd4j::NDArray* c, const std::vector& axes_a, const std::vector& axes_b, const std::vector& permutForC) { @@ -67,32 +74,38 @@ void nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, std::vector shapeAt, shapeBt; ShapeUtils::evalShapeForTensorDot(a, b, axes_a, axes_b, permutAt, permutBt, shapeAt, shapeBt); - NDArray *cP(c), *cPR(c); - // check whether permutation is required - if(!permutForC.empty()) - cP = new NDArray(c->permute(permutForC)); + NDArray* cP = permutForC.empty() ? c : new NDArray(c->permute(permutForC)); - auto aPR = a->permute(permutAt); - auto bPR = b->permute(permutBt); + // check whether permutation is necessary + const NDArray* aP = permutAt.empty() ? a : new NDArray(a->permute(permutAt)); + const NDArray* bP = permutBt.empty() ? b : new NDArray(b->permute(permutBt)); // check whether reshape is necessary - if(!aPR.isSameShape(shapeAt)) - aPR.reshapei(shapeAt); - if(!bPR.isSameShape(shapeBt)) - bPR.reshapei(shapeBt); + const NDArray* aPR = aP->isSameShape(shapeAt) ? aP : new NDArray(aP->reshape(aP->ordering(), shapeAt)); + const NDArray* bPR = bP->isSameShape(shapeAt) ? bP : new NDArray(bP->reshape(bP->ordering(), shapeBt)); - if(!cP->isSameShape({aPR.sizeAt(0), bPR.sizeAt(1)})) - cPR = new NDArray(cP->reshape(cP->ordering(), {aPR.sizeAt(0), bPR.sizeAt(1)})); + std::vector requiredCshape = {aPR->sizeAt(0), bPR->sizeAt(1)}; - mmul(&aPR, &bPR, cPR, 1.0, 0.0); + NDArray* cPR = cP->isSameShape(requiredCshape) ? cP : new NDArray(cP->reshape(cP->ordering(), requiredCshape, false)); + + mmul(aPR, bPR, cPR, 1.0, 0.0); if(cPR->getBuffer() != cP->getBuffer() || cPR->getSpecialBuffer() != cP->getSpecialBuffer() ) // this means both permute and reshape have been performed on c, cP always points on c->getBuffer() cP->assign(cPR); - if(cPR != c) + if(aP != aPR) + delete aPR; + if(bP != bPR) + delete bPR; + if(a != aP) + delete aP; + if(b != bP) + delete bP; + + if(cP != cPR) delete cPR; - if(cP != c) + if(c != cP) delete cP; } @@ -129,7 +142,7 @@ void nd4j::MmulHelper::tensorDot(const NDArray* a, const NDArray* b, NDArray* c, if(!whatToDoWithC.empty()) { cArrs = std::vector(whatToDoWithC.size()+1, c); for(int i = 0; i < cArrs.size()-1; ++i) - cArrs[i+1] = (whatToDoWithC[i] == 'p') ? new NDArray(cArrs[i]->permute(modifC[i])) : new NDArray(cArrs[i]->reshape(c->ordering(), modifC[i])); // since we ignore first element in cArrs (that is cArrs[0]) then it is always equal to c + cArrs[i+1] = (whatToDoWithC[i] == 'p') ? new NDArray(cArrs[i]->permute(modifC[i])) : new NDArray(cArrs[i]->reshape(c->ordering(), modifC[i], false)); // since we ignore first element in cArrs (that is cArrs[0]) then it is always equal to c } mmul(aPR, bPR, cArrs[cArrs.size()-1], 1.0, 0.0); @@ -208,7 +221,7 @@ nd4j::NDArray* MmulHelper::mmul(const nd4j::NDArray* A, const nd4j::NDArray* B, // vector x matrix, A{M} x B{M,N} = C{N} -> reduce to matrix x matrix A2{1,M} x B{M,N} = C2{1,N}, since there is no corresponding blas operation sgevm if(isAVector && bRank == 2) { NDArray* A2 = new NDArray(A->reshape(A->ordering(), {1, A->lengthOf()})); // A{M} -> A2{1,M} - NDArray* C2 = C ? new NDArray(C->reshape(C->ordering(), {1, C->lengthOf()})) : nullptr; // C{N} -> C2{1,N} + NDArray* C2 = C ? new NDArray(C->reshape(C->ordering(), {1, C->lengthOf()}, false)) : nullptr; // C{N} -> C2{1,N} auto result = mmulMxM(A2, B, C2, alpha, beta, outOrder); // result{1,N} delete A2; delete C2; diff --git a/libnd4j/include/helpers/impl/ShapeBuilders.cpp b/libnd4j/include/helpers/impl/ShapeBuilders.cpp index 70aa934ca..d8443e180 100644 --- a/libnd4j/include/helpers/impl/ShapeBuilders.cpp +++ b/libnd4j/include/helpers/impl/ShapeBuilders.cpp @@ -139,5 +139,15 @@ namespace nd4j { return ShapeBuilders::copyShapeInfoAndType(inShapeInfo, ArrayOptions::dataType(shapeInfoToGetTypeFrom), copyStrides, workspace); } +//////////////////////////////////////////////////////////////////////////////// +Nd4jLong* ShapeBuilders::copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace) { + + Nd4jLong *outShapeInfo = nullptr; + ALLOCATE(outShapeInfo, workspace, shape::shapeInfoLength(inShapeInfo[0] - dimsSize), Nd4jLong); + + shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsSize, dimsToExclude, outShapeInfo); + + return outShapeInfo; +} } \ No newline at end of file diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index 165ed5ffd..9d002e238 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -75,10 +75,23 @@ std::vector ShapeUtils::evalShapeForTensorDot(const Nd4jLong* aShapeIn permutBt = axesB; permutBt.insert(permutBt.end(), list_B.begin(), list_B.end()); + // if permut contains something like {0,1,2,..rank-1}, then there is no need to make permutation and we return empty vector in this case + uint i1, i2; + for(i1 = 0; i1 < aRank; ++i1) + if(permutAt[i1] != i1) + break; + if(i1 == aRank) + permutAt = {}; + for(i2 = 0; i2 < bRank; ++i2) + if(permutBt[i2] != i2) + break; + if(i2 == bRank) + permutBt = {}; + Nd4jLong n2 = 1; for (int i = 0; i < axeAsize; i++) n2 *= aShapeInfo[axesA[i] + 1]; - shapeAt = {-1, n2}; + shapeAt = {shape::length(aShapeInfo) / n2, n2}; std::vector oldShapeA; oldShapeA.resize(list_A.size()); @@ -89,7 +102,7 @@ std::vector ShapeUtils::evalShapeForTensorDot(const Nd4jLong* aShapeIn Nd4jLong n3 = 1; for (int i = 0; i < axeBsize; i++) n3 *= bShapeInfo[axesB[i] + 1]; - shapeBt = {n3, -1}; + shapeBt = {n3, shape::length(bShapeInfo) / n3}; std::vector oldShapeB; oldShapeB.resize(list_B.size()); @@ -306,10 +319,10 @@ std::vector ShapeUtils::evalRepeatShape(int axis, const std::vector {a,b} */ ND4J_EXPORT _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo, const Nd4jLong numOfSubArrs, const int dimsSize, const int* dimsToExclude, Nd4jLong* subArrShapeInfo, Nd4jLong* subArrOffsets, bool keepUnitiesInShape = false); + /** + * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99} + * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order + * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1} + * returns number of non-unity dimensions in inShapeInfo + * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo + */ + ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities); + + /** + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} + */ + INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo); @@ -2050,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank]; } - shape::setOrderAndEws(shapeInfo, len); + shape::checkStridesSetEwsAndOrder(shapeInfo); delete[] temp; } @@ -2227,7 +2248,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn INLINEDEF _CUDA_HD bool isCommonVector(const Nd4jLong *shapeInfo, int& posOfNonUnityDim) { if(rank(shapeInfo) > 0 && length(shapeInfo) == 1) { - posOfNonUnityDim = 0; + posOfNonUnityDim = -1; return true; } @@ -2272,6 +2293,18 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn return isVector && !shapeFirstOne; } +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape) { + + int num = 0; + + for(uint i = 0; i < rank; ++i) + if(inShape[i] != 1) + ++num; + + return num; +} + INLINEDEF _CUDA_HD int oneDimEqualToLength(Nd4jLong *shape, int rank) { for(int i = 0; i < rank; i++) { if(shape[i] == shape::prodLong(shape,rank)) @@ -2310,8 +2343,14 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn * Returns the shape portion of an information * buffer */ - INLINEDEF _CUDA_HD Nd4jLong *shapeOf(Nd4jLong *buffer) { - return buffer + 1; + INLINEDEF _CUDA_HD Nd4jLong *shapeOf(Nd4jLong *shapeInfo) { + + return shapeInfo + 1; + } + + INLINEDEF _CUDA_HD Nd4jLong *shapeOf(const Nd4jLong *shapeInfo) { + + return shape::shapeOf(const_cast(shapeInfo)); } /** @@ -2444,7 +2483,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer); // correct order and ews if necessary - shape::setOrderAndEws(newShapeBuffer); + shape::checkStridesSetEwsAndOrder(newShapeBuffer); delete[] indices; @@ -3918,121 +3957,151 @@ INLINEDEF _CUDA_HD bool areStridesDefault(const Nd4jLong* shapeInfo) { // return true; // } -// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) { +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) { // // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements // // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo -// const int newOrder = isFOrder ? 102 : 99; -// const int oldOrder = oldShapeInfo[2 * oldRank + 3]; - // newShapeInfo[0] = newRank; // memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong)); -// Nd4jLong* newStrides = shape::stride(newShapeInfo); -// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); +// Nd4jLong* newStrides = shape::stride(newShapeInfo); +// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); // const Nd4jLong* oldStrides = shape::stride(const_cast(oldShapeInfo)); -// int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; - +// Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; // while (newStart < newRank && oldStart < oldRank) { // newDim = newShape[newStart]; // oldDim = oldShape[oldStart]; -// while (newDim != oldDim) +// while (newDim != oldDim && newDim > 0 && oldDim > 0) // if (newDim < oldDim) newDim *= newShape[newStop++]; // else oldDim *= oldShape[oldStop++]; // // ------ Check whether the original axes can be combined ------ // -// for (int i = oldStart; i < oldStop - 1; i++) { - -// if(oldShape[i] == 1) { // ignore strides like {...,1,1,...} -// if(oldOrder == 102) ++oldStart; +// for (int step = 1, i = oldStart; i < oldStop - 1; ++i) { +// if(oldShape[i] == 1) // skip unity-dimension and its stride // continue; -// } - -// if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i]) -// return false; // not contiguous enough -// if(oldOrder == 99 && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1]) -// return false; // not contiguous enough +// while((i + step) < oldRank && oldShape[i + step] == 1) +// ++step; // skip following unity-dimensions and its strides if such are present +// if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step]) +// return false; // not contiguous enough // } -// // ------ Calculate new strides for all axes currently worked with ------ // -// if(isFOrder) { -// newStrides[newStart] = oldStrides[oldStart]; -// for (int i = newStart + 1; i < newStop; ++i) -// newStrides[i] = newStrides[i - 1] * newShape[i - 1]; -// } -// else { -// newStrides[newStop - 1] = oldStrides[oldStop - 1]; -// for (int i = newStop - 1; i > newStart; --i) -// newStrides[i - 1] = newStrides[i] * newShape[i]; -// } +// newStrides[newStop - 1] = oldStrides[oldStop - 1]; +// for (int i = newStop - 1; i > newStart; --i) +// newStrides[i - 1] = newStrides[i] * newShape[i]; // newStart = newStop++; // oldStart = oldStop++; // } -// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order -// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews -// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type +// // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank) +// for (int i = newStart; i < newRank; ++i) +// newStrides[i] = 1; + +// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order +// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews +// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type // return true; // } ////////////////////////////////////////////////////////////////////// -INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) { +INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, const char newOrder, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) { - // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements - // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo + // copy shape from newShape into newShapeInfo + newShapeInfo[0] = newRank; + memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong)); - newShapeInfo[0] = newRank; - memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong)); + // copy order + newShapeInfo[2 * newRank + 3] = newOrder; - Nd4jLong* newStrides = shape::stride(newShapeInfo); - const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); - const Nd4jLong* oldStrides = shape::stride(const_cast(oldShapeInfo)); - Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; + return shape::reshapeC(oldShapeInfo, newShapeInfo); +} - while (newStart < newRank && oldStart < oldRank) { +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShapeInfo) { - newDim = newShape[newStart]; - oldDim = oldShape[oldStart]; + // newShapeInfo contains rank, shape and order; but no strides, type and ews - while (newDim != oldDim && newDim > 0 && oldDim > 0) - if (newDim < oldDim) newDim *= newShape[newStop++]; - else oldDim *= oldShape[oldStop++]; - - // ------ Check whether the original axes can be combined ------ // - for (int step = 1, i = oldStart; i < oldStop - 1; ++i) { - if(oldShape[i] == 1) // skip unity-dimension and its stride - continue; - while((i + step) < oldRank && oldShape[i + step] == 1) - ++step; // skip following unity-dimensions and its strides if such are present - if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step]) - return false; // not contiguous enough - } - - newStrides[newStop - 1] = oldStrides[oldStop - 1]; - for (int i = newStop - 1; i > newStart; --i) - newStrides[i - 1] = newStrides[i] * newShape[i]; - - newStart = newStop++; - oldStart = oldStop++; - } - - // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank) - for (int i = newStart; i < newRank; ++i) - newStrides[i] = 1; - - newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order - newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews - newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type + const int newRank = shape::rank(newShapeInfo); + // if oldShapeInfo is scalar or vector with length=1 + if(shape::length(oldShapeInfo) == 1) { + for (uint i = 0; i < newRank; ++i) + shape::stride(newShapeInfo)[i] = 1; + newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); + *shape::ews(newShapeInfo) = 1; return true; } + const auto oldOrder = shape::order(oldShapeInfo); + const auto newOrder = shape::order(newShapeInfo); + const auto oldEws = shape::elementWiseStride(const_cast(oldShapeInfo)); + + if(oldEws > 0 && oldOrder != newOrder) + return false; + + // *** FIRST STAGE - exclude unity dimensions from oldShapeInfo and newShapeInfo (if such are present of course), since they don't affect on strides evaluation, however they complicate code + + // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities) + Nd4jLong tempBuffer[4*MAX_RANK]; + Nd4jLong *oldShape = tempBuffer, *newShape = tempBuffer + 2*MAX_RANK, *oldStrides, *newStrides; + + // exclude unities from oldShapeInfo + const int oldNumOfNonUnities = shape::excludeUnitiesFromShapeInfo(oldShapeInfo, oldShape, oldStrides); + const int newNumOfNonUnities = shape::excludeUnitiesFromShapeInfo(newShapeInfo, newShape, newStrides); + + // *** SECOND STAGE - strides evaluation + + int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; + + while (newStart < newNumOfNonUnities && oldStart < oldNumOfNonUnities) { + + newDim = newShape[newStart]; + oldDim = oldShape[oldStart]; + + while (newDim != oldDim && newDim > 0 && oldDim > 0) { + + if (newDim < oldDim) + newDim *= newShape[newStop++]; + else + oldDim *= oldShape[oldStop++]; + } + + // check c-contiguous of old axes range + for(uint i = oldStart; i < oldStop - 1; ++i) // do not check value of last stride, it doesn't matter + if(oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1]) + return false; // not contiguous + + // fill newStrides in c manner + newStrides[newStop - 1] = oldStrides[oldStop - 1]; // copy last stride + for (int i = newStop - 2; i >= newStart; --i) + newStrides[i] = newStrides[i + 1] * newShape[i + 1]; + + newStart = newStop++; + oldStart = oldStop++; + } + + // fill new calculated strides into newShapeInfo, take into account possible unities in shape + for (int j = 0, i = 0; i < newRank; ++i) + shape::stride(newShapeInfo)[i] = (shape::shapeOf(newShapeInfo)[i] == 1) ? 1 : newStrides[j++]; + + // set ews + if(oldEws == 0) + shape::checkStridesSetEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order + else { + newShapeInfo[2 * newRank + 3] = oldOrder; // order + *shape::ews(newShapeInfo) = oldEws; // ews + } + + newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type + + return true; +} INLINEDEF _CUDA_H bool canReshape(const int oldRank, Nd4jLong* oldShape, const int newRank, Nd4jLong* newShapeOf, bool isFOrder) { @@ -4573,129 +4642,75 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong } ////////////////////////////////////////////////////////////////////// -INLINEDEF void _CUDA_HD setEws(Nd4jLong* shapeInfo, Nd4jLong len) { +INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) { + // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities) + Nd4jLong tempBuffer[2*MAX_RANK]; + Nd4jLong *shape = tempBuffer, *strides; - const int rank = shape::rank(shapeInfo); - const Nd4jLong* shape = shape::shapeOf(shapeInfo); - const Nd4jLong* strides = shape::stride(shapeInfo); - const char order = shape::order(shapeInfo); - Nd4jLong* ews = shape::ews(shapeInfo); + // exclude unities from shapeInfo + const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides); - if(len == -1) // calculate array length if it is not given - len = shape::length(shapeInfo); - - if(len <= 1) { // empty, scalar or unity-vector case - *ews = 1; - return; - } - - int nonUnityDim(0); - if(shape::isCommonVector(shapeInfo, nonUnityDim)) { - *ews = strides[nonUnityDim]; - return; - } - - // check last(c)/first(f) dimension, it should be equal to 1 - if((order == 'c' && shape[rank - 1] != 1 && strides[rank - 1] != 1) || (order == 'f' && shape[0] != 1 && strides[0] != 1)) { - *ews = 0; - return; - } - - Nd4jLong correctStride = 1; - if(order == 'c') { - for (int i = rank - 2; i >= 0 ; i--) { - correctStride *= shape[i + 1]; - if(shape[i] == 1) - continue; - if(correctStride != strides[i]) { - *ews = 0; - return; - } - } - } - else { - for (int i = 1; i < rank; ++i) { - correctStride *= shape[i - 1]; - if(shape[i] == 1) - continue; - if(correctStride != strides[i]) { - *ews = 0; - return; - } - } - } - - *ews = 1; + shape::checkStridesSetEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides); } ////////////////////////////////////////////////////////////////////// -INLINEDEF _CUDA_HD void setOrderAndEws(Nd4jLong* shapeInfo, Nd4jLong len) { +INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) { - const int rank = shape::rank(shapeInfo); - const Nd4jLong* shape = shape::shapeOf(shapeInfo); - const Nd4jLong* strides = shape::stride(shapeInfo); - const char order = shape::order(shapeInfo); - Nd4jLong* ews = shape::ews(shapeInfo); + const int rank = shape::rank(shapeInfo); - if(len == -1) // calculate array length if it is not given - len = shape::length(shapeInfo); - - if(len <= 1) { // empty, scalar or unity-vector case - *ews = 1; + if(shape::length(shapeInfo) == 1) { + *shape::ews(shapeInfo) = 1; + shapeInfo[rank * 2 + 3] = (int)proposedOrder; return; } - int nonUnityDim(0); - if(shape::isCommonVector(shapeInfo, nonUnityDim)) { // in this case we don't change order - *ews = strides[nonUnityDim]; + if(numOfNonUnities == 1) { // case of common vector + *shape::ews(shapeInfo) = *stridesNoUnities; + shapeInfo[rank * 2 + 3] = (int)proposedOrder; return; } - // check if strides are contiguous in respect to c-order - // firstly check last stride, it should be equal to 1 - if (strides[rank - 1] == 1 || shape[rank - 1] == 1) { // last dimension is ok, go on through the rest dimensions in reverse order - Nd4jLong correctStride = 1; - bool cContiguous = true; - for (int i = rank - 2; i >= 0 ; i--) { - correctStride *= shape[i + 1]; - if(shape[i] == 1) - continue; - if(correctStride != strides[i]) { - cContiguous = false; + bool contiguous = true; + + // *** check whether strides are in c contiguous order ***// + if(stridesNoUnities[numOfNonUnities - 1] != 1) // last stride should be always unity for c order + contiguous = false; + else { + for (uint i = 0; i < numOfNonUnities - 1; ++i) { + if(stridesNoUnities[i] != stridesNoUnities[i + 1] * shapeNoUnities[i + 1]) { + contiguous = false; break; } } - if(cContiguous) { - *ews = 1; - shapeInfo[shape::shapeInfoLength(rank) - 1] = 99; - return; - } + } + if(contiguous) { + *shape::ews(shapeInfo) = 1; + shapeInfo[rank * 2 + 3] = 99; + return; } - // now check if strides are contiguous in respect to f-order - // firstly check first stride, it should be equal to 1 - if(strides[0] == 1 || shape[0] == 1) { // first dimension is ok, go on through the rest dimensions - Nd4jLong correctStride = 1; - bool fContiguous = true; - for (int i = 1; i < rank; ++i) { - correctStride *= shape[i - 1]; - if(shape[i] == 1) - continue; - if(correctStride != strides[i]) { - fContiguous = false; + contiguous = true; + + //*** check whether strides are in f contiguous order ***// + if(stridesNoUnities[0] != 1) // first stride should be always unity for f order + contiguous = false; + else { + for (uint i = 1; i < numOfNonUnities; ++i) { + if(stridesNoUnities[i] != stridesNoUnities[i - 1] * shapeNoUnities[i - 1]) { + contiguous = false; break; } } - if(fContiguous) { - *ews = 1; - shapeInfo[shape::shapeInfoLength(rank) - 1] = 102; - return; - } + } + if(contiguous) { + *shape::ews(shapeInfo) = 1; + shapeInfo[rank * 2 + 3] = 102; + return; } - *ews = 0; - // if both cContiguous and fContiguous are false then order is preserved + *shape::ews(shapeInfo) = 0; + shapeInfo[rank * 2 + 3] = (int)proposedOrder; } ////////////////////////////////////////////////////////////////////// @@ -4709,49 +4724,42 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo return; } - Nd4jLong *outShapeInfo = new Nd4jLong[shape::shapeInfoLength(wholeShapeInfo)]; - memcpy(outShapeInfo, wholeShapeInfo, shape::shapeInfoByteLength(wholeShapeInfo)); + const int subArrRank = keepUnitiesInShape ? rank : rank - dimsSize; + + subArrShapeInfo[0] = subArrRank; // rank + subArrShapeInfo[2 * subArrRank + 1] = shape::type(wholeShapeInfo); // type + subArrShapeInfo[2 * subArrRank + 3] = shape::order(wholeShapeInfo); // order Nd4jLong* shape = new Nd4jLong[dimsSize]; Nd4jLong* strides = new Nd4jLong[dimsSize]; - const int subArrRank = keepUnitiesInShape ? rank : rank - dimsSize; - Nd4jLong* shapeNoUnities = nullptr; - if(!keepUnitiesInShape) - shapeNoUnities = new Nd4jLong[subArrRank]; - - Nd4jLong subArrLen = 1; - for(int k = subArrRank - 1, j = dimsSize - 1, i = rank - 1; i >= 0; --i) { + if(j >= 0 && i == dimsToExclude[j]) { - strides[j] = shape::stride(outShapeInfo)[i]; - shape[j--] = shape::shapeOf(outShapeInfo)[i]; - shape::shapeOf(outShapeInfo)[i] = 1; + + strides[j] = shape::stride(wholeShapeInfo)[i]; + shape[j--] = shape::shapeOf(wholeShapeInfo)[i]; + + if(keepUnitiesInShape) { + shape::shapeOf(subArrShapeInfo)[k] = 1; + shape::stride(subArrShapeInfo)[k--] = shape::stride(wholeShapeInfo)[i]; + } } else { - subArrLen *= shape::shapeOf(outShapeInfo)[i]; - if(!keepUnitiesInShape) - shapeNoUnities[k--] = shape::shapeOf(outShapeInfo)[i]; + shape::shapeOf(subArrShapeInfo)[k] = shape::shapeOf(wholeShapeInfo)[i]; + shape::stride(subArrShapeInfo)[k--] = shape::stride(wholeShapeInfo)[i]; } - } - // evaluate ews - shape::setEws(outShapeInfo, subArrLen); + } // calculation of sub-array offsets (subArrOffsets) shape::calcOffsets(dimsSize, shape, strides, subArrOffsets); - // remove unities from outShapeInfo if required - if(!keepUnitiesInShape) { - shape::reshapeC(rank, outShapeInfo, subArrRank, shapeNoUnities, subArrShapeInfo); - delete []shapeNoUnities; - } - else - memcpy(subArrShapeInfo, outShapeInfo, shape::shapeInfoLength(subArrRank) * sizeof(Nd4jLong)); + // evaluate ews + shape::checkStridesSetEwsAndOrder(subArrShapeInfo); delete []strides; delete []shape; - delete []outShapeInfo; } ////////////////////////////////////////////////////////////////////// @@ -4815,195 +4823,238 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, } ////////////////////////////////////////////////////////////////////// -INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) { +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) { - // we assume all array have same length - const Nd4jLong len = shape::length(xShapeInfo); +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); - const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); - const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); - const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo); +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); +// const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo); - const char xOrder = shape::order(xShapeInfo); - const char yOrder = shape::order(yShapeInfo); - const char zOrder = shape::order(zShapeInfo); +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); +// const char zOrder = shape::order(zShapeInfo); - const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo); +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo); - if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) { - xOffsets = yOffsets = zOffsets = nullptr; +// if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = zOffsets = nullptr; +// } +// else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) { +// xOffsets = yOffsets = nullptr; +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) { +// xOffsets = zOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) { +// yOffsets = zOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// } +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, yOrder); +// } +// } +// } +// else if(zEws == 1) { +// zOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, zOrder); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, zOrder); +// } +// } +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = zOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// yOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// zOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// } +// } + +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo); + +// if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = nullptr; +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// } +// } + +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities) { + + const int rank = shape::rank(inShapeInfo); + const int numOfNonUnities = shape::numOfNonUnitDims(rank, shape::shapeOf(inShapeInfo)); + + if(numOfNonUnities == rank) { // no unities in shape, no copy procedure + shapeNoUnities = const_cast(inShapeInfo) + 1; + stridesNoUnities = const_cast(inShapeInfo) + 1 + rank; + return numOfNonUnities; } - else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) { - xOffsets = yOffsets = nullptr; - zOffsets = new Nd4jLong[len]; - shape::calcOffsets(zShapeInfo, zOffsets, xOrder); - } - else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) { - xOffsets = zOffsets = nullptr; - yOffsets = new Nd4jLong[len]; - shape::calcOffsets(yShapeInfo, yOffsets, xOrder); - } - else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) { - yOffsets = zOffsets = nullptr; - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets, yOrder); - } - else if(xEws == 1) { - xOffsets = nullptr; - PRAGMA_OMP_PARALLEL_SECTIONS - { - PRAGMA_OMP_SECTION - { - yOffsets = new Nd4jLong[len]; - shape::calcOffsets(yShapeInfo, yOffsets, xOrder); - } - PRAGMA_OMP_SECTION - { - zOffsets = new Nd4jLong[len]; - shape::calcOffsets(zShapeInfo, zOffsets, xOrder); - } - } - } - else if(yEws == 1) { - yOffsets = nullptr; - PRAGMA_OMP_PARALLEL_SECTIONS - { - PRAGMA_OMP_SECTION - { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets, yOrder); - } - PRAGMA_OMP_SECTION - { - zOffsets = new Nd4jLong[len]; - shape::calcOffsets(zShapeInfo, zOffsets, yOrder); - } - } - } - else if(zEws == 1) { - zOffsets = nullptr; - PRAGMA_OMP_PARALLEL_SECTIONS - { - PRAGMA_OMP_SECTION - { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets, zOrder); - } - PRAGMA_OMP_SECTION - { - yOffsets = new Nd4jLong[len]; - shape::calcOffsets(yShapeInfo, yOffsets, zOrder); - } - } - } - else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets); - yOffsets = zOffsets = xOffsets; - } - else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - PRAGMA_OMP_PARALLEL_SECTIONS - { - PRAGMA_OMP_SECTION - { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets); - } - PRAGMA_OMP_SECTION - { - zOffsets = new Nd4jLong[len]; - shape::calcOffsets(zShapeInfo, zOffsets); - } - } - yOffsets = xOffsets; - } - else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - PRAGMA_OMP_PARALLEL_SECTIONS - { - PRAGMA_OMP_SECTION - { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets); - } - PRAGMA_OMP_SECTION - { - yOffsets = new Nd4jLong[len]; - shape::calcOffsets(yShapeInfo, yOffsets); - } - } - zOffsets = xOffsets; - } - else { - PRAGMA_OMP_PARALLEL_SECTIONS - { - PRAGMA_OMP_SECTION - { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets); - } - PRAGMA_OMP_SECTION - { - yOffsets = new Nd4jLong[len]; - shape::calcOffsets(yShapeInfo, yOffsets); - } - PRAGMA_OMP_SECTION - { - zOffsets = new Nd4jLong[len]; - shape::calcOffsets(zShapeInfo, zOffsets); - } + + for(uint j = 0, i = 0; i < rank; ++i) { + if(shape::shapeOf(inShapeInfo)[i] != 1) { + shapeNoUnities[j] = shape::shapeOf(inShapeInfo)[i]; + shapeNoUnities[numOfNonUnities + j++] = shape::stride(inShapeInfo)[i]; } } + + stridesNoUnities = shapeNoUnities + numOfNonUnities; + + return numOfNonUnities; } ////////////////////////////////////////////////////////////////////// -INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) { +INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo) { - // we assume all array have same length - const Nd4jLong len = shape::length(xShapeInfo); + outShapeInfo[0] = inShapeInfo[0] - dimsSize; - const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); - const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); - - const char xOrder = shape::order(xShapeInfo); - const char yOrder = shape::order(yShapeInfo); - - const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo); - - if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) { - xOffsets = yOffsets = nullptr; - } - else if(xEws == 1) { - xOffsets = nullptr; - yOffsets = new Nd4jLong[len]; - shape::calcOffsets(yShapeInfo, yOffsets, xOrder); - } - else if(yEws == 1) { - yOffsets = nullptr; - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets, yOrder); - } - else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets); - yOffsets = xOffsets; - } - else { - PRAGMA_OMP_PARALLEL_SECTIONS - { - PRAGMA_OMP_SECTION - { - xOffsets = new Nd4jLong[len]; - shape::calcOffsets(xShapeInfo, xOffsets); - } - PRAGMA_OMP_SECTION - { - yOffsets = new Nd4jLong[len]; - shape::calcOffsets(yShapeInfo, yOffsets); - } + for(uint j = 0, k = 0, i = 0; i < inShapeInfo[0]; ++i) { + if(j < dimsSize && i == dimsToExclude[j]) { + ++j; + continue; } - } -} + shape::shapeOf(outShapeInfo)[k] = shape::shapeOf(inShapeInfo)[i]; + shape::stride(outShapeInfo)[k++] = shape::stride(inShapeInfo)[i]; + } + + outShapeInfo[2 * outShapeInfo[0] + 1] = shape::type(inShapeInfo); // type + *shape::ews(outShapeInfo) = shape::elementWiseStride(inShapeInfo); // ews + outShapeInfo[2 * outShapeInfo[0] + 3] = shape::order(inShapeInfo); // order +} } diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu index 5ca6f0067..37a0ac804 100644 --- a/libnd4j/include/loops/cuda/transform/transform_any.cu +++ b/libnd4j/include/loops/cuda/transform/transform_any.cu @@ -84,7 +84,7 @@ namespace functions { auto tid = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = gridDim.x * blockDim.x; - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { + if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') { for (int i = tid; i < length; i += totalThreads) z[i * zEws] = OpType::op(x[i * xEws], params); diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu index 0f56020b0..d64328494 100644 --- a/libnd4j/include/loops/cuda/transform/transform_bool.cu +++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu @@ -89,7 +89,7 @@ namespace functions { auto tid = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = gridDim.x * blockDim.x; - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { + if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') { for (int i = tid; i < length; i += totalThreads) z[i * zEws] = OpType::op(x[i * xEws], params); diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu index 49d6ab26f..2e82efdb3 100644 --- a/libnd4j/include/loops/cuda/transform/transform_float.cu +++ b/libnd4j/include/loops/cuda/transform/transform_float.cu @@ -97,7 +97,7 @@ namespace functions { auto tid = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = gridDim.x * blockDim.x; - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { + if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') { for (Nd4jLong i = tid; i < length; i += totalThreads) z[i * zEws] = OpType::op(x[i * xEws], params); diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu index 4c587111b..0a66590a5 100644 --- a/libnd4j/include/loops/cuda/transform/transform_same.cu +++ b/libnd4j/include/loops/cuda/transform/transform_same.cu @@ -87,7 +87,7 @@ namespace functions { auto tid = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = gridDim.x * blockDim.x; - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { + if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') { for (int i = tid; i < length; i += totalThreads) z[i * zEws] = OpType::op(x[i * xEws], params); diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu index 1136ef695..35ab0b1dc 100644 --- a/libnd4j/include/loops/cuda/transform/transform_strict.cu +++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu @@ -89,7 +89,7 @@ namespace functions { auto tid = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = gridDim.x * blockDim.x; - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { + if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') { for (int i = tid; i < length; i += totalThreads) z[i * zEws] = OpType::op(x[i * xEws], params); diff --git a/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp b/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp index 2c362b23d..3db3b6097 100644 --- a/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp +++ b/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp @@ -21,70 +21,174 @@ #include #if NOT_EXCLUDED(OP_tensormmul) +#include #include #include #include + namespace nd4j { - namespace ops { - CUSTOM_OP_IMPL(tensormmul, 2, 1, false, 0, -1) { - auto a = INPUT_VARIABLE(0); - auto b = INPUT_VARIABLE(1); +namespace ops { - auto c = OUTPUT_VARIABLE(0); // +//////////////////////////////////////////////////////////////////////// +CUSTOM_OP_IMPL(tensormmul, 2, 1, false, 0, -1) { - REQUIRE_TRUE(a->dataType() == b->dataType(), 0, "tensormmul: A, B and C data types must be the same"); + auto a = INPUT_VARIABLE(0); + auto b = INPUT_VARIABLE(1); - // building axes - int axe0_size = INT_ARG(0); - int axe1_size = INT_ARG(axe0_size+1); - std::vector axes_0(axe0_size), axes_1(axe1_size); - for (int e = 0; e < axe0_size; e++) - axes_0[e] = (int) INT_ARG(e+1); + auto c = OUTPUT_VARIABLE(0); - for (int e = 0; e < axe1_size; e++) - axes_1[e] = (int) INT_ARG(e + axe0_size + 2); + REQUIRE_TRUE(a->dataType() == b->dataType(), 0, "tensormmul: A, B and C data types must be the same"); - nd4j_verbose("axe0: %i; axe1: %i;\n", axes_0.size(), axes_1.size()); + // building axes + int axe0_size = INT_ARG(0); + int axe1_size = INT_ARG(axe0_size+1); + std::vector axes_0(axe0_size), axes_1(axe1_size); + for (int e = 0; e < axe0_size; e++) + axes_0[e] = (int)INT_ARG(e + 1); - MmulHelper::tensorDot(a, b, c, axes_0, axes_1); - return Status::OK(); - } - DECLARE_SYN(tensordot, tensormmul); + for (int e = 0; e < axe1_size; e++) + axes_1[e] = (int)INT_ARG(e + axe0_size + 2); + nd4j_verbose("axe0: %i; axe1: %i;\n", axes_0.size(), axes_1.size()); - DECLARE_SHAPE_FN(tensormmul) { - - auto aShapeInfo = inputShape->at(0); - auto bShapeInfo = inputShape->at(1); + MmulHelper::tensorDot(a, b, c, axes_0, axes_1); + return Status::OK(); +} +DECLARE_SYN(tensordot, tensormmul); - REQUIRE_TRUE(ArrayOptions::dataType(aShapeInfo) == ArrayOptions::dataType(bShapeInfo), 0, "tensormmul: A and B data types must be the same"); +//////////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(tensormmul) { - // building axes - int axe0_size = INT_ARG(0); - int axe1_size = INT_ARG(axe0_size+1); - std::vector axes_0(axe0_size), axes_1(axe1_size); - for (int e = 0; e < axe0_size; e++) - axes_0[e] = (int) INT_ARG(e+1); + auto aShapeInfo = inputShape->at(0); + auto bShapeInfo = inputShape->at(1); - for (int e = 0; e < axe1_size; e++) - axes_1[e] = (int) INT_ARG(e + axe0_size + 2); + REQUIRE_TRUE(ArrayOptions::dataType(aShapeInfo) == ArrayOptions::dataType(bShapeInfo), 0, "tensormmul: A and B data types must be the same"); - // evaluate shapes - std::vector permutAt, permutBt; - std::vector shapeAt, shapeBt; - auto outShape = nd4j::ShapeUtils::evalShapeForTensorDot(aShapeInfo, bShapeInfo, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt); + // building axes + int axe0_size = INT_ARG(0); + int axe1_size = INT_ARG(axe0_size+1); + std::vector axes_0(axe0_size), axes_1(axe1_size); + for (int e = 0; e < axe0_size; e++) + axes_0[e] = (int) INT_ARG(e+1); - return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(aShapeInfo), 'c', outShape))); - } + for (int e = 0; e < axe1_size; e++) + axes_1[e] = (int) INT_ARG(e + axe0_size + 2); - DECLARE_TYPES(tensormmul) { - getOpDescriptor() - ->setAllowedInputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) - ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) - ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}); - } + // evaluate shapes + std::vector permutAt, permutBt; + std::vector shapeAt, shapeBt; + auto outShape = nd4j::ShapeUtils::evalShapeForTensorDot(aShapeInfo, bShapeInfo, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt); + + return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(aShapeInfo), 'c', outShape))); +} + +//////////////////////////////////////////////////////////////////////// +DECLARE_TYPES(tensormmul) { + getOpDescriptor() + ->setAllowedInputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) + ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) + ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}); +} + +//////////////////////////////////////////////////////////////////////// +CUSTOM_OP_IMPL(tensormmul_bp, 3, 2, false, 0, -1) { + + auto A = INPUT_VARIABLE(0); + auto B = INPUT_VARIABLE(1); + + auto dLdC = INPUT_VARIABLE(2); + + auto dLdA = OUTPUT_VARIABLE(0); + auto dLdB = OUTPUT_VARIABLE(1); + + REQUIRE_TRUE( (A->dataType() == B->dataType() && (dLdC->dataType() == A->dataType())), 0, "tensormmul_bp: A, B and dLdC data types must be the same"); + + int axe0Size = INT_ARG(0); + int axe1Size = INT_ARG(axe0Size + 1); + + auto Arank = A->rankOf(); + auto Brank = B->rankOf(); + auto dLdCrank = dLdC->rankOf(); + + REQUIRE_TRUE((Arank >= axe0Size), 0, "tensormmul_bp: A rank must be the higher or same as input axes 0"); + + REQUIRE_TRUE((Brank >= axe1Size), 0, "tensormmul_bp: B rank must be the higher or same as input axes 1"); + + // building axes + std::vector axes0(axe0Size), axes1(axe1Size); + for (uint e = 0; e < axe0Size; e++) + axes0[e] = (int)INT_ARG(e + 1); + for (uint e = 0; e < axe1Size; e++) + axes1[e] = (int)INT_ARG(e + axe0Size + 2); + + std::vector permutAt, permutBt; + std::vector shapeAt, shapeBt; + + ShapeUtils::evalShapeForTensorDot(A, B, axes0, axes1, permutAt, permutBt, shapeAt, shapeBt); + + // special case for scalar value + if (dLdC->isScalar()) { + + dLdA->assign((*dLdC) * *B); + dLdB->assign((*dLdC) * *A); + + return Status::OK(); } + + std::vector axesA = ShapeUtils::evalDimsToExclude(Arank, axes0); + std::vector axesB = ShapeUtils::evalDimsToExclude(Brank, axes1); + + // rank always have to be divided by 2 + std::vector axesAdLdC, axesBdLdC; + if (dLdCrank > 1) { + axesAdLdC.resize(dLdCrank / 2); + std::iota(axesAdLdC.begin(), axesAdLdC.end(), 0); + axesBdLdC = ShapeUtils::evalDimsToExclude(dLdCrank, axesAdLdC); + } + else { + axesAdLdC.push_back(0); + axesBdLdC.push_back(0); + } + + // calculate dLdA + MmulHelper::tensorDot(dLdC, B, dLdA, axesBdLdC, axesB, permutAt); + + // calculate dLdB + MmulHelper::tensorDot(A, dLdC, dLdB, axesA, axesAdLdC, permutBt); + + return Status::OK(); +} + +//////////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(tensormmul_bp) { + + auto aShapeInfo = inputShape->at(0); + auto bShapeInfo = inputShape->at(1); + auto dLShapeInfo = inputShape->at(2); + + REQUIRE_TRUE((ArrayOptions::dataType(aShapeInfo) == ArrayOptions::dataType(bShapeInfo) && + (ArrayOptions::dataType(dLShapeInfo) == ArrayOptions::dataType(aShapeInfo))), 0, "tensormmul_bp: A, B and dLdC data types must be the same"); + + Nd4jLong* dLdAShapeInfo = nullptr; + Nd4jLong* dLdBShapeInfo = nullptr; + + COPY_SHAPE(aShapeInfo, dLdAShapeInfo); + COPY_SHAPE(bShapeInfo, dLdBShapeInfo); + + return SHAPELIST(CONSTANT(dLdAShapeInfo), CONSTANT(dLdBShapeInfo)); +} + +//////////////////////////////////////////////////////////////////////// +DECLARE_TYPES(tensormmul_bp) { + getOpDescriptor() + ->setAllowedInputTypes(0, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF }) // maybe better ALL_FLOATS + ->setAllowedInputTypes(1, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF }) + ->setAllowedInputTypes(2, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF }) + ->setAllowedOutputTypes(0, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF }) + ->setAllowedOutputTypes(1, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF }); +} +} } #endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp index 9cd3285f3..c5e26c73e 100644 --- a/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp @@ -79,7 +79,7 @@ CUSTOM_OP_IMPL(conv1d, 2, 1, false, 0, 5) { } auto inputReshaped = input ->reshape(input->ordering(), reshapeForInput); - auto outputReshaped = output ->reshape(output->ordering(), reshapeForOutput); + auto outputReshaped = output ->reshape(output->ordering(), reshapeForOutput, false); auto weightsReshaped = weights->reshape(weights->ordering(), {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}); // [kW, iC, oC] -> [1, kW, iC, oC] nd4j::ops::conv2d conv2d; @@ -216,10 +216,10 @@ CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) { } auto inputReshaped = input ->reshape(input->ordering(), reshapeForInput); - auto gradIReshaped = gradI ->reshape(gradI->ordering(), reshapeForInput); + auto gradIReshaped = gradI ->reshape(gradI->ordering(), reshapeForInput, false); auto gradOReshaped = gradO ->reshape(gradO->ordering(), reshapeForGradO); - auto weightsReshaped = weights->reshape(weights->ordering(),{1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}); // [kW, iC, oC] -> [1, kW, iC, oC] - auto gradWReshaped = gradW ->reshape(gradW->ordering(), {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}); // [kW, iC, oC] -> [1, kW, iC, oC] + auto weightsReshaped = weights->reshape(weights->ordering(),{1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}); // [kW, iC, oC] -> [1, kW, iC, oC] + auto gradWReshaped = gradW ->reshape(gradW->ordering(), {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}, false);// [kW, iC, oC] -> [1, kW, iC, oC] nd4j::ops::conv2d_bp conv2dBP; auto status = conv2dBP.execute({&inputReshaped, &weightsReshaped, bias, &gradOReshaped}, {&gradIReshaped, &gradWReshaped, gradB}, {}, {1,kW, 1,sW, 0,pW, 1,dW, paddingMode, !isNCW}, {}); diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp index 0652f1840..7ce42756d 100644 --- a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp @@ -239,7 +239,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) { //----- calculation of gradO -----// if(gradB) { if(gradB->rankOf() == 2) - gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()})); + gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false)); gradO->reduceAlongDimension(reduce::Sum, *gradB, gradOaxesForDot); // sum over bS oD oH oW if(gradB != OUTPUT_VARIABLE(2)) delete gradB; diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp index 4a5bbd845..e3632f36a 100644 --- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp @@ -233,7 +233,7 @@ CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) { // ----- calculation of gradB ----- // if(gradB) { if(gradB->rankOf() == 2) - gradB = new NDArray(gradB->reshape(gradB->ordering(), {gradB->lengthOf()})); + gradB = new NDArray(gradB->reshape(gradB->ordering(), {gradB->lengthOf()}, false)); gradO->reduceAlongDimension(reduce::Sum, *gradB, {0, 2, 3}); // sum over bS, oH, oW if(gradB != OUTPUT_VARIABLE(2)) delete gradB; diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp index 1b832ea68..78d275c69 100644 --- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp @@ -243,7 +243,7 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) { // ----- calculation of gradB ----- // if(gradB) { if(gradB->rankOf() == 2) - gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()})); + gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false)); gradO->reduceAlongDimension(reduce::Sum, *gradB, {0, 2, 3, 4}); // sum over bS, oD, oH, oW if(gradB != OUTPUT_VARIABLE(2)) delete gradB; diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp index 984672ad2..dc304e4a9 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp @@ -61,13 +61,13 @@ namespace nd4j { } auto source = inRank == 4?image->reshape(image->ordering(), {image->sizeAt(0), image->sizeAt(1), image->sizeAt(2), image->sizeAt(3)}):image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)}); - auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}):output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}); + auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}, false) : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false); return helpers::resizeAreaFunctor(block.launchContext(), &source, width, height, alignCorners, &target); } DECLARE_SHAPE_FN(resize_area) { - auto shapeList = SHAPELIST(); + auto shapeList = SHAPELIST(); auto in = inputShape->at(0); Nd4jLong* outputShape; @@ -90,7 +90,7 @@ namespace nd4j { } REQUIRE_TRUE(inRank == 4 || inRank == 3, 0, "resize_area: Source tensor should have rank 4, but %i given.", inRank); - + ALLOCATE(outputShape, block.getWorkspace(), shape::shapeInfoLength(inRank), Nd4jLong); outputShape[0] = inRank; if (inRank == 4) { diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp index 26ca7eec9..63da432c7 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp @@ -62,13 +62,13 @@ namespace nd4j { REQUIRE_TRUE(!halfPixelAlign || (halfPixelAlign && !alignCorners), 0, "resize_bicubic: `half_pixel_centers' should be false or true only when `align_corners' is false"); auto source = inRank == 4?image->reshape(image->ordering(), {image->sizeAt(0), image->sizeAt(1), image->sizeAt(2), image->sizeAt(3)}):image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)}); - auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}):output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}); + auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}, false) : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false); return helpers::resizeBicubicFunctorA(block.launchContext(), &source, width, height, alignCorners, halfPixelAlign, &target); } DECLARE_SHAPE_FN(resize_bicubic) { - auto shapeList = SHAPELIST(); + auto shapeList = SHAPELIST(); auto in = inputShape->at(0); Nd4jLong* outputShape; @@ -82,7 +82,7 @@ namespace nd4j { height = newImageSize->e(1); REQUIRE_TRUE(inRank == 4 || inRank == 3, 0, "resize_bicubic: Source tensor should have rank 4, but %i given.", inRank); - + ALLOCATE(outputShape, block.getWorkspace(), shape::shapeInfoLength(inRank), Nd4jLong); outputShape[0] = inRank; if (inRank == 4) { diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp index 652b78cf1..fa7054c29 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp @@ -43,7 +43,7 @@ namespace nd4j { REQUIRE_TRUE(inRank == output->rankOf(), 0, "resize_bilinear: Input and output ranks should be equals, but %i and %i occured.", inRank, output->rankOf()); auto source = inRank == 4?image->reshape(image->ordering(), {image->sizeAt(0), image->sizeAt(1), image->sizeAt(2), image->sizeAt(3)}):image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)}); - auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}):output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}); + auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}, false) : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false); if (block.width() > 1) { auto newImageSize = INPUT_VARIABLE(1); @@ -71,7 +71,7 @@ namespace nd4j { } DECLARE_SHAPE_FN(resize_bilinear) { - auto shapeList = SHAPELIST(); + auto shapeList = SHAPELIST(); auto in = inputShape->at(0); Nd4jLong* outputShape; @@ -94,7 +94,7 @@ namespace nd4j { width = INT_ARG(0); height = INT_ARG(1); } - + ALLOCATE(outputShape, block.getWorkspace(), shape::shapeInfoLength(inRank), Nd4jLong); outputShape[0] = inRank; if (inRank == 4) { diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp index db477f569..9d6ac8a81 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp @@ -63,13 +63,13 @@ namespace nd4j { REQUIRE_TRUE(((alignCorners && height > 2) || (height > 0)) && ((alignCorners && width > 1) || (width > 0)), 0, "resize_nearest_neighbor: Wrong input or output size to resize (width = %d, height = %d)", width, height); auto source = inRank == 4?*image:image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)}); - auto target = inRank == 4?*output:output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}); + auto target = inRank == 4 ? *output : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false); return helpers::resizeNeighborFunctor(block.launchContext(), inRank==4?image:&source, width, height, alignCorners, halfPixelCenter, inRank == 4 ? output : &target); } DECLARE_SHAPE_FN(resize_nearest_neighbor) { - auto shapeList = SHAPELIST(); + auto shapeList = SHAPELIST(); auto in = inputShape->at(0); auto inRank = shape::rank(in); Nd4jLong* outputShape; diff --git a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp index 3b158ff3a..085d7f09c 100644 --- a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp @@ -36,14 +36,14 @@ namespace nd4j { int _a = INT_ARG(e); if (_a < 0) _a += input->rankOf(); - + axis.emplace_back(_a); } else if (block.width() > 1) { auto a = INPUT_VARIABLE(1); for (Nd4jLong e = 0; e < a->lengthOf(); e++) { int _a = a->e(e); - + if (_a < 0) _a += input->rankOf(); @@ -71,7 +71,7 @@ namespace nd4j { } if (block.isInplace()) { - output->reshapei(input->ordering(), shape); + output->reshapei(input->ordering(), shape, false); } else { auto tmp = input->reshape(input->ordering(), shape); output->assign(tmp); @@ -106,20 +106,20 @@ namespace nd4j { int _a = INT_ARG(e); if (_a < 0) _a += rank; - + axis.emplace_back(_a); } else if (block.width() > 1) { auto a = INPUT_VARIABLE(1); for (int e = 0; e < a->lengthOf(); e++) { int _a = a->e(e); - + if (_a < 0) _a += rank; axis.emplace_back(_a); } - + } auto order = shape::order(in); diff --git a/libnd4j/include/ops/declarable/headers/blas.h b/libnd4j/include/ops/declarable/headers/blas.h index 08f8f79a7..d94d365dd 100644 --- a/libnd4j/include/ops/declarable/headers/blas.h +++ b/libnd4j/include/ops/declarable/headers/blas.h @@ -57,7 +57,8 @@ namespace nd4j { * IArgs[1]... axes values for second array */ #if NOT_EXCLUDED(OP_tensormmul) - DECLARE_CUSTOM_OP(tensormmul, 2, 1, false, 0, -1); + DECLARE_CUSTOM_OP(tensormmul, 2, 1, false, 0, -1); + DECLARE_CUSTOM_OP(tensormmul_bp, 3, 2, false, 0, -1); #endif /** diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp index db09f0d3c..51ddc0369 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp @@ -432,7 +432,7 @@ namespace nd4j { ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext()); - NDArray outputReshaped = output->reshape(output->ordering(), outReShape); + NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false); helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext())); // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW] MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput); // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC] @@ -505,7 +505,7 @@ namespace nd4j { if(gradB) { NDArray* gradBR = gradB; if(gradB->rankOf() == 2) - gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()})); + gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false)); gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1}); // sum over bS, oH, oW if(gradBR != gradB) diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp index 0adb0e249..c12b1ce4f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp @@ -30,7 +30,7 @@ namespace helpers { void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) { auto _a = a->reshape(a->ordering(), {-1, 3}); auto _b = b->reshape(b->ordering(), {-1, 3}); - auto _o = o->reshape(o->ordering(), {-1, 3}); + auto _o = o->reshape(o->ordering(), {-1, 3}, false); auto tadsA = _a.allTensorsAlongDimension({1}); auto tadsB = _b.allTensorsAlongDimension({1}); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index 5422d04c1..01e346136 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -244,14 +244,14 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC] - NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), output.sizeAt(3)}); + NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), output.sizeAt(3)}, false); outputRearranged0.permutei({2, 3,0, 4,1, 5}); if(input.lengthOf() == output.lengthOf()) { outputRearranged0.assign(input); } else { - NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, output.sizeAt(3)}); + NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, output.sizeAt(3)}, false); BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatch_, (input, outputRearranged1, padBottom, padTop, padLeft, padRight), LIBND4J_TYPES); if(output.getBuffer() != outputRearranged1.getBuffer()) @@ -352,7 +352,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND for(int j = 1; j < rank; ++i, ++j) temp[i] = output.sizeAt(j); - NDArray outputRearranged0 = output.reshape(output.ordering(), temp); + NDArray outputRearranged0 = output.reshape(output.ordering(), temp, false); //*** construct permuting std::vector for permutation of output array ***// @@ -382,7 +382,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND for(i = 1; i < rank; ++i) temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e(i - 1) : output.sizeAt(i); - NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp); + NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp, false); BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatchND_, (input, padding, outputRearranged1, numOfSpatialDims), LIBND4J_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h index 31b386e7e..02b7e8467 100644 --- a/libnd4j/include/ops/declarable/helpers/cross.h +++ b/libnd4j/include/ops/declarable/helpers/cross.h @@ -59,7 +59,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND void FORCEINLINE _crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) { auto a_ = a->reshape(a->ordering(), {-1, 3}); auto b_ = b->reshape(b->ordering(), {-1, 3}); - auto o_ = o->reshape(o->ordering(), {-1, 3}); + auto o_ = o->reshape(o->ordering(), {-1, 3}, false); auto tadsA = a_.allTensorsAlongDimension({1}); auto tadsB = b_.allTensorsAlongDimension({1}); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu index 4f77b2e7c..39732b024 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu @@ -322,7 +322,7 @@ static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext()); - NDArray outputReshaped = output->reshape(output->ordering(), outReShape); + NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false); helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext())); // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW] MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput); // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC] @@ -1228,7 +1228,7 @@ static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const N NDArray* gradBR = gradB; if(gradB->rankOf() == 2) gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()})); - gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot); // sum over bS, oH, oW + gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot, false); // sum over bS, oH, oW if(gradBR != gradB) delete gradBR; } @@ -1310,7 +1310,7 @@ static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, con NDArray* gradBR = gradB; if(gradB->rankOf() == 2) gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()})); - gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1}); // sum over bS, oH, oW + gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1}, false); // sum over bS, oH, oW if(gradBR != gradB) delete gradBR; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu index 82f421fdd..f3bee349b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu @@ -313,7 +313,7 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC] - NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), input.sizeAt(3)}); + NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), input.sizeAt(3)}, false); outputRearranged0.permutei({2, 3,0, 4,1, 5}); if(input.lengthOf() == output.lengthOf()) { @@ -322,7 +322,7 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o } else { - NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, input.sizeAt(3)}); + NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, input.sizeAt(3)}, false); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; @@ -439,7 +439,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND for(int j = 1; j < rank; ++i, ++j) temp[i] = output.sizeAt(j); - NDArray outputRearranged0 = output.reshape(output.ordering(), temp); + NDArray outputRearranged0 = output.reshape(output.ordering(), temp, false); //*** construct permuting std::vector for permutation of output array ***// @@ -469,7 +469,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND for(i = 1; i < rank; ++i) temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e(i - 1) : output.sizeAt(i); - NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp); + NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp, false); const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp index 3371b16ad..26aeacaa3 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp @@ -471,9 +471,9 @@ PLATFORM_IMPL(lstmLayer, ENGINE_CPU) { if(cI) cIR = new NDArray(cI->reshape(cI->ordering(), {1,dirDim,bS,nOut})); if(hL) - hLR = new NDArray(hL->reshape(hL->ordering(), {1,dirDim,bS,nOut})); + hLR = new NDArray(hL->reshape(hL->ordering(), {1,dirDim,bS,nOut}, false)); if(cL) - cLR = new NDArray(cL->reshape(cL->ordering(), {1,dirDim,bS,nOut})); + cLR = new NDArray(cL->reshape(cL->ordering(), {1,dirDim,bS,nOut}, false)); lstmLayerMKLDNN(xP, WxR, WrR, bR, hIR, cIR, params, hP, hLR, cLR); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp index 9df949267..a7258b01c 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp @@ -321,6 +321,280 @@ TEST_F(DeclarableOpsTests1, TestTensorDot4) { delete results; } +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot5) { + + auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {2,4,2,4}, {44,110,160, 66,132, 38, 88,154, 68,170,224,102,204, 82,136,238, 92,230,288,138,276,126,184,322, 116,290,352,174,348,170,232,406, 76,190,160,114,228,182,152,266, 100,250,224,150,300,226,200,350, 124,310,288,186,372,270,248,434, 148,370,352,222,444,314,296,518}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot6) { + + auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {2,4,2,4}, {22, 66,110,154, 44, 88,132,176, 34,102,170,238, 68,136,204,272, 46,138,230,322, 92,184,276,368, 58,174,290,406,116,232,348,464, 38,114,190,266, 76,152,228,304, 50,150,250,350,100,200,300,400, 62,186,310,434,124,248,372,496, 74,222,370,518,148,296,444,592}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot7) { + + auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {2,4,2,4}, {76,166,112,106,196, 62,136,226, 60,174,208, 98,212,230,136,250, 76,214,336,122,260,174,168,306, 124,286,240,178,340,150,232,394, 100,226,176,142,268,106,184,310, 84,234,272,134,284,274,184,334, 100,274,400,158,332,218,216,390, 148,346,304,214,412,194,280,478}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot8) { + + auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {2,4,2,4}, {30, 90,150,210, 60,120,180,240, 38,114,190,266, 76,152,228,304, 46,138,230,322, 92,184,276,368, 54,162,270,378,108,216,324,432, 42,126,210,294, 84,168,252,336, 50,150,250,350,100,200,300,400, 58,174,290,406,116,232,348,464, 66,198,330,462,132,264,396,528}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot9) { + + // NDArray z('f',{2,2,3}, nd4j::DataType::DOUBLE); + // z.linspace(1); + // z.printShapeInfo(); + // z.printIndexedBuffer(); + // z.reshapei('c', {4,3}); + // z.printShapeInfo(); + // z.printIndexedBuffer(); + + auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {3,4,4,3}, {14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {1,0,1,0}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; +} + + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot10) { + + auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {4,4}, {114,258,402,546, 138,314,490,666, 162,370,578,786, 186,426,666,906}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot11) { + + auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {4,4}, {98,218,338,458, 134,302,470,638, 170,386,602,818, 206,470,734,998}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot12) { + + auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {4,4}, {272,292,312,332, 368,396,424,452, 464,500,536,572, 560,604,648,692}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot13) { + + auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {3,3}, {640,560,640, 576,624,576, 640,560,640}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot14) { + + auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {3,3}, {648,600,520, 648,536,648, 520,600,648}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot15) { + + auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); + auto y = NDArrayFactory::create('f', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); + auto expected = NDArrayFactory::create('c', {3,3}, {624,624,624, 656,656,656, 624,624,624}); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(expected.isSameShape(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; + +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot16) { + + NDArray x('c', {1}, std::vector{2}, nd4j::DataType::FLOAT32); + NDArray y('c', {2,1,2}, {1,2,3,4}, nd4j::DataType::FLOAT32); + NDArray exp('c', {2,2}, {2,4,6,8}, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul op; + auto results = op.evaluate({&x, &y}, {}, {1,0, 1,1}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto *result = results->at(0); + + ASSERT_TRUE(exp.isSameShape(result)); + ASSERT_TRUE(exp.equalsTo(result)); + + delete results; +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests1, TestTensorDot17) { + + NDArray x('f', {16,16}, nd4j::DataType::FLOAT32); + NDArray y('f', {1000,16}, nd4j::DataType::FLOAT32); + NDArray z('c', {16,1000}, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul op; + auto status = op.execute({&x, &y}, {&z}, {}, {1,1, 1,1}, {}); + + ASSERT_EQ(ND4J_STATUS_OK, status); +} + ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests1, DivergentCheck1) { auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation("switch"); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp index 6025216f9..dc672d8e6 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp @@ -708,30 +708,6 @@ TEST_F(DeclarableOpsTests12, multiUnique_2) { ASSERT_TRUE(nd4j::ops::helpers::multiUnique(arrayList)); } -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests12, tensormmul_6) { - - NDArray x('c', {1}, std::vector{2}, nd4j::DataType::FLOAT32); - NDArray y('c', {2,1,2}, {1,2,3,4}, nd4j::DataType::FLOAT32); - NDArray exp('c', {2,2}, {2,4,6,8}, nd4j::DataType::FLOAT32); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {1,0, 1,1}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - // exp.printShapeInfo(); - // result->printShapeInfo(); - // result->printIndexedBuffer(); - - ASSERT_TRUE(exp.isSameShape(result)); - ASSERT_TRUE(exp.equalsTo(result)); - - delete results; - -} - //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests12, reduceMeanBp_4) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp index d154039f3..ee8691bbb 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp @@ -1560,3 +1560,447 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test11) { delete resultsB; } +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP1) { + + NDArray A('c', { 1, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 1, 2, 4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.1 }, nd4j::DataType::FLOAT32); + + NDArray dLdA('c', { 1, 2, 3 }, { 3.3, 8.5, 13.36, 3.7, 9.54, 15. }, nd4j::DataType::FLOAT32); + NDArray dLdB('c', { 1, 2, 4 }, { 3.38, 4.04, 4.7, 5.13, 3.83, 4.58, 5.33, 5.82 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,0,1, 2,0,1 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dLdA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dLdA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dLdB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dLdB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP2) { + + NDArray A('c', { 1, 2, 3 }, { 2,2,2, 2,2,2 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 1, 2, 3 }, { 3,3,3,3, 3,3 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 1 }, { 1 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(B.isSameShape(*dLdAbp)); + ASSERT_TRUE(B.equalsTo(*dLdAbp)); + + ASSERT_TRUE(A.isSameShape(*dLdBbp)); + ASSERT_TRUE(A.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP3) { + + NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 4, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32); + + NDArray dA('c', { 3, 2, 2 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, nd4j::DataType::FLOAT32); + NDArray dB('c', { 4, 2, 2 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8, 7.04 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP4) { + + NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5, 9, 23, 0.12, 8, 9, 0.1, 0, 124, 3 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 2, 4, 1 }, { 4, 13, .5, 19, 2.3, 1.2, 18, .9 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 2 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::FLOAT32); + + NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32); + NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9, 26.1, 32.84 , 3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dLdA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dLdA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dLdB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dLdB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP5) { + + NDArray A('c', { 3, 4, 1, 1 }, { 0.4, 3, 5, 9, 23, 0.12, 8, 9, 0.1, 0, 124, 3 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 2, 4, 1, 1 }, { 4, 13, .5, 19, 2.3, 1.2, 18, .9 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, nd4j::DataType::FLOAT32); + + NDArray dLdA('c', { 3, 4, 1, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32); + NDArray dLdB('c', { 2, 4, 1, 1 }, { 30.49, 3.456, 201.9, 26.1, 32.84, 3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dLdA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dLdA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dLdB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dLdB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP6) { + + NDArray A('c', { 2, 2, 2 }, { 2,2, 2,2, 2,2, 2,2 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 2, 2, 2 }, { 3,3, 3,3, 3,3, 3,3 }, nd4j::DataType::FLOAT32); + + auto dLdC = NDArrayFactory::create(1.f); + + nd4j::ops::tensormmul_bp op_bp; + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 3,0,1,2, 3,0,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(B.isSameShape(*dLdAbp)); + ASSERT_TRUE(B.equalsTo(*dLdAbp)); + + ASSERT_TRUE(A.isSameShape(*dLdBbp)); + ASSERT_TRUE(A.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP7) { + + NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5, 9, 23, 0.12, 8, 9, 0.1, 0, 124, 3 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 2, 4, 1 }, { 4, 13, .5, 19, 2.3, 1.2, 18, .9 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::FLOAT32); + + NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32); + NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9, 26.1, 32.84, 3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dLdA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dLdA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dLdB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dLdB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP8) { + + NDArray A('c', { 1, 1, 4, 3 }, { 0.4, 3, 5, 9, 23, 0.12, 8, 9, 0.1, 0, 124, 3 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 1, 1, 4, 2 }, { 4, 13, .5, 19, 2.3, 1.2, 18, .9 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 2 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, nd4j::DataType::FLOAT32); + + NDArray dLdA('c', { 1, 1, 4, 3 }, { 20., 23.4, 26.8, 23.35, 27.25, 31.15, 3.97, 4.67, 5.37, 20.88, 24.66, 28.44 }, nd4j::DataType::FLOAT32); + NDArray dLdB('c', { 1, 1, 4, 2 }, { 11.84, 12.68, 39.98, 43.192, 20.65, 22.36, 165.7, 178.4 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 3,0,1,2, 3,0,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dLdA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dLdA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dLdB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dLdB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP9) { + + NDArray A('c', { 3, 2, 2, 1 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 4, 2, 2 ,1 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 1, 4, 1 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32); + + NDArray dA('c', { 3, 2, 2, 1 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, nd4j::DataType::FLOAT32); + NDArray dB('c', { 4, 2, 2, 1 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8, 7.04 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP10) { + + NDArray A('c', { 1, 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 1, 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 1, 3, 1, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32); + + + NDArray dA('c', { 1, 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5, 11.62, 18.74 }, nd4j::DataType::FLOAT32); + NDArray dB('c', { 1, 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP11) { + + NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32); + + + NDArray dA('c', { 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5, 11.62, 18.74 }, nd4j::DataType::FLOAT32); + NDArray dB('c', { 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,0,1, 2,0,1 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP12) { + + NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32); + NDArray B('c', { 2, 2 ,3 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, nd4j::DataType::FLOAT32); + NDArray dLdC('c', { 2, 3, 2, 3 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, + 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2., 2.1, 2.2, 2.3, 2.4, + 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::FLOAT32); + + NDArray dA('c', { 2, 2, 3 }, { 7.66, 20.26, 32.86, 8.29, 21.97, 35.65, 45.46, 58.06, 70.66, 49.33, 63.01, 76.69 }, nd4j::DataType::FLOAT32); + NDArray dB('c', { 2, 2, 3 }, { 25.86, 27.36, 28.86, 28.74, 30.42, 32.1, 30.36, 31.86, 33.36, 33.78, 35.46, 37.14 }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP13) { + + NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::DOUBLE); + NDArray B('c', { 3, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, nd4j::DataType::DOUBLE); + NDArray dLdC('c', { 3, 2, 3, 2 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, + 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2., 2.1, 2.2, 2.3, 2.4, + 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::DOUBLE); + + NDArray dA('c', { 3, 2, 2 }, { 7.79, 20.57, 8.21, 21.71, 33.35, 46.13, 35.21, 48.71, 58.91, 71.69, 62.21, 75.71 }, nd4j::DataType::DOUBLE); + NDArray dB('c', { 3, 2, 2 }, { 26.49, 28.02, 28.41, 30.06, 29.55, 31.08, 31.71, 33.36, 32.61, 34.14, 35.01, 36.66 }, nd4j::DataType::DOUBLE); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP14) { + + NDArray A('c', { 2, 2, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::DOUBLE); + + NDArray B('c', { 2, 2, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::DOUBLE); + + NDArray dLdC('c', { 2, 2, 2, 2, 2, 2 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, + 1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, + 1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, + 1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, + 1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::DOUBLE); + + NDArray dA('c', { 2, 2, 2, 2 }, { 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24, 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24 }, nd4j::DataType::DOUBLE); + NDArray dB('c', { 2, 2, 2, 2 }, { 10.76, 12.88, 15., 17.12, 12.36, 14.8, 17.24, 19.68, 19.24, 21.36, 23.48, 25.6, 22.12, 24.56, 27., 29.44 }, nd4j::DataType::DOUBLE); + + nd4j::ops::tensormmul_bp op_bp; + + auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {}); + + ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status()); + + auto* dLdAbp = resultsBP->at(0); + auto* dLdBbp = resultsBP->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdAbp)); + ASSERT_TRUE(dA.equalsTo(*dLdAbp)); + + ASSERT_TRUE(dB.isSameShape(*dLdBbp)); + ASSERT_TRUE(dB.equalsTo(*dLdBbp)); + + delete resultsBP; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP15) { + + NDArray A('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::FLOAT32); + NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::FLOAT32); + + NDArray dLdC('f', { 2, 2 }, { 23.0, 24.44, 2.0, 26. }, nd4j::DataType::FLOAT32); + + NDArray dA('c', { 2, 2, 3 }, { 27., 127., 227., 77., 177., 277., 76.44, 278.20001, 479.96002, 177.32, 379.08001, 580.839966 }, nd4j::DataType::FLOAT32); + NDArray dB('f', { 2, 2, 3 }, { 194.08, 184., 336.4, 268., 241.52, 212., 383.839996, 296., 288.96002, 240., 431.27999, 324. }, nd4j::DataType::FLOAT32); + + nd4j::ops::tensormmul_bp op; + auto results = op.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2,2,1,2 }); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto* dLdA = results->at(0); + auto* dLdB = results->at(1); + + ASSERT_TRUE(dA.isSameShape(*dLdA)); + ASSERT_TRUE(dA.equalsTo(*dLdA)); + + ASSERT_TRUE(dB.isSameShape(*dLdB)); + ASSERT_TRUE(dB.equalsTo(*dLdB)); + + delete results; +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP16) { + + NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE); + NDArray B('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE); + + NDArray dLdC('c', { 2, 2 }, nd4j::DataType::DOUBLE); + + const OpArgsHolder argsHolderFF({ &A, &B }, {}, { 2,1,2, 2,1,2 }); + const OpArgsHolder argsHolderBP({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }); + + nd4j::ops::tensormmul op; + nd4j::ops::tensormmul_bp op_bp; + + const bool isGradCorrect = GradCheck::checkGrad(op, op_bp, argsHolderFF, argsHolderBP, {1,0}); + ASSERT_TRUE(isGradCorrect); +} +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests15, TestTensorMmul_BP17) { + + NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE); + NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE); + + NDArray dLdC('c', { 2, 2 }, nd4j::DataType::DOUBLE); + + const OpArgsHolder argsHolderFF({ &A, &B }, {}, { 2,1,2, 2,1,2 }); + const OpArgsHolder argsHolderBP({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }); + + nd4j::ops::tensormmul op; + nd4j::ops::tensormmul_bp op_bp; + + const bool isGradCorrect = GradCheck::checkGrad(op, op_bp, argsHolderFF, argsHolderBP, { 1,0 }); + ASSERT_TRUE(isGradCorrect); +} + diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp index fa129b1af..0cf1cea2b 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp @@ -578,246 +578,6 @@ TEST_F(DeclarableOpsTests2, Test_Concat_BP_1) { } -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot5) { - - auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {2,4,2,4}, {44,110,160, 66,132, 38, 88,154, 68,170,224,102,204, 82,136,238, 92,230,288,138,276,126,184,322, 116,290,352,174,348,170,232,406, 76,190,160,114,228,182,152,266, 100,250,224,150,300,226,200,350, 124,310,288,186,372,270,248,434, 148,370,352,222,444,314,296,518}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot6) { - - auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {2,4,2,4}, {22, 66,110,154, 44, 88,132,176, 34,102,170,238, 68,136,204,272, 46,138,230,322, 92,184,276,368, 58,174,290,406,116,232,348,464, 38,114,190,266, 76,152,228,304, 50,150,250,350,100,200,300,400, 62,186,310,434,124,248,372,496, 74,222,370,518,148,296,444,592}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot7) { - - auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {2,4,2,4}, {76,166,112,106,196, 62,136,226, 60,174,208, 98,212,230,136,250, 76,214,336,122,260,174,168,306, 124,286,240,178,340,150,232,394, 100,226,176,142,268,106,184,310, 84,234,272,134,284,274,184,334, 100,274,400,158,332,218,216,390, 148,346,304,214,412,194,280,478}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot8) { - - auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {2,4,2,4}, {30, 90,150,210, 60,120,180,240, 38,114,190,266, 76,152,228,304, 46,138,230,322, 92,184,276,368, 54,162,270,378,108,216,324,432, 42,126,210,294, 84,168,252,336, 50,150,250,350,100,200,300,400, 58,174,290,406,116,232,348,464, 66,198,330,462,132,264,396,528}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {1,1,1,2}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot9) { - - // NDArray z('f',{2,2,3}, nd4j::DataType::DOUBLE); - // z.linspace(1); - // z.printShapeInfo(); - // z.printIndexedBuffer(); - // z.reshapei('c', {4,3}); - // z.printShapeInfo(); - // z.printIndexedBuffer(); - - auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {3,4,4,3}, {14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {1,0,1,0}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; -} - - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot10) { - - auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {4,4}, {114,258,402,546, 138,314,490,666, 162,370,578,786, 186,426,666,906}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot11) { - - auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {4,4}, {98,218,338,458, 134,302,470,638, 170,386,602,818, 206,470,734,998}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot12) { - - auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {4,4}, {272,292,312,332, 368,396,424,452, 464,500,536,572, 560,604,648,692}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot13) { - - auto x = NDArrayFactory::create('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {3,3}, {640,560,640, 576,624,576, 640,560,640}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot14) { - - auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {3,3}, {648,600,520, 648,536,648, 520,600,648}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} - -//////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests2, TestTensorDot15) { - - auto x = NDArrayFactory::create('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15}); - auto y = NDArrayFactory::create('f', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16}); - auto expected = NDArrayFactory::create('c', {3,3}, {624,624,624, 656,656,656, 624,624,624}); - - nd4j::ops::tensormmul op; - auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0}); - - ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *result = results->at(0); - - ASSERT_TRUE(expected.isSameShape(result)); - ASSERT_TRUE(expected.equalsTo(result)); - - delete results; - -} //////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_1) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp index 2c4655b31..11ebc1229 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp @@ -2043,34 +2043,6 @@ TEST_F(DeclarableOpsTests9, cumprod_test1) { const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN); ASSERT_TRUE(isGradCorrect); - - //************************************// -/* exclusive = 1; reverse = 0; - - result = op.execute({&inputC, &axis}, {}, {exclusive, reverse}); - ASSERT_EQ(Status::OK(), result->status()); - z = result->at(0); - ASSERT_TRUE(expTF.equalsTo(z)); - delete result; -*/ - //************************************// -/* exclusive = 0; reverse = 1; - - result = op.execute({&inputC, &axis}, {}, {exclusive, reverse}); - ASSERT_EQ(Status::OK(), result->status()); - z = result->at(0); - ASSERT_TRUE(expFT.equalsTo(z)); - delete result; -*/ - //************************************// -/* exclusive = 1; reverse = 1; - - result = op.execute({&inputC, &axis}, {}, {exclusive, reverse}); - ASSERT_EQ(Status::OK(), result->status()); - z = result->at(0); - ASSERT_TRUE(expTT.equalsTo(z)); - delete result; -*/ } //////////////////////////////////////////////////////////////////////////////// @@ -2079,11 +2051,6 @@ TEST_F(DeclarableOpsTests9, cumprod_test2) { auto inputC = NDArrayFactory::create('c', {2, 2}); auto axis = NDArrayFactory::create(1.); -// auto expFF = NDArrayFactory::create('c', {3, 5}, {1., 2., 6., 24., 120., 6., 42., 336., 3024., 30240.,11., 132.,1716., 24024.,360360.}); -// auto expTF = NDArrayFactory::create('c', {3, 5}, {1, 1, 2, 6, 24,1, 6, 42, 336, 3024,1, 11, 132, 1716, 24024}); - -// auto expFT = NDArrayFactory::create('c', {3, 5}, {120, 120, 60, 20, 5,30240, 5040, 720, 90, 10,360360, 32760, 2730, 210, 15}); //+++ -// auto expTT = NDArrayFactory::create('c', {3, 5}, {120, 60, 20, 5, 1,5040, 720, 90, 10, 1,32760, 2730, 210, 15, 1}); auto gradO = NDArrayFactory::create('c', {2, 2}); int exclusive, reverse; diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java index 981495eac..4cfe9f1be 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java @@ -61,7 +61,7 @@ public class TestPCA extends BaseNd4jTest { assertEquals("Reconstructed matrix is very different from the original.", 0.0, Diff.getDouble(i), 1.0); } } - + @Test public void testFactorSVDTransposed() { int m = 4; From 3de3cd8277bc6c2d1a3fa23dff511daab44b0b87 Mon Sep 17 00:00:00 2001 From: raver119 Date: Thu, 13 Feb 2020 20:59:35 +0300 Subject: [PATCH 02/19] R119 tests (#238) * one small test Signed-off-by: raver119 * one small test Signed-off-by: raver119 * bert test Signed-off-by: raver119 * Graph FlowPath fix Signed-off-by: raver119 * - GraphProfiler tweaks - NodeProfile now includes shapes Signed-off-by: raver119 * RELU_layer inplace tweak Signed-off-by: raver119 * meh Signed-off-by: raver119 * identity tweaks Signed-off-by: raver119 * bert result validation Signed-off-by: raver119 * - bunch of Shape ops have inplace exec forbidden now - Legacy ops have inplace exec disabled by default now Signed-off-by: raver119 * ffast-math enabled Signed-off-by: raver119 * ffast-math enabled Signed-off-by: raver119 * allow some legacy ops to be inplace Signed-off-by: raver119 * disable -fast_math Signed-off-by: raver119 * disable expensive test for cuda Signed-off-by: raver119 --- libnd4j/blas/cpu/GraphExecutioner.cpp | 6 +- libnd4j/include/graph/VariableProxy.h | 1 + libnd4j/include/graph/VariableSpace.h | 1 + libnd4j/include/graph/impl/Variable.cpp | 5 +- libnd4j/include/graph/impl/VariableProxy.cpp | 3 + libnd4j/include/graph/impl/VariableSpace.cpp | 25 ++++- libnd4j/include/graph/profiling/NodeProfile.h | 17 ++- .../graph/profiling/impl/GraphProfile.cpp | 23 +++- .../profiling/impl/GraphProfilingHelper.cpp | 2 +- .../graph/profiling/impl/NodeProfile.cpp | 41 ++++++- libnd4j/include/ops/declarable/OpDescriptor.h | 3 + .../generic/activations/identity.cpp | 12 +- .../ops/declarable/generic/nn/relu_layer.cpp | 13 +-- .../ops/declarable/generic/shape/reshape.cpp | 2 +- .../declarable/generic/shape/reshape_as.cpp | 2 +- .../generic/shape/tile_to_shape.cpp | 2 +- .../declarable/generic/shape/transpose.cpp | 2 +- .../include/ops/declarable/headers/shape.h | 16 +-- .../ops/declarable/impl/DeclarableOp.cpp | 26 ++++- .../include/ops/declarable/impl/LegacyOp.cpp | 4 +- .../impl/LegacyPairwiseTransformOp.cpp | 4 +- .../ops/declarable/impl/LegacyScalarOp.cpp | 10 +- .../declarable/impl/LegacyTransformSameOp.cpp | 4 +- .../impl/LegacyTransformStrictOp.cpp | 4 +- .../ops/declarable/impl/OpDescriptor.cpp | 3 + libnd4j/tests_cpu/layers_tests/CMakeLists.txt | 2 +- .../layers_tests/DeclarableOpsTests10.cpp | 4 + .../layers_tests/DeclarableOpsTests14.cpp | 13 +++ .../layers_tests/DeclarableOpsTests15.cpp | 14 +++ .../layers_tests/PlaygroundTests.cpp | 104 ++++++++++++++++++ 30 files changed, 306 insertions(+), 62 deletions(-) diff --git a/libnd4j/blas/cpu/GraphExecutioner.cpp b/libnd4j/blas/cpu/GraphExecutioner.cpp index 2190afbf1..98b3204cd 100644 --- a/libnd4j/blas/cpu/GraphExecutioner.cpp +++ b/libnd4j/blas/cpu/GraphExecutioner.cpp @@ -179,7 +179,7 @@ namespace graph { nd4j_debug("Embedded graph execution finished. %i variable(s) migrated\n", cnt); } else if (node->hasCustomOp()) { - // if we have something to execute - lets just execute it. + // now, if we have something to execute - lets just execute it. auto status = node->getCustomOp()->execute(&context); if (status != ND4J_STATUS_OK) return status; @@ -494,8 +494,10 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace) nd4j::memory::MemoryRegistrator::getInstance()->setGraphMemoryFootprintIfGreater(h, m); } - if (tempFlow) + if (tempFlow) { delete flowPath; + __variableSpace->setFlowPath(nullptr); + } return Status::OK(); } diff --git a/libnd4j/include/graph/VariableProxy.h b/libnd4j/include/graph/VariableProxy.h index 1c253e9d8..c2a6e9c62 100644 --- a/libnd4j/include/graph/VariableProxy.h +++ b/libnd4j/include/graph/VariableProxy.h @@ -58,6 +58,7 @@ namespace nd4j { virtual void putVariable(int id, Variable *variable); virtual void putVariable(int id, NDArray *array); virtual void putVariable(int id, int idx, NDArray *array); + virtual void putVariable(int id, int idx, NDArray &array); virtual void putVariable(int id, int idx, Variable *array); virtual void replaceVariable(Variable *variable); diff --git a/libnd4j/include/graph/VariableSpace.h b/libnd4j/include/graph/VariableSpace.h index 9443d34b1..81abaf6e8 100644 --- a/libnd4j/include/graph/VariableSpace.h +++ b/libnd4j/include/graph/VariableSpace.h @@ -100,6 +100,7 @@ namespace nd4j { virtual void putVariable(int id, Variable *variable); virtual void putVariable(int id, NDArray *array); virtual void putVariable(int id, int idx, NDArray *array); + virtual void putVariable(int id, int idx, NDArray &array); virtual void putVariable(int id, int idx, Variable *array); virtual void dropVariable(std::pair &pair); diff --git a/libnd4j/include/graph/impl/Variable.cpp b/libnd4j/include/graph/impl/Variable.cpp index 5b8f00b25..c2c5ff61f 100644 --- a/libnd4j/include/graph/impl/Variable.cpp +++ b/libnd4j/include/graph/impl/Variable.cpp @@ -60,8 +60,11 @@ namespace nd4j { result->_name = this->_name; result->_index = this->_index; - if (this->_ndarray != nullptr) + if (this->_ndarray != nullptr) { result->_ndarray = new NDArray(this->_ndarray->dup(this->_ndarray->ordering())); + result->_readOnly = false; + result->_removable = true; + } if (this->_list != nullptr) result->_list = this->_list->clone(); diff --git a/libnd4j/include/graph/impl/VariableProxy.cpp b/libnd4j/include/graph/impl/VariableProxy.cpp index 85664f24a..e8abf1310 100644 --- a/libnd4j/include/graph/impl/VariableProxy.cpp +++ b/libnd4j/include/graph/impl/VariableProxy.cpp @@ -191,6 +191,9 @@ namespace nd4j { _current->putVariable(id, array); } + void nd4j::graph::VariableProxy::putVariable(int id, int idx, NDArray &array) { + _current->putVariable(id, idx, array); + } void VariableProxy::putVariable(int id, int idx, NDArray *array) { _current->putVariable(id, idx, array); diff --git a/libnd4j/include/graph/impl/VariableSpace.cpp b/libnd4j/include/graph/impl/VariableSpace.cpp index 8318befb0..735f0260a 100644 --- a/libnd4j/include/graph/impl/VariableSpace.cpp +++ b/libnd4j/include/graph/impl/VariableSpace.cpp @@ -263,19 +263,19 @@ namespace nd4j { void nd4j::graph::VariableSpace::putVariable(int id, Variable *variable) { // we don't want to add variables more then once if (_variables.count(id) > 0 || _temporary.count(id) > 0) { - // nd4j_verbose("Trying to update variable for node_%i\n", id); - auto local = id < 0 ? _variables.at(id) : _temporary.at(id); if (!local->hasNDArray() && variable->hasNDArray()) { - // nd4j_verbose("Saving variable for node_%i\n", id); local->setNDArray(variable->getNDArray()); + + // we're inheriting this from Variable + local->markReadOnly(variable->isReadOnly()); + local->markRemovable(variable->isRemovable()); } + return; } - //nd4j_debug("Adding Variable to Space: id: %i; Array is null: %i;\n", id, variable->getNDArray() == nullptr); - _varmap.lock(); _handles->emplace_back(variable); @@ -314,6 +314,21 @@ namespace nd4j { } } + void nd4j::graph::VariableSpace::putVariable(int id, int idx, NDArray &array) { + auto *var = new nd4j::graph::Variable(&array, "", id, idx); + var->markRemovable(false); + var->markReadOnly(true); + + // let's see if this op needs + bool d = this->hasVariable(id, idx); + + this->putVariable(id, var); + + // if var for this nodeid already exists - we'll just delete variable + if (d) + delete var; + } + void nd4j::graph::VariableSpace::putVariable(int id, NDArray *array) { auto *var = new nd4j::graph::Variable(array); this->putVariable(id, var); diff --git a/libnd4j/include/graph/profiling/NodeProfile.h b/libnd4j/include/graph/profiling/NodeProfile.h index 51b02326d..62df0c34a 100644 --- a/libnd4j/include/graph/profiling/NodeProfile.h +++ b/libnd4j/include/graph/profiling/NodeProfile.h @@ -24,6 +24,7 @@ #include #include #include +#include namespace nd4j { namespace graph { @@ -65,6 +66,9 @@ namespace nd4j { // total amount of memory used during execution Nd4jLong _memoryTotal = 0L; + + std::vector _inputShapes; + std::vector _outputShapes; public: NodeProfile() = default; ~NodeProfile() = default; @@ -84,10 +88,15 @@ namespace nd4j { void setObjectsSize(Nd4jLong bytes); void setTotalSize(Nd4jLong bytes); - Nd4jLong getActivationsSize(); - Nd4jLong getTemporarySize(); - Nd4jLong getObjectsSize(); - Nd4jLong getTotalSize(); + void addInputShape(Nd4jLong *shapeInfo); + void addOutputShape(Nd4jLong *shapeInfo); + + Nd4jLong getActivationsSize() const; + Nd4jLong getTemporarySize() const; + Nd4jLong getObjectsSize() const; + Nd4jLong getTotalSize() const; + + Nd4jLong getExecutionTime() const; std::string& name(); diff --git a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp index 6c7cccc01..ea8e7bc49 100644 --- a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp +++ b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include namespace nd4j { namespace graph { @@ -184,9 +186,26 @@ namespace nd4j { if (_profiles.empty()) nd4j_printf("No nodes in graph\n",""); - for (auto v: _profiles) + // printint out stuff + std::vector sorted; + for (auto v: _profiles) { v->printOut(); - + sorted.emplace_back(v); + } + + if (_profiles.size() > 1) { + // building hot spots + std::sort(sorted.begin(), sorted.end(), [](const NodeProfile *a, const NodeProfile *b) -> bool { + return a->getExecutionTime() > b->getExecutionTime(); + }); + + nd4j_printf("\nTop 30 reports by EXEC:\n", ""); + auto limit = nd4j::math::nd4j_min(30, sorted.size()); + for (int e = 0; e < limit; e++) { + sorted[e]->printOut(); + } + } + nd4j_printf("\nSpecial timers:\n", ""); if (_timings.empty()) nd4j_printf("No special timers were set\n",""); diff --git a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp index 025cd8651..cbea09616 100644 --- a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp +++ b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp @@ -32,7 +32,7 @@ namespace nd4j { // graph->printOut(); // warm up - for (int e = 0; e < 1000; e++) { + for (int e = 0; e < iterations; e++) { FlowPath fp; auto _vs = varSpace->clone(); diff --git a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp index ab5d2a4c4..c8b00e788 100644 --- a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp +++ b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace graph { @@ -35,9 +36,23 @@ namespace nd4j { nd4j_printf(" Memory: ACT: %lld; TMP: %lld; OBJ: %lld; TTL: %lld;\n", _memoryActivations / _merges, _memoryTemporary / _merges, _memoryObjects / _merges, _memoryTotal / _merges); nd4j_printf(" Time: PREP: %lld ns; EXEC: %lld ns; TTL: %lld ns;\n", _preparationTime / _merges, _executionTime / _merges, _totalTime / _merges); nd4j_printf(" PREP: INPUT: %lld ns; SHAPE: %lld ns; ARRAY: %lld ns;\n", _inputTime / _merges, _shapeTime / _merges, _arrayTime / _merges); + + std::string inputs; + std::string outputs; + + int cnt = 0; + for (const auto &v: _inputShapes) + inputs += v + " "; + + for (const auto &v: _outputShapes) + outputs += v + " "; + + + nd4j_printf(" Inputs: %s\n", inputs.c_str()); + nd4j_printf(" Outputs: %s\n", outputs.c_str()); }; - Nd4jLong NodeProfile::getActivationsSize() { + Nd4jLong NodeProfile::getActivationsSize() const { return _memoryActivations; } @@ -53,15 +68,15 @@ namespace nd4j { _inputTime = time; } - Nd4jLong NodeProfile::getTemporarySize() { + Nd4jLong NodeProfile::getTemporarySize() const{ return _memoryTemporary; } - Nd4jLong NodeProfile::getObjectsSize() { + Nd4jLong NodeProfile::getObjectsSize() const{ return _memoryObjects; } - Nd4jLong NodeProfile::getTotalSize() { + Nd4jLong NodeProfile::getTotalSize() const{ return _memoryTotal; } @@ -97,6 +112,18 @@ namespace nd4j { _memoryTotal = bytes; } + Nd4jLong NodeProfile::getExecutionTime() const { + return _executionTime; + } + + void NodeProfile::addInputShape(Nd4jLong *shapeInfo) { + _inputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + } + + void NodeProfile::addOutputShape(Nd4jLong *shapeInfo) { + _outputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + } + void NodeProfile::merge(NodeProfile *other) { _merges += other->_merges; _memoryObjects += other->_memoryObjects; @@ -110,6 +137,9 @@ namespace nd4j { _shapeTime += other->_shapeTime; _arrayTime += other->_arrayTime; _inputTime += other->_inputTime; + + _inputShapes = other->_inputShapes; + _outputShapes = other->_outputShapes; } std::string& NodeProfile::name() { @@ -129,6 +159,9 @@ namespace nd4j { _shapeTime = other->_shapeTime; _arrayTime = other->_arrayTime; _inputTime = other->_inputTime; + + _inputShapes = other->_inputShapes; + _outputShapes = other->_outputShapes; } } } \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/OpDescriptor.h b/libnd4j/include/ops/declarable/OpDescriptor.h index 2c857f3c0..302559ad8 100644 --- a/libnd4j/include/ops/declarable/OpDescriptor.h +++ b/libnd4j/include/ops/declarable/OpDescriptor.h @@ -147,6 +147,9 @@ namespace nd4j { // returns TRUE if this op allows in-place execution bool allowsInplace(); + // this method allows you to enable/disable inplace call for a given op + void allowInplace(bool reallyAllow); + // this method returns opNum (applicable for legacy XYZ ops only) int getOpNum(); diff --git a/libnd4j/include/ops/declarable/generic/activations/identity.cpp b/libnd4j/include/ops/declarable/generic/activations/identity.cpp index 5ae5b0690..e424772fc 100644 --- a/libnd4j/include/ops/declarable/generic/activations/identity.cpp +++ b/libnd4j/include/ops/declarable/generic/activations/identity.cpp @@ -27,12 +27,10 @@ namespace nd4j { namespace ops { OP_IMPL(identity, 1, 1, true) { auto first = INPUT_VARIABLE(0); - auto z = this->getZ(block); + auto z = OUTPUT_VARIABLE(0); - // just for lulz - first->applyTransform(nd4j::transform::Identity, *z); - - STORE_RESULT(*z); + if (!block.isInplace()) + first->applyTransform(nd4j::transform::Identity, *z); return Status::OK(); } @@ -60,8 +58,8 @@ namespace nd4j { DECLARE_TYPES(identity_bp) { getOpDescriptor() ->setAllowedInputTypes(0, DataType::ANY) - ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) - ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}); + ->setAllowedInputTypes(1, {ALL_FLOATS}) + ->setAllowedOutputTypes(0, {ALL_FLOATS}); } } } diff --git a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp index cfc080117..22c7a9137 100644 --- a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp @@ -31,22 +31,17 @@ namespace nd4j { REQUIRE_TRUE(w->isMatrix(), 0, "relu_layer: weights argument should be a 2D tensor, but got rank %i instead!", w->rankOf()); REQUIRE_TRUE(b->isVector(), 0, "relu_layer: biases argument should be a 1D tensor, but got rank %i instead!", b->rankOf()); REQUIRE_TRUE(b->lengthOf() == w->sizeAt(1), 0, "relu_layer: biases array length should match to columns of weights matrix, however got length = %i and columns = %i!", b->lengthOf(), w->sizeAt(1)); - REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!", - x->sizeAt(1), w->sizeAt(0)); - + REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!", x->sizeAt(1), w->sizeAt(0)); auto output = OUTPUT_VARIABLE(0); - //T bound = (T)0.f; - //nd4j_printf("Matrix x(%ix%i), Matrix w(%ix%i), b(1x%i)\n", x->sizeAt(0), x->sizeAt(1), w->sizeAt(0), w->sizeAt(1), b->lengthOf()); nd4j::ops::xw_plus_b op; - std::unique_ptr result(op.evaluate({x, w, b})); - REQUIRE_TRUE(Status::OK() == result->status(), 0, "relu_layer: xw_plus_b op failed on input data."); + auto status = op.execute({x, w, b}, {output}); + REQUIRE_TRUE(Status::OK() == status, 0, "relu_layer: xw_plus_b op failed on input data."); auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0; - auto xw = result->at(0); - xw->applyScalar(nd4j::scalar::RELU, scalar, *output); + output->applyScalar(nd4j::scalar::RELU, scalar, *output); return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp index 1d76138f2..b8d582481 100644 --- a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp @@ -28,7 +28,7 @@ namespace nd4j { ////////////////////////////////////////////////////////////////////////// // here iArgs is a vector with (optional) negative of order as first element: // ({-order, dim1, dim2, dim3, ...}) - CUSTOM_OP_IMPL(reshape, 1, 1, true, 0, -2) { + CUSTOM_OP_IMPL(reshape, 1, 1, false, 0, -2) { auto x = INPUT_VARIABLE(0); if (block.width() == 1) { diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp index 92dc2a146..75aafc06f 100644 --- a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp @@ -28,7 +28,7 @@ namespace nd4j { ////////////////////////////////////////////////////////////////////////// - CUSTOM_OP_IMPL(reshapeas, 2, 1, true, 0, 0) { + CUSTOM_OP_IMPL(reshapeas, 2, 1, false, 0, 0) { auto x = INPUT_VARIABLE(0); auto y = INPUT_VARIABLE(1); diff --git a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp index cc88fb46c..d71fbddd5 100644 --- a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp @@ -25,7 +25,7 @@ namespace nd4j { namespace ops { - CUSTOM_OP_IMPL(tile_to_shape, 1, 1, true, 0, -1) { + CUSTOM_OP_IMPL(tile_to_shape, 1, 1, false, 0, -1) { auto input = INPUT_VARIABLE(0); auto output = OUTPUT_VARIABLE(0); diff --git a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp index 5d01b8bbf..15ed67744 100644 --- a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp @@ -28,7 +28,7 @@ namespace nd4j { namespace ops { ////////////////////////////////////////////////////////////////////////// - CUSTOM_OP_IMPL(transpose, 1, 1, true, 0, 0) { + CUSTOM_OP_IMPL(transpose, 1, 1, false, 0, 0) { auto x = INPUT_VARIABLE(0); if (block.width() == 1) { if (block.isInplace()) { diff --git a/libnd4j/include/ops/declarable/headers/shape.h b/libnd4j/include/ops/declarable/headers/shape.h index 3d47c24bf..c21cdb84d 100644 --- a/libnd4j/include/ops/declarable/headers/shape.h +++ b/libnd4j/include/ops/declarable/headers/shape.h @@ -26,15 +26,15 @@ namespace nd4j { namespace ops { #if NOT_EXCLUDED(OP_permute) - DECLARE_CUSTOM_OP(permute, 1, 1, true, 0, -2); + DECLARE_CUSTOM_OP(permute, 1, 1, false, 0, -2); #endif #if NOT_EXCLUDED(OP_reshapeas) - DECLARE_CUSTOM_OP(reshapeas, 2, 1, true, 0, 0); + DECLARE_CUSTOM_OP(reshapeas, 2, 1, false, 0, 0); #endif #if NOT_EXCLUDED(OP_transpose) - DECLARE_CUSTOM_OP(transpose, 1, 1, true, 0, 0); + DECLARE_CUSTOM_OP(transpose, 1, 1, false, 0, 0); #endif #if NOT_EXCLUDED(OP_shape_of) @@ -46,7 +46,7 @@ namespace nd4j { #endif #if NOT_EXCLUDED(OP_squeeze) - DECLARE_CUSTOM_OP(squeeze, 1, 1, true, 0, -2); + DECLARE_CUSTOM_OP(squeeze, 1, 1, false, 0, -2); #endif #if NOT_EXCLUDED(OP_expand_dims) @@ -54,11 +54,11 @@ namespace nd4j { #endif #if NOT_EXCLUDED(OP_reshape) - DECLARE_CUSTOM_OP(reshape, 1, 1, true, 0, -2); + DECLARE_CUSTOM_OP(reshape, 1, 1, false, 0, -2); #endif #if NOT_EXCLUDED(OP_size_at) - DECLARE_CUSTOM_OP(size_at, 1, 1, true, 0, 1); + DECLARE_CUSTOM_OP(size_at, 1, 1, false, 0, 1); #endif /** @@ -80,8 +80,8 @@ namespace nd4j { * @tparam T */ #if NOT_EXCLUDED(OP_tile_to_shape) - DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, true, 0, -1); - DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, true, 0, -1); + DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, false, 0, -1); + DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, false, 0, -1); #endif /** diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp index 46d10b51c..9724b6ba5 100644 --- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp +++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp @@ -150,6 +150,22 @@ namespace nd4j { } if (ctx.isInplace()) { + if (Environment::getInstance()->isProfiling() && node != nullptr) { + if (ctx.isFastPath()) { + // + } else { + for (auto p: *ctx.inputs()) { + auto var = ctx.variable(p); + if (var->variableType() == VariableType::NDARRAY) { + NDArray *array = var->getNDArray(); + + node->addInputShape(array->shapeInfo()); + node->addOutputShape(array->shapeInfo()); + } + } + } + } + // do nothing, getZ result will do the trick return static_cast(ctx.width()); } else { @@ -192,6 +208,10 @@ namespace nd4j { auto inputTime = std::chrono::duration_cast(inputEnd - inputStart).count(); node->setInputTime(inputTime); + // saving output shapes in profile + for (int e = 0; e < inSha.size(); e++) + node->addInputShape(inSha.at(e)); + shapeStart = std::chrono::system_clock::now(); } @@ -204,6 +224,10 @@ namespace nd4j { auto prepTime = std::chrono::duration_cast(shapeEnd - shapeStart).count(); node->setShapeFunctionTime(prepTime); + // saving output shapes in profile + for (int e = 0; e < outSha->size(); e++) + node->addOutputShape(outSha->at(e)); + arrayStart = std::chrono::system_clock::now(); } @@ -562,7 +586,7 @@ namespace nd4j { block->setInnerTime(outerTime); } - if (Environment::getInstance()->isProfiling()) { + if (Environment::getInstance()->isProfiling() && !block->isFastPath()) { auto fp = block->getVariableSpace()->flowPath(); if (fp != nullptr) { auto p = fp->profile(); diff --git a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp index f93df63f1..e9920c409 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp @@ -23,11 +23,11 @@ namespace nd4j { namespace ops { - LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) { + LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) { _numInputs = numInputs; } - LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) { + LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) { _opNum = opNum; _numInputs = numInputs; } diff --git a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp index 07c7234f5..49f896be1 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp @@ -25,11 +25,11 @@ namespace nd4j { namespace ops { LegacyPairwiseTransformOp::LegacyPairwiseTransformOp() : LegacyOp::LegacyOp(2) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyPairwiseTransformOp::LegacyPairwiseTransformOp(int opNum) : LegacyOp::LegacyOp(2, opNum) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyPairwiseTransformOp::clone() { diff --git a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp index b1261b37c..856bfdeaf 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp @@ -26,11 +26,11 @@ namespace nd4j { namespace ops { LegacyScalarOp::LegacyScalarOp() : LegacyOp::LegacyOp(1) { - // no-op + this->getOpDescriptor()->allowInplace(true); } LegacyScalarOp::LegacyScalarOp(int opNum) : LegacyOp::LegacyOp(1, opNum){ - // no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyScalarOp::clone() { @@ -66,6 +66,7 @@ namespace nd4j { NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y->buffer(), y->shapeInfo(), y->specialBuffer(), y->specialShapeInfo(), extras.argumentsAsT(z->dataType())); + NDArray::registerSpecialUse({z}, {x, y}); } else if (block.getTArguments()->size() > 0) { auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext()); @@ -78,10 +79,9 @@ namespace nd4j { NDArray::prepareSpecialUse({z}, {x, _scalar}); NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), _scalar->buffer(), _scalar->shapeInfo(), _scalar->specialBuffer(), _scalar->specialShapeInfo(), extras.argumentsAsT(z->dataType())); - } - manager.synchronize(); - STORE_RESULT(*z); + NDArray::registerSpecialUse({z}, {x, _scalar}); + } return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp index 49fef3af0..6b097c3af 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp @@ -26,11 +26,11 @@ namespace nd4j { namespace ops { LegacyTransformSameOp::LegacyTransformSameOp() : LegacyOp::LegacyOp(1) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyTransformSameOp::LegacyTransformSameOp(int opNum) : LegacyOp::LegacyOp(1, opNum) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyTransformSameOp::clone() { diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp index 19a51191a..a390a458c 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp @@ -26,11 +26,11 @@ namespace nd4j { namespace ops { LegacyTransformStrictOp::LegacyTransformStrictOp() : LegacyOp::LegacyOp(1) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyTransformStrictOp::LegacyTransformStrictOp(int opNum) : LegacyOp::LegacyOp(1, opNum) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyTransformStrictOp::clone() { diff --git a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp index 5139a95cc..417fc0605 100644 --- a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp +++ b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp @@ -50,6 +50,9 @@ namespace nd4j { _scalar = isScalar; } + void OpDescriptor::allowInplace(bool reallyAllow){ + _allowsInplace = reallyAllow; + } bool OpDescriptor::operator==(const OpDescriptor& other) const { if (_hash == -1 && other._hash == -1) diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt index f538eb9cd..17ae714cd 100644 --- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt @@ -52,7 +52,7 @@ elseif(WIN32) set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2") endif() else() - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*") set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native") diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp index a0722f9d0..484719a45 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp @@ -3087,6 +3087,10 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03_3) { //////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif NDArray x = NDArrayFactory::create('c', {2,4,5,3}); NDArray exp = NDArrayFactory::create('c', {2,4,5,3},{ diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp index 600004ec2..7592bee27 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp @@ -78,6 +78,11 @@ TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_1) { } TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_2) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif + auto x = NDArrayFactory::create('c', {5}, {1, 2, 3, std::numeric_limits::infinity(), 5}); auto y = NDArrayFactory::create('c', {5}, {1, 2, 3, -std::numeric_limits::infinity(), 5}); @@ -332,6 +337,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_max_1) { } TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif auto e = NDArrayFactory::create('c', {1, 0}); nd4j::ops::reduce_sum sumOp; @@ -343,6 +352,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) { } TEST_F(DeclarableOpsTests14, test_empty_reduce_mean_1) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif auto e = NDArrayFactory::create('c', {1, 0}); nd4j::ops::reduce_mean sumOp; diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp index ee8691bbb..199630d4e 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp @@ -584,6 +584,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_1) { } TEST_F(DeclarableOpsTests15, test_check_numeric_2) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif + auto x = NDArrayFactory::create('c', {3},{1.f, 2.f, std::numeric_limits::infinity()}); auto y = NDArrayFactory::string("should trigger"); auto z = NDArrayFactory::create('c', {3} ); @@ -598,6 +603,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_2) { } TEST_F(DeclarableOpsTests15, test_check_numeric_3) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif + auto x = NDArrayFactory::create('c', {3},{1.f, 2.f, std::numeric_limits::quiet_NaN()}); auto y = NDArrayFactory::string("should trigger"); auto z = NDArrayFactory::create('c', {3} ); @@ -1530,6 +1540,10 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test10) { } TEST_F(DeclarableOpsTests15, Pow_BP_Test11) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif NDArray xB('c', { 3,2,1 }, { .4, 3, 5, .8, -9, -12 }, nd4j::DataType::FLOAT32); NDArray yB('c', { 1,2,3 }, { 3, -2, .4, -4, 10, .8 }, nd4j::DataType::FLOAT32); diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 9f75beca1..7db7a791a 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -65,6 +65,110 @@ TEST_F(PlaygroundTests, test_avx) { nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel()); } + +TEST_F(PlaygroundTests, test_bert_1) { + // this test will run ONLY if this model exists + if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0) + return; + + auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb"); + + auto t = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext.numpy"); + auto u = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_1.numpy"); + auto v = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_4.numpy"); + auto z = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model_output.numpy"); + + //graph->printOut(); + + graph->tagInplaceNodes(); + + graph->getVariableSpace()->putVariable(85,0, t); + graph->getVariableSpace()->putVariable(86,0, u); + graph->getVariableSpace()->putVariable(87,0, v); + +/* + // validating graph now + auto status = GraphExecutioner::execute(graph); + ASSERT_EQ(Status::OK(), status); + ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198)); + + auto array = graph->getVariableSpace()->getVariable(198)->getNDArray(); + ASSERT_EQ(z, *array); +*/ + + nd4j::Environment::getInstance()->setProfiling(true); + auto profile = GraphProfilingHelper::profile(graph, 1); + + profile->printOut(); + + nd4j::Environment::getInstance()->setProfiling(false); + delete profile; + +/* + std::vector values; + + for (int e = 0; e < 1; e++) { + auto timeStart = std::chrono::system_clock::now(); + + GraphExecutioner::execute(graph); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); +*/ + + delete graph; +} + +/* +TEST_F(PlaygroundTests, test_broadcast_1) { + int pool = 10; + std::vector aX(pool); + std::vector aY(pool); + std::vector aZ(pool); + + for (int e = 0; e < pool; e++) { + aX[e] = NDArrayFactory::create_('c', {64, 128, 1}); + aY[e] = NDArrayFactory::create_('c', {768}); + aZ[e] = NDArrayFactory::create_('c', {64, 128, 768}); + + aX[e]->assign(119 * (e+1)); + aY[e]->assign(119 * (e+3)); + } + + std::vector values; + + for (int e = 0; e < 1000; e++) { + auto x = aX[e < pool ? e : e % pool]; + auto y = aY[e < pool ? e : e % pool]; + auto z = aZ[e < pool ? e : e % pool]; + + auto timeStart = std::chrono::system_clock::now(); + + x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); + + for (int e = 0; e < pool; e++) { + delete aX[e]; + delete aY[e]; + delete aZ[e]; + } +} +*/ + /* TEST_F(PlaygroundTests, test_s_0) { From c34790f9320844e6119970ef36afac769ed89d36 Mon Sep 17 00:00:00 2001 From: Andrii T <39699084+atuzhykov@users.noreply.github.com> Date: Fri, 14 Feb 2020 02:53:35 +0200 Subject: [PATCH 03/19] =?UTF-8?q?Copied=20and=20pasted=20RegressionTest100?= =?UTF-8?q?b4.java=20to=20RegressionTest100b6.jav=E2=80=A6=20(#215)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Copied and pasted RegressionTest100b4.java to RegressionTest100b6.java with renamed b4->b6 * assertEquals > assertTrue for half dtype Signed-off-by: atuzhykov --- .../regressiontest/RegressionTest100b6.java | 390 ++++++++++++++++++ 1 file changed, 390 insertions(+) create mode 100644 deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java new file mode 100644 index 000000000..637f5860f --- /dev/null +++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2015-2019 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.deeplearning4j.regressiontest; + +import org.deeplearning4j.BaseDL4JTest; +import org.deeplearning4j.TestUtils; +import org.deeplearning4j.nn.conf.BackpropType; +import org.deeplearning4j.nn.conf.ConvolutionMode; +import org.deeplearning4j.nn.conf.graph.LayerVertex; +import org.deeplearning4j.nn.conf.layers.*; +import org.deeplearning4j.nn.conf.layers.convolutional.Cropping2D; +import org.deeplearning4j.nn.conf.layers.recurrent.Bidirectional; +import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn; +import org.deeplearning4j.nn.conf.layers.variational.VariationalAutoencoder; +import org.deeplearning4j.nn.graph.ComputationGraph; +import org.deeplearning4j.nn.graph.vertex.impl.MergeVertex; +import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; +import org.deeplearning4j.nn.weights.WeightInitXavier; +import org.deeplearning4j.regressiontest.customlayer100a.CustomLayer; +import org.junit.Test; +import org.nd4j.linalg.activations.impl.*; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.learning.config.Adam; +import org.nd4j.linalg.learning.config.RmsProp; +import org.nd4j.linalg.learning.regularization.L2Regularization; +import org.nd4j.linalg.lossfunctions.impl.LossMAE; +import org.nd4j.linalg.lossfunctions.impl.LossMCXENT; +import org.nd4j.resources.Resources; + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; + +import static org.junit.Assert.*; + +public class RegressionTest100b6 extends BaseDL4JTest { + + @Override + public DataType getDataType() { + return DataType.FLOAT; + } + + @Test + public void testCustomLayer() throws Exception { + + for (DataType dtype : new DataType[]{DataType.DOUBLE, DataType.FLOAT, DataType.HALF}) { + + String dtypeName = dtype.toString().toLowerCase(); + + File f = Resources.asFile("regression_testing/100b6/CustomLayerExample_100b6_" + dtypeName + ".bin"); + MultiLayerNetwork.load(f, true); + + MultiLayerNetwork net = MultiLayerNetwork.load(f, true); +// net = net.clone(); + + DenseLayer l0 = (DenseLayer) net.getLayer(0).conf().getLayer(); + assertEquals(new ActivationTanH(), l0.getActivationFn()); + assertEquals(new L2Regularization(0.03), TestUtils.getL2Reg(l0)); + assertEquals(new RmsProp(0.95), l0.getIUpdater()); + + CustomLayer l1 = (CustomLayer) net.getLayer(1).conf().getLayer(); + assertEquals(new ActivationTanH(), l1.getActivationFn()); + assertEquals(new ActivationSigmoid(), l1.getSecondActivationFunction()); + assertEquals(new RmsProp(0.95), l1.getIUpdater()); + + INDArray outExp; + File f2 = Resources + .asFile("regression_testing/100b6/CustomLayerExample_Output_100b6_" + dtypeName + ".bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) { + outExp = Nd4j.read(dis); + } + + INDArray in; + File f3 = Resources.asFile("regression_testing/100b6/CustomLayerExample_Input_100b6_" + dtypeName + ".bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) { + in = Nd4j.read(dis); + } + + assertEquals(dtype, in.dataType()); + assertEquals(dtype, outExp.dataType()); + assertEquals(dtype, net.params().dataType()); + assertEquals(dtype, net.getFlattenedGradients().dataType()); + assertEquals(dtype, net.getUpdater().getStateViewArray().dataType()); + + //System.out.println(Arrays.toString(net.params().data().asFloat())); + + INDArray outAct = net.output(in); + assertEquals(dtype, outAct.dataType()); + + assertEquals(dtype, net.getLayerWiseConfigurations().getDataType()); + assertEquals(dtype, net.params().dataType()); + boolean eq = outExp.equalsWithEps(outAct, 0.01); + assertTrue(outExp + " vs " + outAct, eq); } + } + + + @Test + public void testLSTM() throws Exception { + + File f = Resources.asFile("regression_testing/100b6/GravesLSTMCharModelingExample_100b6.bin"); + MultiLayerNetwork net = MultiLayerNetwork.load(f, true); + + LSTM l0 = (LSTM) net.getLayer(0).conf().getLayer(); + assertEquals(new ActivationTanH(), l0.getActivationFn()); + assertEquals(200, l0.getNOut()); + assertEquals(new WeightInitXavier(), l0.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l0)); + assertEquals(new Adam(0.005), l0.getIUpdater()); + + LSTM l1 = (LSTM) net.getLayer(1).conf().getLayer(); + assertEquals(new ActivationTanH(), l1.getActivationFn()); + assertEquals(200, l1.getNOut()); + assertEquals(new WeightInitXavier(), l1.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l1)); + assertEquals(new Adam(0.005), l1.getIUpdater()); + + RnnOutputLayer l2 = (RnnOutputLayer) net.getLayer(2).conf().getLayer(); + assertEquals(new ActivationSoftmax(), l2.getActivationFn()); + assertEquals(77, l2.getNOut()); + assertEquals(new WeightInitXavier(), l2.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l2)); + assertEquals(new Adam(0.005), l2.getIUpdater()); + + assertEquals(BackpropType.TruncatedBPTT, net.getLayerWiseConfigurations().getBackpropType()); + assertEquals(50, net.getLayerWiseConfigurations().getTbpttBackLength()); + assertEquals(50, net.getLayerWiseConfigurations().getTbpttFwdLength()); + + INDArray outExp; + File f2 = Resources.asFile("regression_testing/100b6/GravesLSTMCharModelingExample_Output_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) { + outExp = Nd4j.read(dis); + } + + INDArray in; + File f3 = Resources.asFile("regression_testing/100b6/GravesLSTMCharModelingExample_Input_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) { + in = Nd4j.read(dis); + } + + INDArray outAct = net.output(in); + + assertEquals(outExp, outAct); + } + + @Test + public void testVae() throws Exception { + + File f = Resources.asFile("regression_testing/100b6/VaeMNISTAnomaly_100b6.bin"); + MultiLayerNetwork net = MultiLayerNetwork.load(f, true); + + VariationalAutoencoder l0 = (VariationalAutoencoder) net.getLayer(0).conf().getLayer(); + assertEquals(new ActivationLReLU(), l0.getActivationFn()); + assertEquals(32, l0.getNOut()); + assertArrayEquals(new int[]{256, 256}, l0.getEncoderLayerSizes()); + assertArrayEquals(new int[]{256, 256}, l0.getDecoderLayerSizes()); + assertEquals(new WeightInitXavier(), l0.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l0)); + assertEquals(new Adam(1e-3), l0.getIUpdater()); + + INDArray outExp; + File f2 = Resources.asFile("regression_testing/100b6/VaeMNISTAnomaly_Output_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) { + outExp = Nd4j.read(dis); + } + + INDArray in; + File f3 = Resources.asFile("regression_testing/100b6/VaeMNISTAnomaly_Input_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) { + in = Nd4j.read(dis); + } + + INDArray outAct = net.output(in); + + assertEquals(outExp, outAct); + } + + + @Test + public void testYoloHouseNumber() throws Exception { + + File f = Resources.asFile("regression_testing/100b6/HouseNumberDetection_100b6.bin"); + ComputationGraph net = ComputationGraph.load(f, true); + + int nBoxes = 5; + int nClasses = 10; + + ConvolutionLayer cl = (ConvolutionLayer) ((LayerVertex) net.getConfiguration().getVertices() + .get("convolution2d_9")).getLayerConf().getLayer(); + assertEquals(nBoxes * (5 + nClasses), cl.getNOut()); + assertEquals(new ActivationIdentity(), cl.getActivationFn()); + assertEquals(ConvolutionMode.Same, cl.getConvolutionMode()); + assertEquals(new WeightInitXavier(), cl.getWeightInitFn()); + assertArrayEquals(new int[]{1, 1}, cl.getKernelSize()); + + INDArray outExp; + File f2 = Resources.asFile("regression_testing/100b6/HouseNumberDetection_Output_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) { + outExp = Nd4j.read(dis); + } + + INDArray in; + File f3 = Resources.asFile("regression_testing/100b6/HouseNumberDetection_Input_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) { + in = Nd4j.read(dis); + } + + INDArray outAct = net.outputSingle(in); + + boolean eq = outExp.equalsWithEps(outAct.castTo(outExp.dataType()), 1e-3); + assertTrue(eq); + } + + @Test + public void testSyntheticCNN() throws Exception { + + File f = Resources.asFile("regression_testing/100b6/SyntheticCNN_100b6.bin"); + MultiLayerNetwork net = MultiLayerNetwork.load(f, true); + + ConvolutionLayer l0 = (ConvolutionLayer) net.getLayer(0).conf().getLayer(); + assertEquals(new ActivationReLU(), l0.getActivationFn()); + assertEquals(4, l0.getNOut()); + assertEquals(new WeightInitXavier(), l0.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l0)); + assertEquals(new Adam(0.005), l0.getIUpdater()); + assertArrayEquals(new int[]{3, 3}, l0.getKernelSize()); + assertArrayEquals(new int[]{2, 1}, l0.getStride()); + assertArrayEquals(new int[]{1, 1}, l0.getDilation()); + assertArrayEquals(new int[]{0, 0}, l0.getPadding()); + + SeparableConvolution2D l1 = (SeparableConvolution2D) net.getLayer(1).conf().getLayer(); + assertEquals(new ActivationReLU(), l1.getActivationFn()); + assertEquals(8, l1.getNOut()); + assertEquals(new WeightInitXavier(), l1.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l1)); + assertEquals(new Adam(0.005), l1.getIUpdater()); + assertArrayEquals(new int[]{3, 3}, l1.getKernelSize()); + assertArrayEquals(new int[]{1, 1}, l1.getStride()); + assertArrayEquals(new int[]{1, 1}, l1.getDilation()); + assertArrayEquals(new int[]{0, 0}, l1.getPadding()); + assertEquals(ConvolutionMode.Same, l1.getConvolutionMode()); + assertEquals(1, l1.getDepthMultiplier()); + + SubsamplingLayer l2 = (SubsamplingLayer) net.getLayer(2).conf().getLayer(); + assertArrayEquals(new int[]{3, 3}, l2.getKernelSize()); + assertArrayEquals(new int[]{2, 2}, l2.getStride()); + assertArrayEquals(new int[]{1, 1}, l2.getDilation()); + assertArrayEquals(new int[]{0, 0}, l2.getPadding()); + assertEquals(PoolingType.MAX, l2.getPoolingType()); + + ZeroPaddingLayer l3 = (ZeroPaddingLayer) net.getLayer(3).conf().getLayer(); + assertArrayEquals(new int[]{4, 4, 4, 4}, l3.getPadding()); + + Upsampling2D l4 = (Upsampling2D) net.getLayer(4).conf().getLayer(); + assertArrayEquals(new int[]{3, 3}, l4.getSize()); + + DepthwiseConvolution2D l5 = (DepthwiseConvolution2D) net.getLayer(5).conf().getLayer(); + assertEquals(new ActivationReLU(), l5.getActivationFn()); + assertEquals(16, l5.getNOut()); + assertEquals(new WeightInitXavier(), l5.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l5)); + assertEquals(new Adam(0.005), l5.getIUpdater()); + assertArrayEquals(new int[]{3, 3}, l5.getKernelSize()); + assertArrayEquals(new int[]{1, 1}, l5.getStride()); + assertArrayEquals(new int[]{1, 1}, l5.getDilation()); + assertArrayEquals(new int[]{0, 0}, l5.getPadding()); + assertEquals(2, l5.getDepthMultiplier()); + + SubsamplingLayer l6 = (SubsamplingLayer) net.getLayer(6).conf().getLayer(); + assertArrayEquals(new int[]{2, 2}, l6.getKernelSize()); + assertArrayEquals(new int[]{2, 2}, l6.getStride()); + assertArrayEquals(new int[]{1, 1}, l6.getDilation()); + assertArrayEquals(new int[]{0, 0}, l6.getPadding()); + assertEquals(PoolingType.MAX, l6.getPoolingType()); + + Cropping2D l7 = (Cropping2D) net.getLayer(7).conf().getLayer(); + assertArrayEquals(new int[]{3, 3, 2, 2}, l7.getCropping()); + + ConvolutionLayer l8 = (ConvolutionLayer) net.getLayer(8).conf().getLayer(); + assertEquals(4, l8.getNOut()); + assertEquals(new WeightInitXavier(), l8.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l8)); + assertEquals(new Adam(0.005), l8.getIUpdater()); + assertArrayEquals(new int[]{4, 4}, l8.getKernelSize()); + assertArrayEquals(new int[]{1, 1}, l8.getStride()); + assertArrayEquals(new int[]{1, 1}, l8.getDilation()); + assertArrayEquals(new int[]{0, 0}, l8.getPadding()); + + CnnLossLayer l9 = (CnnLossLayer) net.getLayer(9).conf().getLayer(); + assertEquals(new WeightInitXavier(), l9.getWeightInitFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l9)); + assertEquals(new Adam(0.005), l9.getIUpdater()); + assertEquals(new LossMAE(), l9.getLossFn()); + + INDArray outExp; + File f2 = Resources.asFile("regression_testing/100b6/SyntheticCNN_Output_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) { + outExp = Nd4j.read(dis); + } + + INDArray in; + File f3 = Resources.asFile("regression_testing/100b6/SyntheticCNN_Input_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) { + in = Nd4j.read(dis); + } + + INDArray outAct = net.output(in); + + //19 layers - CPU vs. GPU difference accumulates notably, but appears to be correct + if(Nd4j.getBackend().getClass().getName().toLowerCase().contains("native")){ + assertEquals(outExp, outAct); + } else { + boolean eq = outExp.equalsWithEps(outAct, 0.1); + assertTrue(eq); + } + } + + @Test + public void testSyntheticBidirectionalRNNGraph() throws Exception { + + File f = Resources.asFile("regression_testing/100b6/SyntheticBidirectionalRNNGraph_100b6.bin"); + ComputationGraph net = ComputationGraph.load(f, true); + + Bidirectional l0 = (Bidirectional) net.getLayer("rnn1").conf().getLayer(); + + LSTM l1 = (LSTM) l0.getFwd(); + assertEquals(16, l1.getNOut()); + assertEquals(new ActivationReLU(), l1.getActivationFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l1)); + + LSTM l2 = (LSTM) l0.getBwd(); + assertEquals(16, l2.getNOut()); + assertEquals(new ActivationReLU(), l2.getActivationFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l2)); + + Bidirectional l3 = (Bidirectional) net.getLayer("rnn2").conf().getLayer(); + + SimpleRnn l4 = (SimpleRnn) l3.getFwd(); + assertEquals(16, l4.getNOut()); + assertEquals(new ActivationReLU(), l4.getActivationFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l4)); + + SimpleRnn l5 = (SimpleRnn) l3.getBwd(); + assertEquals(16, l5.getNOut()); + assertEquals(new ActivationReLU(), l5.getActivationFn()); + assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l5)); + + MergeVertex mv = (MergeVertex) net.getVertex("concat"); + + GlobalPoolingLayer gpl = (GlobalPoolingLayer) net.getLayer("pooling").conf().getLayer(); + assertEquals(PoolingType.MAX, gpl.getPoolingType()); + assertArrayEquals(new int[]{2}, gpl.getPoolingDimensions()); + assertTrue(gpl.isCollapseDimensions()); + + OutputLayer outl = (OutputLayer) net.getLayer("out").conf().getLayer(); + assertEquals(3, outl.getNOut()); + assertEquals(new LossMCXENT(), outl.getLossFn()); + + INDArray outExp; + File f2 = Resources.asFile("regression_testing/100b6/SyntheticBidirectionalRNNGraph_Output_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) { + outExp = Nd4j.read(dis); + } + + INDArray in; + File f3 = Resources.asFile("regression_testing/100b6/SyntheticBidirectionalRNNGraph_Input_100b6.bin"); + try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) { + in = Nd4j.read(dis); + } + + INDArray outAct = net.output(in)[0]; + + assertEquals(outExp, outAct); + } +} From f165160edb026a4194b57da08f54772dbeaa99be Mon Sep 17 00:00:00 2001 From: Shams Ul Azeem Date: Fri, 14 Feb 2020 06:06:00 +0500 Subject: [PATCH 04/19] MultiLayerConfiguration should be ComputationGraphConfiguration for validating a cg model conf (#240) --- .../main/java/org/deeplearning4j/util/DL4JModelValidator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java index 712b9c12b..8567dc379 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java @@ -3,6 +3,7 @@ package org.deeplearning4j.util; import lombok.NonNull; import org.apache.commons.io.IOUtils; import org.deeplearning4j.nn.api.Model; +import org.deeplearning4j.nn.conf.ComputationGraphConfiguration; import org.deeplearning4j.nn.conf.MultiLayerConfiguration; import org.deeplearning4j.nn.graph.ComputationGraph; import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; @@ -121,7 +122,7 @@ public class DL4JModelValidator { } try{ - MultiLayerConfiguration.fromJson(config); + ComputationGraphConfiguration.fromJson(config); } catch (Throwable t){ return ValidationResult.builder() .formatType("ComputationGraph") From 4206171b703a813a564f4f04b6bed501d72e74e1 Mon Sep 17 00:00:00 2001 From: Alexander Stoyakin Date: Fri, 14 Feb 2020 08:27:46 +0200 Subject: [PATCH 05/19] Ignored tests (#243) --- .../org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java index 210c4b703..3788c434e 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java @@ -123,7 +123,12 @@ public class TFGraphTestAllSameDiff { //Note: Can't extend BaseNd4jTest here a //AB 2020/01/07 - Known issues "bitcast/from_float64_to_int64", "bitcast/from_rank2_float64_to_int64", - "bitcast/from_float64_to_uint64" + "bitcast/from_float64_to_uint64", + + // 2020/02/14 - new ops which are not passing yet + "linear_solve/.*", + "triangular_solve/.*", + "lstsq/.*" }; /* As per TFGraphTestList.printArraysDebugging - this field defines a set of regexes for test cases that should have From 6e6289b6b9feec7094229f08dd11348820a4b56e Mon Sep 17 00:00:00 2001 From: Oleh Date: Fri, 14 Feb 2020 11:04:38 +0200 Subject: [PATCH 06/19] Oleh bert multiply true broad cast (#239) * libnd4j trueBroadcast rank 3 row implementation of special case Signed-off-by: Oleg * libnd4j rule clarify for second special case for all tests pass * libnd4j parallel_tad loop switch on in special case * libnd4j more general case for special case 2, need additional testing Signed-off-by: Oleg * libnd4j more general case for trueBroadcast special cases added * libnd4j minor corrections and clean up * libnd4j one more minor fix Signed-off-by: Oleg * libnd4j fixed check point to support all Y common vector representations in first special case for trueBroadcast Signed-off-by: Oleg Co-authored-by: raver119 --- .../include/loops/cpu/TrueBroadcastHelper.hpp | 476 ++++++++++-------- .../layers_tests/DeclarableOpsTests14.cpp | 120 ++++- 2 files changed, 377 insertions(+), 219 deletions(-) diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp index 6005c3647..95fe19109 100644 --- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp +++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp @@ -14,9 +14,9 @@ * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ -// -// @author Yurii Shyrma (iuriish@yahoo.com) -// + // + // @author Yurii Shyrma (iuriish@yahoo.com) + // #include #include @@ -24,226 +24,268 @@ using namespace simdOps; -namespace nd4j { -namespace helpers { +namespace nd4j { + namespace helpers { -//////////////////////////////////////////////////////////////////////// -template -template -void TrueBroadcastHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + //////////////////////////////////////////////////////////////////////// + template + template + void TrueBroadcastHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { - const X* x = reinterpret_cast(xArr.getBuffer()); - const Y* y = reinterpret_cast(yArr.getBuffer()); - Z* z = reinterpret_cast(zArr.getBuffer()); + const X* x = reinterpret_cast(xArr.getBuffer()); + const Y* y = reinterpret_cast(yArr.getBuffer()); + Z* z = reinterpret_cast(zArr.getBuffer()); - const auto xShapeInfo = xArr.getShapeInfo(); - const auto yShapeInfo = yArr.getShapeInfo(); - const auto zShapeInfo = zArr.getShapeInfo(); + const auto xShapeInfo = xArr.getShapeInfo(); + const auto yShapeInfo = yArr.getShapeInfo(); + const auto zShapeInfo = zArr.getShapeInfo(); - const int xRank = xArr.rankOf(); - const int yRank = yArr.rankOf(); - const int zRank = zArr.rankOf(); + const int xRank = xArr.rankOf(); + const int yRank = yArr.rankOf(); + const int zRank = zArr.rankOf(); - bool bSpecialCase = (1 == xArr.ews() && 'c' == xArr.ordering() && 1 == yRank && - 1 == yArr.ews() && 'c' == yArr.ordering() && - 1 == zArr.ews() && 'c' == zArr.ordering()); + bool bSpecialCase = (1 == xArr.ews() && 'c' == xArr.ordering() && + 1 == yArr.ews() && 'c' == yArr.ordering() && + 1 == zArr.ews() && 'c' == zArr.ordering()); - if (bSpecialCase) { - auto yLen = (uint32_t)yArr.lengthOf(); - auto func = PRAGMA_THREADS_FOR{ - for (uint32_t i = start; i < stop; i++) { - auto rZ = z + (i * yLen); - auto v = x[i]; - for (uint32_t j = 0; j < yLen; j++) { - rZ[j] = OpType::op(v, y[j]); - } - } - }; - samediff::Threads::parallel_tad(func, 0, xArr.lengthOf()); - return; + if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) { + auto yLen = (uint32_t)yArr.lengthOf(); + auto func = PRAGMA_THREADS_FOR{ + for (uint32_t i = start; i < stop; i++) { + auto rZ = z + (i * yLen); + auto v = x[i]; + for (uint32_t j = 0; j < yLen; j++) { + rZ[j] = OpType::op(v, y[j]); + } + } + }; + samediff::Threads::parallel_tad(func, 0, xArr.lengthOf()); + return; + } + + + auto yShapeInt = yArr.getShapeAsVectorInt(); + auto xShapeInt = xArr.getShapeAsVectorInt(); + auto nCountY = std::count_if(yShapeInt.cbegin(), yShapeInt.cend(), [](int i) { return i == 1; }); + auto nCountX = std::count_if(xShapeInt.cbegin(), xShapeInt.cend(), [](int i) { return i == 1; }); + + bool bSpecialCase2 = (xRank == zRank && yRank == zRank && 1 == xArr.sizeAt(-1) && 1 == yArr.sizeAt(-2) && 1 == nCountY && 1 == nCountX); + + if (bSpecialCase && bSpecialCase2) { + + int zDim1 = zArr.sizeAt(-2); + int zDim2 = zArr.sizeAt(-1); + + int nLen = zArr.lengthOf() / yArr.sizeAt(-1); + + auto func = PRAGMA_THREADS_FOR{ + for (uint32_t total = start; total < stop; total += increment) { + + uint32_t i = total / zDim1; + uint32_t j = total % zDim1; + + uint32_t index = (i * zDim1) + j; + auto rZ = z + (index * zDim2); + auto rY = y + (i * zDim2); + auto rX = x[index]; + + for (uint32_t n = 0; n < zDim2; n++) { + rZ[n] = OpType::op(rX, rY[n]); + } + } + }; + samediff::Threads::parallel_tad(func, 0, nLen, 1); + return; + } + + + const Nd4jLong zLen = zArr.lengthOf(); + auto func = PRAGMA_THREADS_FOR{ + std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); + + for (auto i = start; i < stop; ++i) { + + shape::index2coords(i, zShapeInfo, zCoords.data()); + + for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if (ix >= 0) { + if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { + xCoords[ix--] = zCoords[iz]; + } + else { + xCoords[ix--] = 0; + } + } + + if (iy >= 0) { + if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { + yCoords[iy--] = zCoords[iz]; + } + else { + yCoords[iy--] = 0; + } + } + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } + }; + + samediff::Threads::parallel_for(func, 0, zLen); + } + + template + void TrueBroadcastHelper::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + DISPATCH_BY_OPNUM_TTT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_OPS); + } + + //////////////////////////////////////////////////////////////////////// + template + template + void TrueBroadcastBoolHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + const X* x = reinterpret_cast(xArr.getBuffer()); + const X* y = reinterpret_cast(yArr.getBuffer()); + Z* z = reinterpret_cast(zArr.getBuffer()); + + const auto xShapeInfo = xArr.getShapeInfo(); + const auto yShapeInfo = yArr.getShapeInfo(); + const auto zShapeInfo = zArr.getShapeInfo(); + + const int xRank = xArr.rankOf(); + const int yRank = yArr.rankOf(); + const int zRank = zArr.rankOf(); + + const Nd4jLong zLen = zArr.lengthOf(); + + auto func = PRAGMA_THREADS_FOR{ + std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); + + for (auto i = start; i < stop; ++i) { + + shape::index2coords(i, zShapeInfo, zCoords.data()); + + for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if (ix >= 0) { + if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { + xCoords[ix--] = zCoords[iz]; + } + else { + xCoords[ix--] = 0; + } + } + + if (iy >= 0) { + if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { + yCoords[iy--] = zCoords[iz]; + } + else { + yCoords[iy--] = 0; + } + } + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset], nullptr); + } + }; + + samediff::Threads::parallel_for(func, 0, zLen); + } + + template + void TrueBroadcastBoolHelper::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + DISPATCH_BY_OPNUM_TT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_BOOL_OPS); + } + + //////////////////////////////////////////////////////////////////////// + template + template + void TrueBroadcastIntHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + const X* x = reinterpret_cast(xArr.getBuffer()); + const X* y = reinterpret_cast(yArr.getBuffer()); + X* z = reinterpret_cast(zArr.getBuffer()); + + const auto xShapeInfo = xArr.getShapeInfo(); + const auto yShapeInfo = yArr.getShapeInfo(); + const auto zShapeInfo = zArr.getShapeInfo(); + + const int xRank = xArr.rankOf(); + const int yRank = yArr.rankOf(); + const int zRank = zArr.rankOf(); + + const Nd4jLong zLen = zArr.lengthOf(); + + auto func = PRAGMA_THREADS_FOR{ + std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); + + for (auto i = start; i < stop; ++i) { + + shape::index2coords(i, zShapeInfo, zCoords.data()); + + for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if (ix >= 0) { + if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { + xCoords[ix--] = zCoords[iz]; + } + else { + xCoords[ix--] = 0; + } + } + + if (iy >= 0) { + if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { + yCoords[iy--] = zCoords[iz]; + } + else { + yCoords[iy--] = 0; + } + } + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } + }; + + samediff::Threads::parallel_for(func, 0, zLen); + } + + template + void TrueBroadcastIntHelper::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + DISPATCH_BY_OPNUM_T(exec, PARAMS(xArr, yArr, zArr), BROADCAST_INT_OPS); + } + + /* + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_0); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_1); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_2); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_3); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_4); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_5); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_6); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_7); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_8); + BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_9); + + BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastBoolHelper, , LIBND4J_TYPES, BOOL_TYPES); + + BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastIntHelper, , INTEGER_TYPES); + */ } - - const Nd4jLong zLen = zArr.lengthOf(); - auto func = PRAGMA_THREADS_FOR { - std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); - - for (auto i = start; i < stop; ++i) { - - shape::index2coords(i, zShapeInfo, zCoords.data()); - - for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { - - if (ix >= 0) { - if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { - xCoords[ix--] = zCoords[iz]; - } else { - xCoords[ix--] = 0; - } - } - - if (iy >= 0) { - if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { - yCoords[iy--] = zCoords[iz]; - } else { - yCoords[iy--] = 0; - } - } - } - - const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); - const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); - - z[zOffset] = OpType::op(x[xOffset], y[yOffset]); - } - }; - - samediff::Threads::parallel_for(func, 0, zLen); } - -template -void TrueBroadcastHelper::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { - DISPATCH_BY_OPNUM_TTT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_OPS); -} - -//////////////////////////////////////////////////////////////////////// -template -template -void TrueBroadcastBoolHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { - - const X* x = reinterpret_cast(xArr.getBuffer()); - const X* y = reinterpret_cast(yArr.getBuffer()); - Z* z = reinterpret_cast(zArr.getBuffer()); - - const auto xShapeInfo = xArr.getShapeInfo(); - const auto yShapeInfo = yArr.getShapeInfo(); - const auto zShapeInfo = zArr.getShapeInfo(); - - const int xRank = xArr.rankOf(); - const int yRank = yArr.rankOf(); - const int zRank = zArr.rankOf(); - - const Nd4jLong zLen = zArr.lengthOf(); - - auto func = PRAGMA_THREADS_FOR { - std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); - - for (auto i = start; i < stop; ++i) { - - shape::index2coords(i, zShapeInfo, zCoords.data()); - - for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { - - if (ix >= 0) { - if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { - xCoords[ix--] = zCoords[iz]; - } else { - xCoords[ix--] = 0; - } - } - - if (iy >= 0) { - if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { - yCoords[iy--] = zCoords[iz]; - } else { - yCoords[iy--] = 0; - } - } - } - - const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); - const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); - - z[zOffset] = OpType::op(x[xOffset], y[yOffset], nullptr); - } - }; - - samediff::Threads::parallel_for(func, 0, zLen); -} - -template -void TrueBroadcastBoolHelper::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_BOOL_OPS); -} - -//////////////////////////////////////////////////////////////////////// -template -template -void TrueBroadcastIntHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { - - const X* x = reinterpret_cast(xArr.getBuffer()); - const X* y = reinterpret_cast(yArr.getBuffer()); - X* z = reinterpret_cast(zArr.getBuffer()); - - const auto xShapeInfo = xArr.getShapeInfo(); - const auto yShapeInfo = yArr.getShapeInfo(); - const auto zShapeInfo = zArr.getShapeInfo(); - - const int xRank = xArr.rankOf(); - const int yRank = yArr.rankOf(); - const int zRank = zArr.rankOf(); - - const Nd4jLong zLen = zArr.lengthOf(); - - auto func = PRAGMA_THREADS_FOR { - std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); - - for (auto i = start; i < stop; ++i) { - - shape::index2coords(i, zShapeInfo, zCoords.data()); - - for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { - - if (ix >= 0) { - if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { - xCoords[ix--] = zCoords[iz]; - } else { - xCoords[ix--] = 0; - } - } - - if (iy >= 0) { - if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { - yCoords[iy--] = zCoords[iz]; - } else { - yCoords[iy--] = 0; - } - } - } - - const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); - const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); - - z[zOffset] = OpType::op(x[xOffset], y[yOffset]); - } - }; - - samediff::Threads::parallel_for(func, 0, zLen); -} - -template -void TrueBroadcastIntHelper::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { - DISPATCH_BY_OPNUM_T(exec, PARAMS(xArr, yArr, zArr), BROADCAST_INT_OPS); -} - -/* -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_0); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_1); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_2); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_3); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_4); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_5); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_6); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_7); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_8); -BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_9); - -BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastBoolHelper, , LIBND4J_TYPES, BOOL_TYPES); - -BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastIntHelper, , INTEGER_TYPES); -*/ -} -} \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp index 7592bee27..7e3fae4af 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp @@ -546,13 +546,13 @@ TEST_F(DeclarableOpsTests14, repeat_5) { delete result; } ///////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests14, Test_scalar_broadcast_SpecialCaseTest) { +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest) { auto y = NDArray('c', { 3 }, nd4j::DataType::FLOAT32); auto x = NDArray('c', { 5, 2, 1 }, nd4j::DataType::FLOAT32); auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, nd4j::DataType::FLOAT32); - + y.assign(1.0); x.linspace(1.0); @@ -566,3 +566,119 @@ TEST_F(DeclarableOpsTests14, Test_scalar_broadcast_SpecialCaseTest) { delete result; } +///////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest2) { + + auto y = NDArray('c', { 1, 3 }, nd4j::DataType::FLOAT32); + auto x = NDArray('c', { 5, 2, 1 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, nd4j::DataType::FLOAT32); + + y.assign(1.0); + x.linspace(1.0); + + nd4j::ops::add op; + auto result = op.evaluate({ &x, &y }); + ASSERT_EQ(Status::OK(), result->status()); + + auto res = *result->at(0); + + ASSERT_EQ(e, res); + + delete result; +} + +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest3) { + + auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 3, 5, 4 }, { 10., 11., 12., 13., 20., 22., 24., 26., 30., 33., 36., 39., 40., 44., 48., 52., 50., 55., 60., 65., 84., 90., 96., 102., 98., 105., 112., 119., 112., 120., 128., 136., 126., 135., 144., 153., 140., 150., 160., 170., 198., 209., 220., 231., 216., 228., 240., 252., 234., 247., 260., 273., 252., 266., 280., 294., 270., 285., 300., 315. }, nd4j::DataType::FLOAT32); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), y, z); + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest4) { + + auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 2, 3, 5, 4 }, { 10., 11., 12., 13.,20., 22., 24., 26.,30., 33., 36., 39.,40., 44., 48., 52.,50., 55., 60., 65.,84., 90., 96., 102.,98., 105., 112., 119.,112., 120., 128., 136.,126., 135., 144., 153.,140., 150., 160., 170.,198., 209., 220., 231.,216., 228., 240., 252.,234., 247., 260., 273.,252., 266., 280., 294.,270., 285., 300., 315.,352., 368., 384., 400.,374., 391., 408., 425.,396., 414., 432., 450.,418., 437., 456., 475.,440., 460., 480., 500.,546., 567., 588., 609.,572., 594., 616., 638.,598., 621., 644., 667.,624., 648., 672., 696.,650., 675., 700., 725.,780., 806., 832., 858.,810., 837., 864., 891.,840., 868., 896., 924.,870., 899., 928., 957.,900., 930., 960., 990. }, nd4j::DataType::FLOAT32); + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), y, z); + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest5) { + + auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615, 0.428571, 0.400000, 0.375000, 0.352941, 0.500000, 0.466667, 0.437500, 0.411765, 0.571429, 0.533333, 0.500000, 0.470588, 0.642857, 0.600000, 0.562500, 0.529412, 0.714286, 0.666667, 0.625000, 0.588235, 0.611111, 0.578947, 0.550000, 0.523810, 0.666667, 0.631579, 0.600000, 0.571429, 0.722222, 0.684211, 0.650000, 0.619048, 0.777778, 0.736842, 0.700000, 0.666667, 0.833333, 0.789474, 0.750000, 0.714286 }, nd4j::DataType::FLOAT32); + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z); + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest6) { + + auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 2, 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615, 0.428571, 0.400000, 0.375000, 0.352941, 0.500000, 0.466667, 0.437500, 0.411765, 0.571429, 0.533333, 0.500000, 0.470588, 0.642857, 0.600000, 0.562500, 0.529412, 0.714286, 0.666667, 0.625000, 0.588235,0.611111, 0.578947, 0.550000, 0.523810,0.666667, 0.631579, 0.600000, 0.571429,0.722222, 0.684211, 0.650000, 0.619048,0.777778, 0.736842, 0.700000, 0.666667,0.833333, 0.789474, 0.750000, 0.714286, 0.727273, 0.695652, 0.666667, 0.64, 0.772727, 0.739130, 0.708333, 0.68, 0.818182, 0.782609, 0.750000, 0.72, 0.863636, 0.826087, 0.791667, 0.76, 0.909091, 0.869565, 0.833333, 0.80, 0.807692, 0.777778, 0.750000, 0.724138, 0.846154, 0.814815, 0.785714, 0.758621, 0.884615, 0.851852, 0.821429, 0.793103, 0.923077, 0.888889, 0.857143, 0.827586, 0.961538, 0.925926, 0.892857, 0.862069, 0.866667, 0.838710, 0.812500, 0.787879, 0.900000, 0.870968, 0.843750, 0.818182, 0.933333, 0.903226, 0.875000, 0.848485, 0.966667, 0.935484, 0.906250, 0.878788, 1.000000, 0.967742, 0.937500, 0.909091 }, nd4j::DataType::FLOAT32); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z); + ASSERT_EQ(e, z); +} + +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest7) { + + auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 3, 5, 4 }, { -9., -10., -11., -12.,-8., -9., -10., -11., -7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8.000000, -9.000000, -10.00,-6.000000, -7.000000, -8.000000, -9.000,-5.000000, -6.000000, -7.000000, -8.000,-4.000000, -5.000000, -6.000000, -7.000,-3.000000, -4.000000, -5.000000, -6.000 }, nd4j::DataType::FLOAT32); + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Subtract(), y, z); + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest8) { + + auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 2, 3, 5, 4 }, { -9.0, -10., -11., -12.,-8., -9., -10., -11.0,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4., 0., -1., -2., -3. }, nd4j::DataType::FLOAT32); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Subtract(), y, z); + ASSERT_EQ(e, z); +} From 9e3c1b02b1648354a700f3e078162d4f197d33c8 Mon Sep 17 00:00:00 2001 From: raver119 Date: Fri, 14 Feb 2020 16:20:31 +0300 Subject: [PATCH 07/19] Perf improvements (#242) * initial commit Signed-off-by: raver119 * meh Signed-off-by: raver119 * better ExpandDims impl Signed-off-by: raver119 * better Squeeze impl Signed-off-by: raver119 * better Softmax impl Signed-off-by: raver119 * one test disabled Signed-off-by: raver119 * more accurate impl Signed-off-by: raver119 * - GraphProfiler now prints full shapeInfo instead of shape - softmax typo fix Signed-off-by: raver119 --- libnd4j/include/graph/impl/Graph.cpp | 15 + .../graph/profiling/impl/NodeProfile.cpp | 4 +- libnd4j/include/helpers/ShapeUtils.h | 2 + libnd4j/include/helpers/impl/ShapeUtils.cpp | 20 + .../declarable/generic/shape/expand_dims.cpp | 11 +- .../ops/declarable/generic/shape/squeeze.cpp | 10 +- .../declarable/helpers/cpu/activations.cpp | 91 +++-- libnd4j/include/templatemath.h | 46 ++- .../layers_tests/DeclarableOpsTests18.cpp | 11 + .../layers_tests/DeclarableOpsTests19.cpp | 10 + .../layers_tests/PlaygroundTests.cpp | 47 +++ .../java/org/nd4j/nativeblas/Nd4jCuda.java | 353 +++++++++++++---- .../java/org/nd4j/nativeblas/Nd4jCpu.java | 374 ++++++++++++++---- 13 files changed, 822 insertions(+), 172 deletions(-) diff --git a/libnd4j/include/graph/impl/Graph.cpp b/libnd4j/include/graph/impl/Graph.cpp index 2acedcea3..4b337dd0d 100644 --- a/libnd4j/include/graph/impl/Graph.cpp +++ b/libnd4j/include/graph/impl/Graph.cpp @@ -1088,8 +1088,23 @@ namespace nd4j { if (e < node->input()->size() - 1) nd4j_printf(", ", ""); } + + if (node->opType() == OpType_CUSTOM) { + auto ctx = node->protoContext(); + if (ctx->getIArguments()->size() > 0) { + printf("]; iArgs: ["); + + for (int e = 0; e < ctx->getIArguments()->size(); e++) { + printf("%i", ctx->getIArguments()->at(e)); + if (e < ctx->getIArguments()->size() - 1) + nd4j_printf(", ", ""); + } + } + } + nd4j_printf("]; \n", ""); + // printf("\n"); fflush(stdout); } diff --git a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp index c8b00e788..a6a990eb8 100644 --- a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp +++ b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp @@ -117,11 +117,11 @@ namespace nd4j { } void NodeProfile::addInputShape(Nd4jLong *shapeInfo) { - _inputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + _inputShapes.emplace_back(ShapeUtils::shapeInfoAsString(shapeInfo)); } void NodeProfile::addOutputShape(Nd4jLong *shapeInfo) { - _outputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + _outputShapes.emplace_back(ShapeUtils::shapeInfoAsString(shapeInfo)); } void NodeProfile::merge(NodeProfile *other) { diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h index c99a0b0de..ec31f479a 100644 --- a/libnd4j/include/helpers/ShapeUtils.h +++ b/libnd4j/include/helpers/ShapeUtils.h @@ -97,6 +97,8 @@ namespace nd4j { static std::string shapeAsString(const int rank, const Nd4jLong* shapeInfo); static std::string strideAsString(const NDArray* array); + static std::string shapeInfoAsString(const Nd4jLong* shapeInfo); + static std::vector shapeAsVector(const Nd4jLong* shapeInfo); // evaluate shapeInfo for diagonal array which is made using input arr elements as diagonal diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index 9d002e238..235ab3d10 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -666,6 +666,26 @@ Nd4jLong* ShapeUtils::evalTileShapeInfo(const NDArray& arr, const std::vectorreshape(input->ordering(), shape); - output->assign(tmp); - - STORE_RESULT(output); - + if (input->ews() == 1 && output->ews() == 1 && input->ordering() == output->ordering()) { + output->dataBuffer()->copyBufferFrom(*input->dataBuffer().get(), output->lengthOf() * DataTypeUtils::sizeOfElement(output->dataType()), 0, input->bufferOffset()); + } else { + auto tmp = input->reshape(input->ordering(), shape); + output->assign(tmp); + } return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp index 085d7f09c..22e229643 100644 --- a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp @@ -25,7 +25,7 @@ namespace nd4j { namespace ops { - CUSTOM_OP_IMPL(squeeze, 1, 1, true, 0, -2) { + CUSTOM_OP_IMPL(squeeze, 1, 1, false, 0, -2) { auto input = INPUT_VARIABLE(0); auto output = OUTPUT_VARIABLE(0); @@ -73,8 +73,12 @@ namespace nd4j { if (block.isInplace()) { output->reshapei(input->ordering(), shape, false); } else { - auto tmp = input->reshape(input->ordering(), shape); - output->assign(tmp); + if (input->ews() == 1 && output->ews() == 1 && input->ordering() == output->ordering()) { + output->dataBuffer()->copyBufferFrom(*input->dataBuffer().get(), output->lengthOf() * DataTypeUtils::sizeOfElement(output->dataType()), 0, input->bufferOffset()); + } else { + auto tmp = input->reshape(input->ordering(), shape); + output->assign(tmp); + } } return Status::OK(); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index 9a11baf37..56c93b611 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -191,6 +191,70 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr BUILD_SINGLE_SELECTOR(xType, logSoftMaxForVector_, (input.getBuffer(), input.getShapeInfo(), output.buffer(), output.shapeInfo()), FLOAT_TYPES); } + template + void softmax_loop(T *input, T *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen); + + template <> + FORCEINLINE void softmax_loop(float *input, float *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) { + auto inBuff = input + offsets[i]; + auto outBuff = output + offsets[i]; + + float max = -DataTypeUtils::max(); + float sum = 0.f; + + #pragma omp simd reduction(max:max) + for (uint j = 0; j < tadLen; ++j) + max = nd4j::math::nd4j_max(max, inBuff[j]); + + #pragma omp simd reduction(+:sum) + for (uint j = 0; j < tadLen; ++j) { + float temp = nd4j::math::nd4j_exp(inBuff[j] - max); + outBuff[j] = temp; + sum += temp; + } + + #pragma omp simd + for (uint j = 0; j < tadLen; ++j) + outBuff[j] /= sum; + } + }; + + samediff::Threads::parallel_tad(func,0, numOfSubArrs); + } + + + template + FORCEINLINE void softmax_loop(T *input, T *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) { + auto inBuff = input + offsets[i]; + auto outBuff = output + offsets[i]; + + T max = -DataTypeUtils::max(); + T sum(0.f); + + #pragma omp simd reduction(maxT:max) + for (uint j = 0; j < tadLen; ++j) + max = nd4j::math::nd4j_max(max, inBuff[j]); + + #pragma omp simd reduction(sumT:sum) + for (uint j = 0; j < tadLen; ++j) { + T temp = nd4j::math::nd4j_exp(inBuff[j] - max); + outBuff[j] = temp; + sum += temp; + } + + #pragma omp simd + for (uint j = 0; j < tadLen; ++j) + outBuff[j] /= sum; + } + }; + + samediff::Threads::parallel_tad(func,0, numOfSubArrs); + } + ////////////////////////////////////////////////////////////////////////// template static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) { @@ -213,31 +277,10 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra const uint tadLen = shape::length(tadShapeInfo); if(shape::elementWiseStride(tadShapeInfo) == 1){ + T *inBuff = input.bufferAsT(); + T *outBuff = output.bufferAsT(); - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { - - T *inBuff = input.bufferAsT() + tadOffsets[i]; - T *outBuff = output.bufferAsT() + tadOffsets[i]; - - T max = -DataTypeUtils::max(); - T sum = 0; - - for (uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[j]); - - for (uint j = 0; j < tadLen; ++j) { - T temp = nd4j::math::nd4j_exp(inBuff[j] - max); - outBuff[j] = temp; - sum += temp; - } - - for (uint j = 0; j < tadLen; ++j) - outBuff[j] /= sum; - } - }; - - samediff::Threads::parallel_tad(func,0, numOfSubArrs); + softmax_loop(inBuff, outBuff, tadOffsets, numOfSubArrs, tadLen); } else { diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h index b412befd8..48021d734 100644 --- a/libnd4j/include/templatemath.h +++ b/libnd4j/include/templatemath.h @@ -127,6 +127,32 @@ namespace nd4j { template math_def inline Z nd4j_erfc(T num); + math_def inline int32_t floatToRawIntBits(float d) { + union { + float f; + int32_t i; + } tmp; + tmp.f = d; + return tmp.i; + } + + math_def inline float intBitsToFloat(int32_t i) { + union { + float f; + int32_t i; + } tmp; + tmp.i = i; + return tmp.f; + } + + math_def inline float mulsignf(float x, float y) { + return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31))); + } + + math_def inline float copysignfk(float x, float y) { + return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31))); + } + template math_def inline Z nd4j_sigmoid(T val) { return (Z) 1.0f / ((Z) 1.0f + nd4j_exp(-val)); @@ -660,6 +686,11 @@ namespace nd4j { * @param val2 * @return */ + template <> + math_def inline float nd4j_pow(float val, float val2) { + return p_pow(val, val2); + } + template math_def inline Z nd4j_pow(X val, Y val2) { return p_pow(static_cast(val), static_cast(val2)); @@ -767,10 +798,23 @@ namespace nd4j { } + math_def inline float neu_tanh(float val, float sign) { + float e(M_E); + float av = sign * val; + auto p = nd4j::math::nd4j_pow(e, -av * 2.f); + return (1 - p) / (1 + p); + } + + template <> + math_def inline float nd4j_tanh(float val) { + float sign = copysignfk(1.0f, val); + return sign * neu_tanh(val, sign); + } + + template math_def inline Z nd4j_tanh(X val) { return val <= 0 ? neg_tanh(val) : pos_tanh(val); - //return p_tanh(static_cast(val)); } template diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp index 93864af8c..2c7737a31 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp @@ -48,5 +48,16 @@ TEST_F(DeclarableOpsTests18, test_bitcast_1) { auto status = op.execute({&x}, {&z}, {}, {(Nd4jLong) nd4j::DataType::INT64}, {}); ASSERT_EQ(Status::OK(), status); + ASSERT_EQ(e, z); +} + +TEST_F(DeclarableOpsTests18, test_tanh_1) { + auto x = NDArrayFactory::create('c', {8}, {0.23f, -0.23f, 0.35f, -0.35f, 0.64f, -0.64f, 100000.f, -100000.f}); + auto z = x.ulike(); + auto e = NDArrayFactory::create('c', {8}, {0.226028f, -0.226028f, 0.336376f, -0.336376f, 0.564900f, -0.564900f, 1.f, -1.f}); + + nd4j::ops::tanh op; + op.execute({&x}, {&z}); + ASSERT_EQ(e, z); } \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp index 9883a9d79..b0a547a7d 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp @@ -66,4 +66,14 @@ TEST_F(DeclarableOpsTests19, test_conv1d_bp_1) { delete result; +} + +TEST_F(DeclarableOpsTests19, test_squeeze_1) { + auto x = NDArrayFactory::create('c', {3, 4, 1}); + auto e = NDArrayFactory::create('c', {3, 4}); + int axis = 2; + + nd4j::ops::squeeze op; + auto status = op.execute({&x}, {&e}, {axis}); + ASSERT_EQ(Status::OK(), status); } \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 7db7a791a..83d3ee3b8 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -169,6 +169,53 @@ TEST_F(PlaygroundTests, test_broadcast_1) { } */ +/* +TEST_F(PlaygroundTests, test_broadcast_1) { + int pool = 500; + std::vector aX(pool); + std::vector aY(pool); + std::vector aZ(pool); + + for (int e = 0; e < pool; e++) { + aX[e] = NDArrayFactory::create_('c', {512, 3072}); + aY[e] = NDArrayFactory::create_('c', {768}); + aZ[e] = NDArrayFactory::create_('c', {512, 3072}); + + aX[e]->assign( (e+1) / 119); + aY[e]->assign( (e+3) / 119); + } + + + + std::vector values; + + for (int e = 0; e < 1000; e++) { + auto x = aX[e < pool ? e : e % pool]; + auto y = aY[e < pool ? e : e % pool]; + auto z = aZ[e < pool ? e : e % pool]; + + auto timeStart = std::chrono::system_clock::now(); + + //x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z); + x->applyTransform(transform::Tanh, *z, nullptr); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); + + for (int e = 0; e < pool; e++) { + delete aX[e]; + delete aY[e]; + delete aZ[e]; + } +} + +*/ /* TEST_F(PlaygroundTests, test_s_0) { diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index e7ddcda11..c8b15c1a2 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -4250,14 +4250,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * set new order and shape in case of suitable array length (in-place operation) * order - order to set * shape - shape to set - * + * copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping * if there was permute applied before or there are weird strides, then new buffer is allocated for array */ + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -4267,8 +4273,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * * if permute have been applied before or there are weird strides, then new buffer is allocated for new array */ + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -6203,6 +6212,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -6235,11 +6245,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); public native void setObjectsSize(@Cast("Nd4jLong") long bytes); public native void setTotalSize(@Cast("Nd4jLong") long bytes); + public native void addInputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native @Cast("Nd4jLong") long getActivationsSize(); public native @Cast("Nd4jLong") long getTemporarySize(); public native @Cast("Nd4jLong") long getObjectsSize(); public native @Cast("Nd4jLong") long getTotalSize(); + public native @Cast("Nd4jLong") long getExecutionTime(); + public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name(); public native void merge(NodeProfile other); @@ -6835,9 +6854,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") LongBuffer oldShape, int newRank, @Cast("Nd4jLong*") LongBuffer newShape, @Cast("bool") boolean isFOrder); @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") long[] oldShape, int newRank, @Cast("Nd4jLong*") long[] newShape, @Cast("bool") boolean isFOrder); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongPointer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongBuffer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") long[] oldShapeInfo, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + /** + * newShapeInfo contains rank, shape and order only, no strides/ews/type + */ + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, @Cast("Nd4jLong*") long[] newShapeInfo); /** * Get the shape info buffer @@ -7145,6 +7170,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongPointer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongBuffer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") long[] shapeInfo); + + /** + * shape - input inShape is shape only, not shapeInfo + * returns number of non-unity dimensions in inShape + */ + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongPointer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongBuffer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") long[] inShape); + /** * Returns whether the * given shape is a vector or not @@ -7163,9 +7197,9 @@ public static final int PREALLOC_SIZE = 33554432; * Returns the shape portion of an information * buffer */ - @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] buffer); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] shapeInfo); /** * Return a copy of a buffer. @@ -7903,40 +7937,22 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong*") LongBuffer offsets); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets, byte order/*='c'*/); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c'); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c'); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order); - // deduce element-wise stride - // if array is scalar or unit length vector then ews = 1 - // if array is common vector then ews = stride of non-unity dimension - // if strides are normal set ews = 1, otherwise ews = 0 - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len); - // deduce order and element-wise stride // if array is scalar or unit length vector then ews = 1 and order is preserved // if array is common vector then ews = stride of non-unity dimension and order is preserved // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo); /** * processes whole set of sub-arrays @@ -7946,7 +7962,7 @@ public static final int PREALLOC_SIZE = 33554432; * numOfSubArrs - number of sub-arrays, size of subArrOffsets is equal to numOfSubArrs * dimsSize - size of dimsToExclude, if dimsSize = array rank or dimsSize = 0 it means sub-array is whole array, copy of wholeShapeInfo and one zero offset will be returned * dimsToExclude - MUST BE SORTED, dimensions to evaluate sub-array along, i.e. when shape is [2,3,4,5] and dimsToExclude={0,2}, then there will be 8 sub-arrays with shape [3,5] - * subArrShapeInfo - output argument, contains shapeInfo common for all sub-arrays + * subArrShapeInfo - output argument, contains shapeInfo (same for all sub-arrays) * subArrOffsets - output argument, contains successive sub-arrays offsets from original this-buffer * keepUnitiesInShape - if false then eliminate unities from sub-array shapeInfo, for example {1,a,1,b} -> {a,b} */ @@ -7957,6 +7973,24 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets, @Cast("bool") boolean keepUnitiesInShape/*=false*/); @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets); + /** + * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99} + * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order + * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1} + * returns number of non-unity dimensions in inShapeInfo + * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo + */ + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongPointer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities); + + /** + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} + */ + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, int dimsSize, @Const IntBuffer dimsToExclude, @Cast("Nd4jLong*") LongBuffer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] outShapeInfo); @@ -8186,6 +8220,8 @@ public static final int PREALLOC_SIZE = 33554432; * @param rank the rank of the shape */ +////////////////////////////////////////////////////////////////////// + /** * Returns whether the * given shape is a vector or not @@ -8735,69 +8771,60 @@ public static final int PREALLOC_SIZE = 33554432; // return true; // } -// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) { +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) { // // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements // // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo -// const int newOrder = isFOrder ? 102 : 99; -// const int oldOrder = oldShapeInfo[2 * oldRank + 3]; - // newShapeInfo[0] = newRank; // memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong)); -// Nd4jLong* newStrides = shape::stride(newShapeInfo); -// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); +// Nd4jLong* newStrides = shape::stride(newShapeInfo); +// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); // const Nd4jLong* oldStrides = shape::stride(const_cast(oldShapeInfo)); -// int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; - +// Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; // while (newStart < newRank && oldStart < oldRank) { // newDim = newShape[newStart]; // oldDim = oldShape[oldStart]; -// while (newDim != oldDim) +// while (newDim != oldDim && newDim > 0 && oldDim > 0) // if (newDim < oldDim) newDim *= newShape[newStop++]; // else oldDim *= oldShape[oldStop++]; // // ------ Check whether the original axes can be combined ------ // -// for (int i = oldStart; i < oldStop - 1; i++) { - -// if(oldShape[i] == 1) { // ignore strides like {...,1,1,...} -// if(oldOrder == 102) ++oldStart; +// for (int step = 1, i = oldStart; i < oldStop - 1; ++i) { +// if(oldShape[i] == 1) // skip unity-dimension and its stride // continue; -// } - -// if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i]) -// return false; // not contiguous enough -// if(oldOrder == 99 && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1]) -// return false; // not contiguous enough +// while((i + step) < oldRank && oldShape[i + step] == 1) +// ++step; // skip following unity-dimensions and its strides if such are present +// if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step]) +// return false; // not contiguous enough // } -// // ------ Calculate new strides for all axes currently worked with ------ // -// if(isFOrder) { -// newStrides[newStart] = oldStrides[oldStart]; -// for (int i = newStart + 1; i < newStop; ++i) -// newStrides[i] = newStrides[i - 1] * newShape[i - 1]; -// } -// else { -// newStrides[newStop - 1] = oldStrides[oldStop - 1]; -// for (int i = newStop - 1; i > newStart; --i) -// newStrides[i - 1] = newStrides[i] * newShape[i]; -// } +// newStrides[newStop - 1] = oldStrides[oldStop - 1]; +// for (int i = newStop - 1; i > newStart; --i) +// newStrides[i - 1] = newStrides[i] * newShape[i]; // newStart = newStop++; // oldStart = oldStop++; // } -// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order -// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews -// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type +// // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank) +// for (int i = newStart; i < newRank; ++i) +// newStrides[i] = 1; + +// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order +// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews +// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type // return true; // } +////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////// // this function checks the consistence of dimensions with array rank (negative dimensions, too large dimensions, too big number of dimensions) @@ -8838,9 +8865,198 @@ public static final int PREALLOC_SIZE = 33554432; ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); +// const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); +// const char zOrder = shape::order(zShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo); + +// if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = zOffsets = nullptr; +// } +// else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) { +// xOffsets = yOffsets = nullptr; +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) { +// xOffsets = zOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) { +// yOffsets = zOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// } +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, yOrder); +// } +// } +// } +// else if(zEws == 1) { +// zOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, zOrder); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, zOrder); +// } +// } +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = zOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// yOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// zOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// } +// } + +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo); + +// if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = nullptr; +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// } +// } ////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////// @@ -9065,6 +9281,9 @@ public static final int PREALLOC_SIZE = 33554432; // returns TRUE if this op allows in-place execution public native @Cast("bool") boolean allowsInplace(); + // this method allows you to enable/disable inplace call for a given op + public native void allowInplace(@Cast("bool") boolean reallyAllow); + // this method returns opNum (applicable for legacy XYZ ops only) public native int getOpNum(); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index 49d088f27..71614c20f 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -4253,14 +4253,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * set new order and shape in case of suitable array length (in-place operation) * order - order to set * shape - shape to set - * + * copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping * if there was permute applied before or there are weird strides, then new buffer is allocated for array */ + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -4270,8 +4276,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * * if permute have been applied before or there are weird strides, then new buffer is allocated for new array */ + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -6206,6 +6215,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -6238,11 +6248,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); public native void setObjectsSize(@Cast("Nd4jLong") long bytes); public native void setTotalSize(@Cast("Nd4jLong") long bytes); + public native void addInputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native @Cast("Nd4jLong") long getActivationsSize(); public native @Cast("Nd4jLong") long getTemporarySize(); public native @Cast("Nd4jLong") long getObjectsSize(); public native @Cast("Nd4jLong") long getTotalSize(); + public native @Cast("Nd4jLong") long getExecutionTime(); + public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name(); public native void merge(NodeProfile other); @@ -6838,9 +6857,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") LongBuffer oldShape, int newRank, @Cast("Nd4jLong*") LongBuffer newShape, @Cast("bool") boolean isFOrder); @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") long[] oldShape, int newRank, @Cast("Nd4jLong*") long[] newShape, @Cast("bool") boolean isFOrder); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongPointer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongBuffer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") long[] oldShapeInfo, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + /** + * newShapeInfo contains rank, shape and order only, no strides/ews/type + */ + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, @Cast("Nd4jLong*") long[] newShapeInfo); /** * Get the shape info buffer @@ -7148,6 +7173,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongPointer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongBuffer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") long[] shapeInfo); + + /** + * shape - input inShape is shape only, not shapeInfo + * returns number of non-unity dimensions in inShape + */ + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongPointer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongBuffer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") long[] inShape); + /** * Returns whether the * given shape is a vector or not @@ -7166,9 +7200,9 @@ public static final int PREALLOC_SIZE = 33554432; * Returns the shape portion of an information * buffer */ - @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] buffer); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] shapeInfo); /** * Return a copy of a buffer. @@ -7906,40 +7940,22 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong*") LongBuffer offsets); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets, byte order/*='c'*/); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c'); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c'); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order); - // deduce element-wise stride - // if array is scalar or unit length vector then ews = 1 - // if array is common vector then ews = stride of non-unity dimension - // if strides are normal set ews = 1, otherwise ews = 0 - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len); - // deduce order and element-wise stride // if array is scalar or unit length vector then ews = 1 and order is preserved // if array is common vector then ews = stride of non-unity dimension and order is preserved // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo); /** * processes whole set of sub-arrays @@ -7949,7 +7965,7 @@ public static final int PREALLOC_SIZE = 33554432; * numOfSubArrs - number of sub-arrays, size of subArrOffsets is equal to numOfSubArrs * dimsSize - size of dimsToExclude, if dimsSize = array rank or dimsSize = 0 it means sub-array is whole array, copy of wholeShapeInfo and one zero offset will be returned * dimsToExclude - MUST BE SORTED, dimensions to evaluate sub-array along, i.e. when shape is [2,3,4,5] and dimsToExclude={0,2}, then there will be 8 sub-arrays with shape [3,5] - * subArrShapeInfo - output argument, contains shapeInfo common for all sub-arrays + * subArrShapeInfo - output argument, contains shapeInfo (same for all sub-arrays) * subArrOffsets - output argument, contains successive sub-arrays offsets from original this-buffer * keepUnitiesInShape - if false then eliminate unities from sub-array shapeInfo, for example {1,a,1,b} -> {a,b} */ @@ -7960,6 +7976,24 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets, @Cast("bool") boolean keepUnitiesInShape/*=false*/); @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets); + /** + * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99} + * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order + * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1} + * returns number of non-unity dimensions in inShapeInfo + * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo + */ + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongPointer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities); + + /** + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} + */ + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, int dimsSize, @Const IntBuffer dimsToExclude, @Cast("Nd4jLong*") LongBuffer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] outShapeInfo); @@ -8189,6 +8223,8 @@ public static final int PREALLOC_SIZE = 33554432; * @param rank the rank of the shape */ +////////////////////////////////////////////////////////////////////// + /** * Returns whether the * given shape is a vector or not @@ -8738,69 +8774,60 @@ public static final int PREALLOC_SIZE = 33554432; // return true; // } -// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) { +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) { // // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements // // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo -// const int newOrder = isFOrder ? 102 : 99; -// const int oldOrder = oldShapeInfo[2 * oldRank + 3]; - // newShapeInfo[0] = newRank; // memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong)); -// Nd4jLong* newStrides = shape::stride(newShapeInfo); -// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); +// Nd4jLong* newStrides = shape::stride(newShapeInfo); +// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); // const Nd4jLong* oldStrides = shape::stride(const_cast(oldShapeInfo)); -// int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; - +// Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; // while (newStart < newRank && oldStart < oldRank) { // newDim = newShape[newStart]; // oldDim = oldShape[oldStart]; -// while (newDim != oldDim) +// while (newDim != oldDim && newDim > 0 && oldDim > 0) // if (newDim < oldDim) newDim *= newShape[newStop++]; // else oldDim *= oldShape[oldStop++]; // // ------ Check whether the original axes can be combined ------ // -// for (int i = oldStart; i < oldStop - 1; i++) { - -// if(oldShape[i] == 1) { // ignore strides like {...,1,1,...} -// if(oldOrder == 102) ++oldStart; +// for (int step = 1, i = oldStart; i < oldStop - 1; ++i) { +// if(oldShape[i] == 1) // skip unity-dimension and its stride // continue; -// } - -// if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i]) -// return false; // not contiguous enough -// if(oldOrder == 99 && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1]) -// return false; // not contiguous enough +// while((i + step) < oldRank && oldShape[i + step] == 1) +// ++step; // skip following unity-dimensions and its strides if such are present +// if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step]) +// return false; // not contiguous enough // } -// // ------ Calculate new strides for all axes currently worked with ------ // -// if(isFOrder) { -// newStrides[newStart] = oldStrides[oldStart]; -// for (int i = newStart + 1; i < newStop; ++i) -// newStrides[i] = newStrides[i - 1] * newShape[i - 1]; -// } -// else { -// newStrides[newStop - 1] = oldStrides[oldStop - 1]; -// for (int i = newStop - 1; i > newStart; --i) -// newStrides[i - 1] = newStrides[i] * newShape[i]; -// } +// newStrides[newStop - 1] = oldStrides[oldStop - 1]; +// for (int i = newStop - 1; i > newStart; --i) +// newStrides[i - 1] = newStrides[i] * newShape[i]; // newStart = newStop++; // oldStart = oldStop++; // } -// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order -// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews -// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type +// // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank) +// for (int i = newStart; i < newRank; ++i) +// newStrides[i] = 1; + +// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order +// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews +// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type // return true; // } +////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////// // this function checks the consistence of dimensions with array rank (negative dimensions, too large dimensions, too big number of dimensions) @@ -8841,9 +8868,198 @@ public static final int PREALLOC_SIZE = 33554432; ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); +// const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); +// const char zOrder = shape::order(zShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo); + +// if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = zOffsets = nullptr; +// } +// else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) { +// xOffsets = yOffsets = nullptr; +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) { +// xOffsets = zOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) { +// yOffsets = zOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// } +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, yOrder); +// } +// } +// } +// else if(zEws == 1) { +// zOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, zOrder); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, zOrder); +// } +// } +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = zOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// yOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// zOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// } +// } + +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo); + +// if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = nullptr; +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// } +// } ////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////// @@ -11382,6 +11598,9 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // returns TRUE if this op allows in-place execution public native @Cast("bool") boolean allowsInplace(); + // this method allows you to enable/disable inplace call for a given op + public native void allowInplace(@Cast("bool") boolean reallyAllow); + // this method returns opNum (applicable for legacy XYZ ops only) public native int getOpNum(); @@ -21093,7 +21312,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public permute() { super((Pointer)null); allocate(); } private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); - } + } // #endif // #if NOT_EXCLUDED(OP_reshapeas) @@ -21111,7 +21330,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public reshapeas() { super((Pointer)null); allocate(); } private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); - } + } // #endif // #if NOT_EXCLUDED(OP_transpose) @@ -22222,7 +22441,22 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public tensormmul() { super((Pointer)null); allocate(); } private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); - } + } + @Namespace("nd4j::ops") public static class tensormmul_bp extends DeclarableCustomOp { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public tensormmul_bp(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public tensormmul_bp(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public tensormmul_bp position(long position) { + return (tensormmul_bp)super.position(position); + } + + public tensormmul_bp() { super((Pointer)null); allocate(); } + private native void allocate(); + public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); + } // #endif /** From 011c272fde110bd88a4f922477499e04bc5b9cf8 Mon Sep 17 00:00:00 2001 From: Yurii Shyrma Date: Mon, 17 Feb 2020 07:04:28 +0200 Subject: [PATCH 08/19] Shyrma transpose (#244) * - provide contiguous strides for ouput in transpose op Signed-off-by: Yurii * - provide contiguous strides for output in permute op Signed-off-by: Yurii * - take into account empty shapes properly in transpose/permute op Signed-off-by: Yurii --- libnd4j/blas/NDArray.hpp | 16 +- libnd4j/include/helpers/ShapeUtils.h | 6 +- libnd4j/include/helpers/impl/ShapeUtils.cpp | 45 +- .../ops/declarable/generic/shape/permute.cpp | 117 ++--- .../ops/declarable/generic/shape/reshape.cpp | 448 +++++++++--------- .../declarable/generic/shape/reshape_as.cpp | 24 +- .../declarable/generic/shape/transpose.cpp | 136 ++---- .../layers_tests/DeclarableOpsTests1.cpp | 109 +---- libnd4j/tests_cpu/layers_tests/EmptyTests.cpp | 17 - .../layers_tests/PlaygroundTests.cpp | 43 +- 10 files changed, 374 insertions(+), 587 deletions(-) diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index f7bad72c3..8abee8d82 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -1976,7 +1976,7 @@ bool NDArray::permutei(const std::initializer_list& dimensions) { ////////////////////////////////////////////////////////////////////////// bool NDArray::permutei(const std::vector& dimensions) { - return permutei(dimensions.data(), dimensions.size()); + return permutei(dimensions.data(), rankOf()); } ////////////////////////////////////////////////////////////////////////// @@ -1998,7 +1998,7 @@ bool NDArray::permutei(const std::vector& dimensions) { for (int e = 0; e < dimensions.size(); e++) ivec[e] = dimensions[e]; - return permutei(ivec.data(), ivec.size()); + return permutei(ivec.data(), rankOf()); } ////////////////////////////////////////////////////////////////////////// @@ -2034,9 +2034,8 @@ NDArray NDArray::permute(const Nd4jLong* dimensions, const int rank) && { ////////////////////////////////////////////////////////////////////////// NDArray NDArray::permute(const std::vector& dimensions) const &{ - auto data = dimensions.data(); - auto size = dimensions.size(); - return permute(data, size); + + return permute(dimensions.data(), rankOf()); } ////////////////////////////////////////////////////////////////////////// @@ -2048,7 +2047,8 @@ NDArray NDArray::permute(const std::vector& dimensions) && { ////////////////////////////////////////////////////////////////////////// NDArray NDArray::permute(const std::vector& dimensions) const & { - return permute(dimensions.data(), dimensions.size()); + + return permute(dimensions.data(), rankOf()); } ////////////////////////////////////////////////////////////////////////// @@ -2111,12 +2111,12 @@ void NDArray::permute(const Nd4jLong *dimensions, const int rank, NDArray& targe ////////////////////////////////////////////////////////////////////////// void NDArray::permute(const std::vector& dimensions, NDArray& target) const { - permute(dimensions.data(), dimensions.size(), target); + permute(dimensions.data(), rankOf(), target); } ////////////////////////////////////////////////////////////////////////// void NDArray::permute(const std::vector& dimensions, NDArray& target) const { - permute(dimensions.data(), dimensions.size(), target); + permute(dimensions.data(), rankOf(), target); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h index ec31f479a..ebd61410b 100644 --- a/libnd4j/include/helpers/ShapeUtils.h +++ b/libnd4j/include/helpers/ShapeUtils.h @@ -50,11 +50,13 @@ namespace nd4j { static std::vector evalRepeatShape(int axis, const std::vector& repeats, const NDArray& arr); // evaluate shapeInfo of permuted array - static Nd4jLong* evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace); + // if setContigStrides = true, then set contiguous strides in output shapeInfo in accordance with arr order + static Nd4jLong* evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides = false); static Nd4jLong* evalPermShapeInfo(const Nd4jLong* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace); // evaluate shapeInfo of transposed array - static Nd4jLong* evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace); + // if setContigStrides = true, then set contiguous strides in output shapeInfo in accordance with arr order + static Nd4jLong* evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides = false); static bool copyVectorPart(std::vector& target, std::vector& source, int rank, int offset); diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index 235ab3d10..a2d3f97ef 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -313,32 +313,37 @@ std::vector ShapeUtils::evalRepeatShape(int axis, const std::vectorbufferForShapeInfo(descriptor).primaryAsT(); - } + // copy arr _shapeInfo into new array + memcpy(shapeInfoNew, arr.getShapeInfo(), shape::shapeInfoByteLength(rank)); + // perform buffer permutation + shape::doPermuteShapeInfo(shapeInfoNew, dimensions, arr.lengthOf()); + + if(setContigStrides) + shape::updateStrides(shapeInfoNew, arr.ordering()); + + ShapeDescriptor descriptor(shapeInfoNew); + + RELEASE(shapeInfoNew, workspace); + + return ConstantShapeHelper::getInstance()->bufferForShapeInfo(descriptor).primaryAsT(); +} ////////////////////////////////////////////////////////////////////////// // evaluate shapeInfo of permuted array @@ -350,14 +355,14 @@ std::vector ShapeUtils::evalRepeatShape(int axis, const std::vector dimensions(rank); for (int i = 0; i < rank; ++i) dimensions[i] = rank - 1 - i; - return evalPermShapeInfo(dimensions.data(), dimensions.size(), arr, workspace); + return evalPermShapeInfo(dimensions.data(), dimensions.size(), arr, workspace, setContigStrides); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/generic/shape/permute.cpp b/libnd4j/include/ops/declarable/generic/shape/permute.cpp index 7e5efaa85..63c20e888 100644 --- a/libnd4j/include/ops/declarable/generic/shape/permute.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/permute.cpp @@ -15,7 +15,8 @@ ******************************************************************************/ // -// Created by raver119 on 29/10/17. +// @author raver119@gmail.com +// @author Yurii Shyrma (iuriish@yahoo.com) // #include @@ -29,80 +30,52 @@ namespace nd4j { ////////////////////////////////////////////////////////////////////////// // here iArgs is int vector of ordered set of dimensions to be permuted - CUSTOM_OP_IMPL(permute, 1, 1, true, 0, -2) { - auto x = INPUT_VARIABLE(0); +CUSTOM_OP_IMPL(permute, 1, 1, true, 0, -2) { - bool replace = false; + auto x = INPUT_VARIABLE(0); + auto z = OUTPUT_VARIABLE(0); - auto origArgs = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT() : *block.getIArguments(); - std::vector arguments({}); - if(origArgs.size() > 0){ - for (int e = 0; e < origArgs.size(); e++) { - int ax = origArgs[e]; - if (ax < 0) - ax += x->rankOf(); - - arguments.emplace_back(ax); - } - - replace = true; - } else { - for (int e = x->rankOf() - 1; e >= 0; e--) - arguments.emplace_back(e); - } - - // 0D edge case - if (x->rankOf() == 0) { - REQUIRE_TRUE(arguments.size() == 1, 0, "Permute: only one axis is allowed for scalar"); - auto output = OUTPUT_VARIABLE(0); - if (!block.isInplace()) - output->assign(x); - - return Status::OK(); - } - - if(block.isInplace()) { // in-place - x->permutei(arguments); - STORE_RESULT(x); - } else { - auto output = OUTPUT_VARIABLE(0); - auto result = x->permute(arguments); - output->assign(result); - STORE_RESULT(output); - } - - return Status::OK(); - } - - DECLARE_TYPES(permute) { - getOpDescriptor() - ->setAllowedInputTypes(0, nd4j::DataType::ANY) - ->setAllowedInputTypes(1, {ALL_INTS}) - ->setSameMode(true); - } - - DECLARE_SHAPE_FN(permute) { - auto shapeList = SHAPELIST(); - auto arguments = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT() : *block.getIArguments(); - - if (shape::rank(inputShape->at(0)) == 0) { - shapeList->push_back(ConstantShapeHelper::getInstance()->scalarShapeInfo(ArrayOptions::dataType(inputShape->at(0)))); - } else if (inputShape->size() == 1 && !arguments.empty()) { - shapeList->push_back(ShapeUtils::evalPermShapeInfo(arguments.data(), arguments.size(), *INPUT_VARIABLE(0), block.workspace())); - } else { - if(arguments.size() == 0){ - //Reverse dimensions - int rank = shape::rank(inputShape->at(0)); - for (int e = rank - 1; e >= 0; e--) - arguments.emplace_back(e); - } - - shapeList->push_back(ShapeUtils::evalPermShapeInfo(arguments.data(), arguments.size(), *INPUT_VARIABLE(0), block.workspace())); - } - - return shapeList; - } + if (x->isEmpty()) { + REQUIRE_TRUE(z->isEmpty(), 0, "PERMUTE OP: when input is empty, output must also be empty"); + return Status::OK(); //No op } + + if (block.width() == 1 && block.getIArguments()->size() == 0) { + z->assign(x->transpose()); + return Status::OK(); + } + + std::vector permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT() : *block.getIArguments(); + + z->assign(x->permute(permutationVector)); + + return Status::OK(); +} + +////////////////////////////////////////////////////////////////////////// +DECLARE_TYPES(permute) { + getOpDescriptor() + ->setAllowedInputTypes(0, nd4j::DataType::ANY) + ->setAllowedInputTypes(1, {ALL_INTS}) + ->setSameMode(true); +} + +////////////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(permute) { + + auto x = INPUT_VARIABLE(0); + + if (block.width() == 1 && block.getIArguments()->size() == 0) + return SHAPELIST(ShapeUtils::evalTranspShapeInfo(*x, block.workspace(), true)); + + std::vector permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT() : *block.getIArguments(); + + auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(permutationVector.data(), x->rankOf(), *x, block.workspace(), true); + + return SHAPELIST(outputShapeInfo); +} + +} } #endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp index b8d582481..4a06455eb 100644 --- a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp @@ -24,254 +24,240 @@ #include namespace nd4j { - namespace ops { - ////////////////////////////////////////////////////////////////////////// - // here iArgs is a vector with (optional) negative of order as first element: - // ({-order, dim1, dim2, dim3, ...}) - CUSTOM_OP_IMPL(reshape, 1, 1, false, 0, -2) { - auto x = INPUT_VARIABLE(0); +namespace ops { - if (block.width() == 1) { - auto arguments = block.getIArguments(); - int argsSize = arguments->size(); - - //Special case: empty.reshape() -> return empty - if (x->isEmpty()) { - REQUIRE_TRUE(OUTPUT_VARIABLE(0)->isEmpty(), 0, "Reshape: when input is empty, output must also be empty"); - return ND4J_STATUS_OK; //No op +////////////////////////////////////////////////////////////////////////// +// here iArgs is a vector with (optional) negative of order as first element: +// ({-order, dim1, dim2, dim3, ...}) +CUSTOM_OP_IMPL(reshape, 1, 1, false, 0, -2) { + + auto x = INPUT_VARIABLE(0); + auto z = OUTPUT_VARIABLE(0); + + //Special case: empty.reshape() -> return empty + if (x->isEmpty()) { + REQUIRE_TRUE(z->isEmpty(), 0, "Reshape: when input is empty, output must also be empty"); + return Status::OK(); //No op + } + + if (block.width() == 1) { + + auto arguments = block.getIArguments(); + int argsSize = arguments->size(); + + + + int e = 1; + char order = (char) -(*arguments)[0]; + if (order != 'c' && order != 'f') { + order = 'c'; //x->ordering(); + e = 0; + } + + REQUIRE_TRUE(argsSize - e >= 1, 0, "Reshape arguments should have at least 1 dimension"); + + std::vector shapeNew; + int e2 = e; + for (; e < (int) arguments->size(); e++) { + if (arguments->at(e) == -1){ + Nd4jLong shapeLength = 1; + for(; e2 < e; e2++){ + shapeLength *= arguments->at(e2); } - - int e = 1; - char order = (char) -(*arguments)[0]; - if (order != 'c' && order != 'f') { - order = 'c'; //x->ordering(); - e = 0; - } - - REQUIRE_TRUE(argsSize - e >= 1, 0, "Reshape arguments should have at least 1 dimension"); - - std::vector shapeNew; - int e2 = e; - for (; e < (int) arguments->size(); e++) { - if (arguments->at(e) == -1){ - Nd4jLong shapeLength = 1; - for(; e2 < e; e2++){ - shapeLength *= arguments->at(e2); - } - for(e2 = e + 1; e2 < arguments->size(); e2++){ - shapeLength *= arguments->at(e2); - } - Nd4jLong realShape = x->lengthOf() / shapeLength; - shapeNew.push_back(realShape); - } - else{ - shapeNew.push_back(arguments->at(e)); - } - - } - - auto len = shape::prodLong(shapeNew.data(), shapeNew.size()); - REQUIRE_TRUE(len == x->lengthOf(), 0, "Reshape: lengths before and after reshape should match, but got %i vs %i", x->lengthOf(), len); - - if (Environment::getInstance()->isDebugAndVerbose()) { - nd4j_printv("Reshape: new shape", shapeNew); - } - - if (block.isInplace()) { - if (x->reshapei(order, shapeNew)) { - STORE_RESULT(*x); - return ND4J_STATUS_OK; - } - } else { - auto ret = OUTPUT_VARIABLE(0); - auto xr = x->reshape(order, shapeNew); - ret->assign(xr); - STORE_RESULT(*ret); - - return Status::OK(); - } - } else if (block.width() == 2) { - auto s = INPUT_VARIABLE(1); - - //Special case: empty.reshape(-1) -> return empty - if (x->isEmpty()) { - //REQUIRE_TRUE(s->lengthOf() == 1 && s->e(0) == -1, 0, "Reshape: when input is empty, shape must be [-1]"); - REQUIRE_TRUE(OUTPUT_VARIABLE(0)->isEmpty(), 0, "Reshape: when input is empty, output must also be empty"); - return Status::OK(); //No op - } - - char order = 'c'; - if (block.numI() > 0) - order = (char) -INT_ARG(0); - - std::vector shapeNew(s->lengthOf()); - - for (int e = 0; e < (int) s->lengthOf(); e++) { - auto dim = s->e(e); - if (dim == -1){ - Nd4jLong shapeLength = 1; - for(int e2 = 0; e2 < e; e2++){ - shapeLength *= s->e(e2); - } - for(int e2 = e + 1; e2 < (int) s->lengthOf(); e2++){ - REQUIRE_TRUE(s->e(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed."); - shapeLength *= s->e(e2); - } - Nd4jLong realShape = x->lengthOf() / shapeLength; - shapeNew[e] = realShape; - } - else{ - shapeNew[e] = dim; - } - } - - if (Environment::getInstance()->isDebugAndVerbose()) { - nd4j_printv("Reshape: new shape", shapeNew); - } - - if (block.isInplace()) { - if (x->reshapei(order, shapeNew)) { - STORE_RESULT(*x); - return Status::OK(); - } - } else { - auto ret = OUTPUT_VARIABLE(0); - if (s->isEmpty()) { - // just a scalar - ret->assign(x); - } else { - auto xr = x->reshape(order, shapeNew); - ret->assign(xr); - } - - return Status::OK(); + for(e2 = e + 1; e2 < arguments->size(); e2++){ + shapeLength *= arguments->at(e2); } + Nd4jLong realShape = x->lengthOf() / shapeLength; + shapeNew.push_back(realShape); + } + else{ + shapeNew.push_back(arguments->at(e)); } - return ND4J_STATUS_BAD_INPUT; } + auto len = shape::prodLong(shapeNew.data(), shapeNew.size()); + REQUIRE_TRUE(len == x->lengthOf(), 0, "Reshape: lengths before and after reshape should match, but got %i vs %i", x->lengthOf(), len); - DECLARE_TYPES(reshape) { - getOpDescriptor() - ->setAllowedInputTypes(0, nd4j::DataType::ANY) - ->setAllowedInputTypes(1, {ALL_INTS}) - ->setSameMode(true); + if (Environment::getInstance()->isDebugAndVerbose()) { + nd4j_printv("Reshape: new shape", shapeNew); } - DECLARE_SHAPE_FN(reshape) { - auto inp = inputShape->at(0); + auto xr = x->reshape(order, shapeNew); + z->assign(xr); + STORE_RESULT(*z); - // we can launch op using Int arguments - if (inputShape->size() == 1) { - REQUIRE_TRUE(block.numI() > 0, 0, "Reshape: new shape should be provided as NDArray or int arguments, but nothing was defined"); - std::vector *arguments = block.getIArguments(); + return Status::OK(); - int e = 1; - char order = (char) -(*arguments)[0]; - if (order != 'c' && order != 'f') { - order = shape::order(inp); - e = 0; + } else if (block.width() == 2) { + + auto s = INPUT_VARIABLE(1); + + char order = 'c'; + if (block.numI() > 0) + order = (char) -INT_ARG(0); + + std::vector shapeNew(s->lengthOf()); + + for (int e = 0; e < (int) s->lengthOf(); e++) { + auto dim = s->e(e); + if (dim == -1){ + Nd4jLong shapeLength = 1; + for(int e2 = 0; e2 < e; e2++){ + shapeLength *= s->e(e2); } - - std::vector shapeNew; - - int e2 = e; - for (; e < (int) arguments->size(); e++) { - if ((int) arguments->at(e) == -1){ - - Nd4jLong shapeLength = 1; - for(; e2 < e; e2 ++){ - shapeLength *= arguments->at(e2); - } - for(e2 = e + 1; e2 < arguments->size(); e2++){ - REQUIRE_TRUE(arguments->at(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed."); - shapeLength *= arguments->at(e2); - } - - if(shapeLength == 0){ - //Edge case for empty: - shapeNew.push_back(0); - } else { - //Standard case - Nd4jLong realShape = shape::length(inp) / shapeLength; - shapeNew.push_back(realShape); - } - } - else{ - shapeNew.push_back(arguments->at(e)); - } + for(int e2 = e + 1; e2 < (int) s->lengthOf(); e2++){ + REQUIRE_TRUE(s->e(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed."); + shapeLength *= s->e(e2); } - - return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(inp), order, shapeNew))); - } else { - // or, with second input "as shape" - auto x = INPUT_VARIABLE(0); - auto y = INPUT_VARIABLE(1); - - // special case here - if (y->isEmpty()) { - REQUIRE_TRUE(x->lengthOf() == 1, 0, "Reshape: new length doesn't match existing array"); - return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(ArrayOptions::dataType(inp))); - } - //Special case: empty.reshape(-1) -> return empty - if (x->isEmpty()) { - //REQUIRE_TRUE(y->lengthOf() == 1 && y->e(0) == -1, 0, "Reshape: when input is empty, shape must be [-1]"); - auto shapeOf = y->getBufferAsVector(); - Nd4jLong prod = 1; - bool hasNegs = false; - for (auto v:shapeOf) { - if (v < 0) { - hasNegs = true; - v = 0; - } - - prod *= v; - } - - REQUIRE_TRUE(prod == 0, 0, "Reshape: in case of empty arrays reshape must return empty array as well"); - - // if there are -1s - we turn them into zeros - if (hasNegs) { - for (int e = 0; e < shapeOf.size(); e++) - if (shapeOf[e] < 0) - shapeOf[e] = 0; - } - - auto newShape = ShapeBuilders::createShapeInfo(ArrayOptions::dataType(inp), shape::order(inp), y->lengthOf(), shapeOf.data()); - return SHAPELIST(CONSTANT(newShape)); - } - - std::vector shapeNew(y->lengthOf()); - - for (int e = 0; e < (int) y->lengthOf(); e++) { - auto dim = y->e(e); - if (dim == -1){ - Nd4jLong shapeLength = 1; - for(int e2 = 0; e2 < e; e2++){ - shapeLength *= y->e(e2); - } - for(int e2 = e + 1; e2 < (int)y->lengthOf(); e2++){ - REQUIRE_TRUE(y->e(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed."); - shapeLength *= y->e(e2); - } - - if(shapeLength == 0){ - //Edge case for empty: - shapeNew[e] = 0; - } else { - Nd4jLong realShape = shape::length(inp) / shapeLength; - shapeNew[e] = realShape; - } - }else { - shapeNew[e] = dim; - } - } - - return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inp), 'c', shapeNew)); + Nd4jLong realShape = x->lengthOf() / shapeLength; + shapeNew[e] = realShape; + } + else{ + shapeNew[e] = dim; } } + + if (Environment::getInstance()->isDebugAndVerbose()) { + nd4j_printv("Reshape: new shape", shapeNew); + } + + if (s->isEmpty()) { + // just a scalar + z->assign(x); + } else { + auto xr = x->reshape(order, shapeNew); + z->assign(xr); + } + + return Status::OK(); + } + + return ND4J_STATUS_BAD_INPUT; +} + + +DECLARE_TYPES(reshape) { + getOpDescriptor() + ->setAllowedInputTypes(0, nd4j::DataType::ANY) + ->setAllowedInputTypes(1, {ALL_INTS}) + ->setSameMode(true); +} + +DECLARE_SHAPE_FN(reshape) { + auto inp = inputShape->at(0); + + // we can launch op using Int arguments + if (inputShape->size() == 1) { + REQUIRE_TRUE(block.numI() > 0, 0, "Reshape: new shape should be provided as NDArray or int arguments, but nothing was defined"); + std::vector *arguments = block.getIArguments(); + + int e = 1; + char order = (char) -(*arguments)[0]; + if (order != 'c' && order != 'f') { + order = shape::order(inp); + e = 0; + } + + std::vector shapeNew; + + int e2 = e; + for (; e < (int) arguments->size(); e++) { + if ((int) arguments->at(e) == -1){ + + Nd4jLong shapeLength = 1; + for(; e2 < e; e2 ++){ + shapeLength *= arguments->at(e2); + } + for(e2 = e + 1; e2 < arguments->size(); e2++){ + REQUIRE_TRUE(arguments->at(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed."); + shapeLength *= arguments->at(e2); + } + + if(shapeLength == 0){ + //Edge case for empty: + shapeNew.push_back(0); + } else { + //Standard case + Nd4jLong realShape = shape::length(inp) / shapeLength; + shapeNew.push_back(realShape); + } + } + else{ + shapeNew.push_back(arguments->at(e)); + } + } + + return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(inp), order, shapeNew))); + } else { + // or, with second input "as shape" + auto x = INPUT_VARIABLE(0); + auto y = INPUT_VARIABLE(1); + + // special case here + if (y->isEmpty()) { + REQUIRE_TRUE(x->lengthOf() == 1, 0, "Reshape: new length doesn't match existing array"); + return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(ArrayOptions::dataType(inp))); + } + //Special case: empty.reshape(-1) -> return empty + if (x->isEmpty()) { + //REQUIRE_TRUE(y->lengthOf() == 1 && y->e(0) == -1, 0, "Reshape: when input is empty, shape must be [-1]"); + auto shapeOf = y->getBufferAsVector(); + Nd4jLong prod = 1; + bool hasNegs = false; + for (auto v:shapeOf) { + if (v < 0) { + hasNegs = true; + v = 0; + } + + prod *= v; + } + + REQUIRE_TRUE(prod == 0, 0, "Reshape: in case of empty arrays reshape must return empty array as well"); + + // if there are -1s - we turn them into zeros + if (hasNegs) { + for (int e = 0; e < shapeOf.size(); e++) + if (shapeOf[e] < 0) + shapeOf[e] = 0; + } + + auto newShape = ShapeBuilders::createShapeInfo(ArrayOptions::dataType(inp), shape::order(inp), y->lengthOf(), shapeOf.data()); + return SHAPELIST(CONSTANT(newShape)); + } + + std::vector shapeNew(y->lengthOf()); + + for (int e = 0; e < (int) y->lengthOf(); e++) { + auto dim = y->e(e); + if (dim == -1){ + Nd4jLong shapeLength = 1; + for(int e2 = 0; e2 < e; e2++){ + shapeLength *= y->e(e2); + } + for(int e2 = e + 1; e2 < (int)y->lengthOf(); e2++){ + REQUIRE_TRUE(y->e(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed."); + shapeLength *= y->e(e2); + } + + if(shapeLength == 0){ + //Edge case for empty: + shapeNew[e] = 0; + } else { + Nd4jLong realShape = shape::length(inp) / shapeLength; + shapeNew[e] = realShape; + } + }else { + shapeNew[e] = dim; + } + } + + return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inp), 'c', shapeNew)); + } +} +} } #endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp index 75aafc06f..3035f104b 100644 --- a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp @@ -29,34 +29,26 @@ namespace nd4j { ////////////////////////////////////////////////////////////////////////// CUSTOM_OP_IMPL(reshapeas, 2, 1, false, 0, 0) { - + auto x = INPUT_VARIABLE(0); auto y = INPUT_VARIABLE(1); auto z = OUTPUT_VARIABLE(0); - std::vector shapeNew(y->shapeOf(), y->shapeOf() + y->rankOf()); - char order = y->ordering(); - if (x->reshapei(order, shapeNew)) { - *z = *x; - STORE_RESULT(*z); + if (x->reshapei(y->ordering(), y->getShapeAsVector())) { + + z->assign(x); return Status::OK(); } return ND4J_STATUS_BAD_INPUT; } DECLARE_SYN(reshape_as, reshapeas); - - DECLARE_SHAPE_FN(reshapeas) { - - auto inputShapeInfo = inputShape->at(1); - int shapeInfoLength = inputShapeInfo[0]*2 + 4; - Nd4jLong* outputShapeInfo(nullptr); - COPY_SHAPE(inputShapeInfo, outputShapeInfo); - - return SHAPELIST(CONSTANT(outputShapeInfo)); -} + DECLARE_SHAPE_FN(reshapeas) { + + return SHAPELIST(ShapeBuilders::copyShapeInfo(INPUT_VARIABLE(1)->getShapeInfo(), false, block.workspace())); + } DECLARE_TYPES(reshapeas) { getOpDescriptor() diff --git a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp index 15ed67744..4ec586370 100644 --- a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp @@ -15,7 +15,8 @@ ******************************************************************************/ // -// Created by raver119 on 29/10/17. +// @author raver119@gmail.com +// @author Yurii Shyrma (iuriish@yahoo.com) // #include @@ -25,113 +26,52 @@ #include namespace nd4j { -namespace ops { +namespace ops { - ////////////////////////////////////////////////////////////////////////// - CUSTOM_OP_IMPL(transpose, 1, 1, false, 0, 0) { - auto x = INPUT_VARIABLE(0); - if (block.width() == 1) { - if (block.isInplace()) { - x->transposei(); - STORE_RESULT(*x); - } else { - auto output = OUTPUT_VARIABLE(0); - auto t = x->transpose(); - output->assign(t); - STORE_RESULT(*output); - } - } else { - // this is tf-mode transpose, that's nd4j permute - bool replace = false; - std::vector arguments(*block.getIArguments()); +////////////////////////////////////////////////////////////////////////// +CUSTOM_OP_IMPL(transpose, 1, 1, false, 0, 0) { - auto w = block.width(); - auto a = arguments.size(); + auto x = INPUT_VARIABLE(0); + auto z = OUTPUT_VARIABLE(0); - if (w == 2 && a == 0) { - auto axis = INPUT_VARIABLE(1); - for (int e = 0; e < axis->lengthOf(); e++) { - auto ax = axis->e(e); - if (ax < 0) - ax += x->rankOf(); + //Special case: empty.reshape() -> return empty + if (x->isEmpty()) { + REQUIRE_TRUE(z->isEmpty(), 0, "TRANSPOSE OP: when input is empty, output must also be empty"); + return Status::OK(); //No op + } - arguments.emplace_back(ax); - } - - replace = true; - } else if (a == 0) { - for (int e = x->rankOf() - 1; e >= 0; e--) - arguments.emplace_back(e); - } - - // 0D edge case - if (x->rankOf() == 0) { - REQUIRE_TRUE(arguments.size() == 1, 0, "Permute: only one axis is allowed for scalar"); - auto output = OUTPUT_VARIABLE(0); - if (!block.isInplace()) - output->assign(x); - - return Status::OK(); - } - - if(block.isInplace()) { // in-place - x->permutei(arguments); - STORE_RESULT(x); - } else { - auto input = x->permute(arguments); - - auto output = OUTPUT_VARIABLE(0); - output->assign(input); - } - } + if (block.width() == 1 && block.getIArguments()->size() == 0) { + z->assign(x->transpose()); return Status::OK(); } - DECLARE_TYPES(transpose) { - getOpDescriptor() - ->setAllowedInputTypes(nd4j::DataType::ANY) - ->setSameMode(true); - } + std::vector permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT() : *block.getIArguments(); - DECLARE_SHAPE_FN(transpose) { - if (block.width() == 1) { - auto outputShapeInfo = ShapeUtils::evalTranspShapeInfo(*INPUT_VARIABLE(0), block.workspace()); - return SHAPELIST(outputShapeInfo); - } else { - // this is basically permute mode - auto shapeList = SHAPELIST(); - auto arguments = block.getIArguments(); - if (shape::rank(inputShape->at(0)) == 0) { - Nd4jLong *newshape; - ALLOCATE(newshape, block.getWorkspace(), shape::shapeInfoLength(inputShape->at(0)), Nd4jLong); - newshape[0] = 0; - newshape[1] = 0; - newshape[2] = 1; - newshape[3] = 99; - ArrayOptions::copyDataType(newshape, inputShape->at(0)); - shapeList->push_back(newshape); - } else if (arguments->size() > 0 || inputShape->size() > 1) { - auto axis = arguments->size() > 0 ? *arguments : (INPUT_VARIABLE(1))->template asVectorT(); - auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(axis.data(), axis.size(), *INPUT_VARIABLE(0), block.workspace()); - shapeList->push_back(outputShapeInfo); - } else if (inputShape->size() == 2) { - // dead end - auto axis = INPUT_VARIABLE(1); - auto axisV = axis->template asVectorT(); - auto newshape = ShapeUtils::evalPermShapeInfo(axisV.data(), axisV.size(), *INPUT_VARIABLE(0), block.workspace()); - shapeList->push_back(newshape); - } else { - int rank = shape::rank(inputShape->at(0)); - for (int e = rank - 1; e >= 0; e--) - arguments->emplace_back(e); + z->assign(x->permute(permutationVector)); - auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(arguments->data(), arguments->size(), *INPUT_VARIABLE(0), block.workspace()); - shapeList->push_back(outputShapeInfo); - } + return Status::OK(); +} + +DECLARE_TYPES(transpose) { + getOpDescriptor() + ->setAllowedInputTypes(nd4j::DataType::ANY) + ->setSameMode(true); +} + +DECLARE_SHAPE_FN(transpose) { + + auto x = INPUT_VARIABLE(0); + + if (block.width() == 1 && block.getIArguments()->size() == 0) + return SHAPELIST(ShapeUtils::evalTranspShapeInfo(*x, block.workspace(), true)); + + std::vector permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT() : *block.getIArguments(); + + auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(permutationVector.data(), x->rankOf(), *x, block.workspace(), true); + + return SHAPELIST(outputShapeInfo); +} - return shapeList; - } - } } } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp index a7258b01c..dee410a21 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp @@ -1882,36 +1882,6 @@ TEST_F(DeclarableOpsTests1, TestGemv1) { #endif -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests1, Reshape1) { - const std::vector xShape = {5,4,3}; - const std::vector yShape = {3,5,4}; - - auto x = NDArrayFactory::create_('f', xShape); - auto y = NDArrayFactory::create_('f', yShape); - - auto variableSpace = new VariableSpace(); - variableSpace->putVariable(-1, x); - - auto block = new Context(1, variableSpace, true); - block->fillInputs({-1}); - std::vector* arguments = block->getIArguments(); - arguments->push_back(-y->ordering()); - arguments->push_back(3); - arguments->push_back(5); - arguments->push_back(4); - - nd4j::ops::reshape reshape; - - reshape.execute(block); - - ASSERT_TRUE(x->isSameShape(y)); - - delete y; - delete block; - delete variableSpace; -} - ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests1, Reshape2) { const std::vector xShape = {5,4,3}; @@ -2022,37 +1992,8 @@ TEST_F(DeclarableOpsTests1, Reshape7){ ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests1, Transpose1) { - auto x = NDArrayFactory::create_('c', {3,5,2}); - auto exp = NDArrayFactory::create_('f', {2,5,3}); - - auto variableSpace = new VariableSpace(); - variableSpace->putVariable(-1, x); - - auto block = new Context(1, variableSpace, true); // in-place - block->fillInputs({-1}); - nd4j::ops::transpose transpose; - - Nd4jStatus status = transpose.execute(block); - ASSERT_EQ(ND4J_STATUS_OK, status); - // ASSERT_TRUE(x.isSameShapeStrict(exp)); - - for (int e = 0; e < x->rankOf() * 2 + 2; e++) { - ASSERT_EQ(x->getShapeInfo()[e], exp->getShapeInfo()[e]); - } -// ASSERT_EQ(x.getShapeInfo()[x.rankOf() * 2 + 2],-exp.getShapeInfo()[x.rankOf() * 2 + 2]); - ASSERT_EQ(x->getShapeInfo()[x->rankOf() * 2 + 3], exp->getShapeInfo()[x->rankOf() * 2 + 3]); - - delete exp; - delete block; - delete variableSpace; - -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests1, Transpose2) { - auto x = NDArrayFactory::create_('c', {3,5,2}); - auto exp = NDArrayFactory::create_('f', {2,5,3}); + auto exp = NDArrayFactory::create_('c', {2,5,3}); auto variableSpace = new VariableSpace(); variableSpace->putVariable(-1, x); @@ -2066,12 +2007,10 @@ TEST_F(DeclarableOpsTests1, Transpose2) { ASSERT_EQ(ND4J_STATUS_OK, status); auto result = variableSpace->getVariable(block->getNodeId())->getNDArray(); - // ASSERT_TRUE(result->isSameShapeStrict(exp)); - for (int e = 0; e < result->rankOf() * 2 + 2; e++) { - ASSERT_EQ(result->getShapeInfo()[e], exp->getShapeInfo()[e]); - } - //ASSERT_EQ(result->getShapeInfo()[x.rankOf() * 2 + 2],-exp.getShapeInfo()[x.rankOf() * 2 + 2]); - ASSERT_EQ(result->getShapeInfo()[x->rankOf() * 2 + 3], exp->getShapeInfo()[x->rankOf() * 2 + 3]); + + ASSERT_TRUE(exp->isSameShape(result)); + ASSERT_TRUE(exp->dataType() == result->dataType()); + ASSERT_TRUE(exp->ordering() == result->ordering()); delete exp; delete block; @@ -2079,44 +2018,12 @@ TEST_F(DeclarableOpsTests1, Transpose2) { } -////////////////////////////////////////////////////////////////////// -// in-place -TEST_F(DeclarableOpsTests1, Permute1) { - - Nd4jLong shapeX[] = {3, 5, 10, 15, 150, 15, 1, 0, 1, 99}; - Nd4jLong shapeExp[] = {3, 15, 5, 10, 1, 150, 15, 0, 0, 99}; - const std::vector perm = {2, 0, 1}; - ArrayOptions::setDataType(shapeX, nd4j::DataType::FLOAT32); - ArrayOptions::setDataType(shapeExp, nd4j::DataType::FLOAT32); - - auto x = new NDArray(shapeX,true); - auto exp = new NDArray(shapeExp,true); - - auto variableSpace = new VariableSpace(); - variableSpace->putVariable(-1, x); - - auto block = new Context(1, variableSpace, true); // in-place - block->fillInputs({-1}); - std::vector* arguments = block->getIArguments(); - *arguments = perm; // set dimensions to be permuted - - nd4j::ops::permute permute; - Nd4jStatus status = permute.execute(block); - ASSERT_EQ(ND4J_STATUS_OK, status); - - ASSERT_TRUE(x->isSameShapeStrict(*exp)); - - delete exp; - delete block; - delete variableSpace; -} - ////////////////////////////////////////////////////////////////////// // not-in-place -TEST_F(DeclarableOpsTests1, Permute2) { +TEST_F(DeclarableOpsTests1, Permute1) { - Nd4jLong shapeX[] = {3, 5, 10, 15, 150, 15, 1, 0, 1, 99}; - Nd4jLong shapeExp[] = {3, 15, 5, 10, 1, 150, 15, 0, 0, 99}; + Nd4jLong shapeX[] = {3, 5,10,15, 150,15,1, 0,1,99}; + Nd4jLong shapeExp[] = {3, 15,5,10, 50,10,1, 0,1,99}; const std::vector perm = {2, 0, 1}; ArrayOptions::setDataType(shapeX, nd4j::DataType::FLOAT32); diff --git a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp index 3717c488b..a234e6d50 100644 --- a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp @@ -161,23 +161,6 @@ TEST_F(EmptyTests, Test_Reshape_1) { delete result; } -TEST_F(EmptyTests, Test_Reshape_2) { - auto vector = NDArrayFactory::create('c', {1}, {119.0f}); - auto exp = NDArrayFactory::create(119.0f); - auto empty = NDArrayFactory::empty_(); - - nd4j::ops::reshape op; - auto result = op.evaluate({&vector, empty}, {}, {}, {}, {}, true); - - ASSERT_EQ(Status::OK(), result->status()); - - ASSERT_EQ(exp, *result->at(0)); - ASSERT_EQ(exp, vector); - - delete empty; - delete result; -} - TEST_F(EmptyTests, Test_Reshape_3) { auto x = NDArrayFactory::create('c', {1, 0, 0, 2}); auto y = NDArrayFactory::create('c', {2}, {10, 0}); diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 83d3ee3b8..3ceec003e 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -65,7 +65,7 @@ TEST_F(PlaygroundTests, test_avx) { nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel()); } - +/* TEST_F(PlaygroundTests, test_bert_1) { // this test will run ONLY if this model exists if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0) @@ -86,15 +86,15 @@ TEST_F(PlaygroundTests, test_bert_1) { graph->getVariableSpace()->putVariable(86,0, u); graph->getVariableSpace()->putVariable(87,0, v); -/* - // validating graph now - auto status = GraphExecutioner::execute(graph); - ASSERT_EQ(Status::OK(), status); - ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198)); - auto array = graph->getVariableSpace()->getVariable(198)->getNDArray(); - ASSERT_EQ(z, *array); -*/ + // validating graph now + // auto status = GraphExecutioner::execute(graph); + // ASSERT_EQ(Status::OK(), status); + // ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198)); + + // auto array = graph->getVariableSpace()->getVariable(198)->getNDArray(); + // ASSERT_EQ(z, *array); + nd4j::Environment::getInstance()->setProfiling(true); auto profile = GraphProfilingHelper::profile(graph, 1); @@ -104,28 +104,27 @@ TEST_F(PlaygroundTests, test_bert_1) { nd4j::Environment::getInstance()->setProfiling(false); delete profile; -/* - std::vector values; - for (int e = 0; e < 1; e++) { - auto timeStart = std::chrono::system_clock::now(); + // std::vector values; - GraphExecutioner::execute(graph); + // for (int e = 0; e < 1; e++) { + // auto timeStart = std::chrono::system_clock::now(); - auto timeEnd = std::chrono::system_clock::now(); - auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); - values.emplace_back(outerTime); - } + // GraphExecutioner::execute(graph); - std::sort(values.begin(), values.end()); + // auto timeEnd = std::chrono::system_clock::now(); + // auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + // values.emplace_back(outerTime); + // } - nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); -*/ + // std::sort(values.begin(), values.end()); + + // nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); delete graph; } -/* + TEST_F(PlaygroundTests, test_broadcast_1) { int pool = 10; std::vector aX(pool); From f9d51b72781eac90a3ea4a4edb5a54c7c2843d27 Mon Sep 17 00:00:00 2001 From: raver119 Date: Mon, 17 Feb 2020 10:23:05 +0300 Subject: [PATCH 09/19] More compilation units (#246) * weird edge case Signed-off-by: raver119 * weird edge case Signed-off-by: raver119 * get rid of it Signed-off-by: raver119 * crop and resize reorganized Signed-off-by: raver119 * restore test Signed-off-by: raver119 * remove unwanted unit refs in cmale Signed-off-by: raver119 --- .../generic/parity_ops/crop_and_resize.cpp | 3 +- .../ops/declarable/helpers/cpu/addBias.cpp | 22 ++++ .../compilation_units/crop_and_resize_0.cpp | 30 +++++ .../compilation_units/crop_and_resize_1.cpp | 30 +++++ .../compilation_units/crop_and_resize_2.cpp | 30 +++++ .../compilation_units/crop_and_resize_3.cpp | 30 +++++ .../compilation_units/crop_and_resize_4.cpp | 30 +++++ .../compilation_units/crop_and_resize_5.cpp | 30 +++++ .../compilation_units/crop_and_resize_6.cpp | 30 +++++ .../compilation_units/crop_and_resize_7.cpp | 30 +++++ .../compilation_units/crop_and_resize_8.cpp | 30 +++++ .../compilation_units/crop_and_resize_9.cpp | 30 +++++ .../helpers/cpu/crop_and_resize.cpp | 63 +++++++++ .../helpers/cpu/crop_and_resize.hpp | 123 ++++++++++++++++++ .../declarable/helpers/cpu/image_resize.cpp | 116 +---------------- .../ops/declarable/helpers/crop_and_resize.h | 40 ++++++ .../declarable/helpers/cuda/image_resize.cu | 2 +- .../ops/declarable/helpers/image_resize.h | 3 - libnd4j/include/types/types.h | 32 +++++ .../layers_tests/PlaygroundTests.cpp | 82 ++++++++---- .../tests_cpu/libnd4j_tests/CMakeLists.txt | 2 +- 21 files changed, 641 insertions(+), 147 deletions(-) create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp create mode 100644 libnd4j/include/ops/declarable/helpers/crop_and_resize.h diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp index 7afb24bd7..cdce8a95a 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp @@ -23,7 +23,8 @@ //#include #include -#include +#include + namespace nd4j { namespace ops { CUSTOM_OP_IMPL(crop_and_resize, 4, 1, false, 0, 0) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp index e5242a5be..39e51f6d7 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp @@ -374,6 +374,28 @@ namespace nd4j { template static void addBias_(const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) { + /* + if (input.rankOf() == 2 && bias.rankOf() == 1 && input.sizeAt(1) == bias.sizeAt(0) && input.ordering() == 'c') { + int rows = input.sizeAt(0); + int biasLen = bias.lengthOf(); + + auto inB = input.bufferAsT(); + auto bB = bias.bufferAsT(); + auto outB = output.bufferAsT(); + + for (int e = 0; e < rows; e++) { + auto row = inB + (e * biasLen); + auto out = outB + (e * biasLen); + + for (int t = 0; t < biasLen; t++) { + out[t] = row[t] + bB[t]; + } + } + + return; + } + */ + Nd4jLong* x_shapeInfo = input.getShapeInfo(); Nd4jLong* z_shapeInfo = output.getShapeInfo(); X* x = input.bufferAsT(); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp new file mode 100644 index 000000000..3177cca34 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_0, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp new file mode 100644 index 000000000..cd9c00dc5 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_1, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp new file mode 100644 index 000000000..3b126d288 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_2, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp new file mode 100644 index 000000000..cca97a1ac --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_3, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp new file mode 100644 index 000000000..568492c08 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_4, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp new file mode 100644 index 000000000..1491c9e1d --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_5, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp new file mode 100644 index 000000000..8517a39e9 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_6, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp new file mode 100644 index 000000000..e12190170 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_7, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp new file mode 100644 index 000000000..c4ddd7066 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_8, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp new file mode 100644 index 000000000..38cf05787 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include "../crop_and_resize.hpp" + +namespace nd4j { + namespace ops { + namespace helpers { + BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_9, FLOAT_TYPES, INTEGER_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp new file mode 100644 index 000000000..233699163 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp @@ -0,0 +1,63 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * Copyright (c) 2019-2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// +// @author sgazeos@gmail.com +// + +#include +#include + +namespace nd4j { + namespace ops { + namespace helpers { + +// ------------------------------------------------------------------------------------------------------------------ // +// ------------------------------------------------------------------------------------------------------------------ // +// crop and resize helper functor: +// \@param context - launch context for operation +// \@param images - batch of images (4D tensor) with shape {batch, width, height, channels} with given type +// \@param boxes - float boxes for crop +// \@param indices - integer boxes indices for crop +// \@param cropSize - integer size (newWidth, newHeight) +// \@param method - one of bilinear (0) or nearest neighbour (1) interpolation algorithm +// \@param extrapolationVal - radix to increase/decrease image +// \@param crops - output image batch (4D with given type) +// + void + cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes, + NDArray const *indices, NDArray const *cropSize, + int method, double extrapolationVal, NDArray *crops) { + BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_, (images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp new file mode 100644 index 000000000..ca30d73bd --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp @@ -0,0 +1,123 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// + +#include +#include + +namespace nd4j { + namespace ops { + namespace helpers { + template + void cropAndResizeFunctor_(NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) { + const int batchSize = images->sizeAt(0); + const int imageHeight = images->sizeAt(1); + const int imageWidth = images->sizeAt(2); + + const int numBoxes = crops->sizeAt(0); + const int cropHeight = crops->sizeAt(1); + const int cropWidth = crops->sizeAt(2); + const int depth = crops->sizeAt(3); + + for (auto b = 0; b < numBoxes; ++b) { + T y1 = boxes->t(b, 0); + T x1 = boxes->t(b, 1); + T y2 = boxes->t(b, 2); + T x2 = boxes->t(b, 3); + + int bIn = indices->e(b); + if (bIn >= batchSize) { + continue; + } + + T heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / (cropHeight - 1) : T(0); + T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); + + auto func = PRAGMA_THREADS_FOR { + for (auto y = start; y < stop; y += increment) { + const float inY = (cropHeight > 1) + ? y1 * (imageHeight - 1) + y * heightScale + : 0.5 * (y1 + y2) * (imageHeight - 1); + + if (inY < 0 || inY > imageHeight - 1) { + for (auto x = 0; x < cropWidth; ++x) { + for (auto d = 0; d < depth; ++d) { + crops->p(b, y, x, d, extrapolationVal); + } + } + continue; + } + if (method == 0 /* bilinear */) { + const int topYIndex = nd4j::math::p_floor(inY); + const int bottomYIndex = nd4j::math::p_ceil(inY); + const float y_lerp = inY - topYIndex; + + for (auto x = 0; x < cropWidth; ++x) { + const float in_x = (cropWidth > 1) + ? x1 * (imageWidth - 1) + x * widthScale + : 0.5 * (x1 + x2) * (imageWidth - 1); + + if (in_x < 0 || in_x > imageWidth - 1) { + for (auto d = 0; d < depth; ++d) { + crops->p(b, y, x, d, extrapolationVal); + } + continue; + } + int left_x_index = math::p_floor(in_x); + int right_x_index = math::p_ceil(in_x); + T x_lerp = in_x - left_x_index; + + for (auto d = 0; d < depth; ++d) { + const float topLeft(images->e(bIn, topYIndex, left_x_index, d)); + const float topRight(images->e(bIn, topYIndex, right_x_index, d)); + const float bottomLeft(images->e(bIn, bottomYIndex, left_x_index, d)); + const float bottomRight(images->e(bIn, bottomYIndex, right_x_index, d)); + const float top = topLeft + (topRight - topLeft) * x_lerp; + const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp; + crops->p(b, y, x, d, top + (bottom - top) * y_lerp); + } + } + } else { // method is "nearest neighbor" + for (auto x = 0; x < cropWidth; ++x) { + const float inX = (cropWidth > 1) + ? x1 * (imageWidth - 1) + x * widthScale + : 0.5 * (x1 + x2) * (imageWidth - 1); + + if (inX < 0 || inX > imageWidth - 1) { + for (auto d = 0; d < depth; ++d) { + crops->p(b, y, x, d, extrapolationVal); + } + continue; + } + const int closestXIndex = roundf(inX); + const int closestYIndex = roundf(inY); + for (auto d = 0; d < depth; ++d) { + crops->p(b, y, x, d, images->e(bIn, closestYIndex, closestXIndex, d)); + } + } + } + } + }; + + samediff::Threads::parallel_for(func, 0, cropHeight); + } + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp index d4089359f..9d30ddcf7 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp @@ -366,102 +366,6 @@ namespace helpers { BUILD_SINGLE_SELECTOR(images->dataType(), return resizeNeighborFunctor_, (images, width, height, alignCorners, halfPixelCenter, output), LIBND4J_TYPES); } - - template - static void cropAndResizeFunctor_(NDArray const *images, NDArray const *boxes, NDArray const *indices, - NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) { - const int batchSize = images->sizeAt(0); - const int imageHeight = images->sizeAt(1); - const int imageWidth = images->sizeAt(2); - - const int numBoxes = crops->sizeAt(0); - const int cropHeight = crops->sizeAt(1); - const int cropWidth = crops->sizeAt(2); - const int depth = crops->sizeAt(3); - - for (auto b = 0; b < numBoxes; ++b) { - T y1 = boxes->t(b, 0); - T x1 = boxes->t(b, 1); - T y2 = boxes->t(b, 2); - T x2 = boxes->t(b, 3); - - int bIn = indices->e(b); - if (bIn >= batchSize) { - continue; - } - - T heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / (cropHeight - 1) : T(0); - T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); - - auto func = PRAGMA_THREADS_FOR { - for (auto y = start; y < stop; y += increment) { - const float inY = (cropHeight > 1) - ? y1 * (imageHeight - 1) + y * heightScale - : 0.5 * (y1 + y2) * (imageHeight - 1); - - if (inY < 0 || inY > imageHeight - 1) { - for (auto x = 0; x < cropWidth; ++x) { - for (auto d = 0; d < depth; ++d) { - crops->p(b, y, x, d, extrapolationVal); - } - } - continue; - } - if (method == 0 /* bilinear */) { - const int topYIndex = nd4j::math::p_floor(inY); - const int bottomYIndex = nd4j::math::p_ceil(inY); - const float y_lerp = inY - topYIndex; - - for (auto x = 0; x < cropWidth; ++x) { - const float in_x = (cropWidth > 1) - ? x1 * (imageWidth - 1) + x * widthScale - : 0.5 * (x1 + x2) * (imageWidth - 1); - - if (in_x < 0 || in_x > imageWidth - 1) { - for (auto d = 0; d < depth; ++d) { - crops->p(b, y, x, d, extrapolationVal); - } - continue; - } - int left_x_index = math::p_floor(in_x); - int right_x_index = math::p_ceil(in_x); - T x_lerp = in_x - left_x_index; - - for (auto d = 0; d < depth; ++d) { - const float topLeft(images->e(bIn, topYIndex, left_x_index, d)); - const float topRight(images->e(bIn, topYIndex, right_x_index, d)); - const float bottomLeft(images->e(bIn, bottomYIndex, left_x_index, d)); - const float bottomRight(images->e(bIn, bottomYIndex, right_x_index, d)); - const float top = topLeft + (topRight - topLeft) * x_lerp; - const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp; - crops->p(b, y, x, d, top + (bottom - top) * y_lerp); - } - } - } else { // method is "nearest neighbor" - for (auto x = 0; x < cropWidth; ++x) { - const float inX = (cropWidth > 1) - ? x1 * (imageWidth - 1) + x * widthScale - : 0.5 * (x1 + x2) * (imageWidth - 1); - - if (inX < 0 || inX > imageWidth - 1) { - for (auto d = 0; d < depth; ++d) { - crops->p(b, y, x, d, extrapolationVal); - } - continue; - } - const int closestXIndex = roundf(inX); - const int closestYIndex = roundf(inY); - for (auto d = 0; d < depth; ++d) { - crops->p(b, y, x, d, images->e(bIn, closestYIndex, closestXIndex, d)); - } - } - } - } - }; - - samediff::Threads::parallel_for(func, 0, cropHeight); - } - } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // ------------------------------------------------------------------------------------------------------------------ // // Bicubic interpolation @@ -1105,25 +1009,7 @@ namespace helpers { return ND4J_STATUS_OK; } -// ------------------------------------------------------------------------------------------------------------------ // -// ------------------------------------------------------------------------------------------------------------------ // -// crop and resize helper functor: -// \@param context - launch context for operation -// \@param images - batch of images (4D tensor) with shape {batch, width, height, channels} with given type -// \@param boxes - float boxes for crop -// \@param indices - integer boxes indices for crop -// \@param cropSize - integer size (newWidth, newHeight) -// \@param method - one of bilinear (0) or nearest neighbour (1) interpolation algorithm -// \@param extrapolationVal - radix to increase/decrease image -// \@param crops - output image batch (4D with given type) -// - void - cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes, - NDArray const *indices, NDArray const *cropSize, - int method, double extrapolationVal, NDArray *crops) { - BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_, - (images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES); - } + } } } \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/crop_and_resize.h b/libnd4j/include/ops/declarable/helpers/crop_and_resize.h new file mode 100644 index 000000000..3926dbfb0 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/crop_and_resize.h @@ -0,0 +1,40 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + + +// +// @author sgazeos@gmail.com +// + +#ifndef SD_CROP_AND_RESIZE_H +#define SD_CROP_AND_RESIZE_H + +#include +#include + + +namespace nd4j { + namespace ops { + namespace helpers { + template + void cropAndResizeFunctor_(NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops); + + void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const* images, NDArray const* boxes, NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops); + } + } +} + +#endif //SD_CROP_AND_RESIZE_H diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu index c028daff3..6096f3a85 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu @@ -1326,7 +1326,7 @@ namespace helpers { // crops - output (4D tensor - [batch, outWidth, outHeight, pixels]) // template - static void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices, + void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) { const int batchSize = images->sizeAt(0); const int imageHeight = images->sizeAt(1); diff --git a/libnd4j/include/ops/declarable/helpers/image_resize.h b/libnd4j/include/ops/declarable/helpers/image_resize.h index 047b2cf70..decac3db9 100644 --- a/libnd4j/include/ops/declarable/helpers/image_resize.h +++ b/libnd4j/include/ops/declarable/helpers/image_resize.h @@ -50,9 +50,6 @@ namespace helpers { int resizeFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height, ImageResizeMethods method, bool preserveAspectRatio, bool antialias, NDArray* output); - - void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const* images, NDArray const* boxes, - NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops); } } } diff --git a/libnd4j/include/types/types.h b/libnd4j/include/types/types.h index 92fada8d3..7322c6bd5 100644 --- a/libnd4j/include/types/types.h +++ b/libnd4j/include/types/types.h @@ -159,6 +159,38 @@ (nd4j::DataType::INT64, Nd4jLong), \ (nd4j::DataType::BFLOAT16, bfloat16) +#define NUMERIC_TYPES_0 \ + (nd4j::DataType::HALF, float16) + +#define NUMERIC_TYPES_1 \ + (nd4j::DataType::FLOAT32, float) + +#define NUMERIC_TYPES_2 \ + (nd4j::DataType::DOUBLE, double) + +#define NUMERIC_TYPES_3 \ + (nd4j::DataType::INT8, int8_t), \ + (nd4j::DataType::BFLOAT16, bfloat16) + +#define NUMERIC_TYPES_4 \ + (nd4j::DataType::UINT8, uint8_t) + +#define NUMERIC_TYPES_5 \ + (nd4j::DataType::UINT16, uint16_t) + +#define NUMERIC_TYPES_6 \ + (nd4j::DataType::UINT32, uint32_t) + +#define NUMERIC_TYPES_7 \ + (nd4j::DataType::UINT64, uint64_t) + +#define NUMERIC_TYPES_8 \ + (nd4j::DataType::INT16, int16_t) + +#define NUMERIC_TYPES_9 \ + (nd4j::DataType::INT32, int32_t), \ + (nd4j::DataType::INT64, Nd4jLong) + #define GENERIC_NUMERIC_TYPES \ (nd4j::DataType::HALF, float16), \ diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 3ceec003e..9ce56d463 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -65,7 +65,32 @@ TEST_F(PlaygroundTests, test_avx) { nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel()); } -/* + +TEST_F(PlaygroundTests, test_biasAdd_1) { + auto x = NDArrayFactory::create('c', {512, 3072}); + auto y = NDArrayFactory::create('c', {3072}); + + std::vector values; + + nd4j::ops::biasadd op; + + for (int e = 0; e < 100; e++) { + auto timeStart = std::chrono::system_clock::now(); + + op.execute({&x, &y}, {&x}); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); +} + + + TEST_F(PlaygroundTests, test_bert_1) { // this test will run ONLY if this model exists if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0) @@ -86,15 +111,15 @@ TEST_F(PlaygroundTests, test_bert_1) { graph->getVariableSpace()->putVariable(86,0, u); graph->getVariableSpace()->putVariable(87,0, v); - +/* // validating graph now - // auto status = GraphExecutioner::execute(graph); - // ASSERT_EQ(Status::OK(), status); - // ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198)); - - // auto array = graph->getVariableSpace()->getVariable(198)->getNDArray(); - // ASSERT_EQ(z, *array); + auto status = GraphExecutioner::execute(graph); + ASSERT_EQ(Status::OK(), status); + ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198)); + auto array = graph->getVariableSpace()->getVariable(198)->getNDArray(); + ASSERT_EQ(z, *array); +*/ nd4j::Environment::getInstance()->setProfiling(true); auto profile = GraphProfilingHelper::profile(graph, 1); @@ -104,43 +129,47 @@ TEST_F(PlaygroundTests, test_bert_1) { nd4j::Environment::getInstance()->setProfiling(false); delete profile; +/* + std::vector values; - // std::vector values; + for (int e = 0; e < 1; e++) { + auto timeStart = std::chrono::system_clock::now(); - // for (int e = 0; e < 1; e++) { - // auto timeStart = std::chrono::system_clock::now(); + GraphExecutioner::execute(graph); - // GraphExecutioner::execute(graph); + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } - // auto timeEnd = std::chrono::system_clock::now(); - // auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); - // values.emplace_back(outerTime); - // } - - // std::sort(values.begin(), values.end()); - - // nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); + std::sort(values.begin(), values.end()); + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); +*/ delete graph; } +/* TEST_F(PlaygroundTests, test_broadcast_1) { - int pool = 10; + int pool = 1000; std::vector aX(pool); std::vector aY(pool); std::vector aZ(pool); for (int e = 0; e < pool; e++) { - aX[e] = NDArrayFactory::create_('c', {64, 128, 1}); - aY[e] = NDArrayFactory::create_('c', {768}); - aZ[e] = NDArrayFactory::create_('c', {64, 128, 768}); + aX[e] = NDArrayFactory::create_('c', {512, 3072}); + aY[e] = NDArrayFactory::create_('c', {3072}); + aZ[e] = NDArrayFactory::create_('c', {512, 3072}); aX[e]->assign(119 * (e+1)); aY[e]->assign(119 * (e+3)); } std::vector values; + Context ctx(1); + + nd4j::ops::biasadd op; for (int e = 0; e < 1000; e++) { auto x = aX[e < pool ? e : e % pool]; @@ -149,7 +178,8 @@ TEST_F(PlaygroundTests, test_broadcast_1) { auto timeStart = std::chrono::system_clock::now(); - x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z); + //op.execute({x, y}, {z}); + nd4j::ops::helpers::addBias(ctx, *x, *y, *z, false); auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); @@ -166,7 +196,7 @@ TEST_F(PlaygroundTests, test_broadcast_1) { delete aZ[e]; } } -*/ + /* TEST_F(PlaygroundTests, test_broadcast_1) { diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt index a852a0c4c..fbba329e3 100644 --- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt @@ -273,7 +273,7 @@ add_executable(runtests ${LOOPS_SOURCES} ../../blas/cpu/NativeOps.cpp ../../blas ../../include/cnpy/cnpy.cpp ../../include/nd4jmemset.h ../../include/nd4jmalloc.h ../../blas/Environment.cpp ../../blas/Environment.h ${EXEC_SOURCES} ${HELPERS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES} - ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES} ../../include/loops/cpu/compilation_units/TrueBroadcastHelper_1.cpp) + ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES}) target_link_libraries(runtests gtest ${MKLDNN} gtest_main ${BLAS_LIBRARIES}) From 2698fbf5417611333d311001051ee44933e1abd1 Mon Sep 17 00:00:00 2001 From: raver119 Date: Mon, 17 Feb 2020 16:25:10 +0300 Subject: [PATCH 10/19] Broadcast perf improvements (#248) * broadcast as scalar edge case Signed-off-by: raver119 * missing return Signed-off-by: raver119 * few fixes Signed-off-by: raver119 * one more fix Signed-off-by: raver119 * no need for lambdas Signed-off-by: raver119 --- libnd4j/blas/cpu/NativeOpExecutioner.cpp | 25 ++++++++++--- libnd4j/include/helpers/LoopKind.h | 35 ++++++++++++++++++- libnd4j/include/loops/broadcasting.h | 3 ++ libnd4j/include/loops/cpu/broadcasting.hpp | 34 ++++++++++++++++-- .../layers_tests/BroadcastableOpsTests.cpp | 1 + .../tests_cpu/layers_tests/BrodcastTests.cpp | 2 +- .../layers_tests/PlaygroundTests.cpp | 10 ++++++ 7 files changed, 102 insertions(+), 8 deletions(-) diff --git a/libnd4j/blas/cpu/NativeOpExecutioner.cpp b/libnd4j/blas/cpu/NativeOpExecutioner.cpp index c155bd781..cbc224838 100644 --- a/libnd4j/blas/cpu/NativeOpExecutioner.cpp +++ b/libnd4j/blas/cpu/NativeOpExecutioner.cpp @@ -163,15 +163,32 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext *lc, BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES); #else + auto loopKind = nd4j::LoopKind::deduceKindOfLoopBroadcast(hXShapeInfo, hYShapeInfo, hZShapeInfo); + auto func = PRAGMA_THREADS_FOR { - BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES); + BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, loopKind, start, stop), LIBND4J_TYPES); }; - auto xLen = shape::length(hXShapeInfo); - auto yLen = shape::length(hYShapeInfo); - auto numTads = xLen / yLen; + Nd4jLong numTads = 0; + + switch (loopKind) { + case nd4j::LoopKind::BROADCAST_SCALAR_X: { + numTads = shape::length(hXShapeInfo); + } + break; + case nd4j::LoopKind::BROADCAST_SCALAR_Y: { + numTads = shape::length(hYShapeInfo); + } + break; + default: { + auto xLen = shape::length(hXShapeInfo); + auto yLen = shape::length(hYShapeInfo); + numTads = xLen / yLen; + } + } samediff::Threads::parallel_tad(func, 0, numTads); + #endif } diff --git a/libnd4j/include/helpers/LoopKind.h b/libnd4j/include/helpers/LoopKind.h index f8f8084c8..ddd1c95e5 100644 --- a/libnd4j/include/helpers/LoopKind.h +++ b/libnd4j/include/helpers/LoopKind.h @@ -37,12 +37,13 @@ namespace nd4j { class ND4J_EXPORT LoopKind { public: - enum Kind {SMALLARR2DX, EWS1, EWSNONZERO, RANK1, RANK2, RANK3, RANK4, RANK5, X_EWSNONZERO, Y_EWSNONZERO, Z_EWSNONZERO, COMMON}; + enum Kind {SMALLARR2DX, EWS1, EWSNONZERO, RANK1, RANK2, RANK3, RANK4, RANK5, X_EWSNONZERO, Y_EWSNONZERO, Z_EWSNONZERO, COMMON, BROADCAST_SCALAR_X, BROADCAST_SCALAR_Y}; static FORCEINLINE Kind deduceKindOfLoopXZ(const Nd4jLong* xShapeInfo, const Nd4jLong* zShapeInfo); static FORCEINLINE Kind deduceKindOfLoopXYZ(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo); static FORCEINLINE Kind deduceKindOfLoopTadXZ(const Nd4jLong* xShapeInfo, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo); static FORCEINLINE Kind deduceKindOfLoopTadXYZ(const Nd4jLong* xTadShapeInfo, const Nd4jLong* yTadShapeInfo, const Nd4jLong* zShapeInfo); + static FORCEINLINE Kind deduceKindOfLoopBroadcast(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo); }; @@ -82,6 +83,38 @@ LoopKind::Kind LoopKind::deduceKindOfLoopXZ(const Nd4jLong* xShapeInfo, const Nd return COMMON; } +LoopKind::Kind LoopKind::deduceKindOfLoopBroadcast(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo) { + auto xRank = shape::rank(xShapeInfo); + auto yRank = shape::rank(yShapeInfo); + auto zRank = shape::rank(zShapeInfo); + + auto xOrder = shape::order(xShapeInfo); + auto yOrder = shape::order(yShapeInfo); + auto zOrder = shape::order(zShapeInfo); + + auto xEws = shape::elementWiseStride(xShapeInfo); + auto yEws = shape::elementWiseStride(yShapeInfo); + auto zEws = shape::elementWiseStride(zShapeInfo); + + if (xRank == yRank && xRank == zRank && xOrder == 'c' && yOrder == 'c' && zOrder == 'c' && xEws == 1 && yEws == 1 && zEws == 1 && xRank >= 2) { + // we validate that shapes are equal till the last dim + for (int e = 0; e < xRank - 1; e++) { + if (xShapeInfo[e+1] != yShapeInfo[e+1]) + return COMMON; + } + + // now, if one of the shapes has 1 as last dim + auto detect = xShapeInfo[xRank] == 1 ? -1 : (yShapeInfo[xRank] == 1) ? 1 : 0; + + if (detect == 1) + return nd4j::LoopKind::BROADCAST_SCALAR_Y; + else if (detect == -1) + return nd4j::LoopKind::BROADCAST_SCALAR_X; + } + + return nd4j::LoopKind::COMMON; +} + ////////////////////////////////////////////////////////////////////////////// LoopKind::Kind LoopKind::deduceKindOfLoopXYZ(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo) { diff --git a/libnd4j/include/loops/broadcasting.h b/libnd4j/include/loops/broadcasting.h index a38e79c3f..ebf702004 100755 --- a/libnd4j/include/loops/broadcasting.h +++ b/libnd4j/include/loops/broadcasting.h @@ -40,6 +40,7 @@ #endif #include +#include #include "legacy_ops.h" @@ -122,6 +123,7 @@ namespace functions { Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetZ, + nd4j::LoopKind::Kind loopKind, uint64_t start, uint64_t stop); @@ -149,6 +151,7 @@ namespace functions { Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetZ, + nd4j::LoopKind::Kind loopKind, uint64_t start, uint64_t stop); diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp index 37dbf833f..691b95b83 100644 --- a/libnd4j/include/loops/cpu/broadcasting.hpp +++ b/libnd4j/include/loops/cpu/broadcasting.hpp @@ -75,6 +75,7 @@ namespace functions { Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffset, + nd4j::LoopKind::Kind loopKind, uint64_t start, uint64_t stop) { DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x, @@ -88,7 +89,7 @@ namespace functions { xTadShapeInfo, xTadOffset, zTadShapeInfo, - zTadOffset, start, stop), BROADCAST_OPS); + zTadOffset, loopKind, start, stop), BROADCAST_OPS); } template @@ -105,6 +106,7 @@ namespace functions { Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffset, + nd4j::LoopKind::Kind loopKind, uint64_t start, uint64_t stop) { @@ -142,7 +144,7 @@ namespace functions { auto yEws = shape::elementWiseStride(yShapeInfo); auto zEws = shape::elementWiseStride(zTadShapeInfo); - const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); + const nd4j::LoopKind::Kind kindOfLoop = loopKind == nd4j::LoopKind::BROADCAST_SCALAR_X || loopKind == nd4j::LoopKind::BROADCAST_SCALAR_Y ? loopKind : nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { for (auto i = start; i < stop; i++) { @@ -163,6 +165,34 @@ namespace functions { for (unsigned int f = 0; f < tadLength; f++) oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]); } + } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_X){ + // this loop effectively turns broadcast into series of scalar ops + auto loopLength = yShapeInfo[shape::rank(yShapeInfo)]; + + for (auto i = start; i < stop; i++) { + auto oY = y + (i * loopLength); + auto oZ = z + (i * loopLength); + + const auto oX = x[i]; + + PRAGMA_OMP_SIMD + for (unsigned int f = 0; f < loopLength; f++) + oZ[f] = OpType::op(oX, oY[f]); + } + } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){ + // this loop effectively turns broadcast into series of scalar ops + auto loopLength = xShapeInfo[shape::rank(xShapeInfo)]; + + for (auto i = start; i < stop; i++) { + auto oX = x + (i * loopLength); + auto oZ = z + (i * loopLength); + + const auto oY = y[i]; + + PRAGMA_OMP_SIMD + for (unsigned int f = 0; f < loopLength; f++) + oZ[f] = OpType::op(oX[f], oY); + } } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { uint tadShapeShapeInfoCast[MAX_RANK]; diff --git a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp index 1f6000f06..9b6d06ec6 100644 --- a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp @@ -179,6 +179,7 @@ TEST_F(BroadcastableOpsTests, Test_Minimum_1) { auto z = result->at(0); ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); delete result; diff --git a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp index 9a8f09b87..bc2ae2152 100644 --- a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp @@ -54,7 +54,7 @@ TEST_F(BroadcastMultiDimTest,MultimDimTest) { tad->tadOnlyShapeInfo, //tadShapeInfo tad->tadOffsets, //tadOffset tad->tadOnlyShapeInfo, //tadShapeInfoZ - tad->tadOffsets, 0, tad->numTads); //tadOffsetZ + tad->tadOffsets, nd4j::LoopKind::COMMON, 0, tad->numTads); //tadOffsetZ for(int i = 0; i < 30; i++) { ASSERT_EQ(dataAssertion[i],result[i]); } diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 9ce56d463..7cdf40c7f 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -149,6 +149,16 @@ TEST_F(PlaygroundTests, test_bert_1) { delete graph; } +TEST_F(PlaygroundTests, test_one_off_ops_1) { + auto x = NDArrayFactory::create('c', {4, 128, 768}); + auto y = NDArrayFactory::create('c', {4, 128, 1}); + auto z = x.ulike(); + + nd4j::ops::squaredsubtract op; + op.execute({&x, &y}, {&z}); +} + + /* TEST_F(PlaygroundTests, test_broadcast_1) { From c8882cbfa528eb0da1324b3e54ca67832b688a46 Mon Sep 17 00:00:00 2001 From: Alex Black Date: Tue, 18 Feb 2020 10:29:06 +1100 Subject: [PATCH 11/19] Test fixes + cleanup (#245) * Test spam reduction Signed-off-by: Alex Black * Arbiter bad import fixes Signed-off-by: Alex Black * Small spark test tweak Signed-off-by: AlexDBlack * Arbiter test log spam reduction Signed-off-by: Alex Black * More test spam reduction Signed-off-by: Alex Black --- .../deeplearning4j/arbiter/util/WebUtils.java | 47 ----------------- .../arbiter/optimize/BraninFunction.java | 2 +- .../arbiter/optimize/TestRandomSearch.java | 2 +- .../optimize/genetic/TestRandomGenerator.java | 10 ++-- .../TwoParentsCrossoverOperatorTests.java | 4 +- .../culling/RatioCullOperatorTests.java | 4 +- .../GeneticSelectionOperatorTests.java | 4 +- .../selection/SelectionOperatorTests.java | 4 +- .../TestComputationGraphSpace.java | 2 +- .../TestGraphLocalExecution.java | 2 +- .../TestGraphLocalExecutionGenetic.java | 2 +- .../multilayernetwork/TestLayerSpace.java | 8 +-- .../TestMultiLayerSpace.java | 4 +- .../arbiter/server/ArbiterCLIRunnerTest.java | 3 +- .../graph/data/TestGraphLoading.java | 6 +-- .../graph/data/TestGraphLoadingWeighted.java | 2 +- .../deeplearning4j/graph/graph/TestGraph.java | 2 +- .../deepwalk/DeepWalkGradientCheck.java | 10 ++-- .../graph/models/deepwalk/TestDeepWalk.java | 16 +++--- .../models/deepwalk/TestGraphHuffman.java | 8 +-- .../ParameterServerParallelWrapperTest.java | 5 +- .../models/word2vec/SparkWord2VecTest.java | 2 +- .../spark/TestEarlyStoppingSpark.java | 10 ++-- .../TestEarlyStoppingSparkCompGraph.java | 10 ++-- .../spark/data/TestShuffleExamples.java | 2 +- .../spark/datavec/TestExport.java | 4 +- .../HashingBalancedPartitionerTest.java | 22 ++++---- .../impl/graph/TestSparkComputationGraph.java | 2 +- .../multilayer/TestSparkDl4jMultiLayer.java | 27 ++++++++-- ...arameterAveragingSparkVsSingleMachine.java | 50 ++++++++++--------- .../spark/impl/paramavg/TestJsonYaml.java | 2 +- ...TestSparkMultiLayerParameterAveraging.java | 28 ++++++----- .../stats/TestTrainingStatsCollection.java | 4 +- .../spark/time/TestTimeSource.java | 4 +- .../spark/ui/TestListeners.java | 10 ++-- .../spark/util/TestRepartitioning.java | 12 ++--- .../org/nd4s/NDArrayExtractionTest.scala | 2 +- .../org/nd4s/samediff/TrainingTest.scala | 2 +- .../rl4j/learning/HistoryProcessorTest.java | 4 +- .../rl4j/network/ac/ActorCriticTest.java | 4 +- .../rl4j/policy/PolicyTest.java | 2 +- 41 files changed, 168 insertions(+), 182 deletions(-) delete mode 100644 arbiter/arbiter-core/src/main/java/org/deeplearning4j/arbiter/util/WebUtils.java diff --git a/arbiter/arbiter-core/src/main/java/org/deeplearning4j/arbiter/util/WebUtils.java b/arbiter/arbiter-core/src/main/java/org/deeplearning4j/arbiter/util/WebUtils.java deleted file mode 100644 index 632ad7474..000000000 --- a/arbiter/arbiter-core/src/main/java/org/deeplearning4j/arbiter/util/WebUtils.java +++ /dev/null @@ -1,47 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -package org.deeplearning4j.arbiter.util; - -import org.slf4j.Logger; - -import java.awt.*; -import java.net.URI; - -/** - * Various utilities for webpages and dealing with browsers - */ -public class WebUtils { - - public static void tryOpenBrowser(String path, Logger log) { - try { - WebUtils.openBrowser(new URI(path)); - } catch (Exception e) { - log.error("Could not open browser", e); - System.out.println("Browser could not be launched automatically.\nUI path: " + path); - } - } - - public static void openBrowser(URI uri) throws Exception { - if (Desktop.isDesktopSupported()) { - Desktop.getDesktop().browse(uri); - } else { - throw new UnsupportedOperationException( - "Cannot open browser on this platform: Desktop.isDesktopSupported() == false"); - } - } - -} diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java index f49a8051d..4d507ee7d 100644 --- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java +++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java @@ -127,7 +127,7 @@ public class BraninFunction { BraninConfig candidate = (BraninConfig) c.getValue(); double score = scoreFunction.score(candidate, null, (Map) null); - System.out.println(candidate.getX1() + "\t" + candidate.getX2() + "\t" + score); +// System.out.println(candidate.getX1() + "\t" + candidate.getX2() + "\t" + score); Thread.sleep(20); diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java index 34916ebdc..99d2ad8d7 100644 --- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java +++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java @@ -54,7 +54,7 @@ public class TestRandomSearch extends BaseDL4JTest { runner.execute(); - System.out.println("----- Complete -----"); +// System.out.println("----- Complete -----"); } diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java index 2055ede57..abeba96e8 100644 --- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java +++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java @@ -16,8 +16,8 @@ package org.deeplearning4j.arbiter.optimize.genetic; +import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.math3.random.RandomGenerator; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; public class TestRandomGenerator implements RandomGenerator { private final int[] intRandomNumbers; @@ -63,17 +63,17 @@ public class TestRandomGenerator implements RandomGenerator { @Override public long nextLong() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } @Override public boolean nextBoolean() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } @Override public float nextFloat() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } @Override @@ -83,6 +83,6 @@ public class TestRandomGenerator implements RandomGenerator { @Override public double nextGaussian() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } } diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java index 9efe89620..9bde211f0 100644 --- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java +++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java @@ -16,6 +16,7 @@ package org.deeplearning4j.arbiter.optimize.genetic.crossover; +import org.apache.commons.lang3.NotImplementedException; import org.deeplearning4j.BaseDL4JTest; import org.deeplearning4j.arbiter.optimize.generator.genetic.crossover.CrossoverResult; import org.deeplearning4j.arbiter.optimize.generator.genetic.crossover.TwoParentsCrossoverOperator; @@ -26,7 +27,6 @@ import org.deeplearning4j.arbiter.optimize.genetic.TestParentSelection; import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer; import org.junit.Assert; import org.junit.Test; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; public class TwoParentsCrossoverOperatorTests extends BaseDL4JTest { @@ -42,7 +42,7 @@ public class TwoParentsCrossoverOperatorTests extends BaseDL4JTest { @Override public CrossoverResult crossover() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } } diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java index 093ffd486..c85022dca 100644 --- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java +++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java @@ -16,6 +16,7 @@ package org.deeplearning4j.arbiter.optimize.genetic.culling; +import org.apache.commons.lang3.NotImplementedException; import org.deeplearning4j.BaseDL4JTest; import org.deeplearning4j.arbiter.optimize.generator.genetic.Chromosome; import org.deeplearning4j.arbiter.optimize.generator.genetic.culling.RatioCullOperator; @@ -24,7 +25,6 @@ import org.deeplearning4j.arbiter.optimize.generator.genetic.population.Populati import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer; import org.junit.Assert; import org.junit.Test; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; import java.util.List; @@ -46,7 +46,7 @@ public class RatioCullOperatorTests extends BaseDL4JTest { @Override public void cullPopulation() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } public double getCullRatio() { diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java index 1d2b74de9..ddd0ae91e 100644 --- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java +++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java @@ -16,6 +16,7 @@ package org.deeplearning4j.arbiter.optimize.genetic.selection; +import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.math3.random.RandomGenerator; import org.deeplearning4j.BaseDL4JTest; import org.deeplearning4j.arbiter.optimize.generator.genetic.ChromosomeFactory; @@ -33,7 +34,6 @@ import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer; import org.deeplearning4j.arbiter.optimize.genetic.TestRandomGenerator; import org.junit.Assert; import org.junit.Test; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; import static org.junit.Assert.assertArrayEquals; @@ -55,7 +55,7 @@ public class GeneticSelectionOperatorTests extends BaseDL4JTest { @Override public void cullPopulation() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } @Override diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java index 3f64279ee..5d8a8b361 100644 --- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java +++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java @@ -16,6 +16,7 @@ package org.deeplearning4j.arbiter.optimize.genetic.selection; +import org.apache.commons.lang3.NotImplementedException; import org.deeplearning4j.BaseDL4JTest; import org.deeplearning4j.arbiter.optimize.generator.genetic.ChromosomeFactory; import org.deeplearning4j.arbiter.optimize.generator.genetic.population.PopulationInitializer; @@ -24,7 +25,6 @@ import org.deeplearning4j.arbiter.optimize.generator.genetic.selection.Selection import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer; import org.junit.Assert; import org.junit.Test; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; public class SelectionOperatorTests extends BaseDL4JTest { private class TestSelectionOperator extends SelectionOperator { @@ -39,7 +39,7 @@ public class SelectionOperatorTests extends BaseDL4JTest { @Override public double[] buildNextGenes() { - throw new NotImplementedException(); + throw new NotImplementedException("Not implemented"); } } diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java index 7c4ec38f4..54d73b775 100644 --- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java +++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java @@ -158,7 +158,7 @@ public class TestComputationGraphSpace extends BaseDL4JTest { } } - System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount); +// System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount); assertTrue(reluCount > 0); assertTrue(tanhCount > 0); diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java index 1747b45f9..391139f32 100644 --- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java +++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java @@ -162,7 +162,7 @@ public class TestGraphLocalExecution extends BaseDL4JTest { List results = runner.getResults(); assertTrue(results.size() > 0); - System.out.println("----- COMPLETE - " + results.size() + " results -----"); +// System.out.println("----- COMPLETE - " + results.size() + " results -----"); } } diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java index 2b9c5696d..91daa027f 100644 --- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java +++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java @@ -165,7 +165,7 @@ public class TestGraphLocalExecutionGenetic extends BaseDL4JTest { List results = runner.getResults(); assertTrue(results.size() > 0); - System.out.println("----- COMPLETE - " + results.size() + " results -----"); +// System.out.println("----- COMPLETE - " + results.size() + " results -----"); } } diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java index 6a5458e65..959cafc35 100644 --- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java +++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java @@ -101,7 +101,7 @@ public class TestLayerSpace extends BaseDL4JTest { double l2 = TestUtils.getL2(l); IActivation activation = l.getActivationFn(); - System.out.println(lr + "\t" + l2 + "\t" + activation); +// System.out.println(lr + "\t" + l2 + "\t" + activation); assertTrue(lr >= 0.3 && lr <= 0.4); assertTrue(l2 >= 0.01 && l2 <= 0.1); @@ -190,7 +190,7 @@ public class TestLayerSpace extends BaseDL4JTest { ActivationLayer al = als.getValue(d); IActivation activation = al.getActivationFn(); - System.out.println(activation); +// System.out.println(activation); assertTrue(containsActivationFunction(actFns, activation)); } @@ -228,7 +228,7 @@ public class TestLayerSpace extends BaseDL4JTest { IActivation activation = el.getActivationFn(); long nOut = el.getNOut(); - System.out.println(activation + "\t" + nOut); +// System.out.println(activation + "\t" + nOut); assertTrue(containsActivationFunction(actFns, activation)); assertTrue(nOut >= 10 && nOut <= 20); @@ -295,7 +295,7 @@ public class TestLayerSpace extends BaseDL4JTest { long nOut = el.getNOut(); double forgetGate = el.getForgetGateBiasInit(); - System.out.println(activation + "\t" + nOut + "\t" + forgetGate); +// System.out.println(activation + "\t" + nOut + "\t" + forgetGate); assertTrue(containsActivationFunction(actFns, activation)); assertTrue(nOut >= 10 && nOut <= 20); diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java index 99dc79f42..d4dbe9a3a 100644 --- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java +++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java @@ -293,8 +293,8 @@ public class TestMultiLayerSpace extends BaseDL4JTest { assertTrue(nLayerCounts[i] >= 5); //Expect approx equal (50/3 each), but some variation randomly } - System.out.println("Number of layers: " + Arrays.toString(nLayerCounts)); - System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount); +// System.out.println("Number of layers: " + Arrays.toString(nLayerCounts)); +// System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount); } diff --git a/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java b/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java index 21e4e402a..40e8a3e41 100644 --- a/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java +++ b/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java @@ -98,7 +98,8 @@ public class ArbiterCLIRunnerTest extends BaseDL4JTest { assertEquals(configuration,OptimizationConfiguration.fromJson(configuration.toJson())); FileUtils.writeStringToFile(new File(configPath),configuration.toJson()); - System.out.println(configuration.toJson()); +// System.out.println(configuration.toJson()); + configuration.toJson(); log.info("Starting test"); cliRunner.runMain( diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java index 1a5a27918..f4c970a22 100644 --- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java +++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java @@ -41,7 +41,7 @@ public class TestGraphLoading extends BaseDL4JTest { IGraph graph = GraphLoader .loadUndirectedGraphEdgeListFile(cpr.getTempFileFromArchive().getAbsolutePath(), 7, ","); - System.out.println(graph); +// System.out.println(graph); assertEquals(graph.numVertices(), 7); int[][] edges = {{1, 2}, {0, 2, 4}, {0, 1, 3, 4}, {2, 4, 5}, {1, 2, 3, 5, 6}, {3, 4, 6}, {4, 5}}; @@ -66,7 +66,7 @@ public class TestGraphLoading extends BaseDL4JTest { edgeLineProcessor, vertexFactory, 10, false); - System.out.println(graph); +// System.out.println(graph); for (int i = 0; i < 10; i++) { List> edges = graph.getEdgesOut(i); @@ -111,7 +111,7 @@ public class TestGraphLoading extends BaseDL4JTest { Graph graph = GraphLoader.loadGraph(verticesCPR.getTempFileFromArchive().getAbsolutePath(), edgesCPR.getTempFileFromArchive().getAbsolutePath(), vertexLoader, edgeLineProcessor, false); - System.out.println(graph); +// System.out.println(graph); for (int i = 0; i < 10; i++) { List> edges = graph.getEdgesOut(i); diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java index 94e1a20bf..77903f51e 100644 --- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java +++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java @@ -71,7 +71,7 @@ public class TestGraphLoadingWeighted extends BaseDL4JTest { } } - System.out.println(graph); +// System.out.println(graph); } diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java index 0dc456107..b0adf3283 100644 --- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java +++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java @@ -220,7 +220,7 @@ public class TestGraph extends BaseDL4JTest { sum += transitionProb[i][j]; for (int j = 0; j < transitionProb[i].length; j++) transitionProb[i][j] /= sum; - System.out.println(Arrays.toString(transitionProb[i])); +// System.out.println(Arrays.toString(transitionProb[i])); } //Check that transition probs are essentially correct (within bounds of random variation) diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java index c1aedd47a..f0343bde9 100644 --- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java +++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java @@ -145,8 +145,8 @@ public class DeepWalkGradientCheck extends BaseDL4JTest { if (relError > MAX_REL_ERROR && absErr > MIN_ABS_ERROR) fail(msg); - else - System.out.println(msg); +// else +// System.out.println(msg); } } @@ -333,10 +333,10 @@ public class DeepWalkGradientCheck extends BaseDL4JTest { if (relError > MAX_REL_ERROR && absErr > MIN_ABS_ERROR) fail(msg); - else - System.out.println(msg); +// else +// System.out.println(msg); } - System.out.println(); +// System.out.println(); } } diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java index d92c3bec1..97359cf15 100644 --- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java +++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java @@ -67,7 +67,7 @@ public class TestDeepWalk extends BaseDL4JTest { for (int i = 0; i < 7; i++) { INDArray vector = deepWalk.getVertexVector(i); assertArrayEquals(new long[] {vectorSize}, vector.shape()); - System.out.println(Arrays.toString(vector.dup().data().asFloat())); +// System.out.println(Arrays.toString(vector.dup().data().asFloat())); } GraphWalkIterator iter = new RandomWalkIterator<>(graph, 8); @@ -77,11 +77,11 @@ public class TestDeepWalk extends BaseDL4JTest { for (int t = 0; t < 5; t++) { iter.reset(); deepWalk.fit(iter); - System.out.println("--------------------"); +// System.out.println("--------------------"); for (int i = 0; i < 7; i++) { INDArray vector = deepWalk.getVertexVector(i); assertArrayEquals(new long[] {vectorSize}, vector.shape()); - System.out.println(Arrays.toString(vector.dup().data().asFloat())); +// System.out.println(Arrays.toString(vector.dup().data().asFloat())); } } } @@ -160,7 +160,7 @@ public class TestDeepWalk extends BaseDL4JTest { continue; double sim = deepWalk.similarity(i, nearestTo); - System.out.println(i + "\t" + nearestTo + "\t" + sim); +// System.out.println(i + "\t" + nearestTo + "\t" + sim); assertTrue(sim <= minSimNearest); } } @@ -211,7 +211,7 @@ public class TestDeepWalk extends BaseDL4JTest { Graph graph = GraphLoader .loadUndirectedGraphEdgeListFile(cpr.getTempFileFromArchive().getAbsolutePath(), 13, ","); - System.out.println(graph); +// System.out.println(graph); Nd4j.getRandom().setSeed(12345); @@ -229,11 +229,13 @@ public class TestDeepWalk extends BaseDL4JTest { //Calculate similarity(0,i) for (int i = 0; i < nVertices; i++) { - System.out.println(deepWalk.similarity(0, i)); +// System.out.println(deepWalk.similarity(0, i)); + deepWalk.similarity(0, i); } for (int i = 0; i < nVertices; i++) - System.out.println(deepWalk.getVertexVector(i)); +// System.out.println(deepWalk.getVertexVector(i)); + deepWalk.getVertexVector(i); } @Test(timeout = 60000L) diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java index 763aae822..76b2af0b5 100644 --- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java +++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java @@ -38,9 +38,11 @@ public class TestGraphHuffman extends BaseDL4JTest { gh.buildTree(vertexDegrees); - for (int i = 0; i < 7; i++) - System.out.println(i + "\t" + gh.getCodeLength(i) + "\t" + gh.getCodeString(i) + "\t\t" + gh.getCode(i) - + "\t\t" + Arrays.toString(gh.getPathInnerNodes(i))); + for (int i = 0; i < 7; i++) { + String s = i + "\t" + gh.getCodeLength(i) + "\t" + gh.getCodeString(i) + "\t\t" + gh.getCode(i) + + "\t\t" + Arrays.toString(gh.getPathInnerNodes(i)); +// System.out.println(s); + } int[] expectedLengths = {3, 2, 2, 5, 4, 2, 5}; for (int i = 0; i < vertexDegrees.length; i++) { diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java index beb9af5b4..ad610739f 100644 --- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java +++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java @@ -79,8 +79,9 @@ public class ParameterServerParallelWrapperTest extends BaseDL4JTest { model.init(); ParallelWrapper parameterServerParallelWrapper = - new ParallelWrapper.Builder(model).trainerFactory(new ParameterServerTrainerContext()) - .workers(Runtime.getRuntime().availableProcessors()) + new ParallelWrapper.Builder(model) + .workers(Math.min(4, Runtime.getRuntime().availableProcessors())) + .trainerFactory(new ParameterServerTrainerContext()) .reportScoreAfterAveraging(true).prefetchBuffer(3).build(); parameterServerParallelWrapper.fit(mnistTrain); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java index 55d893d8c..f3b3f974a 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java @@ -104,7 +104,7 @@ public class SparkWord2VecTest extends BaseDL4JTest { public void call(ExportContainer v) throws Exception { assertNotNull(v.getElement()); assertNotNull(v.getArray()); - System.out.println(v.getElement() + " - " + v.getArray()); +// System.out.println(v.getElement() + " - " + v.getArray()); } } } diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java index 0983cbd76..1515cf3cf 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java @@ -66,7 +66,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build()) .build(); MultiLayerNetwork net = new MultiLayerNetwork(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); @@ -119,7 +119,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MSE).build()) .build(); MultiLayerNetwork net = new MultiLayerNetwork(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); EarlyStoppingModelSaver saver = new InMemoryModelSaver<>(); @@ -155,7 +155,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build()) .build(); MultiLayerNetwork net = new MultiLayerNetwork(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); @@ -198,7 +198,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build()) .build(); MultiLayerNetwork net = new MultiLayerNetwork(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); @@ -231,7 +231,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build()) .build(); MultiLayerNetwork net = new MultiLayerNetwork(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java index 2e35b629c..0c4e2b2f8 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java @@ -69,7 +69,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in") .setOutputs("0").build(); ComputationGraph net = new ComputationGraph(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); @@ -120,7 +120,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MSE).build(), "in") .setOutputs("0").build(); ComputationGraph net = new ComputationGraph(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); EarlyStoppingModelSaver saver = new InMemoryModelSaver<>(); @@ -158,7 +158,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in") .setOutputs("0").build(); ComputationGraph net = new ComputationGraph(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); @@ -203,7 +203,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in") .setOutputs("0").build(); ComputationGraph net = new ComputationGraph(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); @@ -238,7 +238,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest { .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in") .setOutputs("0").build(); ComputationGraph net = new ComputationGraph(conf); - net.setListeners(new ScoreIterationListener(1)); + net.setListeners(new ScoreIterationListener(5)); JavaRDD irisData = getIris(); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java index 24d58bb17..c26db5642 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java @@ -59,7 +59,7 @@ public class TestShuffleExamples extends BaseSparkTest { int totalExampleCount = 0; for (DataSet ds : shuffledList) { totalExampleCount += ds.getFeatures().length(); - System.out.println(Arrays.toString(ds.getFeatures().data().asFloat())); +// System.out.println(Arrays.toString(ds.getFeatures().data().asFloat())); assertEquals(ds.getFeatures(), ds.getLabels()); } diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java index e93cfeb92..d110a3b98 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java @@ -86,7 +86,7 @@ public class TestExport extends BaseSparkTest { for (File file : files) { if (!file.getPath().endsWith(".bin")) continue; - System.out.println(file); +// System.out.println(file); DataSet ds = new DataSet(); ds.load(file); assertEquals(minibatchSize, ds.numExamples()); @@ -144,7 +144,7 @@ public class TestExport extends BaseSparkTest { for (File file : files) { if (!file.getPath().endsWith(".bin")) continue; - System.out.println(file); +// System.out.println(file); MultiDataSet ds = new org.nd4j.linalg.dataset.MultiDataSet(); ds.load(file); assertEquals(minibatchSize, ds.getFeatures(0).size(0)); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java index 6094ed008..4d2ed4b97 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java @@ -92,9 +92,9 @@ public class HashingBalancedPartitionerTest extends BaseSparkTest { int[][] colorCountsByPartition = new int[3][2]; for (final Tuple2, String> val : testList) { - System.out.println(val); +// System.out.println(val); Integer partition = hbp.getPartition(val._1()); - System.out.println(partition); +// System.out.println(partition); if (val._2().equals("red")) colorCountsByPartition[partition][0] += 1; @@ -102,9 +102,9 @@ public class HashingBalancedPartitionerTest extends BaseSparkTest { colorCountsByPartition[partition][1] += 1; } - for (int i = 0; i < 3; i++) { - System.out.println(Arrays.toString(colorCountsByPartition[i])); - } +// for (int i = 0; i < 3; i++) { +// System.out.println(Arrays.toString(colorCountsByPartition[i])); +// } for (int i = 0; i < 3; i++) { // avg red per partition : 2.33 assertTrue(colorCountsByPartition[i][0] >= 1 && colorCountsByPartition[i][0] < 4); @@ -178,12 +178,12 @@ public class HashingBalancedPartitionerTest extends BaseSparkTest { colorCountsByPartition[partition][1] += 1; } - for (int i = 0; i < numPartitions; i++) { - System.out.println(Arrays.toString(colorCountsByPartition[i])); - } - - System.out.println("Ideal red # per partition: " + avgRed); - System.out.println("Ideal blue # per partition: " + avgBlue); +// for (int i = 0; i < numPartitions; i++) { +// System.out.println(Arrays.toString(colorCountsByPartition[i])); +// } +// +// System.out.println("Ideal red # per partition: " + avgRed); +// System.out.println("Ideal blue # per partition: " + avgBlue); for (int i = 0; i < numPartitions; i++) { // avg red per partition : 2.33 diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java index 0e29386a1..0de7875e2 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java @@ -115,7 +115,7 @@ public class TestSparkComputationGraph extends BaseSparkTest { TrainingMaster tm = new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 10, 1, 0); SparkComputationGraph scg = new SparkComputationGraph(sc, cg, tm); - scg.setListeners(Collections.singleton((TrainingListener) new ScoreIterationListener(1))); + scg.setListeners(Collections.singleton((TrainingListener) new ScoreIterationListener(5))); JavaRDD rdd = sc.parallelize(list); scg.fitMultiDataSet(rdd); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java index ecf9b937b..38a15ef8d 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java @@ -31,8 +31,11 @@ import org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster; import org.junit.Test; import org.nd4j.evaluation.classification.Evaluation; import org.nd4j.linalg.activations.Activation; +import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.linalg.dataset.DataSet; import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; +import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.learning.config.Adam; import org.nd4j.linalg.learning.config.Nesterovs; import org.nd4j.linalg.lossfunctions.LossFunctions; @@ -45,8 +48,24 @@ import static org.junit.Assert.assertTrue; @Slf4j public class TestSparkDl4jMultiLayer extends BaseSparkTest { - @Test(timeout = 120000L) + @Override + public long getTimeoutMilliseconds() { + return 120000L; + } + + @Override + public DataType getDataType() { + return DataType.FLOAT; + } + + @Override + public DataType getDefaultFPDataType() { + return DataType.FLOAT; + } + + @Test public void testEvaluationSimple() throws Exception { + Nd4j.getRandom().setSeed(12345); for( int evalWorkers : new int[]{1, 4, 8}) { //Simple test to validate DL4J issue 4099 is fixed... @@ -75,18 +94,18 @@ public class TestSparkDl4jMultiLayer extends BaseSparkTest { //---------------------------------- //Create network configuration and conduct network training MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() + .dataType(DataType.FLOAT) .seed(12345) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT) .activation(Activation.LEAKYRELU) .weightInit(WeightInit.XAVIER) - .updater(new Nesterovs(0.02, 0.9)) - .l2(1e-4) + .updater(new Adam(1e-3)) + .l2(1e-5) .list() .layer(0, new DenseLayer.Builder().nIn(28 * 28).nOut(500).build()) .layer(1, new DenseLayer.Builder().nIn(500).nOut(100).build()) .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD) .activation(Activation.SOFTMAX).nIn(100).nOut(10).build()) - .build(); //Configuration for Spark training: see https://deeplearning4j.org/docs/latest/deeplearning4j-scaleout-howto for explanation of these configuration options diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java index 0188b15d9..9a6c80000 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java @@ -333,15 +333,16 @@ public class TestCompareParameterAveragingSparkVsSingleMachine { sparkNet.fit(rdd); } - System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); +// System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); + sparkNet.getSparkTrainingStats().statsAsString(); INDArray finalSparkParams = sparkNet.getNetwork().params().dup(); - System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); - System.out.println("Initial (Spark) params: " - + Arrays.toString(initialSparkParams.data().asFloat())); - System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat())); - System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat())); +// System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); +// System.out.println("Initial (Spark) params: " +// + Arrays.toString(initialSparkParams.data().asFloat())); +// System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat())); +// System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat())); assertEquals(initialParams, initialSparkParams); assertNotEquals(initialParams, finalParams); assertEquals(finalParams, finalSparkParams); @@ -405,15 +406,16 @@ public class TestCompareParameterAveragingSparkVsSingleMachine { sparkNet.fit(rdd); } - System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); +// System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); + sparkNet.getSparkTrainingStats().statsAsString(); INDArray finalSparkParams = sparkNet.getNetwork().params().dup(); - System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); - System.out.println("Initial (Spark) params: " - + Arrays.toString(initialSparkParams.data().asFloat())); - System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat())); - System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat())); +// System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); +// System.out.println("Initial (Spark) params: " +// + Arrays.toString(initialSparkParams.data().asFloat())); +// System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat())); +// System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat())); assertArrayEquals(initialParams.data().asFloat(), initialSparkParams.data().asFloat(), 1e-8f); assertArrayEquals(finalParams.data().asFloat(), finalSparkParams.data().asFloat(), 1e-6f); @@ -478,18 +480,19 @@ public class TestCompareParameterAveragingSparkVsSingleMachine { sparkNet.fit(rdd); } - System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); +// System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); + sparkNet.getSparkTrainingStats().statsAsString(); INDArray finalSparkParams = sparkNet.getNetwork().params().dup(); // executioner.addToWatchdog(finalSparkParams, "finalSparkParams"); float[] fp = finalParams.data().asFloat(); float[] fps = finalSparkParams.data().asFloat(); - System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); - System.out.println("Initial (Spark) params: " - + Arrays.toString(initialSparkParams.data().asFloat())); - System.out.println("Final (Local) params: " + Arrays.toString(fp)); - System.out.println("Final (Spark) params: " + Arrays.toString(fps)); +// System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); +// System.out.println("Initial (Spark) params: " +// + Arrays.toString(initialSparkParams.data().asFloat())); +// System.out.println("Final (Local) params: " + Arrays.toString(fp)); +// System.out.println("Final (Spark) params: " + Arrays.toString(fps)); assertEquals(initialParams, initialSparkParams); assertNotEquals(initialParams, finalParams); @@ -551,14 +554,15 @@ public class TestCompareParameterAveragingSparkVsSingleMachine { sparkNet.fit(rdd); } - System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); +// System.out.println(sparkNet.getSparkTrainingStats().statsAsString()); + sparkNet.getSparkTrainingStats().statsAsString(); INDArray finalSparkParams = sparkNet.getNetwork().params().dup(); - System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); - System.out.println("Initial (Spark) params: " + Arrays.toString(initialSparkParams.data().asFloat())); - System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat())); - System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat())); +// System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat())); +// System.out.println("Initial (Spark) params: " + Arrays.toString(initialSparkParams.data().asFloat())); +// System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat())); +// System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat())); assertArrayEquals(initialParams.data().asFloat(), initialSparkParams.data().asFloat(), 1e-8f); assertArrayEquals(finalParams.data().asFloat(), finalSparkParams.data().asFloat(), 1e-6f); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java index c43729166..8558878b8 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java @@ -37,7 +37,7 @@ public class TestJsonYaml { String json = tm.toJson(); String yaml = tm.toYaml(); - System.out.println(json); +// System.out.println(json); TrainingMaster fromJson = ParameterAveragingTrainingMaster.fromJson(json); TrainingMaster fromYaml = ParameterAveragingTrainingMaster.fromYaml(yaml); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java index ca7a168b2..3b328e210 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java @@ -389,7 +389,7 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { List workerFitStats = stats.getValue("ParameterAveragingWorkerFitTimesMs"); for (EventStats e : workerFitStats) { ExampleCountEventStats eces = (ExampleCountEventStats) e; - System.out.println(eces.getTotalExampleCount()); +// System.out.println(eces.getTotalExampleCount()); } for (EventStats e : workerFitStats) { @@ -457,7 +457,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { assertNotEquals(paramsBefore, paramsAfter); SparkTrainingStats stats = sparkNet.getSparkTrainingStats(); - System.out.println(stats.statsAsString()); +// System.out.println(stats.statsAsString()); + stats.statsAsString(); sparkNet.getTrainingMaster().deleteTempFiles(sc); } @@ -483,7 +484,7 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { i++; } - System.out.println("Saved to: " + tempDirF.getAbsolutePath()); +// System.out.println("Saved to: " + tempDirF.getAbsolutePath()); @@ -527,7 +528,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { SparkTrainingStats stats = sparkNet.getSparkTrainingStats(); //Expect - System.out.println(stats.statsAsString()); +// System.out.println(stats.statsAsString()); + stats.statsAsString(); assertEquals(numSplits, stats.getValue("ParameterAveragingMasterRepartitionTimesMs").size()); List list = stats.getValue("ParameterAveragingWorkerFitTimesMs"); @@ -566,8 +568,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { i++; } - System.out.println("Saved to: " + tempDirF.getAbsolutePath()); - System.out.println("Saved to: " + tempDirF2.getAbsolutePath()); +// System.out.println("Saved to: " + tempDirF.getAbsolutePath()); +// System.out.println("Saved to: " + tempDirF2.getAbsolutePath()); @@ -610,7 +612,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { assertNotEquals(paramsBefore, paramsAfter); SparkTrainingStats stats = sparkNet.getSparkTrainingStats(); - System.out.println(stats.statsAsString()); +// System.out.println(stats.statsAsString()); + stats.statsAsString(); //Same thing, buf for MultiDataSet objects: config = new Configuration(); @@ -631,7 +634,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { assertNotEquals(paramsBefore, paramsAfter); stats = sparkNet.getSparkTrainingStats(); - System.out.println(stats.statsAsString()); +// System.out.println(stats.statsAsString()); + stats.statsAsString(); } @@ -730,13 +734,13 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { .build(); for (int avgFreq : new int[] {1, 5, 10}) { - System.out.println("--- Avg freq " + avgFreq + " ---"); +// System.out.println("--- Avg freq " + avgFreq + " ---"); SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf.clone(), new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize) .batchSizePerWorker(batchSizePerExecutor).averagingFrequency(avgFreq) .repartionData(Repartition.Always).build()); - sparkNet.setListeners(new ScoreIterationListener(1)); + sparkNet.setListeners(new ScoreIterationListener(5)); @@ -778,13 +782,13 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest { .setOutputs("1").build(); for (int avgFreq : new int[] {1, 5, 10}) { - System.out.println("--- Avg freq " + avgFreq + " ---"); +// System.out.println("--- Avg freq " + avgFreq + " ---"); SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf.clone(), new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize) .batchSizePerWorker(batchSizePerExecutor).averagingFrequency(avgFreq) .repartionData(Repartition.Always).build()); - sparkNet.setListeners(new ScoreIterationListener(1)); + sparkNet.setListeners(new ScoreIterationListener(5)); JavaRDD rdd = sc.parallelize(list); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java index 5b49899c8..15d57b0a6 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java @@ -107,7 +107,7 @@ public class TestTrainingStatsCollection extends BaseSparkTest { expectedStatNames.addAll(c); } - System.out.println(expectedStatNames); +// System.out.println(expectedStatNames); SparkTrainingStats stats = sparkNet.getSparkTrainingStats(); @@ -119,7 +119,7 @@ public class TestTrainingStatsCollection extends BaseSparkTest { } String statsAsString = stats.statsAsString(); - System.out.println(statsAsString); +// System.out.println(statsAsString); assertEquals(actualKeySet.size(), statsAsString.split("\n").length); //One line per stat diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java index e88438766..f4b435d46 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java @@ -35,7 +35,7 @@ public class TestTimeSource { long systemTime = System.currentTimeMillis(); long ntpTime = timeSource.currentTimeMillis(); long offset = ntpTime - systemTime; - System.out.println("System: " + systemTime + "\tNTPTimeSource: " + ntpTime + "\tOffset: " + offset); +// System.out.println("System: " + systemTime + "\tNTPTimeSource: " + ntpTime + "\tOffset: " + offset); Thread.sleep(500); } } @@ -49,7 +49,7 @@ public class TestTimeSource { long systemTime = System.currentTimeMillis(); long ntpTime = timeSource.currentTimeMillis(); long offset = ntpTime - systemTime; - System.out.println("System: " + systemTime + "\tSystemClockTimeSource: " + ntpTime + "\tOffset: " + offset); +// System.out.println("System: " + systemTime + "\tSystemClockTimeSource: " + ntpTime + "\tOffset: " + offset); assertEquals(systemTime, ntpTime, 2); //Should be exact, but we might randomly tick over between one ms and the next Thread.sleep(500); } diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java index 1b3329530..a12b1e460 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java @@ -87,7 +87,7 @@ public class TestListeners extends BaseSparkTest { net.fit(rdd); List sessions = ss.listSessionIDs(); - System.out.println("Sessions: " + sessions); +// System.out.println("Sessions: " + sessions); assertEquals(1, sessions.size()); String sid = sessions.get(0); @@ -95,15 +95,15 @@ public class TestListeners extends BaseSparkTest { List typeIDs = ss.listTypeIDsForSession(sid); List workers = ss.listWorkerIDsForSession(sid); - System.out.println(sid + "\t" + typeIDs + "\t" + workers); +// System.out.println(sid + "\t" + typeIDs + "\t" + workers); List lastUpdates = ss.getLatestUpdateAllWorkers(sid, StatsListener.TYPE_ID); - System.out.println(lastUpdates); +// System.out.println(lastUpdates); - System.out.println("Static info:"); +// System.out.println("Static info:"); for (String wid : workers) { Persistable staticInfo = ss.getStaticInfo(sid, StatsListener.TYPE_ID, wid); - System.out.println(sid + "\t" + wid); +// System.out.println(sid + "\t" + wid); } assertEquals(1, typeIDs.size()); diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java index e0759a549..ad1622966 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java @@ -63,7 +63,7 @@ public class TestRepartitioning extends BaseSparkTest { assertEquals(10, rdd2.partitions().size()); for (int i = 0; i < 10; i++) { List partition = rdd2.collectPartitions(new int[] {i})[0]; - System.out.println("Partition " + i + " size: " + partition.size()); +// System.out.println("Partition " + i + " size: " + partition.size()); assertEquals(100, partition.size()); //Should be exactly 100, for the util method (but NOT spark .repartition) } } @@ -170,7 +170,7 @@ public class TestRepartitioning extends BaseSparkTest { List> partitionCounts = initial.values().mapPartitionsWithIndex(new CountPartitionsFunction(), true).collect(); - System.out.println(partitionCounts); +// System.out.println(partitionCounts); List> initialExpected = Arrays.asList( new Tuple2<>(0,29), @@ -185,7 +185,7 @@ public class TestRepartitioning extends BaseSparkTest { JavaRDD afterRepartition = SparkUtils.repartitionBalanceIfRequired(initial.values(), Repartition.Always, 2, 112); List> partitionCountsAfter = afterRepartition.mapPartitionsWithIndex(new CountPartitionsFunction(), true).collect(); - System.out.println(partitionCountsAfter); +// System.out.println(partitionCountsAfter); for(Tuple2 t2 : partitionCountsAfter){ assertEquals(2, (int)t2._2()); @@ -219,8 +219,8 @@ public class TestRepartitioning extends BaseSparkTest { } } - System.out.println("min: " + min + "\t@\t" + minIdx); - System.out.println("max: " + max + "\t@\t" + maxIdx); +// System.out.println("min: " + min + "\t@\t" + minIdx); +// System.out.println("max: " + max + "\t@\t" + maxIdx); assertEquals(1, min); assertEquals(2, max); @@ -244,7 +244,7 @@ public class TestRepartitioning extends BaseSparkTest { for (int i = 0; i < 10; i++) { List partition = rdd2.collectPartitions(new int[] {i})[0]; - System.out.println("Partition " + i + " size: " + partition.size()); +// System.out.println("Partition " + i + " size: " + partition.size()); assertTrue(partition.size() >= 90 && partition.size() <= 110); } } diff --git a/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala b/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala index 02474f771..65a2bddf2 100644 --- a/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala +++ b/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala @@ -123,7 +123,7 @@ trait NDArrayExtractionTestBase extends FlatSpec { self: OrderingForTest => val expectedSlice = expectedArray.slice(0) val actualSlice = expectedArray(0, ->) - Console.println(expectedSlice) +// Console.println(expectedSlice) assert(actualSlice == expectedSlice) } diff --git a/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala index d51707ee1..553e59df2 100644 --- a/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala +++ b/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala @@ -28,7 +28,7 @@ class TrainingTest extends FlatSpec with Matchers { val unused3 = unused1.div(unused2) val loss1 = add.std("l1", true) val loss2 = mmul.mean("l2") - Console.println(sd.summary) +// Console.println(sd.summary) if (i == 0) { sd.setLossVariables("l1", "l2") sd.createGradFunction() diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java index a1c28ce60..26ec0708f 100644 --- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java +++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java @@ -43,8 +43,8 @@ public class HistoryProcessorTest { hp.add(a); INDArray[] h = hp.getHistory(); assertEquals(4, h.length); - System.out.println(Arrays.toString(a.shape())); - System.out.println(Arrays.toString(h[0].shape())); +// System.out.println(Arrays.toString(a.shape())); +// System.out.println(Arrays.toString(h[0].shape())); assertEquals( 1, h[0].shape()[0]); assertEquals(a.shape()[0], h[0].shape()[1]); assertEquals(a.shape()[1], h[0].shape()[2]); diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java index dc4814220..c43c26d50 100644 --- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java +++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java @@ -100,8 +100,8 @@ public class ActorCriticTest { double error2 = gradient2 - gradient.getDouble(1); double relError1 = error1 / gradient.getDouble(0); double relError2 = error2 / gradient.getDouble(1); - System.out.println(gradient.getDouble(0) + " " + gradient1 + " " + relError1); - System.out.println(gradient.getDouble(1) + " " + gradient2 + " " + relError2); +// System.out.println(gradient.getDouble(0) + " " + gradient1 + " " + relError1); +// System.out.println(gradient.getDouble(1) + " " + gradient2 + " " + relError2); assertTrue(gradient.getDouble(0) < maxRelError || Math.abs(relError1) < maxRelError); assertTrue(gradient.getDouble(1) < maxRelError || Math.abs(relError2) < maxRelError); } diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java index f97457a52..2262f1789 100644 --- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java +++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java @@ -158,7 +158,7 @@ public class PolicyTest { for (int i = 0; i < 100; i++) { count[policy.nextAction(input)]++; } - System.out.println(count[0] + " " + count[1] + " " + count[2] + " " + count[3]); +// System.out.println(count[0] + " " + count[1] + " " + count[2] + " " + count[3]); assertTrue(count[0] < 20); assertTrue(count[1] < 30); assertTrue(count[2] < 40); From 22c7aa9acf863966edf8984323f4b56385d790b1 Mon Sep 17 00:00:00 2001 From: Yurii Shyrma Date: Tue, 18 Feb 2020 07:58:01 +0200 Subject: [PATCH 12/19] Shyrma mkl matmul (#250) * - provide matmul code based on mkl api Signed-off-by: Yurii * - correct typo in mkl matmul op Signed-off-by: Yurii * - take into account empty arrays in mkl matmul op Signed-off-by: Yurii * - fix bug in mkl matmul and group all matmul tests in one file Signed-off-by: Yurii --- libnd4j/CMakeLists.txt.mkldnn.in | 2 +- .../ops/declarable/generic/blas/matmul.cpp | 249 +++--- .../ops/declarable/platform/mkldnn/matmul.cpp | 294 +++++++ .../declarable/platform/mkldnn/mkldnnUtils.h | 2 + .../layers_tests/DeclarableOpsTests1.cpp | 34 - .../layers_tests/DeclarableOpsTests12.cpp | 15 +- .../layers_tests/DeclarableOpsTests14.cpp | 807 ++++++++++++++++++ .../layers_tests/DeclarableOpsTests2.cpp | 21 - .../layers_tests/DeclarableOpsTests3.cpp | 114 --- .../layers_tests/DeclarableOpsTests4.cpp | 71 -- .../layers_tests/DeclarableOpsTests7.cpp | 14 - .../layers_tests/DeclarableOpsTests9.cpp | 521 ----------- .../tests_cpu/layers_tests/MklDnnTests.cpp | 5 +- 13 files changed, 1229 insertions(+), 920 deletions(-) create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp diff --git a/libnd4j/CMakeLists.txt.mkldnn.in b/libnd4j/CMakeLists.txt.mkldnn.in index 3de36dfde..e67b3554b 100644 --- a/libnd4j/CMakeLists.txt.mkldnn.in +++ b/libnd4j/CMakeLists.txt.mkldnn.in @@ -5,7 +5,7 @@ project(mkldnn-download NONE) include(ExternalProject) ExternalProject_Add(mkldnn GIT_REPOSITORY https://github.com/intel/mkl-dnn.git - GIT_TAG v1.1.3 + GIT_TAG v1.2 SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build" CONFIGURE_COMMAND "" diff --git a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp index 3dd64a113..a673b1988 100644 --- a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp +++ b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp @@ -20,7 +20,7 @@ // @author Yurii Shyrma (iuriish@yahoo.com), fully rewritten // -#include +#include #if NOT_EXCLUDED(OP_matmul) #include @@ -29,142 +29,128 @@ namespace nd4j { namespace ops { - CUSTOM_OP_IMPL(matmul, 2, 1, false, 0, -2) { - auto x = INPUT_VARIABLE(0); - auto y = INPUT_VARIABLE(1); - auto z = OUTPUT_VARIABLE(0); +////////////////////////////////////////////////////////////////////// +CUSTOM_OP_IMPL(matmul, 2, 1, false, 0, -2) { - const int iSize = (int) block.getIArguments()->size(); - int transX = iSize > 0 ? INT_ARG(0) : 0; - int transY = iSize > 1 ? INT_ARG(1) : 0; - const int transZ = iSize > 2 ? INT_ARG(2) : 0; + auto x = INPUT_VARIABLE(0); + auto y = INPUT_VARIABLE(1); + auto z = OUTPUT_VARIABLE(0); - const int xRank = x->rankOf(); - const int yRank = y->rankOf(); - const int zRank = z->rankOf(); + const int iSize = (int) block.getIArguments()->size(); + int transX = iSize > 0 ? INT_ARG(0) : 0; + int transY = iSize > 1 ? INT_ARG(1) : 0; + const int transZ = iSize > 2 ? INT_ARG(2) : 0; - if (transZ) { - x = INPUT_VARIABLE(1); - y = INPUT_VARIABLE(0); - bool temp = transX; - transX = !transY; - transY = !temp; - } + const int xRank = x->rankOf(); + const int yRank = y->rankOf(); + const int zRank = z->rankOf(); - const int xLastDim = transX ? -2 : -1; - const int yLastDim = transY ? -2 : -1; - const int xLastButOneDim = transX ? -1 : -2; - const int yLastButOneDim = transY ? -1 : -2; + if (transZ) { + x = INPUT_VARIABLE(1); + y = INPUT_VARIABLE(0); + bool temp = transX; + transX = !transY; + transY = !temp; + } - // ******* input validation ******* // - REQUIRE_TRUE(xRank > 0 && yRank > 0, 0, - "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !", - xRank, yRank); + const int xLastDim = transX ? -2 : -1; + const int yLastDim = transY ? -2 : -1; + const int xLastButOneDim = transX ? -1 : -2; + const int yLastButOneDim = transY ? -1 : -2; - if (xRank == 1 && yRank == 1) { // dot case, output is scalar (or vector with length = 1) - REQUIRE_TRUE(x->lengthOf() == y->lengthOf(), 0, - "MATMUL OP: since input arrays are vectors they must have the same length, but got x length = %i, y length = %i !", - x->lengthOf(), y->lengthOf()); - } else if (xRank == 1 && yRank == 2) { // vector x matrix, i.e. [4] x [4,5] = [5], output is vector - REQUIRE_TRUE(x->lengthOf() == y->sizeAt(yLastButOneDim), 0, - "MATMUL OP: input arrays have inconsistent shapes for vector-matrix product: x %s, y %s !", - ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str()); - } else if (xRank == 2 && yRank == 1) { // matrix x vector , i.e. [4,5] x [5] = [4], output is vector - REQUIRE_TRUE(x->sizeAt(xLastDim) == y->lengthOf(), 0, - "MATMUL OP: input arrays have inconsistent shapes for matrix-vector product: x %s, y %s !", - ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str()); - } else { - REQUIRE_TRUE(xRank == yRank && yRank == zRank, 0, - "MATMUL OP: input and output arrays must have the same rank, but got instead: x rank = %i, y rank = %i, z rank = %i !", - xRank, yRank, zRank); - REQUIRE_TRUE(x->sizeAt(xLastDim) == y->sizeAt(yLastButOneDim) && - x->sizeAt(xLastButOneDim) == z->sizeAt(-2) && y->sizeAt(yLastDim) == z->sizeAt(-1), 0, - "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", - ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), - ShapeUtils::shapeAsString(z).c_str()); + // ******* input validation ******* // + REQUIRE_TRUE(xRank > 0 && yRank > 0, 0, "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !", xRank, yRank); - if (xRank > 2) // outer dims must be the same - for (int i = 0; i < xRank - 2; ++i) - REQUIRE_TRUE(x->sizeAt(i) == y->sizeAt(i) && y->sizeAt(i) == z->sizeAt(i), 0, - "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", - ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), - ShapeUtils::shapeAsString(z).c_str()); - } - // ******* end of input validation ******* // + if (xRank == 1 && yRank == 1) { // dot case, output is scalar (or vector with length = 1) + REQUIRE_TRUE(x->lengthOf() == y->lengthOf(), 0, "MATMUL OP: since input arrays are vectors they must have the same length, but got x length = %i, y length = %i !", x->lengthOf(), y->lengthOf()); + } else if (xRank == 1 && yRank == 2) { // vector x matrix, i.e. [4] x [4,5] = [5], output is vector + REQUIRE_TRUE(x->lengthOf() == y->sizeAt(yLastButOneDim), 0, "MATMUL OP: input arrays have inconsistent shapes for vector-matrix product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str()); + } else if (xRank == 2 && yRank == 1) { // matrix x vector , i.e. [4,5] x [5] = [4], output is vector + REQUIRE_TRUE(x->sizeAt(xLastDim) == y->lengthOf(), 0, "MATMUL OP: input arrays have inconsistent shapes for matrix-vector product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str()); + } else { + REQUIRE_TRUE(xRank == yRank && yRank == zRank, 0, "MATMUL OP: input and output arrays must have the same rank, but got instead: x rank = %i, y rank = %i, z rank = %i !", xRank, yRank, zRank); + REQUIRE_TRUE(x->sizeAt(xLastDim) == y->sizeAt(yLastButOneDim) && x->sizeAt(xLastButOneDim) == z->sizeAt(-2) && y->sizeAt(yLastDim) == z->sizeAt(-1), 0, "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str()); - MmulHelper::matmul(x, y, z, transX, transY); + if (xRank > 2) // outer dims must be the same + for (int i = 0; i < xRank - 2; ++i) + REQUIRE_TRUE(x->sizeAt(i) == y->sizeAt(i) && y->sizeAt(i) == z->sizeAt(i), 0, "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str()); + } + // ******* end of input validation ******* // - return Status::OK(); - } + MmulHelper::matmul(x, y, z, transX, transY); - DECLARE_SYN(mMul, matmul); + return Status::OK(); +} - DECLARE_SYN(mmul, matmul); +DECLARE_SYN(mMul, matmul); - DECLARE_SYN(gemm, matmul); +DECLARE_SYN(mmul, matmul); - DECLARE_SYN(gemv, matmul); +DECLARE_SYN(gemm, matmul); - DECLARE_SYN(dot, matmul); +DECLARE_SYN(gemv, matmul); +DECLARE_SYN(dot, matmul); - DECLARE_SHAPE_FN(matmul) { +////////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(matmul) { - auto xShapeInfo = inputShape->at(0); - auto yShapeInfo = inputShape->at(1); + auto xShapeInfo = inputShape->at(0); + auto yShapeInfo = inputShape->at(1); - const int iSize = (int) block.getIArguments()->size(); - int transX = iSize > 0 ? INT_ARG(0) : 0; - int transY = iSize > 1 ? INT_ARG(1) : 0; - const int transZ = iSize > 2 ? INT_ARG(2) : 0; + const int iSize = (int) block.getIArguments()->size(); + int transX = iSize > 0 ? INT_ARG(0) : 0; + int transY = iSize > 1 ? INT_ARG(1) : 0; + const int transZ = iSize > 2 ? INT_ARG(2) : 0; - REQUIRE_TRUE(xShapeInfo[0] > 0 && yShapeInfo[0] > 0, 0, - "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !", - xShapeInfo[0], yShapeInfo[0]); + REQUIRE_TRUE(xShapeInfo[0] > 0 && yShapeInfo[0] > 0, 0, + "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !", + xShapeInfo[0], yShapeInfo[0]); - if (transZ) { - xShapeInfo = inputShape->at(1); - yShapeInfo = inputShape->at(0); - bool temp = transX; - transX = !transY; - transY = !temp; - } + if (transZ) { + xShapeInfo = inputShape->at(1); + yShapeInfo = inputShape->at(0); + bool temp = transX; + transX = !transY; + transY = !temp; + } - auto zShapeOnly = ShapeUtils::evalShapeForMatmul(xShapeInfo, yShapeInfo, transX, transY); + auto zShapeOnly = ShapeUtils::evalShapeForMatmul(xShapeInfo, yShapeInfo, transX, transY); - auto dtypeX = ArrayOptions::dataType(xShapeInfo); - auto dtypeY = ArrayOptions::dataType(yShapeInfo); + auto dtypeX = ArrayOptions::dataType(xShapeInfo); + auto dtypeY = ArrayOptions::dataType(yShapeInfo); - auto xOrder = shape::order(xShapeInfo); - auto yOrder = shape::order(yShapeInfo); - auto zOrder = xOrder == 'c' && yOrder == 'c' ? 'c' : 'f'; + auto xOrder = shape::order(xShapeInfo); + auto yOrder = shape::order(yShapeInfo); + auto zOrder = xOrder == 'c' && yOrder == 'c' ? 'c' : 'f'; - // we just pick the higher data type out of X and Y - auto dtypeZ = dtypeX > dtypeY ? dtypeX : dtypeY; + // we just pick the higher data type out of X and Y + auto dtypeZ = dtypeX > dtypeY ? dtypeX : dtypeY; - auto newShape = ConstantShapeHelper::getInstance()->createShapeInfo(dtypeZ, zOrder, zShapeOnly); - return SHAPELIST(newShape); - } + auto newShape = ConstantShapeHelper::getInstance()->createShapeInfo(dtypeZ, zOrder, zShapeOnly); + return SHAPELIST(newShape); +} - DECLARE_TYPES(matmul) { - getOpDescriptor() - ->setAllowedInputTypes(0, {ALL_FLOATS}) - ->setAllowedInputTypes(1, {ALL_FLOATS}) - ->setAllowedOutputTypes(0, {ALL_FLOATS}); - } +////////////////////////////////////////////////////////////////////// +DECLARE_TYPES(matmul) { + getOpDescriptor() + ->setAllowedInputTypes(0, {ALL_FLOATS}) + ->setAllowedInputTypes(1, {ALL_FLOATS}) + ->setAllowedOutputTypes(0, {ALL_FLOATS}); +} +////////////////////////////////////////////////////////////////////// +CUSTOM_OP_IMPL(matmul_bp, 3, 2, false, 0, -2) { + auto x = INPUT_VARIABLE(0); + auto y = INPUT_VARIABLE(1); + auto eps = INPUT_VARIABLE(2); + auto dldx = OUTPUT_VARIABLE(0); + auto dldy = OUTPUT_VARIABLE(1); - CUSTOM_OP_IMPL(matmul_bp, 3, 2, false, 0, -2) { - auto x = INPUT_VARIABLE(0); - auto y = INPUT_VARIABLE(1); - auto eps = INPUT_VARIABLE(2); - auto dldx = OUTPUT_VARIABLE(0); - auto dldy = OUTPUT_VARIABLE(1); - - const int iSize = (int) block.getIArguments()->size(); - int transX = iSize > 0 ? INT_ARG(0) : 0; - int transY = iSize > 1 ? INT_ARG(1) : 0; - const int transZ = iSize > 2 ? INT_ARG(2) : 0; + const int iSize = (int) block.getIArguments()->size(); + int transX = iSize > 0 ? INT_ARG(0) : 0; + int transY = iSize > 1 ? INT_ARG(1) : 0; + const int transZ = iSize > 2 ? INT_ARG(2) : 0; /* In: x=[a,b], y=[b,c] @@ -177,34 +163,35 @@ F F T [a,b] [b,c] [c,a] [c,a] */ - nd4j::ops::matmul op; - op.execute({eps, y}, {dldx}, {}, {transZ, !transY, transX}, {}); - op.execute({x, eps}, {dldy}, {}, {!transX, transZ, transY}, {}); + nd4j::ops::matmul op; + op.execute({eps, y}, {dldx}, {}, {transZ, !transY, transX}, {}); + op.execute({x, eps}, {dldy}, {}, {!transX, transZ, transY}, {}); - return Status::OK(); - } + return Status::OK(); +} +////////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(matmul_bp) { + Nd4jLong *xShapeInfo; + Nd4jLong *yShapeInfo; - DECLARE_SHAPE_FN(matmul_bp) { - Nd4jLong *xShapeInfo; - Nd4jLong *yShapeInfo; + COPY_SHAPE(inputShape->at(0), xShapeInfo); + COPY_SHAPE(inputShape->at(1), yShapeInfo); - COPY_SHAPE(inputShape->at(0), xShapeInfo); - COPY_SHAPE(inputShape->at(1), yShapeInfo); + return SHAPELIST(CONSTANT(xShapeInfo), CONSTANT(yShapeInfo)); +} - return SHAPELIST(CONSTANT(xShapeInfo), CONSTANT(yShapeInfo)); - } +////////////////////////////////////////////////////////////////////// +DECLARE_TYPES(matmul_bp) { + getOpDescriptor() + ->setAllowedInputTypes(0, {ALL_FLOATS}) + ->setAllowedInputTypes(1, {ALL_FLOATS}) + ->setAllowedInputTypes(2, {ALL_FLOATS}) + ->setAllowedOutputTypes(0, {ALL_FLOATS}) + ->setAllowedOutputTypes(1, {ALL_FLOATS}); +} - DECLARE_TYPES(matmul_bp) { - getOpDescriptor() - ->setAllowedInputTypes(0, {ALL_FLOATS}) - ->setAllowedInputTypes(1, {ALL_FLOATS}) - ->setAllowedInputTypes(2, {ALL_FLOATS}) - ->setAllowedOutputTypes(0, {ALL_FLOATS}) - ->setAllowedOutputTypes(1, {ALL_FLOATS}); - } - - } +} } diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp new file mode 100644 index 000000000..f47d08b7a --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp @@ -0,0 +1,294 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com) +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + + +namespace nd4j { +namespace ops { +namespace platforms { + +////////////////////////////////////////////////////////////////////////// +static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const bool transX, const bool transY) { + + // mkl works with following + // [M,K] x [K,N] = [M,N] + // [bS, M,K] x [bS, K,N] = [bS, M,N] + + // possible input cases not supported by mkl, however we'll perform permut/reshape procedures in order to fit requirements + // [4] x [4] = [1] --> [1,4] x [4,1] = [1,1] + // [4] x [4,5] = [5] --> [1,4] x [4,5] = [1,5] + // [4,5] x [5] = [4] --> [4,5] x [5,1] = [4,1] + // [2,3, 4,5] x [2,3, 5,4] = [2,3, 4,4] --> [6, 4,5] x [6, 5,4] = [6, 4,4] + // [2,2,3, 4,5] x [2,2,3, 5,4] = [2,2,3, 4,4] --> [12, 4,5] x [12, 5,4] = [12, 4,4] + + const auto xRank = x->rankOf(); + const auto yRank = y->rankOf(); + const auto zRank = z->rankOf(); + + std::vector permut; + + // fill permutation vector appropriately if transposition is required + if((transX && xRank > 1) || (transY && yRank > 1)) { + + const int rank = xRank >= yRank ? xRank : yRank; + permut.resize(rank); + std::iota(std::begin(permut), std::end(permut), 0); + permut[rank-2] = rank - 1; + permut[rank-1] = rank - 2; + } + + const NDArray* xT = (transX && xRank > 1) ? new NDArray(x->permute(permut)) : x; + const NDArray* yT = (transY && yRank > 1) ? new NDArray(y->permute(permut)) : y; + + const NDArray* xTR = xRank <= 3 ? xT : new NDArray(xT->reshape(xT->ordering(), {xT->lengthOf() / (xT->sizeAt(-2) * xT->sizeAt(-1)), xT->sizeAt(-2), xT->sizeAt(-1)})); + const NDArray* yTR = xRank <= 3 ? yT : new NDArray(yT->reshape(yT->ordering(), {yT->lengthOf() / (yT->sizeAt(-2) * yT->sizeAt(-1)), yT->sizeAt(-2), yT->sizeAt(-1)})); + NDArray* zR = xRank <= 3 ? z : new NDArray(z->reshape(z->ordering(), {z->lengthOf() / (z->sizeAt(-2) * z->sizeAt(-1)), z->sizeAt(-2), z->sizeAt(-1)})/*, false*/); + + // [M,K] x [K,N] = [M,N] + const int M = (xRank > 1) ? xTR->sizeAt(-2) : 1; + const int K = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf(); + const int N = (yRank > 1) ? yTR->sizeAt(-1) : 1; + const int bS = (xRank > 2) ? xTR->sizeAt(0) : 1; // [bS, M,K] x [bS, K,N] = [bS, M,N] + + dnnl::memory::dims xShape = xRank < 3 ? dnnl::memory::dims({M, K}) : dnnl::memory::dims({bS, M, K}); + dnnl::memory::dims yShape = xRank < 3 ? dnnl::memory::dims({K, N}) : dnnl::memory::dims({bS, K, N}); + dnnl::memory::dims zShape = xRank < 3 ? dnnl::memory::dims({M, N}) : dnnl::memory::dims({bS, M, N}); + + dnnl::memory::format_tag format = xRank < 3 ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::abc; + + // x type + dnnl::memory::data_type xType; + if(x->dataType() == DataType::FLOAT32) + xType = dnnl::memory::data_type::f32; + else if(x->dataType() == DataType::HALF) + xType = dnnl::memory::data_type::f16; + else if(x->dataType() == DataType::BFLOAT16) + xType = dnnl::memory::data_type::bf16; + else if(x->dataType() == DataType::UINT8) + xType = dnnl::memory::data_type::u8; + else + xType = dnnl::memory::data_type::s8; + + // y type + dnnl::memory::data_type yType = xType; + if(y->dataType() == DataType::UINT8) + yType = dnnl::memory::data_type::u8; + else if(y->dataType() == DataType::INT8) + yType = dnnl::memory::data_type::s8; + + // z type + dnnl::memory::data_type zType = xType; + if(z->dataType() == DataType::FLOAT32) + zType = dnnl::memory::data_type::f32; + else if(z->dataType() == DataType::INT32) + zType = dnnl::memory::data_type::s32; + else if(z->dataType() == DataType::UINT8) + zType = dnnl::memory::data_type::u8; + else if(z->dataType() == DataType::INT8) + zType = dnnl::memory::data_type::s8; + + // memory descriptors for arrays + + // x + dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, dnnl::memory::format_tag::any); + dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, format); + if(xTR->ews() != 1 || xTR->ordering() != 'c') { + x_user_md.data.format_kind = dnnl_blocked; // overrides format + x_user_md.data.format_desc.blocking.strides[0] = xRank == 1 ? 1 : xTR->strideAt(0); + x_user_md.data.format_desc.blocking.strides[1] = xRank == 1 ? xTR->strideAt(0) : xTR->strideAt(1); + if(xRank > 2) + x_user_md.data.format_desc.blocking.strides[2] = xTR->strideAt(2); + } + + // y + dnnl::memory::desc y_mkl_md = dnnl::memory::desc(yShape, yType, dnnl::memory::format_tag::any); + dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, format); + if(yTR->ews() != 1 || yTR->ordering() != 'c') { + y_user_md.data.format_kind = dnnl_blocked; // overrides format + y_user_md.data.format_desc.blocking.strides[0] = yRank == 1 ? 1 : yTR->strideAt(0); + y_user_md.data.format_desc.blocking.strides[1] = yRank == 1 ? yTR->strideAt(0) : yTR->strideAt(1); + if(yRank > 2) + y_user_md.data.format_desc.blocking.strides[2] = yTR->strideAt(2); + } + + // z + dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, dnnl::memory::format_tag::any); + dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, format); + if(zR->ews() != 1 || zR->ordering() != 'c') { + z_user_md.data.format_kind = dnnl_blocked; // overrides format + z_user_md.data.format_desc.blocking.strides[0] = zRank == 1 ? 1 : zR->strideAt(0); + z_user_md.data.format_desc.blocking.strides[1] = zRank == 1 ? zR->strideAt(0) : zR->strideAt(1); + if(zRank > 2) + z_user_md.data.format_desc.blocking.strides[2] = zR->strideAt(2); + } + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + + // Create attributes (to handle alpha and beta if necessary) + dnnl::primitive_attr attr; // it is empty since we have usual values for alpha (=1) and beta (=0) + + // operation primitive description + dnnl::matmul::desc op_desc(x_mkl_md, y_mkl_md, z_mkl_md); + dnnl::matmul::primitive_desc op_prim_desc(op_desc, attr, engine); + + // arguments (memory buffers) necessary for calculations + std::unordered_map args; + + dnnl::stream stream(engine); + + // provide memory buffers and check whether reorder is required + + // input + auto x_user_mem = dnnl::memory(x_user_md, engine, xTR->getBuffer()); + const bool xReorder = op_prim_desc.src_desc() != x_user_mem.get_desc(); + auto x_mkl_mem = xReorder ? dnnl::memory(op_prim_desc.src_desc(), engine) : x_user_mem; + if (xReorder) + dnnl::reorder(x_user_mem, x_mkl_mem).execute(stream, x_user_mem, x_mkl_mem); + args[DNNL_ARG_SRC] = x_mkl_mem; + + // y + auto y_user_mem = dnnl::memory(y_user_md, engine, yTR->getBuffer()); + const bool yReorder = op_prim_desc.weights_desc() != y_user_mem.get_desc(); + auto y_mkl_mem = yReorder ? dnnl::memory(op_prim_desc.weights_desc(), engine) : y_user_mem; + if (yReorder) + dnnl::reorder(y_user_mem, y_mkl_mem).execute(stream, y_user_mem, y_mkl_mem); + args[DNNL_ARG_WEIGHTS] = y_mkl_mem; + + // z + auto z_user_mem = dnnl::memory(z_user_md, engine, zR->getBuffer()); + const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc(); + auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem; + args[DNNL_ARG_DST] = z_mkl_mem; + + // run calculations + dnnl::matmul(op_prim_desc).execute(stream, args); + + // reorder outputs if necessary + if (zReorder) + dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem); + + stream.wait(); + + if(zR->getBuffer() != z->getBuffer()) + z->assign(zR); + + if(zR != z) + delete zR; + if(xTR != xT) + delete xTR; + if(xT != x) + delete xT; + if(yTR != yT) + delete yTR; + if(yT != y) + delete yT; + + // shape::printArray(z_mkl_mem.map_data(),8); +} + +////////////////////////////////////////////////////////////////////////// +PLATFORM_IMPL(matmul, ENGINE_CPU) { + + auto x = INPUT_VARIABLE(0); + auto y = INPUT_VARIABLE(1); + auto z = OUTPUT_VARIABLE(0); + + if(x->isEmpty() || y->isEmpty()) + return Status::OK(); + + const int iSize = (int) block.getIArguments()->size(); + int transX = iSize > 0 ? INT_ARG(0) : 0; + int transY = iSize > 1 ? INT_ARG(1) : 0; + const int transZ = iSize > 2 ? INT_ARG(2) : 0; + + const int xRank = x->rankOf(); + const int yRank = y->rankOf(); + const int zRank = z->rankOf(); + + if (transZ) { + x = INPUT_VARIABLE(1); + y = INPUT_VARIABLE(0); + bool temp = transX; + transX = !transY; + transY = !temp; + } + + const int xLastDim = transX ? -2 : -1; + const int yLastDim = transY ? -2 : -1; + const int xLastButOneDim = transX ? -1 : -2; + const int yLastButOneDim = transY ? -1 : -2; + + // ******* input validation ******* // + REQUIRE_TRUE(xRank > 0 && yRank > 0, 0, "MATMUL MKLDNN OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !", xRank, yRank); + + if (xRank == 1 && yRank == 1) { // dot case, output is scalar (or vector with length = 1) + REQUIRE_TRUE(x->lengthOf() == y->lengthOf(), 0,"MATMUL MKLDNN OP: since input arrays are vectors they must have the same length, but got x length = %i, y length = %i !",x->lengthOf(), y->lengthOf()); + } else if (xRank == 1 && yRank == 2) { // vector x matrix, i.e. [4] x [4,5] = [5], output is vector + REQUIRE_TRUE(x->lengthOf() == y->sizeAt(yLastButOneDim), 0, "MATMUL MKLDNN OP: input arrays have inconsistent shapes for vector-matrix product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str()); + } else if (xRank == 2 && yRank == 1) { // matrix x vector , i.e. [4,5] x [5] = [4], output is vector + REQUIRE_TRUE(x->sizeAt(xLastDim) == y->lengthOf(), 0, "MATMUL MKLDNN OP: input arrays have inconsistent shapes for matrix-vector product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str()); + } else { + REQUIRE_TRUE(xRank == yRank && yRank == zRank, 0, "MATMUL MKLDNN OP: input and output arrays must have the same rank, but got instead: x rank = %i, y rank = %i, z rank = %i !", xRank, yRank, zRank); + REQUIRE_TRUE(x->sizeAt(xLastDim) == y->sizeAt(yLastButOneDim) && x->sizeAt(xLastButOneDim) == z->sizeAt(-2) && y->sizeAt(yLastDim) == z->sizeAt(-1), 0, "MATMUL MKLDNN OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str()); + + if (xRank > 2) // outer dims must be the same + for (int i = 0; i < xRank - 2; ++i) + REQUIRE_TRUE(x->sizeAt(i) == y->sizeAt(i) && y->sizeAt(i) == z->sizeAt(i), 0, "MATMUL MKLDNN OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str()); + } + // ******* end of input validation ******* // + + matmulMKLDNN(x, y, z, transX, transY); + + return Status::OK(); +} + +////////////////////////////////////////////////////////////////////////// +PLATFORM_CHECK(matmul, ENGINE_CPU) { + + auto x = INPUT_VARIABLE(0); + auto y = INPUT_VARIABLE(1); + + auto z = INPUT_VARIABLE(0); + + const DataType xType = x->dataType(); + const DataType yType = y->dataType(); + const DataType zType = z->dataType(); + + + return block.isUseMKLDNN() && + ( + (xType==DataType::FLOAT32 && yType==DataType::FLOAT32 && zType==DataType::FLOAT32) || + (xType==DataType::HALF && yType==DataType::HALF && zType==DataType::FLOAT32) || + (xType==DataType::BFLOAT16 && yType==DataType::BFLOAT16 && zType==DataType::BFLOAT16) || + ((xType==DataType::UINT8 || xType==DataType::INT8) && (yType==DataType::UINT8 || yType==DataType::INT8) && (zType==DataType::UINT8 || zType==DataType::INT8 || zType==DataType::INT32 || zType==DataType::FLOAT32)) + ); +} + + +} +} +} diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h index c8b34a6c0..10adf533d 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h +++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h @@ -84,6 +84,8 @@ namespace nd4j{ DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CPU); DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CPU); + + DECLARE_PLATFORM(matmul, ENGINE_CPU); } } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp index dee410a21..507a507af 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp @@ -1341,40 +1341,6 @@ TEST_F(DeclarableOpsTests1, MultiplyScalarScalar1) { delete exp; } -TEST_F(DeclarableOpsTests1, TestMatMul1) { - auto x = NDArrayFactory::create_('c', {3, 5}); - x->linspace(1); - - auto y = NDArrayFactory::create_('c', {5, 3}); - y->linspace(1); - - float _expB[]{135.0f, 310.0f, 485.0f, 150.0f, 350.0f, 550.0f, 165.0f, 390.0f, 615.0f}; - Nd4jLong _expS[] {2, 3, 3, 1, 3, 0, 1, 102}; // expected shape - ArrayOptions::setDataType(_expS, nd4j::DataType::FLOAT32); - NDArray exp(_expB, _expS); - - auto variableSpace = new VariableSpace(); - variableSpace->putVariable(-1, x); - variableSpace->putVariable(-2, y); - variableSpace->putVariable(1, new Variable()); - - auto block = new Context(1, variableSpace, false); - block->fillInputs({-1, -2}); - - nd4j::ops::matmul op; - - Nd4jStatus status = op.execute(block); - ASSERT_EQ(ND4J_STATUS_OK, status); - ASSERT_TRUE(variableSpace->hasVariable(1)); - - auto result = variableSpace->getVariable(1)->getNDArray(); - - ASSERT_TRUE(result->equalsTo(&exp)); - - delete block; - delete variableSpace; -} - ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests1, TestSoftMax_bp_1) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp index dc672d8e6..e5eaa9a6a 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp @@ -2800,16 +2800,9 @@ TEST_F(DeclarableOpsTests12, QR_Test_1_1) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests12, QR_Test_2) { - auto in = NDArrayFactory::create('c', {5,3}, { - 12., -51., 4., 6., 167., -68., -4., 24., -41., -1., 1., 0., 2., 0., 3. - }); - auto expQ = NDArrayFactory::create('c', {5, 3}, { - 0.8464148, 0.3912908, -0.3431241, -0.42320737, -0.9040873, 0.02927014, 0.28213826, -0.17042054, -0.93285596, 0.07053456, -0.01404065, 0.00109937, -0.14106913, 0.0166551, 0.10577161 - }); - - auto expR = NDArrayFactory::create('c', {3,3}, { - -14.177447, -20.666622, 13.401566, 0., -175.04254, 70.080315, 0., 0., 35.201546 - }); + auto in = NDArrayFactory::create('c', {5,3}, {12., -51., 4., 6., 167., -68., -4., 24., -41., -1., 1., 0., 2., 0., 3.}); + auto expQ = NDArrayFactory::create('c', {5, 3}, {0.8464148,0.3912908,-0.3431241,-0.42320737, -0.9040873,0.02927014,0.28213826, -0.17042054, -0.93285596,0.07053456, -0.01404065,0.00109937,-0.14106913,0.0166551,0.10577161}); + auto expR = NDArrayFactory::create('c', {3,3}, {-14.177447,-20.666622,13.401566,0.,-175.04254,70.080315,0.,0.,35.201546}); nd4j::ops::qr op; auto res = op.evaluate({&in}, {}, {}, {false}); @@ -2819,8 +2812,6 @@ TEST_F(DeclarableOpsTests12, QR_Test_2) { auto r = res->at(1); ASSERT_TRUE(q->isSameShape(expQ)); ASSERT_TRUE(r->isSameShape(expR)); -// q->printIndexedBuffer("Orthogonal 5x5"); -// r->printIndexedBuffer("Upper triangular 5x3"); nd4j::ops::matmul opMul; auto res2 = opMul.evaluate({q, r}); //MmulHelper::matmul(q, r, &in, false, false); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp index 7e3fae4af..25e2d383d 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp @@ -682,3 +682,810 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest8) { x.applyTrueBroadcast(BroadcastOpsTuple::Subtract(), y, z); ASSERT_EQ(e, z); } + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test1) { + + auto x = NDArrayFactory::create('c', {3, 4}); + auto y = NDArrayFactory::create('c', {4, 3}); + auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123., 40., 92., 144., 45., 105., 165.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test2) { + + auto x = NDArrayFactory::create('c', {3, 4}); + auto y = NDArrayFactory::create('f', {4, 3}); + auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test3) { + + auto x = NDArrayFactory::create('f', {3, 4}); + auto y = NDArrayFactory::create('c', {4, 3}); + auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test4) { + + auto x = NDArrayFactory::create ('f', {3, 4}); + auto y = NDArrayFactory::create('f', {4, 3}); + auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test5) { + + auto x = NDArrayFactory::create('c', {4, 3}); + auto y = NDArrayFactory::create('c', {4, 3}); + auto exp = NDArrayFactory::create('f', {3, 3}, {83., 94., 105., 94., 107., 120., 105., 120., 135.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test6) { + + auto x = NDArrayFactory::create('c', {4, 3}); + auto y = NDArrayFactory::create('f', {3, 4}); + auto exp = NDArrayFactory::create('f', {3, 3}, {35., 40., 45., 79., 92., 105., 123., 144., 165.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test7) { + + auto x = NDArrayFactory::create('c', {5, 3,4}); + auto y = NDArrayFactory::create('f', {5, 3,4}); + auto exp = NDArrayFactory::create('f',{5, 3,3}, {3. , 84.6, 281.4, 593.4, 1020.6, 7. , 107.8, 323.8, 655. , 1101.4,11. , 131. , 366.2, 716.6, 1182.2, + 7. , 107.8, 323.8, 655. , 1101.4,17.4, 137.4, 372.6, 723. , 1188.6,27.8, 167. , 421.4, 791. , 1275.8, + 11. , 131. , 366.2, 716.6, 1182.2,27.8, 167. , 421.4, 791. , 1275.8,44.6, 203. , 476.6, 865.4, 1369.4,}); + + x.linspace(1.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {0, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test8) { + + auto x = NDArrayFactory::create('c', {2,5, 3,4}); + auto y = NDArrayFactory::create('f', {2,5, 3,4}); + auto exp = NDArrayFactory::create('f',{2,5, 3,3}, {3. , 1563. , 84.6, 2220.6, 281.4, 2993.4, 593.4, 3881.4,1020.6, 4884.6, 7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4, + 11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2, 7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4, + 17.4, 1769.4, 137.4, 2465.4, 372.6, 3276.6, 723. , 4203. ,1188.6, 5244.6, 27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8, + 11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2, 27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8, + 44.6, 1988.6, 203. , 2723. , 476.6, 3572.6, 865.4, 4537.4,1369.4, 5617.4}); + + x.linspace(1.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {0, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test9) { + + auto x = NDArrayFactory::create('c', {2,5, 4,3}); + auto y = NDArrayFactory::create('f', {2,5, 3,4}); + auto exp = NDArrayFactory::create('f',{2,5, 3,3}, {7. , 1639. , 103. , 2311. , 314.2, 3098.2, 640.6, 4000.6,1082.2, 5018.2, 8. , 1664. , 108.8, 2340.8, 324.8, 3132.8, 656. , 4040. ,1102.4, 5062.4, + 9. , 1689. , 114.6, 2370.6, 335.4, 3167.4, 671.4, 4079.4,1122.6, 5106.6, 15.8, 1743.8, 131. , 2435. , 361.4, 3241.4, 707. , 4163. ,1167.8, 5199.8, + 18.4, 1770.4, 138.4, 2466.4, 373.6, 3277.6, 724. , 4204. ,1189.6, 5245.6, 21. , 1797. , 145.8, 2497.8, 385.8, 3313.8, 741. , 4245. ,1211.4, 5291.4, + 24.6, 1848.6, 159. , 2559. , 408.6, 3384.6, 773.4, 4325.4,1253.4, 5381.4, 28.8, 1876.8, 168. , 2592. , 422.4, 3422.4, 792. , 4368. ,1276.8, 5428.8, + 33. , 1905. , 177. , 2625. , 436.2, 3460.2, 810.6, 4410.6,1300.2, 5476.2}); + + x.linspace(1.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +TEST_F(DeclarableOpsTests14, matmul_test10) { + + auto x = NDArrayFactory::create_('c', {3, 5}); + x->linspace(1); + + auto y = NDArrayFactory::create_('c', {5, 3}); + y->linspace(1); + + float _expB[]{135.0f, 310.0f, 485.0f, 150.0f, 350.0f, 550.0f, 165.0f, 390.0f, 615.0f}; + Nd4jLong _expS[] {2, 3, 3, 1, 3, 0, 1, 102}; // expected shape + ArrayOptions::setDataType(_expS, nd4j::DataType::FLOAT32); + NDArray exp(_expB, _expS); + + auto variableSpace = new VariableSpace(); + variableSpace->putVariable(-1, x); + variableSpace->putVariable(-2, y); + variableSpace->putVariable(1, new Variable()); + + auto block = new Context(1, variableSpace, false); + block->fillInputs({-1, -2}); + + nd4j::ops::matmul op; + + Nd4jStatus status = op.execute(block); + ASSERT_EQ(ND4J_STATUS_OK, status); + ASSERT_TRUE(variableSpace->hasVariable(1)); + + auto result = variableSpace->getVariable(1)->getNDArray(); + + ASSERT_TRUE(result->equalsTo(&exp)); + + delete block; + delete variableSpace; +} + +TEST_F(DeclarableOpsTests14, matmul_test11) { + auto A = NDArrayFactory::create('c', {3, 3}); + auto B = NDArrayFactory::create('c', {3, 1}); + auto exp = NDArrayFactory::create('c', {3, 1}, {14.00f, 32.00f, 50.00f}); + + A.linspace(1); + B.linspace(1); + + nd4j::ops::matmul op; + + auto result = op.evaluate({&A, &B}, {}, {}); + + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests14, matmul_test12) { + auto x= NDArrayFactory::create('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12}); + auto y= NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12}); + auto exp= NDArrayFactory::create('f', {4, 4}, {38.0, 44.0, 50.0, 56.0, 83.0, 98.0, 113.0, 128.0, 128.0, 152.0, 176.0, 200.0, 173.0, 206.0, 239.0, 272.0}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&x, &y}, {}, {1, 1}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + + delete result; +} + + +TEST_F(DeclarableOpsTests14, matmul_test13) { + auto x= NDArrayFactory::create('c', {1, 3}, {1, 2, 3}); + auto y= NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); + auto exp= NDArrayFactory::create('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&x, &y}, {}, {1, 0}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + //z->printIndexedBuffer("z"); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests14, matmul_test14) { + auto x= NDArrayFactory::create('c', {3, 1}, {1, 2, 3}); + auto y= NDArrayFactory::create('c', {4, 1}, {1, 2, 3, 4}); + auto exp= NDArrayFactory::create('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&x, &y}, {}, {0, 1}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + //z->printIndexedBuffer("z"); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests14, matmul_test15) { + auto x= NDArrayFactory::create('c', {3, 1}, {1, 2, 3}); + auto y= NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); + auto exp= NDArrayFactory::create('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&x, &y}, {}, {}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + //z->printIndexedBuffer("z"); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests14, matmul_test16) { + auto x= NDArrayFactory::create('c', {4, 1}, {1, 2, 3, 4}); + auto y= NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); + auto exp= NDArrayFactory::create('f', {4, 4}, {1,2, 3, 4,2,4, 6, 8,3,6, 9,12,4,8,12,16}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&x, &y}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + //z->printIndexedBuffer("z"); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests14, matmul_test17) { + auto x = NDArrayFactory::create('c', {1, 2}, {2.0f, 2.0f}); + auto y = NDArrayFactory::create('c', {2, 1}, {2.0f, 2.0f}); + auto exp = NDArrayFactory::create('c', {1, 1}, {8.0f}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&x, &y}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + ASSERT_EQ(exp, *result->at(0)); + + delete result; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test18) { + + auto x = NDArrayFactory::create('c', {1, 4, 3}); + auto y = NDArrayFactory::create('f', {1, 3, 4}); + auto exp = NDArrayFactory::create('f', {1, 3, 3}, {35., 40., 45., 79., 92., 105., 123., 144., 165.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test19) { + + auto x = NDArrayFactory::create('c', {4, 1}); + auto y = NDArrayFactory::create('f', {1, 4}); + auto exp = NDArrayFactory::create('f', {1, 1}, {15}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1}); + ASSERT_EQ(Status::OK(), results->status()); + + auto z = results->at(0); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test20) { + + auto x = NDArrayFactory::create('c', {1, 4, 1}); + auto y = NDArrayFactory::create('f', {1, 1, 4}); + auto exp = NDArrayFactory::create('f', {1, 1, 1}, {15}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1}); + + ASSERT_EQ(Status::OK(), results->status()); + auto z = results->at(0); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test21) { + + auto x = NDArrayFactory::create('c', {2, 3}); + auto y = NDArrayFactory::create('c', {3, 5}); + auto exp = NDArrayFactory::create('f', {5, 2}, {23. , 26. , 29. , 32. , 35., 50. , 57.5, 65. , 72.5, 80.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {0, 0, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test22) { + + auto x = NDArrayFactory::create('c', {3, 2}); + auto y = NDArrayFactory::create('c', {3, 5}); + auto exp = NDArrayFactory::create('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 0, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test23) { + + auto x = NDArrayFactory::create('c', {3, 2}); + auto y = NDArrayFactory::create('c', {3, 5}); + auto exp = NDArrayFactory::create('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.}); + + x.linspace(1.); + y.linspace(0.5, 0.5); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 0, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test24) { + + auto x = NDArrayFactory::create('c', {2,2, 3,5}); + auto y = NDArrayFactory::create('c', {2,2, 4,3}); + auto exp = NDArrayFactory::create('f',{2,2, 4,5}, {4.6, 281.8, 89.2, 582.4, 10. , 314.2,108.1, 628.3, 15.4, 346.6,127. , 674.2, 20.8, 379. ,145.9, 720.1, 5.2, 289.6, 93.4, 593.8, + 11.5, 322.9,113.2, 640.6, 17.8, 356.2,133. , 687.4, 24.1, 389.5,152.8, 734.2, 5.8, 297.4, 97.6, 605.2, 13. , 331.6,118.3, 652.9, + 20.2, 365.8,139. , 700.6, 27.4, 400. ,159.7, 748.3, 6.4, 305.2,101.8, 616.6, 14.5, 340.3,123.4, 665.2, 22.6, 375.4,145. , 713.8, + 30.7, 410.5,166.6, 762.4, 7. , 313. ,106. , 628. , 16. , 349. ,128.5, 677.5, 25. , 385. ,151. , 727. , 34. , 421. ,173.5, 776.5}); + + x.linspace(1.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test25) { + + auto x = NDArrayFactory::create('f', {4, 3}); + auto y = NDArrayFactory::create('c', {4}); + auto exp = NDArrayFactory::create('f',{3}, {7., 8., 9.}); + + x.linspace(1.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 0}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test26) { + + auto x = NDArrayFactory::create('f', {3}); + auto y = NDArrayFactory::create('c', {4, 3}); + auto exp = NDArrayFactory::create('f',{4}, {1.4, 3.2, 5., 6.8}); + + x.linspace(1.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {0, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test27) { + + auto x = NDArrayFactory::create('f', {1, 1}); + auto y = NDArrayFactory::create('c', {1, 1}); + auto exp = NDArrayFactory::create('f',{1, 1}, {0.2}); + + x.linspace(2.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test28) { + + auto x = NDArrayFactory::create('f', {1, 1}); + auto y = NDArrayFactory::create('c', {1, 1}); + auto exp = NDArrayFactory::create('f',{1, 1}, {0.2}); + + x.linspace(2.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1,1,1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test29) { + + auto x = NDArrayFactory::create('f', {1}); + auto y = NDArrayFactory::create('c', {1, 1}); + auto exp = NDArrayFactory::create('f',{1}, {0.2}); + + x.linspace(2.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test30) { + + auto x = NDArrayFactory::create('f', {1,1}); + auto y = NDArrayFactory::create('c', {1}); + auto exp = NDArrayFactory::create('f',{1}, {0.2}); + + x.linspace(2.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test31) { + + auto x = NDArrayFactory::create('f', {4}); + auto y = NDArrayFactory::create('c', {4}); + auto exp = NDArrayFactory::create(3.); + + x.linspace(1.); + y.linspace(0.1, 0.1); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test32) { + + auto x = NDArrayFactory::create('f', {1}, {2.}); + auto y = NDArrayFactory::create('c', {1}, {3.}); + auto exp = NDArrayFactory::create(6.); + + nd4j::ops::matmul op; + auto results = op.evaluate({&x, &y}, {}, {1, 1}); + auto z = results->at(0); + + ASSERT_EQ(Status::OK(), results->status()); + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete results; +} + + +TEST_F(DeclarableOpsTests14, matmul_test33) { + auto x = NDArrayFactory::create('c', {4, 3}); + auto y = NDArrayFactory::create('c', {4, 1}); + auto exp = NDArrayFactory::create('c',{ 3, 1}, {70, 80, 90}); + + x.linspace(1); + y.linspace(1); + + nd4j::ops::matmul op; + auto result = op.evaluate({&x, &y}, {}, {1, 0}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + + +TEST_F(DeclarableOpsTests14, matmul_test34) { + auto a = NDArrayFactory::create('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + auto b = NDArrayFactory::create('c', {4}, {1, 2, 3, 4}); + auto exp = NDArrayFactory::create('c', {3}, {30, 70, 110}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&a, &b}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests14, matmul_test35) { + auto a = NDArrayFactory::create('c', {4}, {1, 2, 3, 4}); + auto b = NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + auto exp = NDArrayFactory::create('c', {3}, {70, 80, 90}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&a, &b}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests14, matmul_test36) { + auto a = NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); + auto b = NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + auto exp = NDArrayFactory::create('c', {1, 3}, {70, 80, 90}); + + nd4j::ops::matmul op; + auto result = op.evaluate({&a, &b}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, matmul_test37) { + + NDArray a('c', {32, 12, 128, 64}, nd4j::DataType::FLOAT32); + NDArray b('c', {32, 12, 128, 64}, nd4j::DataType::FLOAT32); + NDArray c('c', {32,12,128,128}, nd4j::DataType::FLOAT32); + NDArray cExp('c', {32,12,128,128}, nd4j::DataType::FLOAT32); + + a = 1; + b = 1; + cExp = 64; //Each entry in output c is sum of 64 (1.0 x 1.0) multiplications + + nd4j::ops::matmul op; + auto status = op.execute({&a, &b}, {&c}, {}, {0,1}); + + ASSERT_EQ(ND4J_STATUS_OK, status); + + ASSERT_TRUE(cExp.isSameShape(c)); + ASSERT_TRUE(cExp.equalsTo(c)); +} + +// @Test +// public void testMmulRank4_simple(){ + +// INDArray arr1 = Nd4j.ones(DataType.FLOAT, 32, 12, 128, 64); +// INDArray arr2 = Nd4j.ones(DataType.FLOAT, 32, 12, 128, 64); + +// DynamicCustomOp op = DynamicCustomOp.builder("matmul") +// .addInputs(arr1, arr2) +// .addIntegerArguments(0, 1) //Transpose arr2 only +// .build(); + +// List shapes = op.calculateOutputShape(); +// assertEquals(1, shapes.size()); +// long[] shape = new long[]{32,12,128,128}; +// assertArrayEquals(shape, shapes.get(0).getShape()); + +// INDArray out = Nd4j.create(DataType.FLOAT, shape); + +// op.setOutputArgument(0, out); +// Nd4j.exec(op); +// // System.out.println(out); + +// INDArray exp = Nd4j.valueArrayOf(shape, 64.0, DataType.FLOAT); //Each entry in output is sum of 64 (1.0 x 1.0) multiplications +// assertEquals(exp, out); +// } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp index 0cf1cea2b..029a392f7 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp @@ -397,27 +397,6 @@ TEST_F(DeclarableOpsTests2, NLP_Cbow_Test_1) { delete result; } -TEST_F(DeclarableOpsTests2, YetAnotherMatmulTest_1) { - auto A = NDArrayFactory::create('c', {3, 3}); - auto B = NDArrayFactory::create('c', {3, 1}); - auto exp = NDArrayFactory::create('c', {3, 1}, {14.00f, 32.00f, 50.00f}); - - A.linspace(1); - B.linspace(1); - - nd4j::ops::matmul op; - - auto result = op.evaluate({&A, &B}, {}, {}); - - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - TEST_F(DeclarableOpsTests2, Test_Squeeze_1) { auto x = NDArrayFactory::create('c', {2, 1, 3, 1, 1, 1, 4}); x.linspace(1); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp index 04816b2b2..e7e95afcb 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp @@ -789,120 +789,6 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_Validation_2) { } } -TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_1) { - auto x= NDArrayFactory::create('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12}); - auto y= NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12}); - auto exp= NDArrayFactory::create('f', {4, 4}, {38.0, 44.0, 50.0, 56.0, 83.0, 98.0, 113.0, 128.0, 128.0, 152.0, 176.0, 200.0, 173.0, 206.0, 239.0, 272.0}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}, {}, {1, 1}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - - delete result; -} - - -TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_2) { - auto x= NDArrayFactory::create('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12}); - auto y= NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12}); - auto exp= NDArrayFactory::create('f', {3, 3}, {70.0, 158.0, 246.0, 80.0, 184.0, 288.0, 90.0, 210.0, 330.0}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}, {}, {0, 0}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - - delete result; -} - - -TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_3) { - auto x= NDArrayFactory::create('c', {1, 3}, {1, 2, 3}); - auto y= NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); - auto exp= NDArrayFactory::create('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}, {}, {1, 0}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - //z->printIndexedBuffer("z"); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - -TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_4) { - auto x= NDArrayFactory::create('c', {3, 1}, {1, 2, 3}); - auto y= NDArrayFactory::create('c', {4, 1}, {1, 2, 3, 4}); - auto exp= NDArrayFactory::create('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}, {}, {0, 1}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - //z->printIndexedBuffer("z"); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - -TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_5) { - auto x= NDArrayFactory::create('c', {3, 1}, {1, 2, 3}); - auto y= NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); - auto exp= NDArrayFactory::create('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}, {}, {}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - //z->printIndexedBuffer("z"); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - -TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_6) { - auto x= NDArrayFactory::create('c', {4, 1}, {1, 2, 3, 4}); - auto y= NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); - auto exp= NDArrayFactory::create('f', {4, 4}, {1,2, 3, 4,2,4, 6, 8,3,6, 9,12,4,8,12,16}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - //z->printIndexedBuffer("z"); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - TEST_F(DeclarableOpsTests3, Test_ReverseDivide_1) { auto x= NDArrayFactory::create('c', {1, 3}, {2, 2, 2}); auto y= NDArrayFactory::create('c', {1, 3}, {4, 6, 8}); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp index f04d24395..1fb700779 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp @@ -809,26 +809,6 @@ TEST_F(DeclarableOpsTests4, Test_Reshape_Again) { delete result; } -TEST_F(DeclarableOpsTests4, Test_Gemv_Transpose_1) { - auto x = NDArrayFactory::create('c', {4, 3}); - auto y = NDArrayFactory::create('c', {4, 1}); - auto exp = NDArrayFactory::create('c',{ 3, 1}, {70, 80, 90}); - - x.linspace(1); - y.linspace(1); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}, {}, {1, 0}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - TEST_F(DeclarableOpsTests4, Test_Split_1) { auto x = NDArrayFactory::create('c', {5, 30}); auto sizes = NDArrayFactory::create('c', {1, 3}, {4, 15, 11}); @@ -1166,57 +1146,6 @@ TEST_F(DeclarableOpsTests4, Test_Cross_3) { delete result; } -TEST_F(DeclarableOpsTests4, Test_Matmul_YATS_1) { - auto a = NDArrayFactory::create('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); - auto b = NDArrayFactory::create('c', {4}, {1, 2, 3, 4}); - auto exp = NDArrayFactory::create('c', {3}, {30, 70, 110}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&a, &b}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - -TEST_F(DeclarableOpsTests4, Test_Matmul_YATS_2) { - auto a = NDArrayFactory::create('c', {4}, {1, 2, 3, 4}); - auto b = NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); - auto exp = NDArrayFactory::create('c', {3}, {70, 80, 90}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&a, &b}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - -TEST_F(DeclarableOpsTests4, Test_Matmul_YATS_3) { - auto a = NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); - auto b = NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); - auto exp = NDArrayFactory::create('c', {1, 3}, {70, 80, 90}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&a, &b}); - ASSERT_EQ(ND4J_STATUS_OK, result->status()); - - auto z = result->at(0); - - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete result; -} - TEST_F(DeclarableOpsTests4, Test_Add_119) { auto a = NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); auto b = NDArrayFactory::create('c', {4}, {1, 2, 3, 4}); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp index 0a6f8e5e8..7a9bc1648 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp @@ -5019,20 +5019,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_7) { delete result; } -TEST_F(DeclarableOpsTests7, Test_Matmul_Once_Again) { - auto x = NDArrayFactory::create('c', {1, 2}, {2.0f, 2.0f}); - auto y = NDArrayFactory::create('c', {2, 1}, {2.0f, 2.0f}); - auto exp = NDArrayFactory::create('c', {1, 1}, {8.0f}); - - nd4j::ops::matmul op; - auto result = op.evaluate({&x, &y}, {}, {}); - ASSERT_EQ(Status::OK(), result->status()); - - ASSERT_EQ(exp, *result->at(0)); - - delete result; -} - TYPED_TEST(TypedDeclarableOpsTests7, Test_Pnorm_Once_Again) { auto input = NDArrayFactory::create('c', {1, 1, 5, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f}); auto exp = NDArrayFactory::create('c', {1, 1, 5, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f}); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp index 11ebc1229..77634b052 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp @@ -932,208 +932,6 @@ TEST_F(DeclarableOpsTests9, tile_test1) { delete results; } - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test1) { - - auto x = NDArrayFactory::create('c', {3, 4}); - auto y = NDArrayFactory::create('c', {4, 3}); - auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123., 40., 92., 144., 45., 105., 165.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test2) { - - auto x = NDArrayFactory::create('c', {3, 4}); - auto y = NDArrayFactory::create('f', {4, 3}); - auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test3) { - - auto x = NDArrayFactory::create('f', {3, 4}); - auto y = NDArrayFactory::create('c', {4, 3}); - auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test4) { - - auto x = NDArrayFactory::create ('f', {3, 4}); - auto y = NDArrayFactory::create('f', {4, 3}); - auto exp = NDArrayFactory::create('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test5) { - - auto x = NDArrayFactory::create('c', {4, 3}); - auto y = NDArrayFactory::create('c', {4, 3}); - auto exp = NDArrayFactory::create('f', {3, 3}, {83., 94., 105., 94., 107., 120., 105., 120., 135.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test6) { - - auto x = NDArrayFactory::create('c', {4, 3}); - auto y = NDArrayFactory::create('f', {3, 4}); - auto exp = NDArrayFactory::create('f', {3, 3}, {35., 40., 45., 79., 92., 105., 123., 144., 165.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test7) { - - auto x = NDArrayFactory::create('c', {5, 3,4}); - auto y = NDArrayFactory::create('f', {5, 3,4}); - auto exp = NDArrayFactory::create('f',{5, 3,3}, {3. , 84.6, 281.4, 593.4, 1020.6, 7. , 107.8, 323.8, 655. , 1101.4,11. , 131. , 366.2, 716.6, 1182.2, - 7. , 107.8, 323.8, 655. , 1101.4,17.4, 137.4, 372.6, 723. , 1188.6,27.8, 167. , 421.4, 791. , 1275.8, - 11. , 131. , 366.2, 716.6, 1182.2,27.8, 167. , 421.4, 791. , 1275.8,44.6, 203. , 476.6, 865.4, 1369.4,}); - - x.linspace(1.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {0, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test8) { - - auto x = NDArrayFactory::create('c', {2,5, 3,4}); - auto y = NDArrayFactory::create('f', {2,5, 3,4}); - auto exp = NDArrayFactory::create('f',{2,5, 3,3}, {3. , 1563. , 84.6, 2220.6, 281.4, 2993.4, 593.4, 3881.4,1020.6, 4884.6, 7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4, - 11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2, 7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4, - 17.4, 1769.4, 137.4, 2465.4, 372.6, 3276.6, 723. , 4203. ,1188.6, 5244.6, 27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8, - 11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2, 27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8, - 44.6, 1988.6, 203. , 2723. , 476.6, 3572.6, 865.4, 4537.4,1369.4, 5617.4}); - - x.linspace(1.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {0, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test9) { - - auto x = NDArrayFactory::create('c', {2,5, 4,3}); - auto y = NDArrayFactory::create('f', {2,5, 3,4}); - auto exp = NDArrayFactory::create('f',{2,5, 3,3}, {7. , 1639. , 103. , 2311. , 314.2, 3098.2, 640.6, 4000.6,1082.2, 5018.2, 8. , 1664. , 108.8, 2340.8, 324.8, 3132.8, 656. , 4040. ,1102.4, 5062.4, - 9. , 1689. , 114.6, 2370.6, 335.4, 3167.4, 671.4, 4079.4,1122.6, 5106.6, 15.8, 1743.8, 131. , 2435. , 361.4, 3241.4, 707. , 4163. ,1167.8, 5199.8, - 18.4, 1770.4, 138.4, 2466.4, 373.6, 3277.6, 724. , 4204. ,1189.6, 5245.6, 21. , 1797. , 145.8, 2497.8, 385.8, 3313.8, 741. , 4245. ,1211.4, 5291.4, - 24.6, 1848.6, 159. , 2559. , 408.6, 3384.6, 773.4, 4325.4,1253.4, 5381.4, 28.8, 1876.8, 168. , 2592. , 422.4, 3422.4, 792. , 4368. ,1276.8, 5428.8, - 33. , 1905. , 177. , 2625. , 436.2, 3460.2, 810.6, 4410.6,1300.2, 5476.2}); - - x.linspace(1.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests9, TestDropout_BP_1) { @@ -1325,325 +1123,6 @@ TEST_F(DeclarableOpsTests9, Test_AlphaDropout_BP_1) { delete ress2; } -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test10) { - - auto x = NDArrayFactory::create('c', {1, 4, 3}); - auto y = NDArrayFactory::create('f', {1, 3, 4}); - auto exp = NDArrayFactory::create('f', {1, 3, 3}, {35., 40., 45., 79., 92., 105., 123., 144., 165.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test11) { - - auto x = NDArrayFactory::create('c', {4, 1}); - auto y = NDArrayFactory::create('f', {1, 4}); - auto exp = NDArrayFactory::create('f', {1, 1}, {15}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1}); - ASSERT_EQ(Status::OK(), results->status()); - - auto z = results->at(0); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test12) { - - auto x = NDArrayFactory::create('c', {1, 4, 1}); - auto y = NDArrayFactory::create('f', {1, 1, 4}); - auto exp = NDArrayFactory::create('f', {1, 1, 1}, {15}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1}); - - ASSERT_EQ(Status::OK(), results->status()); - auto z = results->at(0); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test13) { - - auto x = NDArrayFactory::create('c', {2, 3}); - auto y = NDArrayFactory::create('c', {3, 5}); - auto exp = NDArrayFactory::create('f', {5, 2}, {23. , 26. , 29. , 32. , 35., 50. , 57.5, 65. , 72.5, 80.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {0, 0, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test14) { - - auto x = NDArrayFactory::create('c', {3, 2}); - auto y = NDArrayFactory::create('c', {3, 5}); - auto exp = NDArrayFactory::create('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 0, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test15) { - - auto x = NDArrayFactory::create('c', {3, 2}); - auto y = NDArrayFactory::create('c', {3, 5}); - auto exp = NDArrayFactory::create('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.}); - - x.linspace(1.); - y.linspace(0.5, 0.5); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 0, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test16) { - - auto x = NDArrayFactory::create('c', {2,2, 3,5}); - auto y = NDArrayFactory::create('c', {2,2, 4,3}); - auto exp = NDArrayFactory::create('f',{2,2, 4,5}, {4.6, 281.8, 89.2, 582.4, 10. , 314.2,108.1, 628.3, 15.4, 346.6,127. , 674.2, 20.8, 379. ,145.9, 720.1, 5.2, 289.6, 93.4, 593.8, - 11.5, 322.9,113.2, 640.6, 17.8, 356.2,133. , 687.4, 24.1, 389.5,152.8, 734.2, 5.8, 297.4, 97.6, 605.2, 13. , 331.6,118.3, 652.9, - 20.2, 365.8,139. , 700.6, 27.4, 400. ,159.7, 748.3, 6.4, 305.2,101.8, 616.6, 14.5, 340.3,123.4, 665.2, 22.6, 375.4,145. , 713.8, - 30.7, 410.5,166.6, 762.4, 7. , 313. ,106. , 628. , 16. , 349. ,128.5, 677.5, 25. , 385. ,151. , 727. , 34. , 421. ,173.5, 776.5}); - - x.linspace(1.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test17) { - - auto x = NDArrayFactory::create('f', {4, 3}); - auto y = NDArrayFactory::create('c', {4}); - auto exp = NDArrayFactory::create('f',{3}, {7., 8., 9.}); - - x.linspace(1.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 0}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test18) { - - auto x = NDArrayFactory::create('f', {3}); - auto y = NDArrayFactory::create('c', {4, 3}); - auto exp = NDArrayFactory::create('f',{4}, {1.4, 3.2, 5., 6.8}); - - x.linspace(1.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {0, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test19) { - - auto x = NDArrayFactory::create('f', {1, 1}); - auto y = NDArrayFactory::create('c', {1, 1}); - auto exp = NDArrayFactory::create('f',{1, 1}, {0.2}); - - x.linspace(2.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test20) { - - auto x = NDArrayFactory::create('f', {1, 1}); - auto y = NDArrayFactory::create('c', {1, 1}); - auto exp = NDArrayFactory::create('f',{1, 1}, {0.2}); - - x.linspace(2.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1,1,1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test21) { - - auto x = NDArrayFactory::create('f', {1}); - auto y = NDArrayFactory::create('c', {1, 1}); - auto exp = NDArrayFactory::create('f',{1}, {0.2}); - - x.linspace(2.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test22) { - - auto x = NDArrayFactory::create('f', {1,1}); - auto y = NDArrayFactory::create('c', {1}); - auto exp = NDArrayFactory::create('f',{1}, {0.2}); - - x.linspace(2.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test23) { - - auto x = NDArrayFactory::create('f', {4}); - auto y = NDArrayFactory::create('c', {4}); - auto exp = NDArrayFactory::create(3.); - - x.linspace(1.); - y.linspace(0.1, 0.1); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - - -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests9, matmul_test24) { - - auto x = NDArrayFactory::create('f', {1}, {2.}); - auto y = NDArrayFactory::create('c', {1}, {3.}); - auto exp = NDArrayFactory::create(6.); - - nd4j::ops::matmul op; - auto results = op.evaluate({&x, &y}, {}, {1, 1}); - auto z = results->at(0); - - ASSERT_EQ(Status::OK(), results->status()); - ASSERT_TRUE(exp.isSameShape(z)); - ASSERT_TRUE(exp.equalsTo(z)); - - delete results; -} - TEST_F(DeclarableOpsTests9, test_range_int_1) { auto x0 = NDArrayFactory::create(0); auto x1 = NDArrayFactory::create(2); diff --git a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp index d83e85f67..b01c9f98a 100644 --- a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp @@ -64,8 +64,11 @@ TEST_F(MklDnnTests, helpers_includer) { nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CPU maxpool3d_bp; nd4j::ops::platforms::PLATFORM_lrn_ENGINE_CPU lrn; + nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CPU batchnorm; - printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm}); + nd4j::ops::platforms::PLATFORM_matmul_ENGINE_CPU matmul; + + printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm, &matmul}); #endif } \ No newline at end of file From da39a63c9bcbb49fed5f6a6f02c2613e406293d3 Mon Sep 17 00:00:00 2001 From: raver119 Date: Tue, 18 Feb 2020 11:20:38 +0300 Subject: [PATCH 13/19] one more bert-like test Signed-off-by: raver119 --- .../layers_tests/PlaygroundTests.cpp | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 7cdf40c7f..93fb5d6b3 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -149,6 +149,56 @@ TEST_F(PlaygroundTests, test_bert_1) { delete graph; } +TEST_F(PlaygroundTests, test_bert_2) { + // this test will run ONLY if this model exists + if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb") < 0) + return; + + auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb"); + + //graph->printOut(); + + graph->tagInplaceNodes(); + + +/* + // validating graph now + auto status = GraphExecutioner::execute(graph); + ASSERT_EQ(Status::OK(), status); + ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198)); + + auto array = graph->getVariableSpace()->getVariable(198)->getNDArray(); + ASSERT_EQ(z, *array); +*/ + + nd4j::Environment::getInstance()->setProfiling(true); + auto profile = GraphProfilingHelper::profile(graph, 1); + + profile->printOut(); + + nd4j::Environment::getInstance()->setProfiling(false); + delete profile; + +/* + std::vector values; + + for (int e = 0; e < 1; e++) { + auto timeStart = std::chrono::system_clock::now(); + + GraphExecutioner::execute(graph); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); +*/ + delete graph; +} + TEST_F(PlaygroundTests, test_one_off_ops_1) { auto x = NDArrayFactory::create('c', {4, 128, 768}); auto y = NDArrayFactory::create('c', {4, 128, 1}); From 72f9cda0191d9b0473580c54cf54aca428b0d285 Mon Sep 17 00:00:00 2001 From: Abdelrauf Date: Tue, 18 Feb 2020 18:01:43 +0400 Subject: [PATCH 14/19] Added missing bfloat16 (#252) Signed-off-by: AbdelRauf --- libnd4j/include/platformmath.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libnd4j/include/platformmath.h b/libnd4j/include/platformmath.h index b7cbe3745..b58e8f7f6 100644 --- a/libnd4j/include/platformmath.h +++ b/libnd4j/include/platformmath.h @@ -326,6 +326,11 @@ namespace nd4j { #endif } + template <> + math_def FORCEINLINE bfloat16 p_floor(bfloat16 value) { + return static_cast(floorf((float)value)); + } + template <> math_def FORCEINLINE double p_floor(double value) { return floor(value); @@ -352,6 +357,11 @@ namespace nd4j { #endif } + template <> + math_def FORCEINLINE bfloat16 p_ceil(bfloat16 value) { + return static_cast(ceilf((float)value)); + } + template <> math_def FORCEINLINE double p_ceil(double value) { return ceil(value); @@ -374,6 +384,12 @@ namespace nd4j { return static_cast(roundf((float) val)); } + template <> + math_def FORCEINLINE bfloat16 p_round(bfloat16 value) { + return static_cast(roundf((float)value)); + } + + template <> math_def FORCEINLINE double p_round(double value) { return round(value); From c5193ecb8139261dee58c3b1b6952f6a93a7d47e Mon Sep 17 00:00:00 2001 From: Yurii Shyrma Date: Wed, 19 Feb 2020 08:35:52 +0200 Subject: [PATCH 15/19] Shyrma gather (#254) * - profiling gather op for aurora Signed-off-by: Yurii * - include contiguous memcpy in gather op Signed-off-by: Yurii --- .../ops/declarable/helpers/cpu/gather.cpp | 129 ++++++++++++++---- .../ops/declarable/platform/mkldnn/matmul.cpp | 2 +- 2 files changed, 107 insertions(+), 24 deletions(-) diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp index 3fb7c290d..09c8c09ea 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include namespace nd4j { namespace ops { @@ -36,7 +38,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* const int numOfIntArgs = intArgs.size(); - if (indices != nullptr) { + if (indices != nullptr) { // first case: indices consist of only one scalar if(indices->isScalar()) { @@ -46,7 +48,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* auto idx = indices->e(0); auto scalarNDArray = input->e(idx); output->assign(scalarNDArray); - } + } else { NDArray inSubArr = (*input)(indices->e(0), {axis}); output->assign(inSubArr); @@ -54,41 +56,122 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* } else { - std::vector dimsOut(indices->rankOf()); - std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... axis+indices->rankOf()-1 - const Nd4jLong numOfSubArrs = indices->lengthOf(); + if(input->rankOf() == 1 && output->rankOf() == 1) { - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { - NDArray subArrOut = (*output)(i, dimsOut); - NDArray subArrIn = (*input)(indices->e(i), {axis}); - subArrOut.assign(subArrIn); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + output->p(i, input->e(indices->e(i))); + }; + + samediff::Threads::parallel_for(func, 0, output->lengthOf()); + + } + else { + + std::vector dimsOut; + for (int i = 0; i < axis; ++i) + dimsOut.push_back(i); + for (int i = axis+indices->rankOf(); i < output->rankOf(); ++i) + dimsOut.push_back(i); + + std::vector dimsIn = ShapeUtils::evalDimsToExclude(input->rankOf(), {axis}); + + const Nd4jLong numOfSubArrs = indices->lengthOf(); + + auto inTadPack = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimsIn); + auto outTadPack = ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimsOut); + + Nd4jLong* inTadShapeInfo = inTadPack.primaryShapeInfo(); + Nd4jLong* outTadShapeInfo = outTadPack.primaryShapeInfo(); + + if (shape::order(inTadShapeInfo) == shape::order(outTadShapeInfo) && shape::order(inTadShapeInfo) == 'c' && input->dataType() == output->dataType() && shape::elementWiseStride(inTadShapeInfo) == 1 && shape::elementWiseStride(outTadShapeInfo) == 1) { + + auto func = PRAGMA_THREADS_FOR { + + for (auto i = start; i < stop; i += increment) { + + void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e(i)]); + void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); + + memcpy(outBuff, inBuff, shape::length(inTadShapeInfo) * input->sizeOfT()); + } + }; + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); } - }; + else { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { - samediff::Threads::parallel_tad(func, 0, numOfSubArrs); + void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e(i)]); + void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); + + NativeOpExecutioner::execTransformAny(input->getContext(), transform::Assign, + inBuff, inTadShapeInfo, nullptr/*input specialBuffer*/, nullptr/*input specialShapeInfo*/, + outBuff, outTadShapeInfo, nullptr/*output specialBuffer*/, nullptr/*output specialShapeInfo*/, + nullptr, nullptr, nullptr, false/*allowParallelism*/); + } + }; + + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); + } + } } - } + } else { - + // we only allow scalar/vector case here if (numOfIntArgs == 2) { // scalar case + output->assign((*input)(intArgs[1], {axis})); } else { // vector case + const Nd4jLong numOfSubArrs = intArgs.size() - 1; - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { - NDArray subArrOut = (*output)(i, {axis}); - NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); - subArrOut.assign(subArrIn); - } - }; + std::vector dims = ShapeUtils::evalDimsToExclude(input->rankOf(), {axis}); + + auto inTadPack = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dims); + auto outTadPack = ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dims); + + Nd4jLong* inTadShapeInfo = inTadPack.primaryShapeInfo(); + Nd4jLong* outTadShapeInfo = outTadPack.primaryShapeInfo(); + + if (shape::order(inTadShapeInfo) == shape::order(outTadShapeInfo) && shape::order(inTadShapeInfo) == 'c' && input->dataType() == output->dataType() && shape::elementWiseStride(inTadShapeInfo) == 1 && shape::elementWiseStride(outTadShapeInfo) == 1) { + + auto func = PRAGMA_THREADS_FOR { + + for (auto i = start; i < stop; i += increment) { + + void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); + void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); + + std::memcpy(outBuff, inBuff, shape::length(inTadShapeInfo) * input->sizeOfT()); + } + }; + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); + + } + else { + + auto func = PRAGMA_THREADS_FOR { + + for (auto i = start; i < stop; i += increment) { + + void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); + void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); + + NativeOpExecutioner::execTransformAny(input->getContext(), transform::Assign, + inBuff, inTadShapeInfo, nullptr/*input specialBuffer*/, nullptr/*input specialShapeInfo*/, + outBuff, outTadShapeInfo, nullptr/*output specialBuffer*/, nullptr/*output specialShapeInfo*/, + nullptr, nullptr, nullptr, false/*allowParallelism*/); + + } + }; + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); + } - samediff::Threads::parallel_tad(func, 0, numOfSubArrs); } - } + } } diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp index f47d08b7a..53d18e3cd 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp @@ -279,7 +279,7 @@ PLATFORM_CHECK(matmul, ENGINE_CPU) { const DataType zType = z->dataType(); - return block.isUseMKLDNN() && + return block.isUseMKLDNN() && x->rankOf() < 3 && ( (xType==DataType::FLOAT32 && yType==DataType::FLOAT32 && zType==DataType::FLOAT32) || (xType==DataType::HALF && yType==DataType::HALF && zType==DataType::FLOAT32) || From d9058b469ab73f29d3ef8e8d5de96f040774e0c2 Mon Sep 17 00:00:00 2001 From: Serhii Shepel <9946053+sshepel@users.noreply.github.com> Date: Wed, 19 Feb 2020 15:31:21 +0200 Subject: [PATCH 16/19] Add classifier property for dl4j-test-resources (#249) --- pom.xml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 9b32f25ae..f02800d9e 100644 --- a/pom.xml +++ b/pom.xml @@ -226,6 +226,7 @@ 1.0.0-SNAPSHOT 1.0.0-SNAPSHOT 1.0.0-SNAPSHOT + 1.9.13 5.1 @@ -561,6 +562,7 @@ org.deeplearning4j dl4j-test-resources ${dl4j-test-resources.version} + ${dl4j-test-resources.classifier} test @@ -854,7 +856,7 @@ arm - + From 215641ea9e643ddfba29eed531d85f4bb1ce69af Mon Sep 17 00:00:00 2001 From: raver119 Date: Thu, 20 Feb 2020 11:43:26 +0300 Subject: [PATCH 17/19] Minor improvements (#255) * static increments in loops Signed-off-by: raver119 * specials and concat split into separate units Signed-off-by: raver119 --- libnd4j/blas/NDArray.hpp | 20 +- libnd4j/blas/cpu/NDArray.cpp | 10 +- libnd4j/blas/cpu/NDArrayLambda.hpp | 30 +- libnd4j/blas/cpu/NativeOps.cpp | 6 +- libnd4j/include/array/DataTypeConversions.h | 8 +- .../helpers/cpu/loops/IndexReductionLoops.hpp | 20 +- .../include/loops/cpu/TrueBroadcastHelper.hpp | 2 +- libnd4j/include/loops/cpu/indexreduce.hpp | 4 +- libnd4j/include/loops/cpu/random.hpp | 16 +- libnd4j/include/loops/cpu/reduce3.hpp | 6 +- .../include/loops/cpu/summarystatsreduce.cpp | 2 +- .../include/loops/impl/type_conversions.cpp | 6 +- .../declarable/helpers/cpu/BarnesHutTsne.cpp | 2 +- .../declarable/helpers/cpu/activations.cpp | 4 +- .../ops/declarable/helpers/cpu/adjust_hue.cpp | 2 +- .../helpers/cpu/adjust_saturation.cpp | 2 +- .../declarable/helpers/cpu/batched_gemm.cpp | 2 +- .../ops/declarable/helpers/cpu/batchnorm.cpp | 2 +- .../ops/declarable/helpers/cpu/betaInc.cpp | 2 +- .../ops/declarable/helpers/cpu/col2im.cpp | 2 +- .../ops/declarable/helpers/cpu/concat.cpp | 41 +++ .../ops/declarable/helpers/cpu/confusion.cpp | 2 +- .../helpers/cpu/crop_and_resize.hpp | 2 +- .../ops/declarable/helpers/cpu/cross.cpp | 2 +- .../ops/declarable/helpers/cpu/d_t_s.cpp | 4 +- .../ops/declarable/helpers/cpu/diGamma.cpp | 2 +- .../ops/declarable/helpers/cpu/dropout.cpp | 4 +- .../ops/declarable/helpers/cpu/dynamic.cpp | 4 +- .../helpers/cpu/extract_patches.cpp | 2 +- .../ops/declarable/helpers/cpu/gather.cpp | 10 +- .../ops/declarable/helpers/cpu/hamming.cpp | 6 +- .../ops/declarable/helpers/cpu/hashcode.cpp | 4 +- .../declarable/helpers/cpu/image_resize.cpp | 6 +- .../declarable/helpers/cpu/imagesHelpers.cpp | 10 +- .../ops/declarable/helpers/cpu/ismax.cpp | 2 +- .../ops/declarable/helpers/cpu/lrn.cpp | 6 +- .../ops/declarable/helpers/cpu/lstm.cpp | 2 +- .../ops/declarable/helpers/cpu/lup.cpp | 8 +- .../helpers/cpu/matrix_diag_part.cpp | 2 +- .../declarable/helpers/cpu/nth_element.cpp | 2 +- .../ops/declarable/helpers/cpu/one_hot.cpp | 4 +- .../ops/declarable/helpers/cpu/polyGamma.cpp | 2 +- .../include/ops/declarable/helpers/cpu/qr.cpp | 2 +- .../ops/declarable/helpers/cpu/range.cpp | 2 +- .../ops/declarable/helpers/cpu/reverse.cpp | 18 +- .../ops/declarable/helpers/cpu/s_t_b.cpp | 4 +- .../ops/declarable/helpers/cpu/s_t_d.cpp | 4 +- .../ops/declarable/helpers/cpu/scatter.cpp | 14 +- .../ops/declarable/helpers/cpu/segment.cpp | 18 +- .../ops/declarable/helpers/cpu/sg_cb.cpp | 4 +- .../ops/declarable/helpers/cpu/solve.cpp | 2 +- .../ops/declarable/helpers/cpu/sru.cpp | 4 +- .../ops/declarable/helpers/cpu/stack.cpp | 4 +- .../ops/declarable/helpers/cpu/top_k.cpp | 2 +- .../ops/declarable/helpers/cpu/transforms.cpp | 49 ++-- .../helpers/cpu/triangular_solve.cpp | 4 +- .../ops/declarable/helpers/cpu/zeta.cpp | 2 +- .../include/ops/declarable/helpers/cross.h | 2 +- .../ops/declarable/helpers/impl/unique.cpp | 2 +- .../compilation_units/specials_double_0.cpp | 4 +- .../compilation_units/specials_double_1.cpp | 2 +- .../compilation_units/specials_double_2.cpp | 2 +- .../compilation_units/specials_double_3.cpp | 2 +- .../compilation_units/specials_double_4.cpp | 2 +- .../compilation_units/specials_double_5.cpp | 2 +- .../compilation_units/specials_double_6.cpp | 2 +- .../compilation_units/specials_double_7.cpp | 2 +- .../compilation_units/specials_double_8.cpp | 2 +- .../compilation_units/specials_double_9.cpp | 2 +- .../compilation_units/specials_single_0.cpp | 2 +- .../compilation_units/specials_single_1.cpp | 2 +- .../compilation_units/specials_single_2.cpp | 2 +- .../compilation_units/specials_single_3.cpp | 2 +- .../compilation_units/specials_single_4.cpp | 2 +- .../compilation_units/specials_single_5.cpp | 2 +- .../compilation_units/specials_single_6.cpp | 2 +- .../compilation_units/specials_single_7.cpp | 2 +- .../compilation_units/specials_single_8.cpp | 2 +- .../compilation_units/specials_single_9.cpp | 2 +- libnd4j/include/ops/impl/gemm.cpp | 6 +- libnd4j/include/ops/impl/specials_double.hpp | 270 ++++++++++++++++++ .../{specials.hpp => specials_single.hpp} | 251 +--------------- libnd4j/include/ops/special_random_ops.h | 12 +- 83 files changed, 529 insertions(+), 464 deletions(-) create mode 100644 libnd4j/include/ops/declarable/helpers/cpu/concat.cpp create mode 100644 libnd4j/include/ops/impl/specials_double.hpp rename libnd4j/include/ops/impl/{specials.hpp => specials_single.hpp} (56%) diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 8abee8d82..6c5f6a8c8 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -501,7 +501,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dataType == DataType::UTF16) { unicode::utf8to16(string[e], cdata, std::char_traits::length(string[e])); @@ -568,7 +568,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dataType == DataType::UTF16) { unicode::utf8to16(string[e].data(), cdata, string[e].size()); @@ -635,7 +635,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dtype == DataType::UTF16) { memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t)); @@ -701,7 +701,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector::length(string[e]) * sizeof(uint16_t)); @@ -767,7 +767,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dtype == DataType::UTF16) { unicode::utf32to16(string[e].data(), cdata, string[e].size()); @@ -833,7 +833,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dtype == DataType::UTF16) { unicode::utf32to16(string[e], cdata, std::char_traits::length(string[e])); @@ -2367,7 +2367,7 @@ NDArray NDArray::asS() const { const auto inData = bufferAsT() + offsetsLength; auto func = PRAGMA_THREADS_FOR{ - for (int e = start; e < stop; e += increment) { + for (int e = start; e < stop; e++) { auto cdata = outData + offsets[e]; auto end = nInputoffsets[e + 1]; auto idata = inData + nInputoffsets[e]; @@ -3466,7 +3466,7 @@ NDArray NDArray::dup(const char newOrder) const { std::vector strings(lengthOf()); auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { strings[i] = std::move(this->e(i)); } }; @@ -3479,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const { std::vector strings(lengthOf()); auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { strings[i] = std::move(this->e(i)); } }; @@ -3491,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const { std::vector strings(lengthOf()); auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { strings[i] = std::move(this->e(i)); } }; diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp index 9bdf41a16..58d4b3c34 100644 --- a/libnd4j/blas/cpu/NDArray.cpp +++ b/libnd4j/blas/cpu/NDArray.cpp @@ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, target.getShapeInfo(), coords); const auto zOffset = shape::getOffset(target.getShapeInfo(), coords); @@ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) { auto y = reinterpret_cast(yBuffer); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto temp = x[i]; x[i] = y[i]; y[i] = temp; @@ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector& reps) const { if(result.ordering() == 'c') { // ews == 1 always here auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES); } @@ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector& reps) const { else { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto xOffset = result.getOffset(i); auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES); @@ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vectorordering() == second.ordering() && this->ordering() == third.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(f[e], s[e], t[e]); }; @@ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std:: if (f == z) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto tOffset = this->getOffset(e); auto uOffset = second.getOffset(e); auto vOffset = third.getOffset(e); @@ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std:: } else { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto tOffset = this->getOffset(e); auto uOffset = second.getOffset(e); auto vOffset = third.getOffset(e); @@ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::functionordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(f[e], s[e]); }; @@ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); @@ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); auto zOffset = target.getOffset(e); @@ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function& func, NDArray& target) { if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(f[e]); }; @@ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function& func, NDArray& target) { if (f == z) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); f[xOffset] = func(f[xOffset]); @@ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function& func, NDArray& target) { } else { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); auto zOffset = target.getOffset(e); @@ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function& func, NDAr if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(e, f[e]); }; @@ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function& func, NDAr if (f == z) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); f[xOffset] = func(e, f[xOffset]); @@ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function& func, NDAr } else { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); auto zOffset = target.getOffset(e); @@ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::functionordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func((Nd4jLong) e, f[e], s[e]); }; @@ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); @@ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); auto zOffset = target.getOffset(e); diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp index b945c5bcf..e82f2224e 100644 --- a/libnd4j/blas/cpu/NativeOps.cpp +++ b/libnd4j/blas/cpu/NativeOps.cpp @@ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx, _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); auto func = PRAGMA_THREADS_FOR { - for (auto idx = start; idx < stop; idx += increment) { + for (auto idx = start; idx < stop; idx++) { auto xTadOffsetForBlock = tadOffsets[indexes[idx]]; auto zTadOffsetForBlock = zTadOffsets[idx]; @@ -1356,7 +1356,7 @@ void tearGeneric(void *vx, auto numTads = shape::length(hXShapeInfo) / tadLength; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto hZ = reinterpret_cast(targets[i]); auto s = hX + tadOffsets[i]; @@ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS auto dZ = reinterpret_cast(dz); auto func = PRAGMA_THREADS_FOR { - for (auto f = start; f < stop; f += increment) { + for (auto f = start; f < stop; f++) { auto hX = reinterpret_cast(dX[f]); //auto hZ = reinterpret_cast(dZ[f]); diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h index 3af77ca39..abc804f5e 100644 --- a/libnd4j/include/array/DataTypeConversions.h +++ b/libnd4j/include/array/DataTypeConversions.h @@ -52,7 +52,7 @@ namespace nd4j { TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; @@ -110,7 +110,7 @@ namespace nd4j { TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; @@ -138,7 +138,7 @@ namespace nd4j { #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; @@ -164,7 +164,7 @@ namespace nd4j { TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp index 1aaaaebc7..b661d02e7 100644 --- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp +++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp @@ -49,7 +49,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, case nd4j::LoopKind::EWS1: { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -70,7 +70,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, case nd4j::LoopKind::EWSNONZERO: { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -91,7 +91,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, case nd4j::LoopKind::RANK1: { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -114,7 +114,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(2, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -141,7 +141,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(3, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -170,7 +170,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(4, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -201,7 +201,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(5, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -234,7 +234,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -258,7 +258,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -284,7 +284,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp index 95fe19109..f047d1136 100644 --- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp +++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp @@ -80,7 +80,7 @@ namespace nd4j { int nLen = zArr.lengthOf() / yArr.sizeAt(-1); auto func = PRAGMA_THREADS_FOR{ - for (uint32_t total = start; total < stop; total += increment) { + for (uint32_t total = start; total < stop; total++) { uint32_t i = total / zDim1; uint32_t j = total % zDim1; diff --git a/libnd4j/include/loops/cpu/indexreduce.hpp b/libnd4j/include/loops/cpu/indexreduce.hpp index 829f60a18..8d3af7eb4 100644 --- a/libnd4j/include/loops/cpu/indexreduce.hpp +++ b/libnd4j/include/loops/cpu/indexreduce.hpp @@ -73,7 +73,7 @@ Nd4jLong IndexReduce::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex auto func = PRAGMA_THREADS_FOR { intermediatery[thread_id] = OpType::startingIndexValue(x); - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { IndexValue curr(x[i], i); intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); } @@ -88,7 +88,7 @@ Nd4jLong IndexReduce::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex auto func = PRAGMA_THREADS_FOR { intermediatery[thread_id] = OpType::startingIndexValue(x); - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); IndexValue curr(x[offset], i); intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); diff --git a/libnd4j/include/loops/cpu/random.hpp b/libnd4j/include/loops/cpu/random.hpp index 35674de36..ab9793694 100644 --- a/libnd4j/include/loops/cpu/random.hpp +++ b/libnd4j/include/loops/cpu/random.hpp @@ -75,7 +75,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); } @@ -93,7 +93,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); @@ -111,7 +111,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); @@ -129,7 +129,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); @@ -149,7 +149,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); @@ -197,7 +197,7 @@ namespace functions { else{ auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); } @@ -213,7 +213,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); @@ -255,7 +255,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[offset] = OpClass::op(i, length, rng, extraArguments); } diff --git a/libnd4j/include/loops/cpu/reduce3.hpp b/libnd4j/include/loops/cpu/reduce3.hpp index 8d50aedbc..c24a3d474 100644 --- a/libnd4j/include/loops/cpu/reduce3.hpp +++ b/libnd4j/include/loops/cpu/reduce3.hpp @@ -88,7 +88,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, if (kindOfLoop == nd4j::LoopKind::EWS1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); } }; @@ -98,7 +98,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); } @@ -110,7 +110,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp index a8f766f6a..2e36b8085 100644 --- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp +++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp @@ -158,7 +158,7 @@ namespace functions { const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo(tadShapeShapeInfo, tadShapeShapeInfoCast); auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { auto tadOffsetForBlock = tadPack.primaryOffsets()[r]; auto tx = x + tadOffsetForBlock; diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp index b12ff5796..36c95e731 100644 --- a/libnd4j/include/loops/impl/type_conversions.cpp +++ b/libnd4j/include/loops/impl/type_conversions.cpp @@ -81,7 +81,7 @@ namespace nd4j { // now we actually apply quantization auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { rz[e] = static_cast(nd4j::math::nd4j_round( 1.0f * static_cast(x[e]) / nd4j::math::nd4j_max(amax, amin) * max_byte)); } }; @@ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write) int flimit = limit + 4; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { int el = x[e]; int ael = nd4j::math::nd4j_abs(el) - 1; z[ael] += el > 0 ? static_cast(threshold) : static_cast(-threshold); @@ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write) auto z = reinterpret_cast(dz); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { z[i] = static_cast(static_cast(x[i])); } }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp index f8704d7b0..baf19de10 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp @@ -153,7 +153,7 @@ namespace helpers { auto rowSize = sizeof(T) * colCount; auto func = PRAGMA_THREADS_FOR { - for (auto n = start; n < stop; n += increment) { + for (auto n = start; n < stop; n++) { int s = rowP->e(n); int end = rowP->e(n + 1); int shift = n * colCount; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index 56c93b611..2e63c9d5e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -291,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra shape::calcOffsets(tadShapeInfo, offsets); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inBuff = input.bufferAsT() + tadOffsets[i]; auto outBuff = output.bufferAsT() + tadOffsets[i]; @@ -341,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { // FIXME: double! double x = input.e(i); if (x < 0.0) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp index 978c037fa..5a22b02eb 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp @@ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T *xTad = x + packX.platformOffsets()[i]; T *zTad = z + packZ.platformOffsets()[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp index d4b0de398..594280ebe 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp @@ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T *xTad = x + packX.platformOffsets()[i]; T *zTad = z + packZ.platformOffsets()[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp index b408da720..c63dc3c1c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp @@ -94,7 +94,7 @@ void bgemm_(const std::vector& vA, const std::vector& vB, st int vaSize = vA.size(); auto func = PRAGMA_THREADS_FOR { - for (auto p = start; p < stop; p += increment) { + for (auto p = start; p < stop; p++) { auto A = reinterpret_cast(vA.at(p)->buffer()); auto B = reinterpret_cast(vB.at(p)->buffer()); auto C = reinterpret_cast(vC.at(p)->buffer()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp index ad2e29a97..aa9624600 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp @@ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, input->getShapeInfo(), coords); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp index 83cc966ba..5e80d12fb 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp @@ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con int xLen = x.lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) output.t(i) = betaIncCore(a.t(i), b.t(i), x.t(i)); }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp index 5aad38da8..26f82bdd9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp @@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp auto func = PRAGMA_THREADS_FOR { T *col, *im; - for (uint b = start; b < stop; b += increment) { + for (uint b = start; b < stop; b++) { T *im0 = imBuff + b * imStride0; T *col4 = colBuff + b * colStride0; for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp new file mode 100644 index 000000000..1bdf0a6ad --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018 +// + + +#include +#include + +namespace nd4j { + namespace ops { + namespace helpers { + ////////////////////////////////////////////////////////////////////////// + template + static void concat_(const std::vector& inArrs, NDArray& output, const int axis) { + nd4j::SpecialMethods::concatCpuGeneric(inArrs, output, axis); + } + + void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis) { + BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES); + } + + BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector& inArrs, NDArray& output, const int axis), LIBND4J_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp index 4f8989caf..39449c7f8 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp @@ -32,7 +32,7 @@ namespace helpers { int lLen = labels->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (int j = start; j < stop; j += increment) { + for (int j = start; j < stop; j++) { auto label = labels->e(j); auto pred = predictions->e(j); T value = (weights == nullptr ? (T) 1.0f : weights->e(j)); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp index ca30d73bd..1f55378c0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp @@ -50,7 +50,7 @@ namespace nd4j { T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); auto func = PRAGMA_THREADS_FOR { - for (auto y = start; y < stop; y += increment) { + for (auto y = start; y < stop; y++) { const float inY = (cropHeight > 1) ? y1 * (imageHeight - 1) + y * heightScale : 0.5 * (y1 + y2) * (imageHeight - 1); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp index c12b1ce4f..6a8523925 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp @@ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray int tads = tadsA.size(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto a_ = tadsA.at(e); auto b_ = tadsB.at(e); auto o_ = tadsO.at(e); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp index f041452ab..d3e524ff4 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp @@ -46,7 +46,7 @@ namespace helpers { if (isNHWC) { const int total_count = batch_size * output_height * output_width * output_depth; auto func = PRAGMA_THREADS_FOR { - for (auto out_idx = start; out_idx < stop; out_idx += increment) { + for (auto out_idx = start; out_idx < stop; out_idx++) { const int d = out_idx % output_depth; const int out_idx2 = out_idx / output_depth; const int w = out_idx2 % output_width; @@ -70,7 +70,7 @@ namespace helpers { const int total_count = batch_size * input_depth_by_input_area; auto func = PRAGMA_THREADS_FOR { - for (int input_idx = start; input_idx < stop; input_idx += increment) { + for (int input_idx = start; input_idx < stop; input_idx++) { const int n_bY_bX_oC_iY = input_idx / input_width; const int iX = input_idx - n_bY_bX_oC_iY * input_width; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp index 8035f8216..2a51b92a6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp @@ -32,7 +32,7 @@ template static void diGamma_(const NDArray& x, NDArray& z) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) z.p(i, diGammaScalar(x.e(i))); }; samediff::Threads::parallel_for(func, 0, x.lengthOf()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp index 9db974b36..a470f140a 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp @@ -35,7 +35,7 @@ namespace helpers { int inLen = input->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { float val = nodeRng.relativeT(e, T(0.f), T(1.f)); if (val < probValue) @@ -130,7 +130,7 @@ namespace helpers { nd4j::graph::RandomGenerator nodeRng(3019L, seed); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { float randVal = nodeRng.relativeT(e, T(0.f), T(1.f)); float xVal = input->e(e); output->p(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp index 281e6c809..0673a6f2b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp @@ -62,7 +62,7 @@ namespace nd4j { unsigned int outSize = outputList.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { outputs[i].first = outputList[i]; outputs[i].second = 0; for (int e = 0; e < indices->lengthOf(); ++e) @@ -168,7 +168,7 @@ namespace nd4j { unsigned int gradsSize = inputGradientList.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { outputs[i].first = inputGradientList[i]; outputs[i].second = 0; for (int e = 0; e < indices->lengthOf(); ++e) diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp index 0a46c995e..b2707ea5c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp @@ -50,7 +50,7 @@ namespace helpers { colCast = 0; auto func = PRAGMA_THREADS_FOR { - for (auto batch = 0; batch < stop; batch += increment) { + for (auto batch = 0; batch < stop; batch++) { auto patch = listOfMatricies.at(batch); auto outMatrix = listOfOutputs.at(batch); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp index 09c8c09ea..ed844e84f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp @@ -59,7 +59,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* if(input->rankOf() == 1 && output->rankOf() == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) output->p(i, input->e(indices->e(i))); }; @@ -88,7 +88,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e(i)]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); @@ -100,7 +100,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* } else { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e(i)]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); @@ -140,7 +140,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); @@ -155,7 +155,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp index 9e3bdf885..fc6fc768b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp @@ -56,7 +56,7 @@ namespace nd4j { if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto _x = static_cast(xBuffer[e]); auto _y = static_cast(yBuffer[e]); @@ -67,7 +67,7 @@ namespace nd4j { maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto _x = static_cast(xBuffer[e * xEws]); auto _y = static_cast(yBuffer[e * yEws]); @@ -78,7 +78,7 @@ namespace nd4j { maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); } else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto _x = static_cast(x.e(e)); auto _y = static_cast(y.e(e)); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp index 04df86c36..beb48e382 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp @@ -42,7 +42,7 @@ namespace nd4j { // we divide array into 32 element chunks, and store intermediate results once auto func = PRAGMA_THREADS_FOR { - for (auto b = 0; b < stop; b += increment) { + for (auto b = 0; b < stop; b++) { auto blockBuffer = buffer + b * numBlocks; Nd4jLong r = 1; @@ -64,7 +64,7 @@ namespace nd4j { auto func2 = PRAGMA_THREADS_FOR { - for (auto b = start; b < stop; b += increment) { + for (auto b = start; b < stop; b++) { auto blockBuffer = tempBuffer + b * numBlocks; Nd4jLong r = 1; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp index 9d30ddcf7..23acab375 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp @@ -280,7 +280,7 @@ namespace helpers { int xsSize = xs.size(); // Scale x interpolation weights to avoid a multiplication during iteration. auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { xs[i]._bottomIndex *= channels; xs[i]._topIndex *= channels; } @@ -906,7 +906,7 @@ namespace helpers { auto outputPtr = output->bufferAsT(); // output is always float. TO DO: provide another float types also with template declaration auto batchProcess = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { for (auto y = 0; y < st.outHeight; ++y) { const float inY = y * st.heightScale; const float inY1 = (y + 1) * st.heightScale; @@ -961,7 +961,7 @@ namespace helpers { if (Status::OK() == res) { std::vector xCached(st.outWidth); auto cachingProcedure = PRAGMA_THREADS_FOR { - for (auto x = start; x < stop; x += increment) { + for (auto x = start; x < stop; x++) { auto &xCache = xCached[x]; const float inX = x * st.widthScale; const float inX1 = (x + 1) * st.widthScale; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp index e065174d5..b98e7f026 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp @@ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) { 'c' == output.ordering() && 1 == output.ews()){ auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const auto xStep = i*3; z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2]; } @@ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) { auto func = PRAGMA_THREADS_FOR{ Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords); @@ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con const Nd4jLong zDimCstride = output.stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T* xTad = x + packX.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i]; op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); @@ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T* xTad = x + packX.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i]; op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); @@ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T* xTad = x + packX.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i]; //simple M*v //tr.T*v diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp index 4bc9d3304..1fea8e4fe 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp @@ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector int span = (tads / num_threads) + 8; auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { auto rX = const_cast(input)->bufferAsT() + tadOffsets[r]; auto rZ = output->bufferAsT() + zOfsets[r]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp index c9b833cf5..aeb9e38b0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp @@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out if(inTadEws == 1 && outTadEws == 1) { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i += increment) { + for (uint i = start; i < stop; i++) { const T *x = inBuff + inTadOffsets[i]; T *y = outBuff + outTadOffsets[i]; @@ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c if(inTadEws == 1 && gradITadEws == 1) { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i += increment) { + for (uint i = start; i < stop; i++) { const X *x = inBuff + inTadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i]; @@ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c else { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i += increment) { + for (uint i = start; i < stop; i++) { const X *x = inBuff + inTadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp index 683a82392..634d875d2 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp @@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast, auto h_ = h->bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (uint e = start; e < stop; e += increment) { + for (uint e = start; e < stop; e++) { c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); h_[e] = nd4j::math::nd4j_tanh(c_[e]); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp index 2856e73b9..7d2eb5051 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp @@ -45,7 +45,7 @@ namespace helpers { auto n = shape::sizeAt(matrixShape, -1); auto loop = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong theFirstPos[] = {theFirst, i}; Nd4jLong theSecondPos[] = {theSecond, i}; auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0); @@ -203,7 +203,7 @@ namespace helpers { auto result = -1; //auto loop = PRAGMA_THREADS_FOR { auto start = column, stop = rowNum, increment = 1; - for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) { + for (auto rowCounter = start; rowCounter < stop; rowCounter++) { Nd4jLong xPos[] = {rowCounter, column}; auto xIndex = shape::getOffset(compoundShape, xPos, 0); if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) { @@ -221,7 +221,7 @@ namespace helpers { Nd4jLong xDiag[] = {currentRow, currentRow}; auto diagIndex = shape::getOffset(compoundShape, xDiag, 0); auto loop = PRAGMA_THREADS_FOR { - for (int j = start; j < stop; j += increment) { + for (auto j = start; j < stop; j++) { Nd4jLong xRow[] = {j, currentRow}; auto rowIndex = shape::getOffset(compoundShape, xRow, 0); compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t(i, i); @@ -310,7 +310,7 @@ namespace helpers { permutations = permutationVectors->allTensorsAlongDimension({-1}); auto loop = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { luNN_(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n); } }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp index cc43c1866..8a2048263 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp @@ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) { int lO = listOut.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) for (int j = 0; j < lastDimension; ++j) listOut.at(i)->p(j, listDiag.at(i)->e(j, j)); }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp index dcca5075e..20d8bd34f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp @@ -55,7 +55,7 @@ namespace helpers { Nd4jLong oL = output->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto row = rows.at(e); output->p(e, row->e(n)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp index 3e18d6d14..71beed7f9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp @@ -49,7 +49,7 @@ namespace nd4j { if (tadEws >= 1) { auto func = PRAGMA_THREADS_FOR { - for (auto e = 0; e < stop; e += increment) { + for (auto e = 0; e < stop; e++) { auto cO = output + tadPack.primaryOffsets()[e]; auto idx = static_cast(indices[e]); @@ -70,7 +70,7 @@ namespace nd4j { samediff::Threads::parallel_tad(func, 0, numTads); } else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cO = output + tadPack.primaryOffsets()[e]; auto idx = static_cast(indices[e]); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp index fc572677e..df80636ee 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp @@ -70,7 +70,7 @@ template static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T order = n.e(i); if(order != static_cast(order)) // if order has fractional part then do not perform calculations and return NAN output.p(i, std::numeric_limits::quiet_NaN()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp index 90b69ca6f..9e1980e54 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp @@ -113,7 +113,7 @@ namespace helpers { ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); auto batching = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { //qr here qrSingle(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp index bb0e7e24e..a14fb89f9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp @@ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto auto d = delta.e(0); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) buff[i] = s + i * d; }; samediff::Threads::parallel_for(func, 0, len); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp index 9ee906bd5..4c80e3bf2 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp @@ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if (inArr == outArr) { if (inEWS == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto idx = sLength - e; swap(inArr, e, idx); } @@ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * } else if (inEWS > 1) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto idx1 = (sLength - e) * inEWS; Nd4jLong idx2 = e * inEWS; swap(inArr, idx1, idx2); @@ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); swap(outArr, inOffset, outOffset); @@ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) { auto func = PRAGMA_THREADS_FOR { - for (Nd4jLong e = start; e < stop; e += increment) + for (Nd4jLong e = start; e < stop; e++) outArr[sLength - e] = inArr[e]; }; samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); if(inLength != numOfElemsToReverse) { auto f2 = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) outArr[e] = inArr[e]; }; samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); @@ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) outArr[(sLength - e) * outEWS] = inArr[e * inEWS]; }; samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); if(inLength != numOfElemsToReverse) { auto f2 = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) outArr[e * outEWS] = inArr[e * inEWS]; }; samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); @@ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); outArr[outOffset] = inArr[inOffset]; @@ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if(inLength != numOfElemsToReverse) { auto f2 = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto outOffset = shape::getIndexOffset(e, outShapeBuffer); outArr[outOffset] = inArr[inOffset]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index 01e346136..09a628b84 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& // loop through input array auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); @@ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra // loop through output array auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp index fd285ed9c..557d63fd3 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp @@ -48,7 +48,7 @@ namespace helpers { const int total_count = batch_size * input_height * input_width * input_depth; auto func = PRAGMA_THREADS_FOR { - for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { + for (auto inp_idx = start; inp_idx < stop; inp_idx++) { // inp_idx = d + input_depth * (w + input_width * (h + input_height * b)) const int d = inp_idx % input_depth; const int inp_idx2 = inp_idx / input_depth; @@ -74,7 +74,7 @@ namespace helpers { const int total_count = batch_size * output_depth_by_output_area; auto func = PRAGMA_THREADS_FOR { - for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { + for (auto inp_idx = start; inp_idx < stop; inp_idx++) { const int n_iC_oY_bY_oX = inp_idx / block_size; const int bX = inp_idx - n_iC_oY_bY_oX * block_size; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp index a3f0c01be..2de2b2d22 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp @@ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int Nd4jLong xCoords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, xShapeInfo, xCoords); @@ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind if(outRank == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong idx = indices.e(i); NDArray out = output({idx, idx + 1}); @@ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray outSubArr = output(indices.e(i), std::vector({0})); NDArray updSubArr = updates(i, dimsToExcludeUpd); @@ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i if(outRank == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong idx = indices.e(i); NDArray out = output({idx, idx + 1}); @@ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i auto func = PRAGMA_THREADS_FOR { std::vector idxRangeOut(2*outRank, 0); - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray indSubArr = indices(i, dimsToExcludeInd); for (Nd4jLong j = 0; j < indLastDim; ++j) { @@ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr if(!calcGrad) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto subArr = updates(i, dimsToExclude); output.p(i, subArr.e(indices.e(i))); } @@ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr samediff::Threads::parallel_for(func, 0, indicesLen); } else { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto subArr = updates(i, dimsToExclude); auto ind = indices.e(i); subArr.p(ind, subArr.e(ind) - 1.); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp index e20145735..08aafc98c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp @@ -169,7 +169,7 @@ namespace helpers { for (int i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { meanV.p(e, meanV.e(e) + listOfTensors.at(i)->e(e)); } }; @@ -223,7 +223,7 @@ namespace helpers { for (int i = 0; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { sumT->p(e, sumT->e(e) + listOfTensors.at(i)->e(e)); } }; @@ -272,7 +272,7 @@ namespace helpers { for (int i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { sumT->p(e, sumT->e(e) * listOfTensors.at(i)->e(e)); } }; @@ -625,7 +625,7 @@ namespace helpers { Nd4jLong loop_size = input->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); if (nd4j::math::nd4j_abs(tempRes.e(classNum) - input->e(e)) <= T(1.e-6)) output->p(e, gradOut->e(classNum)); @@ -645,7 +645,7 @@ namespace helpers { //std::vector> outputs(numOfClasses); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); @@ -675,7 +675,7 @@ namespace helpers { segmentMinFunctor(context, input, indices, &tempRes); if (input->isVector()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); if (nd4j::math::nd4j_abs(tempRes.e(classNum) - input->e(e)) < 1.e-5) output->p(e, gradOut->e(classNum)); @@ -697,7 +697,7 @@ namespace helpers { int pos = 0; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); @@ -887,7 +887,7 @@ namespace helpers { if (input->isVector()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); if (nd4j::math::nd4j_abs(tempRes.t(classNum) - input->t(e)) < 1.e-6) output->t(e) = gradOut->t(classNum); @@ -1004,7 +1004,7 @@ namespace helpers { unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes); if (input->isVector()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); output->p(e, gradOut->e(classNum) * tempRes.e(classNum) / input->e(e)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp index 59c257c28..05353bf5e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp @@ -364,7 +364,7 @@ namespace nd4j { auto func = PRAGMA_THREADS_FOR { T sneu1e[600]; - for (auto t = start; t < stop; t += increment) { + for (auto t = start; t < stop; t++) { T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; memset(neu1e, 0, vectorLength * sizeof(T)); @@ -457,7 +457,7 @@ namespace nd4j { T sneu1[600]; T sneu1e[600]; - for (int e = start; e < stop; e += increment) { + for (int e = start; e < stop; e++) { T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp index 48f7f0d9a..c8774f028 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp @@ -40,7 +40,7 @@ namespace helpers { output->assign(input); auto batchLoop = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { for (auto r = 0; r < rows; r++) { for (auto c = 0; c < r; c++) { math::nd4j_swap(outputPart[batch]->t(r, c) , outputPart[batch]->t(c, r)); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp index 642dd37da..d2dd3bf30 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp @@ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray T* pCt = ct->bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (auto col = start; col < stop; col += increment) { + for (auto col = start; col < stop; col++) { const auto colNum = col % d2; bool flip = colNum >= K; T maskVal = mask ? *(pMask + col) : T(1); @@ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr T* pGradInit = gradC0->bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (auto col = start; col < stop; col += increment) { + for (auto col = start; col < stop; col++) { T gbF = 0.f; T gbR = 0.f; const auto colNum = col % d2; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp index db9b6afff..a3d27702d 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp @@ -37,7 +37,7 @@ static void stack_(const std::vector& inArrs, NDArray* outArr, c int inSize = inArrs.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) outArr->p(i, inArrs[i]->t(0)); }; @@ -50,7 +50,7 @@ static void stack_(const std::vector& inArrs, NDArray* outArr, c int listSize = list.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) list.at(i)->assign(inArrs[i]); }; samediff::Threads::parallel_tad(func, 0, listSize); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp index e38232928..c4b45b398 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp @@ -150,7 +150,7 @@ namespace helpers { result->assign(0); if (status == ND4J_STATUS_OK) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { bool found = false; for (int j = 0; j < k; j++) { if (target->e(e) == indices->e(e * k + j)) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index ea5e90cd8..1f630e8e0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N int dLen = dOdI.lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { if (dOdI.t(i) != static_cast(0.f)) dOdI.t(i) = static_cast(1.f); } @@ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) { auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1}); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) output.p(i, setOfSubArrs.at(i)->getTrace()); }; samediff::Threads::parallel_for(func, 0, setOfSubArrs.size()); @@ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); @@ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); @@ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK * 3]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong *zCoordStart, *xCoordStart; if (yLastDim == xRank) { @@ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con else if (input->rankOf() == 1 && indices->isVector()) { // special case auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) output->p(e, input->e(indices->e(e))); }; @@ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray subArrOut = (*output)(i, dimsOut); NDArray subArrIn = (*input)(indices->e(i), {axis}); subArrOut.assign(subArrIn); @@ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis}); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray subArrOut = (*output)(i, {axis}); NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); subArrOut.assign(subArrIn); @@ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) { auto arrs = output.allTensorsAlongDimension({rank-2, rank-1}); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) arrs.at(i)->setIdentity(); }; @@ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat indices.push_back((*intArgs)[e]); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inSubArr = input(indices[i], dimsToExclude, true); auto updSubArr = updates(i, dimsToExclude, true); @@ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input case 6: { // copy auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inSubArr = input(i, dimensions); inSubArr.p(indices.t(i), updates.e(i)); } @@ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector& inArrs, NDArray& output) auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T max = -DataTypeUtils::max(); Nd4jLong idx = 0; @@ -839,7 +839,7 @@ static void mergeMax_(const std::vector& inArrs, NDArray& output) { auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T max = -DataTypeUtils::max(); for (int i = 0; i < numArgs; i++) { T v = inArrs[i]->e(e); @@ -865,7 +865,7 @@ static void mergeAvg_(const std::vector& inArrs, NDArray& output) { auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T sum = 0.; for (int i = 0; i < numArgs; i++) { T v = inArrs[i]->e(e); @@ -891,7 +891,7 @@ static void mergeAdd_(const std::vector& inArrs, NDArray& output) { auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T sum = (T) 0.f; for (int i = 0; i < numArgs; i++) sum += inArrs[i]->e(e); @@ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector& auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T iNormActual = norm2.e(i); if (iNormActual > normClip) *listOfInSubArrs.at(i) *= normClip / iNormActual; @@ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector& auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inputSubArr = listOfInSubArrs.at(i); auto outputSubArr = listOfOutSubArrs.at(i); outputSubArr->assign(inputSubArr); @@ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g auto cn = clipNorm.e(0); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { T N = norm2.e(i); auto gradOSubArr = gradOSubArrs.at(i); @@ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o auto func = PRAGMA_THREADS_FOR { Nd4jLong inIdx[MAX_RANK]; Nd4jLong outIdx[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), outIdx); for (int j = 0; j < rank; ++j) { @@ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES); -////////////////////////////////////////////////////////////////////////// -template -static void concat_(const std::vector& inArrs, NDArray& output, const int axis) { - nd4j::SpecialMethods::concatCpuGeneric(inArrs, output, axis); -} - - void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis) { - BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES); - } - - BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector& inArrs, NDArray& output, const int axis), LIBND4J_TYPES); ////////////////////////////////////////////////////////////////////////// template diff --git a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp index ceb228439..c825a8fee 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp @@ -90,7 +90,7 @@ namespace helpers { auto outputPart = output->allTensorsAlongDimension({-2, -1}); auto batchLoop = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { if (lower) { lowerTriangularSolve(context, leftPart[i], rightPart[i], adjoint, outputPart[i]); } else { @@ -112,7 +112,7 @@ namespace helpers { auto rows = input->sizeAt(-2); auto batchLoop = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { if (!lower) { for (auto r = 0; r < rows; r++) { for (auto c = 0; c <= r; c++) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp index 5d4ed9f2e..90ef634c1 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp @@ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray int xLen = x.lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) z.p(i, zetaScalar(x.e(i), q.e(i))); }; diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h index 02b7e8467..3ea80966b 100644 --- a/libnd4j/include/ops/declarable/helpers/cross.h +++ b/libnd4j/include/ops/declarable/helpers/cross.h @@ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND int tads = tadsA.size(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto a_ = tadsA.at(e); auto b_ = tadsB.at(e); auto o_ = tadsO.at(e); diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp index 8ef63101e..3bcdea865 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp @@ -69,7 +69,7 @@ namespace helpers { } auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { values->p(e, static_cast(valuesVector[e])); if (counts != nullptr) counts->p(e, countsMap[valuesVector[e]]); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp index 1a35ecd47..8ef8032bb 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp @@ -19,8 +19,10 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0); + + BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES); } \ No newline at end of file diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp index be8edad04..5bb518d76 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp index 915983bb0..27b68e732 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp index d2f59137d..80e2258c7 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp index 29caeae84..e34b0c528 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp index 489d1fc6a..96797cc98 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp index 6f50c4682..70c7f3990 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp index 03a31221f..e2d1df0e9 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp index 074f09238..25e14d39f 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp index 8de7c663b..f3b4cbcb6 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp index 3e841dfae..4d1575123 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp index 59a215c20..b50c487b7 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp index 77617173d..972b936dd 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp index 2c19c3bc6..9eb99b238 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp index cd6babb61..6558d7284 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp index b54028b42..d89652899 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp index 4ca54e7b1..40c9598ee 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp index 3d843ca4c..e49ace221 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp index d8dc34f1c..973b25edc 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp index 2c12f2803..b3bf0beeb 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9); diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp index 2779bdadf..efd57a7c5 100644 --- a/libnd4j/include/ops/impl/gemm.cpp +++ b/libnd4j/include/ops/impl/gemm.cpp @@ -34,7 +34,7 @@ namespace nd4j { // handle transpose in parallel auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { for (int c = 0; c < cols; c++) { int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c); int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c); @@ -73,7 +73,7 @@ namespace nd4j { C[r] = z; } else { auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) + for (auto r = start; r < stop; r++) C[r] = z; }; samediff::Threads::parallel_for(func, 0, length); @@ -130,7 +130,7 @@ namespace nd4j { auto aT = TRANS == CblasTrans ? reinterpret_cast(nd4j::blas::transpose(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast(x))) : x; auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { int aIdx = linearIndexC(M, N, r, 0); auto aX = aT + aIdx; diff --git a/libnd4j/include/ops/impl/specials_double.hpp b/libnd4j/include/ops/impl/specials_double.hpp new file mode 100644 index 000000000..73f50c772 --- /dev/null +++ b/libnd4j/include/ops/impl/specials_double.hpp @@ -0,0 +1,270 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com, created on 07.10.2017. +// @author Yurii Shyrma (iuriish@yahoo.com) +// + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nd4j { + + + template + void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) { + auto x = reinterpret_cast(dx); + auto z = reinterpret_cast(dz); + + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) { + z[i] = static_cast(x[i]); + } + }; + + samediff::Threads::parallel_for(func, 0, N); + }; + + + template + void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { + int i = left, j = right; + X ktmp; + X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)]; + + Y vtmp; + + { + /* PARTITION PART */ + while (i <= j) { + if (descending) { + while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot) + i++; + while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; + values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; + values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } else { + while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot) + i++; + while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; + values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; + values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } + } + + } + + // + + if ( ((right-left) + void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { + int i = left, j = right; + X ktmp; + Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)]; + + Y vtmp; + + { + /* PARTITION PART */ + while (i <= j) { + if (descending) { + while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot) + i++; + while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; + value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; + value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } else { + while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot) + i++; + while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; + value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; + value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } + } + + } + + // + + if ( ((right-left) + static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ + auto array = reinterpret_cast(varray); + auto values = reinterpret_cast(yarray); + int cutoff = 1000; + + PRAGMA_OMP_PARALLEL_THREADS(numThreads) + { +PRAGMA_OMP_SINGLE_ARGS(nowait) + { + quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); + } + } + } + + template + static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ + auto array = reinterpret_cast(varray); + auto values = reinterpret_cast(yarray); + int cutoff = 1000; + + PRAGMA_OMP_PARALLEL_THREADS(numThreads) + { +PRAGMA_OMP_SINGLE_ARGS(nowait) + { + quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); + } + } + } + + template + void DoubleMethods::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { + quickSort_parallel_key(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); + } + + template + void DoubleMethods::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { + quickSort_parallel_value(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); + } + + template + void DoubleMethods::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { + auto x = reinterpret_cast(vx); + auto y = reinterpret_cast(vy); + + auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); + auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); + + auto xLength = shape::length(xShapeInfo); + auto xTadLength = shape::length(packX.primaryShapeInfo()); + auto numTads = packX.numberOfTads(); + + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r++) { + auto dx = x + packX.primaryOffsets()[r]; + auto dy = y + packY.primaryOffsets()[r]; + + quickSort_parallel_key(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); + } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); + } + + template + void DoubleMethods::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { + auto x = reinterpret_cast(vx); + auto y = reinterpret_cast(vy); + + auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); + auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); + + auto xLength = shape::length(xShapeInfo); + auto xTadLength = shape::length(packX.primaryShapeInfo()); + auto numTads = packX.numberOfTads(); + + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r++) { + auto dx = x + packX.primaryOffsets()[r]; + auto dy = y + packY.primaryOffsets()[r]; + + quickSort_parallel_value(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); + } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); + } +} + diff --git a/libnd4j/include/ops/impl/specials.hpp b/libnd4j/include/ops/impl/specials_single.hpp similarity index 56% rename from libnd4j/include/ops/impl/specials.hpp rename to libnd4j/include/ops/impl/specials_single.hpp index 207ca5964..030e9c6d7 100644 --- a/libnd4j/include/ops/impl/specials.hpp +++ b/libnd4j/include/ops/impl/specials_single.hpp @@ -64,7 +64,7 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, ND T* outBuff = output.bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { const Nd4jLong arrLen = inArrs[r]->lengthOf(); const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; @@ -99,7 +99,7 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, ND } auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto temp = output(indices[i], true); nd4j::TransformLoops::template loopTransform>( inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr, 0, 1); } @@ -143,7 +143,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint auto x = reinterpret_cast(vx); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { for (auto ar = 0L; ar < n; ar++) { z[i] += x[ar][i]; } @@ -179,7 +179,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint } auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { for (Nd4jLong ar = 1; ar < n; ar++) { z[i] += x[ar][i] / static_cast(n); } @@ -199,7 +199,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint // aggregation step auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { for (Nd4jLong ar = 0; ar < n; ar++) { z[i] += x[ar][i] / static_cast(n); } @@ -336,7 +336,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) int numTads = xLength / xTadLength; auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { T *dx = x + tadOffsets[r]; quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending); @@ -358,7 +358,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { for (int bitId = 0; bitId < 16; bitId++) { bool hasBit = (x[e] & 1 << (bitId)) != 0; bool hasSign = (x[e] & 1 << (bitId + 16)) != 0; @@ -378,22 +378,6 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) samediff::Threads::parallel_for(func, 4, lim); } - template - void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) { - auto x = reinterpret_cast(dx); - auto z = reinterpret_cast(dz); - - - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { - z[i] = static_cast(x[i]); - } - }; - - samediff::Threads::parallel_for(func, 0, N); - }; - BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES); - template Nd4jLong SpecialMethods::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) { auto dx = reinterpret_cast(vx); @@ -442,226 +426,5 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) }; return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); } - - template - void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { - int i = left, j = right; - X ktmp; - X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)]; - - Y vtmp; - - { - /* PARTITION PART */ - while (i <= j) { - if (descending) { - while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot) - i++; - while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; - values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; - values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } else { - while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot) - i++; - while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; - values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; - values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } - } - - } - - // - - if ( ((right-left) - void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { - int i = left, j = right; - X ktmp; - Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)]; - - Y vtmp; - - { - /* PARTITION PART */ - while (i <= j) { - if (descending) { - while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot) - i++; - while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; - value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; - value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } else { - while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot) - i++; - while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; - value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; - value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } - } - - } - - // - - if ( ((right-left) - static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ - auto array = reinterpret_cast(varray); - auto values = reinterpret_cast(yarray); - int cutoff = 1000; - - PRAGMA_OMP_PARALLEL_THREADS(numThreads) - { -PRAGMA_OMP_SINGLE_ARGS(nowait) - { - quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); - } - } - } - - template - static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ - auto array = reinterpret_cast(varray); - auto values = reinterpret_cast(yarray); - int cutoff = 1000; - - PRAGMA_OMP_PARALLEL_THREADS(numThreads) - { -PRAGMA_OMP_SINGLE_ARGS(nowait) - { - quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); - } - } - } - - template - void DoubleMethods::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { - quickSort_parallel_key(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); - } - - template - void DoubleMethods::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { - quickSort_parallel_value(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); - } - - template - void DoubleMethods::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { - auto x = reinterpret_cast(vx); - auto y = reinterpret_cast(vy); - - auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); - auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); - - auto xLength = shape::length(xShapeInfo); - auto xTadLength = shape::length(packX.primaryShapeInfo()); - auto numTads = packX.numberOfTads(); - - auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { - auto dx = x + packX.primaryOffsets()[r]; - auto dy = y + packY.primaryOffsets()[r]; - - quickSort_parallel_key(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); - } - }; - - samediff::Threads::parallel_tad(func, 0, numTads); - } - - template - void DoubleMethods::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { - auto x = reinterpret_cast(vx); - auto y = reinterpret_cast(vy); - - auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); - auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); - - auto xLength = shape::length(xShapeInfo); - auto xTadLength = shape::length(packX.primaryShapeInfo()); - auto numTads = packX.numberOfTads(); - - auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { - auto dx = x + packX.primaryOffsets()[r]; - auto dy = y + packY.primaryOffsets()[r]; - - quickSort_parallel_value(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); - } - }; - - samediff::Threads::parallel_tad(func, 0, numTads); - } - - //BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES); - //BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES); } diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h index a25aa36ec..354f8e328 100644 --- a/libnd4j/include/ops/special_random_ops.h +++ b/libnd4j/include/ops/special_random_ops.h @@ -167,7 +167,7 @@ namespace randomOps { if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) { auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T prob = rng->relativeT(e); T cumProb = (T) 0.0f; for (Nd4jLong f = 0; f < yLength; f++) { @@ -330,7 +330,7 @@ namespace randomOps { const T epsilon = static_cast(1e-5); auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto epm = e + middle; // we need to get random values @@ -440,7 +440,7 @@ namespace randomOps { nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); auto func = PRAGMA_THREADS_FOR { - for (Nd4jLong e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { int success = 0; for (int t = 1; t <= trials; t++) { @@ -549,7 +549,7 @@ namespace randomOps { //nd4j::random::RandomBuffer *buffer = reinterpret_cast (state); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { int success = 0; for (int t = 1; t <= trials; t++) { @@ -690,7 +690,7 @@ namespace randomOps { const T epsilon = static_cast(1e-5); auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { if (z[e] > mean + ds || z[e] < mean - ds) { z[e] = step(rng, mean, stddev, e, middle, z[e]); @@ -818,7 +818,7 @@ namespace randomOps { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto epm = e + middle; // we need to get random values From f7a919040794a2d69c1e7a10ac6493726124eda6 Mon Sep 17 00:00:00 2001 From: Yurii Shyrma Date: Thu, 20 Feb 2020 20:19:01 +0200 Subject: [PATCH 18/19] profiling of concat op (both cuda and cpu) (#151) * - profiling of concat op (both cuda and cpu) Signed-off-by: Yurii * better comparison for large concat Signed-off-by: raver119 * - further improving of concat op Signed-off-by: Yurii * some loggin Signed-off-by: raver119 * - add possibility to verify presence of trailing unities in shape and set strides/ews correspondingly - restrict second simple case in concat op to c order only Signed-off-by: Yurii * - move concat op to specials_single.cpp file Signed-off-by: Yurii * - get rid of second concat op declaration in transforms.cpp file Signed-off-by: Yurii Co-authored-by: raver119 --- libnd4j/blas/NDArray.hpp | 12 +- libnd4j/include/helpers/shape.h | 90 ++++++--- .../declarable/generic/transforms/concat.cpp | 8 +- .../ops/declarable/helpers/cuda/concat.cu | 108 ++++++++-- libnd4j/include/ops/impl/specials_single.hpp | 190 +++++++++++++----- .../layers_tests/DeclarableOpsTests9.cpp | 10 +- .../tests_cpu/layers_tests/NDArrayTests2.cpp | 2 +- .../tests_cpu/layers_tests/ShapeTests2.cpp | 52 ++--- .../java/org/nd4j/nativeblas/Nd4jCuda.java | 24 +-- .../linalg/shape/concat/ConcatTestsC.java | 6 + 10 files changed, 348 insertions(+), 154 deletions(-) diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 6c5f6a8c8..f7c6d0684 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -4866,7 +4866,7 @@ NDArray NDArray::operator()(const std::vector& idx, const bool keepUni } } - Nd4jLong *shapeInfoNoUnities = newShapeInfo; + Nd4jLong *newShapeInfo2 = newShapeInfo; if(!keepUnitiesInShape) { @@ -4877,18 +4877,18 @@ NDArray NDArray::operator()(const std::vector& idx, const bool keepUni dimsWithUnities.push_back(d); if(!dimsWithUnities.empty()) - shapeInfoNoUnities = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace()); + newShapeInfo2 = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace()); } // check if there is possibility to set ews = 1 - shape::checkStridesSetEwsAndOrder(shapeInfoNoUnities); + shape::checkStridesEwsAndOrder(newShapeInfo2); - NDArray result(_buffer, ShapeDescriptor(shapeInfoNoUnities), getContext(), offset + getBufferOffset()); + NDArray result(_buffer, ShapeDescriptor(newShapeInfo2), getContext(), offset + getBufferOffset()); result._isView = true; RELEASE(newShapeInfo, getContext()->getWorkspace()); - if(newShapeInfo != shapeInfoNoUnities) - RELEASE(shapeInfoNoUnities, getContext()->getWorkspace()); + if(newShapeInfo != newShapeInfo2) + RELEASE(newShapeInfo2, getContext()->getWorkspace()); return result; } diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h index 3d1d96f4b..d4e95c65f 100644 --- a/libnd4j/include/helpers/shape.h +++ b/libnd4j/include/helpers/shape.h @@ -900,9 +900,9 @@ namespace shape { * @return the double at the specified index */ - ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0); - ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *indices, Nd4jLong baseOffset = 0); - ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *indices, Nd4jLong baseOffset = 0); + ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0); + ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0); + ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0); ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank); @@ -1014,8 +1014,8 @@ namespace shape { // if array is scalar or unit length vector then ews = 1 and order is preserved // if array is common vector then ews = stride of non-unity dimension and order is preserved // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved - ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities); - ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo); + ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities); + ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo); /** * processes whole set of sub-arrays @@ -1041,7 +1041,7 @@ namespace shape { ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities); /** - * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2 * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} */ INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo); @@ -2071,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank]; } - shape::checkStridesSetEwsAndOrder(shapeInfo); + shape::checkStridesEwsAndOrder(shapeInfo); delete[] temp; } @@ -2483,7 +2483,7 @@ INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape) newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer); // correct order and ews if necessary - shape::checkStridesSetEwsAndOrder(newShapeBuffer); + shape::checkStridesEwsAndOrder(newShapeBuffer); delete[] indices; @@ -4092,7 +4092,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap // set ews if(oldEws == 0) - shape::checkStridesSetEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order + shape::checkStridesEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order else { newShapeInfo[2 * newRank + 3] = oldOrder; // order *shape::ews(newShapeInfo) = oldEws; // ews @@ -4642,7 +4642,7 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong } ////////////////////////////////////////////////////////////////////// -INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) { +INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo) { // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities) Nd4jLong tempBuffer[2*MAX_RANK]; @@ -4651,11 +4651,11 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) { // exclude unities from shapeInfo const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides); - shape::checkStridesSetEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides); + shape::checkStridesEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides); } ////////////////////////////////////////////////////////////////////// -INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) { +INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) { const int rank = shape::rank(shapeInfo); @@ -4673,19 +4673,32 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch bool contiguous = true; - // *** check whether strides are in c contiguous order ***// - if(stridesNoUnities[numOfNonUnities - 1] != 1) // last stride should be always unity for c order - contiguous = false; - else { - for (uint i = 0; i < numOfNonUnities - 1; ++i) { - if(stridesNoUnities[i] != stridesNoUnities[i + 1] * shapeNoUnities[i + 1]) { - contiguous = false; - break; - } + //*** check whether strides are in c contiguous order ***// + for (uint i = 0; i < numOfNonUnities - 1; ++i) { + if(stridesNoUnities[i] != shapeNoUnities[i + 1] * stridesNoUnities[i + 1]) { + contiguous = false; + break; } } + if(contiguous) { - *shape::ews(shapeInfo) = 1; + + // for example we have shapeInfo = {3, 5,1,1, 4,4,1, ...} then we should change it to shapeInfo = {3, 5,1,1, 4,4,4, ...ews=4} + if(numOfNonUnities < rank) { // unities are present in shape + + int indNonUnit = rank - 1; + + while(shape::shapeOf(shapeInfo)[indNonUnit--] == 1) + + for(int j = indNonUnit + 2; j < rank; ++j) + shape::stride(shapeInfo)[j] = stridesNoUnities[numOfNonUnities - 1]; + + for(int j = indNonUnit; j >= 0; --j) + if(shape::shapeOf(shapeInfo)[j] == 1) + shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j + 1] * shape::stride(shapeInfo)[j + 1]; + } + + *shape::ews(shapeInfo) = stridesNoUnities[numOfNonUnities - 1]; shapeInfo[rank * 2 + 3] = 99; return; } @@ -4693,18 +4706,31 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch contiguous = true; //*** check whether strides are in f contiguous order ***// - if(stridesNoUnities[0] != 1) // first stride should be always unity for f order - contiguous = false; - else { - for (uint i = 1; i < numOfNonUnities; ++i) { - if(stridesNoUnities[i] != stridesNoUnities[i - 1] * shapeNoUnities[i - 1]) { - contiguous = false; - break; - } + for (uint i = 1; i < numOfNonUnities; ++i) { + if(stridesNoUnities[i] != shapeNoUnities[i - 1] * stridesNoUnities[i - 1]) { + contiguous = false; + break; } } + if(contiguous) { - *shape::ews(shapeInfo) = 1; + + // for example we have shapeInfo = {3, 1,1,5, 1,4,4, ...} then we should change it to shapeInfo = {3, 1,1,5, 4,4,4, ...ews=4} + if(numOfNonUnities < rank) { // unities are present in shape + + int indNonUnit = 0; + + while(shape::shapeOf(shapeInfo)[indNonUnit++] == 1) + + for(int j = 0; j < indNonUnit - 1; ++j) + shape::stride(shapeInfo)[j] = stridesNoUnities[0]; + + for(int j = indNonUnit; j < rank; ++j) + if(shape::shapeOf(shapeInfo)[j] == 1) + shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j - 1] * shape::stride(shapeInfo)[j - 1]; + } + + *shape::ews(shapeInfo) = stridesNoUnities[0]; shapeInfo[rank * 2 + 3] = 102; return; } @@ -4756,7 +4782,7 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo shape::calcOffsets(dimsSize, shape, strides, subArrOffsets); // evaluate ews - shape::checkStridesSetEwsAndOrder(subArrShapeInfo); + shape::checkStridesEwsAndOrder(subArrShapeInfo); delete []strides; delete []shape; diff --git a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp index 2003eef3f..faa59fa6c 100644 --- a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp +++ b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp @@ -42,8 +42,8 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) { std::vector arrsToDelete; int index = 0; bool allOfSameType = true; - auto theFirstRank = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0; - auto theFirstDatatype = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType(); + auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0; + auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType(); for(int i = 0; i < numOfInArrs; ++i) { auto input = INPUT_VARIABLE(i); @@ -51,10 +51,10 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) { // TODO: follow two lines are in accordance to current tf.concat spec. Commented for compatibility with legacy // REQUIRE_TRUE(currentRank > 0, 0, "Rank of input variable %i must be greater 0, but is %lld instead.", i, currentRank); -// REQUIRE_TRUE(theFirstRank == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, theFirstRank); +// REQUIRE_TRUE(rankOfFirstArr == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, rankOfFirstArr); if(!input->isEmpty()) { - allOfSameType &= (theFirstDatatype == input->dataType()); + allOfSameType &= (typeOfFirstArr == input->dataType()); if(input->rankOf() == 0) { auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext()); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu index 43c0e4af9..b455ff659 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu @@ -85,38 +85,106 @@ BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid ////////////////////////////////////////////////////////////////////////// void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis) { - const int threadsPerBlock = 256; - const int blocksPerGrid = 512; - const int sharedMem = 512; + const int numOfInArrs = inArrs.size(); + const auto sizeofT = output.sizeOfT(); - const int numOfArrs = inArrs.size(); - - for(int i = 0; i < numOfArrs; ++i) + for(int i = 0; i < numOfInArrs; ++i) inArrs[i]->syncToDevice(); - output.syncToDevice(); - // prepare arrays of pointers on buffers and shapes - std::vector hInBuffers(numOfArrs); - std::vector hInShapeInfo(numOfArrs); + bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1; - for(int i = 0; i < numOfArrs; ++i) { - hInBuffers[i] = inArrs[i]->getSpecialBuffer(); - hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo(); + if(luckCase1) { + for (uint i = 0; i < numOfInArrs; ++i) { + luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1; + if(!luckCase1) + break; + } } - PointersManager manager(context, "helpers::concat"); + if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f - void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*)); - void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*)); + void* z = static_cast(output.getSpecialBuffer()); - BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES); + for (uint i = 0; i < numOfInArrs; ++i) { + const auto memAmountToCopy = inArrs[i]->lengthOf() * sizeofT; + cudaMemcpyAsync(z, static_cast(inArrs[i]->getSpecialBuffer()), memAmountToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream()); + z = static_cast(z) + memAmountToCopy; + } - manager.synchronize(); + if(cudaStreamSynchronize(*context->getCudaStream()) != 0) + throw std::runtime_error("concat cuda: luckCase1 failed!"); - for(int i = 0; i < numOfArrs; ++i) + for(int i = 0; i < numOfInArrs; ++i) + inArrs[i]->tickReadDevice(); + output.tickWriteDevice(); + + return; + } + + const bool isZcontin = output.strideAt(axis) == 1; + bool areInputsContin = true; + bool allSameOrder = true; + + if(isZcontin) { + for (uint i = 0; i < inArrs.size(); ++i) { + areInputsContin &= inArrs[i]->strideAt(axis) == 1; + allSameOrder &= output.ordering() == inArrs[i]->ordering(); + if(!areInputsContin || !allSameOrder) + break; + } + } + + const bool luckCase2 = isZcontin && areInputsContin && allSameOrder; + + if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array + + const uint zDim = output.sizeAt(axis); + + for (uint i = 0; i < output.lengthOf() / zDim; ++i) { + + const auto iShift = i * sizeofT; + void* z = static_cast(output.getSpecialBuffer()) + zDim * iShift; + + for (uint j = 0; j < numOfInArrs; ++j) { + const auto xDim = inArrs[j]->sizeAt(axis); + void* x = static_cast(inArrs[j]->getSpecialBuffer()) + xDim * iShift; + const auto memSizeToCopy = xDim * sizeofT; + cudaMemcpyAsync(z, x, memSizeToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream()); + z = static_cast(z) + memSizeToCopy; + } + } + + if(cudaStreamSynchronize(*context->getCudaStream()) != 0) + throw std::runtime_error("concat cuda: luckCase2 failed!"); + } + else { // general (slower) case + + const int threadsPerBlock = 256; + const int blocksPerGrid = 512; + const int sharedMem = 512; + + // prepare arrays of pointers on buffers and shapes + std::vector hInBuffers(numOfInArrs); + std::vector hInShapeInfo(numOfInArrs); + + for(int i = 0; i < numOfInArrs; ++i) { + hInBuffers[i] = inArrs[i]->getSpecialBuffer(); + hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo(); + } + + PointersManager manager(context, "helpers::concat"); + + void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*)); + void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*)); + + BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES); + + manager.synchronize(); + } + + for(int i = 0; i < numOfInArrs; ++i) inArrs[i]->tickReadDevice(); - output.tickWriteDevice(); } diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp index 030e9c6d7..ad63ee490 100644 --- a/libnd4j/include/ops/impl/specials_single.hpp +++ b/libnd4j/include/ops/impl/specials_single.hpp @@ -31,81 +31,170 @@ #include namespace nd4j { - /** * Concatneate multi array of the same shape together * along a particular dimension */ +// template +// void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, NDArray& output, const int axis) { +// const uint numOfArrs = inArrs.size(); + +// int outDim; +// const bool isOutputVector = output.isCommonVector(outDim); + +// if(isOutputVector || (axis == 0 && output.ordering() == 'c')) { + +// bool allVectorsOrScalars = true; +// const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews(); + +// std::vector nonUnityDim(numOfArrs); +// std::vector zOffset(numOfArrs); + +// for(int i = 0; i < numOfArrs; i++) { +// allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i])); +// if(!allVectorsOrScalars) +// break; +// if(i == 0) zOffset[0] = 0; +// else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf(); +// } + +// if(allVectorsOrScalars) { + +// T* outBuff = output.bufferAsT(); + +// auto func = PRAGMA_THREADS_FOR { +// for (auto r = start; r < stop; r += increment) { +// const Nd4jLong arrLen = inArrs[r]->lengthOf(); +// const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; + +// T *z = outBuff + zOffset[r]; +// T *x = inArrs[r]->bufferAsT(); + +// if (outEws == 1 && xEws == 1) +// for (Nd4jLong e = 0; e < arrLen; e++) +// z[e] = x[e]; +// else +// for (Nd4jLong e = 0; e < arrLen; e++) +// z[e * outEws] = x[e * xEws]; +// } +// }; + +// samediff::Threads::parallel_tad(func, 0, numOfArrs); +// return; +// } +// } + +// const int rank = inArrs[0]->rankOf(); +// const int rank2 = 2*rank; +// std::vector> indices(numOfArrs, std::vector(rank2,0)); + +// // take into account indices for first array +// indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis); + +// // loop through the rest of input arrays +// for(int i = 1; i < numOfArrs; ++i) { +// indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from +// indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding) +// } + +// auto func = PRAGMA_THREADS_FOR { +// for (auto i = start; i < stop; i += increment) { +// auto temp = output(indices[i], true); +// nd4j::TransformLoops::template loopTransform>( inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr, 0, 1); +// } +// }; + +// samediff::Threads::parallel_tad(func, 0, numOfArrs); +// } + template void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, NDArray& output, const int axis) { - const uint numOfArrs = inArrs.size(); - int outDim; - const bool isOutputVector = output.isCommonVector(outDim); + const int numOfInArrs = inArrs.size(); + const auto sizeofT = output.sizeOfT(); - if(isOutputVector || (axis == 0 && output.ordering() == 'c')) { + T* zBuff = output.bufferAsT(); - bool allVectorsOrScalars = true; - const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews(); + bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1; - std::vector nonUnityDim(numOfArrs); - std::vector zOffset(numOfArrs); + if(luckCase1) { + for (uint i = 0; i < numOfInArrs; ++i) { + luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1; + if(!luckCase1) + break; + } + } - for(int i = 0; i < numOfArrs; i++) { - allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i])); - if(!allVectorsOrScalars) - break; - if(i == 0) zOffset[0] = 0; - else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf(); - } + if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f - if(allVectorsOrScalars) { + T* z = zBuff; + for (uint i = 0; i < numOfInArrs; ++i) { + const auto memAmountToCopy = inArrs[i]->lengthOf(); + memcpy(z, inArrs[i]->bufferAsT(), memAmountToCopy * sizeofT); + z += memAmountToCopy; + } + return; + } - T* outBuff = output.bufferAsT(); + const bool isZcontin = output.strideAt(axis) == 1 && output.ordering() == 'c'; + bool areInputsContin = true; + bool allSameOrder = true; - auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r++) { - const Nd4jLong arrLen = inArrs[r]->lengthOf(); - const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; + if(isZcontin) { + for (uint i = 0; i < numOfInArrs; ++i) { + areInputsContin &= inArrs[i]->strideAt(axis) == 1; + allSameOrder &= inArrs[i]->ordering() == output.ordering(); + if(!areInputsContin || !allSameOrder) + break; + } + } - T *z = outBuff + zOffset[r]; - T *x = inArrs[r]->bufferAsT(); + const bool luckCase2 = isZcontin && areInputsContin && allSameOrder; - if (outEws == 1 && xEws == 1) - for (Nd4jLong e = 0; e < arrLen; e++) - z[e] = x[e]; - else - for (Nd4jLong e = 0; e < arrLen; e++) - z[e * outEws] = x[e * xEws]; - } - }; + if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array - samediff::Threads::parallel_tad(func, 0, numOfArrs); - return; + const uint zDim = output.sizeAt(axis); + + for (uint i = 0; i < output.lengthOf() / zDim; ++i) { + T* z = zBuff + zDim * i; + + for (uint j = 0; j < inArrs.size(); ++j) { + const auto xDim = inArrs[j]->sizeAt(axis); + const T* x = inArrs[j]->bufferAsT() + xDim * i; + memcpy(z, x, xDim * sizeofT); + z += xDim; } } - const int rank = inArrs[0]->rankOf(); - const int rank2 = 2*rank; - std::vector> indices(numOfArrs, std::vector(rank2,0)); + return; + } - // take into account indices for first array - indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis); + // general case + auto func = PRAGMA_THREADS_FOR { - // loop through the rest of input arrays - for(int i = 1; i < numOfArrs; ++i) { - indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from - indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding) - } + Nd4jLong coords[MAX_RANK]; - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i++) { - auto temp = output(indices[i], true); - nd4j::TransformLoops::template loopTransform>( inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr, 0, 1); + for (auto i = start; i < stop; i += increment) { + + shape::index2coords(i, output.getShapeInfo(), coords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); + + uint inArrIdx = 0; + uint xDim = inArrs[inArrIdx]->sizeAt(axis); + + while (coords[axis] >= xDim) { + coords[axis] -= xDim; + xDim = inArrs[++inArrIdx]->sizeAt(axis); } - }; - samediff::Threads::parallel_tad(func, 0, numOfArrs); + const T* x = inArrs[inArrIdx]->bufferAsT(); + const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords); + + zBuff[zOffset] = x[xOffset]; + } + }; + + samediff::Threads::parallel_for(func, 0, output.lengthOf()); } /** @@ -128,6 +217,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint delete inputs[i]; } + /** * This kernel accumulates X arrays, and stores result into Z * diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp index 77634b052..773e1dc18 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp @@ -300,6 +300,8 @@ TEST_F(DeclarableOpsTests9, concat_test3) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto output = result->at(0); + output->printBuffer(); + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -620,12 +622,12 @@ TEST_F(DeclarableOpsTests9, concat_test18) { // we crate bunch of arrays, filled with specific values for (int e = 0; e < 2000; e++) { - auto array = NDArrayFactory::create_('c', {1, 300}); + auto array = NDArrayFactory::create_('c', {1, 300}); array->assign(e); context.setInputArray(e, array, true); } - auto z = NDArrayFactory::create('c', {2000, 300}); + auto z = NDArrayFactory::create('c', {2000, 300}); context.setOutputArray(0, &z, false); context.setIArguments(&axis, 1); @@ -633,8 +635,10 @@ TEST_F(DeclarableOpsTests9, concat_test18) { op.execute(&context); for (int e = 0; e < 2000; e++) { + auto exp = NDArrayFactory::create('c', {300}); + exp.assign(e); auto row = z.tensorAlongDimension(e, {1}); - ASSERT_NEAR((float) e, row.e(0), 1e-5f); + ASSERT_EQ(exp, row); } } diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp index e3dc1aefc..6d5366396 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp @@ -956,7 +956,7 @@ TEST_F(NDArrayTest2, subarray_1) { float buffExpX3[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000}; Nd4jLong shapeExpX4[] = {3, 2, 1, 4, 12, 4, 1, 8192, 0, 99}; float buffExpX4[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000}; - Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 0, 99}; + Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 4, 99}; float buffExpX5[] = {4.000000, 8.000000, 12.000000, 16.000000, 20.000000, 24.000000}; Nd4jLong shapeExpY0[] = {1, 2, 1, 8192, 1, 102}; diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp index a8f430fe3..fb0d7991a 100644 --- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp @@ -43,7 +43,7 @@ public: Nd4jLong shape[3] = {3,4,5}; Nd4jLong *shapeBuffer; ThreeDTest() { - shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); + shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); } ~ThreeDTest() { delete[] shapeBuffer; @@ -196,11 +196,11 @@ public: int dimensionLength = 2; int dimension[2] = {2,3}; Nd4jLong tadAssertionC[10] = {3,4,4,1,4,1,16,16384,1,99}; - Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,1,16384,1,102}; + Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,16,16384,1,102}; }; -TEST_F(LeadingOnes,OnesTest) { +TEST_F(LeadingOnes,OnesTest) { shape::TAD *cTad = new shape::TAD; cTad->init(shapeBufferC,dimension,dimensionLength); @@ -222,7 +222,7 @@ TEST_F(LeadingOnes,OnesTest) { class NormalThreeFourFive : public testing::Test { public: - Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 0, 102}; + Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 5, 99}; Nd4jLong inputShapeBuffer[10] = {3,3,4,5,20,5,1,16384,1,99}; int dimensionLength = 2; int dimension[2] = {0,1}; @@ -243,7 +243,7 @@ class DimensionWarning : public testing::Test { public: int dimensionLength = 2; int dimensions[2] = {0,1}; - Nd4jLong shape[3] = {1,5,1}; + Nd4jLong shape[3] = {1,5,1}; Nd4jLong *shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); ~DimensionWarning() { @@ -324,7 +324,7 @@ public: int dimensionFour = 0; int dimensionLength = 1; FourDTest() { - threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape); + threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape); fourDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 4, fourDShape); } ~FourDTest() { @@ -491,7 +491,7 @@ TEST_F(LabelTest,LabelTad) { delete tad; } -TEST_F(ExpectedValuesTest,TadTest) { +TEST_F(ExpectedValuesTest,TadTest) { auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, mainShape); shape::TAD *tad = new shape::TAD; tad->init(shapeBuffer,testDimensions,3); @@ -528,7 +528,7 @@ TEST_F(ThreeDTest,TensorAlongDimensionTest) { } -TEST_F(NumTadTests,TadTest) { +TEST_F(NumTadTests,TadTest) { auto shape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, this->shape); shape::TAD *tad = new shape::TAD; tad->init(shape,&dimension,1); @@ -539,7 +539,7 @@ TEST_F(NumTadTests,TadTest) { } TEST_F(TADStall,TestStall) { - auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); + auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); shape::TAD *tad = new shape::TAD; tad->init(0,shapeInfo,this->dimensions,3); tad->createTadOnlyShapeInfo(); @@ -564,7 +564,7 @@ TEST_F(PermuteTest,PermuteShapeBufferTest) { Nd4jLong shapeToPermute[4] = {5,3,2,6}; Nd4jLong permutedOrder[4] = {6,2,3,5}; auto shapeBufferOriginal = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute); - auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute); + auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute); shape::permuteShapeBufferInPlace(shapeBufferOriginal,normalOrder,shapeBufferOriginal); EXPECT_TRUE(arrsEquals(4,assertionShapeBuffer,shapeBufferOriginal)); @@ -585,9 +585,9 @@ TEST_F(ElementWiseStrideTest,ElementWiseStrideTest) { TEST_F(SliceVectorTest,RowColumnVectorTest) { Nd4jLong rowVectorShape[2] = {1,5}; - auto rowVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape); + auto rowVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape); Nd4jLong colVectorShape[2] = {5,1}; - auto colVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape); + auto colVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape); Nd4jLong *sliceRow = shape::sliceOfShapeBuffer(0,rowVectorShapeInfo); EXPECT_TRUE(arrsEquals(2,rowVectorShapeInfo,sliceRow)); Nd4jLong *scalarSliceInfo = shape::createScalarShapeInfo(); @@ -608,7 +608,7 @@ TEST_F(SliceTensorTest,TestSlice) { Nd4jLong shape[3] = {3,3,2}; auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); Nd4jLong sliceShape[2] = {3,2}; - auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); + auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer); EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice)); delete[] testSlice; @@ -619,9 +619,9 @@ TEST_F(SliceTensorTest,TestSlice) { TEST_F(SliceMatrixTest,TestSlice) { Nd4jLong shape[2] = {3,2}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); Nd4jLong sliceShape[2] = {1,2}; - auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); + auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer); EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice)); delete[] testSlice; @@ -664,13 +664,13 @@ TEST_F(TensorTwoFromFourDDimTest,TadTwoFromFourDimTest) { //Along dimension 1,2: expect matrix with shape [cols,dim2] //Along dimension 1,3: expect matrix with shape [cols,dim3] //Along dimension 2,3: expect matrix with shape [dim2,dim3] - auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); + auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); for(int i = 0; i < 3; i++) { int *dimArr = dims[i]; Nd4jLong *expectedShape = expectedShapes[i]; shape::TAD *tad = new shape::TAD; tad->init(baseShapeBuffer,dimArr,dimensionLength); - auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); + auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); tad->createTadOnlyShapeInfo(); Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo; EXPECT_TRUE(arrsEquals(shape::rank(expectedShapeBuffer),expectedShape,shape::shapeOf(testShapeBuffer))); @@ -687,14 +687,14 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) { //Along dimension 0,1: expect matrix with shape [rows,cols] //Along dimension 0,2: expect matrix with shape [rows,dim2] //Along dimension 1,2: expect matrix with shape [cols,dim2] - auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); + auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); for(int i = 0; i < 3; i++) { int *dimArr = dims[i]; Nd4jLong *expectedShape = expectedShapes[i]; shape::TAD *tad = new shape::TAD; tad->init(baseShapeBuffer,dimArr,dimensionLength); - auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); + auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); tad->createTadOnlyShapeInfo(); Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo; Nd4jLong *expectedStride = expectedStrides[i]; @@ -715,7 +715,7 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) { TEST_F(TensorOneDimTest,TadDimensionsForTensor) { Nd4jLong shape[3] = {rows,cols,dim2}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); for(int i = 0; i < rank; i++) { //Along dimension 0: expect row vector with length 'dims[i]' @@ -737,14 +737,14 @@ TEST_F(TensorOneDimTest,TadDimensionsForTensor) { TEST_F(MatrixTest,TadDimensionsForMatrix) { Nd4jLong shape[2] = {rows,cols}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); shape::TAD *dimZero = new shape::TAD; dimZero->init(shapeBuffer,&dims[0],1); shape::TAD *dimOne = new shape::TAD; dimOne->init(shapeBuffer,&dims[1],1); //Along dimension 0: expect row vector with length 'rows' - Nd4jLong rowVectorShape[2] = {1,rows}; + Nd4jLong rowVectorShape[2] = {1,rows}; auto expectedDimZeroShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape); dimZero->createTadOnlyShapeInfo(); Nd4jLong *testDimZero = dimZero->tadOnlyShapeInfo; @@ -753,7 +753,7 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) { delete[] expectedDimZeroShape; //Along dimension 1: expect row vector with length 'cols' - Nd4jLong rowVectorColShape[2] {1,cols}; + Nd4jLong rowVectorColShape[2] {1,cols}; auto expectedDimOneShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorColShape); dimOne->createTadOnlyShapeInfo(); Nd4jLong *testDimOneShape = dimOne->tadOnlyShapeInfo; @@ -767,12 +767,12 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) { } TEST_F(VectorTest,VectorTadShape) { - Nd4jLong rowVector[2] = {2,2}; + Nd4jLong rowVector[2] = {2,2}; auto rowBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVector); int rowDimension = 1; Nd4jLong columnVector[2] = {2,2}; - auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector); + auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector); int colDimension = 0; @@ -811,7 +811,7 @@ TEST_F(VectorTest,LinspaceCombinationTest) { int len = rows * cols; double *linspaced = linspace(1,rows * cols,len); Nd4jLong shape[2] = {rows,cols}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); delete[] shapeBuffer; delete[] linspaced; diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index c8b15c1a2..db2c941e9 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -7742,18 +7742,18 @@ public static final int PREALLOC_SIZE = 33554432; * @return the double at the specified index */ - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords); @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank); @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank); diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java index 90e9015b1..bad97296f 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java @@ -16,6 +16,7 @@ package org.nd4j.linalg.shape.concat; +import lombok.extern.slf4j.Slf4j; import lombok.val; import org.junit.Ignore; import org.junit.Test; @@ -43,6 +44,7 @@ import static org.junit.Assert.assertTrue; /** * @author Adam Gibson */ +@Slf4j @RunWith(Parameterized.class) public class ConcatTestsC extends BaseNd4jTest { @@ -309,7 +311,11 @@ public class ConcatTestsC extends BaseNd4jTest { for (int e = 0; e < 20000; e++) list.add(Nd4j.create(DataType.INT, 1, 300).assign(e)); + val timeStart = System.nanoTime(); val result = Nd4j.concat(0, list.toArray(new INDArray[list.size()])); + val timeEnd = System.nanoTime(); + + log.info("Time: {} us", (timeEnd - timeStart) / 1000); for (int e = 0; e < 20000; e++) assertEquals((float) e, result.getRow(e).meanNumber().floatValue(), 1e-5f); From 0748c7e7c2d05e932b5053d11933438d512738f2 Mon Sep 17 00:00:00 2001 From: Oleh Date: Fri, 21 Feb 2020 06:46:05 +0200 Subject: [PATCH 19/19] Oleh broadcast4d (#257) * libnd4j raw implementation of native broadcast for special cases Signed-off-by: Oleg * libnd4j fixed bugs for special case of 4D loop broadcast, add some tests, need more testing and discussion Signed-off-by: Oleg * libnd4j added 3D and 5D cases support and tests, need testing with different orders Signed-off-by: Oleg * libnd4j correctd case selection for broadcast 3,4,5D loops, fixed several places for more stable behavior, clean up Signed-off-by: Oleg * libnd4j minor corrections to avoid some risks in strides selection, added tests and rename some variables Signed-off-by: Oleg * libnd4j optimize usage the stride selection for all loops in separate ShapeUtils method copyCertainStridesFromShapeInfo, merge master Signed-off-by: Oleg * libnd4j remove per request several tests for 3D, 4D and 5D broadcast loops Signed-off-by: Oleg * libnd4j removed some loac changes that had not been sync with serve playground, turn on new loops usage --- libnd4j/blas/cpu/NativeOpExecutioner.cpp | 12 + libnd4j/include/helpers/LoopKind.h | 21 +- libnd4j/include/helpers/ShapeUtils.h | 11 + libnd4j/include/helpers/impl/ShapeUtils.cpp | 23 ++ libnd4j/include/loops/cpu/broadcasting.hpp | 107 ++++++++- .../layers_tests/DeclarableOpsTests14.cpp | 219 ++++++++++++++++-- 6 files changed, 378 insertions(+), 15 deletions(-) diff --git a/libnd4j/blas/cpu/NativeOpExecutioner.cpp b/libnd4j/blas/cpu/NativeOpExecutioner.cpp index cbc224838..1fedb0241 100644 --- a/libnd4j/blas/cpu/NativeOpExecutioner.cpp +++ b/libnd4j/blas/cpu/NativeOpExecutioner.cpp @@ -180,6 +180,18 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext *lc, numTads = shape::length(hYShapeInfo); } break; + case nd4j::LoopKind::BROADCAST_3D: { + numTads = shape::sizeAt(hZShapeInfo, 0); + } + break; + case nd4j::LoopKind::BROADCAST_4D: { + numTads = shape::sizeAt(hZShapeInfo, 0) * shape::sizeAt(hZShapeInfo, 1); + } + break; + case nd4j::LoopKind::BROADCAST_5D: { + numTads = shape::sizeAt(hZShapeInfo, 0) * shape::sizeAt(hZShapeInfo, 1); + } + break; default: { auto xLen = shape::length(hXShapeInfo); auto yLen = shape::length(hYShapeInfo); diff --git a/libnd4j/include/helpers/LoopKind.h b/libnd4j/include/helpers/LoopKind.h index ddd1c95e5..d97f3b225 100644 --- a/libnd4j/include/helpers/LoopKind.h +++ b/libnd4j/include/helpers/LoopKind.h @@ -37,7 +37,7 @@ namespace nd4j { class ND4J_EXPORT LoopKind { public: - enum Kind {SMALLARR2DX, EWS1, EWSNONZERO, RANK1, RANK2, RANK3, RANK4, RANK5, X_EWSNONZERO, Y_EWSNONZERO, Z_EWSNONZERO, COMMON, BROADCAST_SCALAR_X, BROADCAST_SCALAR_Y}; + enum Kind { SMALLARR2DX, EWS1, EWSNONZERO, RANK1, RANK2, RANK3, RANK4, RANK5, X_EWSNONZERO, Y_EWSNONZERO, Z_EWSNONZERO, COMMON, BROADCAST_SCALAR_X, BROADCAST_SCALAR_Y, BROADCAST_3D, BROADCAST_4D, BROADCAST_5D }; static FORCEINLINE Kind deduceKindOfLoopXZ(const Nd4jLong* xShapeInfo, const Nd4jLong* zShapeInfo); static FORCEINLINE Kind deduceKindOfLoopXYZ(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo); @@ -96,6 +96,25 @@ LoopKind::Kind LoopKind::deduceKindOfLoopBroadcast(const Nd4jLong* xShapeInfo, c auto yEws = shape::elementWiseStride(yShapeInfo); auto zEws = shape::elementWiseStride(zShapeInfo); + bool bNDLoopsRanks = (xRank == zRank && yRank <= xRank && yRank >= 2); + + int countUnityDimsInY = 0, countUnityDimsInX = 0; + for (int i = 0; i < xRank; i++) { + if (i < yRank) + countUnityDimsInY += (1 == shape::sizeAt(yShapeInfo, i)) ? 1 : 0; + countUnityDimsInX += (1 == shape::sizeAt(xShapeInfo, i)) ? 1 : 0; + } + + bool bNotCommonVectorCase = (countUnityDimsInY != yRank - 1) && (countUnityDimsInX != xRank - 1); + + if (3 == xRank && bNDLoopsRanks && bNotCommonVectorCase) + return nd4j::LoopKind::BROADCAST_3D; + if (4 == xRank && bNDLoopsRanks && bNotCommonVectorCase) + return nd4j::LoopKind::BROADCAST_4D; + if (5 == xRank && bNDLoopsRanks && bNotCommonVectorCase) + return nd4j::LoopKind::BROADCAST_5D; + + if (xRank == yRank && xRank == zRank && xOrder == 'c' && yOrder == 'c' && zOrder == 'c' && xEws == 1 && yEws == 1 && zEws == 1 && xRank >= 2) { // we validate that shapes are equal till the last dim for (int e = 0; e < xRank - 1; e++) { diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h index ebd61410b..39ea3edaa 100644 --- a/libnd4j/include/helpers/ShapeUtils.h +++ b/libnd4j/include/helpers/ShapeUtils.h @@ -180,6 +180,17 @@ namespace nd4j { return (numStrings + 1) * sizeof(Nd4jLong); } + /** + * This method selects strides based on dimentions required for broadcasting + * @param const pointer to input (Y) shape info for strides selection + * @param rank of input (X) to broadcasting + * @param dimentions size + * @param const pointer to dimentions for broadcasting + * @param pointer to output strides have to be pre allocated by 0 + * @return + */ + static void copyCertainStridesFromShapeInfo(const Nd4jLong* inShapeInfo, const int nRank, const int dimsSize, const int* dims, Nd4jLong* outStrides); + /* * check whether arr1/arr2 is sub-array of arr2/arr1, * this method do not evaluate what array is sub-array, it returns true if arr1 is sub-array of arr2 or arr2 is sub-array of arr1 diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index a2d3f97ef..10babeae1 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -1057,6 +1057,29 @@ std::vector ShapeUtils::tadAxesForSimpleBroadcast(const NDArray& max, const return numOfMinTads == 1 ? maxTadDims : std::vector(); } +void ShapeUtils::copyCertainStridesFromShapeInfo(const Nd4jLong* inShapeInfo, const int nRank, const int dimsSize, const int* dims, Nd4jLong* outStrides) { + + int yRank = shape::rank(inShapeInfo); + auto yOrigStride = shape::stride(inShapeInfo); + + if (yRank == nRank) { + for (int i = 0; i < yRank; ++i) { + // x[2,3,4] * y[2,1,4] = z[2,3,4] + outStrides[i] = (1 == shape::sizeAt(inShapeInfo, i)) ? 0 : yOrigStride[i]; + } + } + else { + + auto dimEx = nd4j::ShapeUtils::evalDimsToExclude(nRank, dimsSize, dims); + + for (int i = 0, it = 0; i < nRank; ++i) { + auto nCount = std::count(dimEx.cbegin(), dimEx.cend(), i); + outStrides[i] = (0 == nCount) ? yOrigStride[it++] : 0; + if (it == yRank) + break; + } + } +} //////////////////////////////////////////////////////////////////////////////// /* bool ShapeUtils::isSubArrayCase(const NDArray& arr1, const NDArray& arr2, std::vector& sameDims) { diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp index 691b95b83..62058bd20 100644 --- a/libnd4j/include/loops/cpu/broadcasting.hpp +++ b/libnd4j/include/loops/cpu/broadcasting.hpp @@ -25,6 +25,7 @@ #include #include #include +#include using namespace simdOps; @@ -144,7 +145,14 @@ namespace functions { auto yEws = shape::elementWiseStride(yShapeInfo); auto zEws = shape::elementWiseStride(zTadShapeInfo); - const nd4j::LoopKind::Kind kindOfLoop = loopKind == nd4j::LoopKind::BROADCAST_SCALAR_X || loopKind == nd4j::LoopKind::BROADCAST_SCALAR_Y ? loopKind : nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); + + const nd4j::LoopKind::Kind kindOfLoop = + (loopKind == nd4j::LoopKind::BROADCAST_SCALAR_X || + loopKind == nd4j::LoopKind::BROADCAST_SCALAR_Y || + loopKind == nd4j::LoopKind::BROADCAST_3D || + loopKind == nd4j::LoopKind::BROADCAST_4D || + loopKind == nd4j::LoopKind::BROADCAST_5D) + ? loopKind : nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { for (auto i = start; i < stop; i++) { @@ -194,6 +202,103 @@ namespace functions { oZ[f] = OpType::op(oX[f], oY); } } + else if (kindOfLoop == nd4j::LoopKind::BROADCAST_3D) { + + int xRank = shape::rank(xShapeInfo); + int yRank = shape::rank(yShapeInfo); + + auto xStrides = shape::stride(xShapeInfo); + auto zStrides = shape::stride(zShapeInfo); + + Nd4jLong yStrides[3] = { 0,0,0 }; + nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides); + + uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1); + uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2); + + for (uint32_t index0 = start; index0 < stop; index0++) { + + PRAGMA_OMP_SIMD + for (uint32_t index1 = 0; index1 < nSize1; index1++) { + for (uint32_t index2 = 0; index2 < nSize2; index2++) { + auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2); + auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2); + auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2); + *rZ = OpType::op(*rX, *rY); + } + } + + } + + } + else if (kindOfLoop == nd4j::LoopKind::BROADCAST_4D) { + + int xRank = shape::rank(xShapeInfo); + int yRank = shape::rank(yShapeInfo); + + auto xStrides = shape::stride(xShapeInfo); + auto zStrides = shape::stride(zShapeInfo); + + Nd4jLong yStrides[4] = { 0,0,0,0 }; + nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides); + + uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1); + uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2); + uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3); + + for (uint32_t i = start; i < stop; i++) { + + uint32_t index0 = i / nSize1; + uint32_t index1 = i % nSize1; + + PRAGMA_OMP_SIMD + for (uint32_t index2 = 0; index2 < nSize2; index2++) { + for (uint32_t index3 = 0; index3 < nSize3; index3++) { + auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3); + auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3); + auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3); + *rZ = OpType::op(*rX, *rY); + } + } + } + + } + else if (kindOfLoop == nd4j::LoopKind::BROADCAST_5D) { + + int xRank = shape::rank(xShapeInfo); + int yRank = shape::rank(yShapeInfo); + + auto xStrides = shape::stride(xShapeInfo); + auto zStrides = shape::stride(zShapeInfo); + + Nd4jLong yStrides[5] = { 0,0,0,0,0 }; + nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides); + + uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1); + uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2); + uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3); + uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4); + + for (uint32_t i = start; i < stop; i++) { + + uint32_t index0 = i / nSize1; + uint32_t index1 = i % nSize1; + + PRAGMA_OMP_SIMD + for (uint32_t index2 = 0; index2 < nSize2; index2++) { + for (uint32_t index3 = 0; index3 < nSize3; index3++) { + for (uint32_t index4 = 0; index4 < nSize4; index4++) { + auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3 + xStrides[4] * index4); + auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3 + yStrides[4] * index4); + auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3 + zStrides[4] * index4); + + *rZ = OpType::op(*rX, *rY); + } + } + } + } + + } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { uint tadShapeShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp index 25e2d383d..3672a4c20 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp @@ -1306,8 +1306,6 @@ TEST_F(DeclarableOpsTests14, matmul_test29) { delete results; } - - ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test30) { @@ -1328,8 +1326,6 @@ TEST_F(DeclarableOpsTests14, matmul_test30) { delete results; } - - ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test31) { @@ -1350,8 +1346,6 @@ TEST_F(DeclarableOpsTests14, matmul_test31) { delete results; } - - ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test32) { @@ -1369,8 +1363,7 @@ TEST_F(DeclarableOpsTests14, matmul_test32) { delete results; } - - +///////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test33) { auto x = NDArrayFactory::create('c', {4, 3}); auto y = NDArrayFactory::create('c', {4, 1}); @@ -1390,8 +1383,7 @@ TEST_F(DeclarableOpsTests14, matmul_test33) { delete result; } - - +////////////////////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test34) { auto a = NDArrayFactory::create('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); auto b = NDArrayFactory::create('c', {4}, {1, 2, 3, 4}); @@ -1408,7 +1400,7 @@ TEST_F(DeclarableOpsTests14, matmul_test34) { delete result; } - +///////////////////////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test35) { auto a = NDArrayFactory::create('c', {4}, {1, 2, 3, 4}); auto b = NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); @@ -1425,7 +1417,7 @@ TEST_F(DeclarableOpsTests14, matmul_test35) { delete result; } - +//////////////////////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test36) { auto a = NDArrayFactory::create('c', {1, 4}, {1, 2, 3, 4}); auto b = NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); @@ -1442,7 +1434,6 @@ TEST_F(DeclarableOpsTests14, matmul_test36) { delete result; } - ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests14, matmul_test37) { @@ -1463,6 +1454,206 @@ TEST_F(DeclarableOpsTests14, matmul_test37) { ASSERT_TRUE(cExp.isSameShape(c)); ASSERT_TRUE(cExp.equalsTo(c)); } +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_3D_1) { + + // x[4, 12, 128] * y[4, 128] = z[4, 12, 128] + + auto x = NDArray('c', { 2, 3, 5 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 2, 5 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 2, 3, 5 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 2, 3, 5 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 60.000000, 77.000000, 96.000000, 117.000000, 140.000000, 110.000000, 132.000000, 156.000000, 182.000000, 210.000000, 240.000000, 272.000000, 306.000000, 342.000000, 380.000000, 315.000000, 352.000000, 391.000000, 432.000000, 475.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000 }, nd4j::DataType::FLOAT32); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyBroadcast(nd4j::broadcast::Multiply, { 0,2 }, y, z); + //z.printBuffer(); + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_3D_2) { + + auto x = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32); + auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32); + auto z = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto eC = NDArray('c', { 2, 3, 5 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.600000, 0.636364, 0.666667, 0.692308, 0.714286, 1.100000, 1.090909, 1.083333, 1.076923, 1.071429, 1.066667, 1.062500, 1.058824, 1.055556, 1.052632, 1.400000, 1.375000, 1.352941, 1.333333, 1.315789, 1.733333, 1.687500, 1.647059, 1.611111, 1.578947 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32); + + e.assign(eC); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z); + + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_4D_1) { + + auto x = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 2, 5, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 2, 3, 5, 4 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 210.000000, 242.000000, 276.000000, 312.000000, 350.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000, 620.000000, 672.000000, 726.000000, 782.000000, 840.000000, 900.000000, 962.000000, 1026.000000, 1092.000000, 1160.000000, 410.000000, 462.000000, 516.000000, 572.000000, 630.000000, 690.000000, 752.000000, 816.000000, 882.000000, 950.000000, 1020.000000, 1092.000000, 1166.000000, 1242.000000, 1320.000000, 1400.000000, 1482.000000, 1566.000000, 1652.000000, 1740.000000, 1830.000000, 1922.000000, 2016.000000, 2112.000000, 2210.000000, 2310.000000, 2412.000000, 2516.000000, 2622.000000, 2730.000000, 2840.000000, 2952.000000, 3066.000000, 3182.000000, 3300.000000, 3420.000000, 3542.000000, 3666.000000, 3792.000000, 3920.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 3030.000000, 3162.000000, 3296.000000, 3432.000000, 3570.000000, 3710.000000, 3852.000000, 3996.000000, 4142.000000, 4290.000000, 4440.000000, 4592.000000, 4746.000000, 4902.000000, 5060.000000, 5220.000000, 5382.000000, 5546.000000, 5712.000000, 5880.000000 }, nd4j::DataType::FLOAT32); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyBroadcast(nd4j::broadcast::Multiply, { 0,2,3 }, y, z); + + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_4D_2) { + + auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + auto y = NDArray('f', { 2, 5, 4 }, nd4j::DataType::FLOAT32); + auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000,0.181818,0.250000,0.307692,0.357143,0.400000,0.437500,0.470588,0.500000,0.526316,0.550000,0.571429, 0.590909,0.608696,0.625000,0.640000, 0.653846,0.666667,0.678571,0.689655, 2.100000,2.000000,1.916667, 1.846154, 1.785714, 1.733333,1.687500, 1.647059,1.611111, 1.578947,1.550000, 1.523810,1.500000, 1.478261,1.458333, 1.440000,1.423077, 1.407407,1.392857, 1.379310,4.100000, 3.818182,3.583333, 3.384615, 3.214286, 3.066667,2.937500, 2.823529,2.722222, 2.631579,2.550000, 2.476191,2.409091, 2.347826,2.291667, 2.240000,2.192308, 2.148148,2.107143, 2.068965,2.033333, 2.000000,1.968750, 1.939394,1.911765, 1.885714,1.861111, 1.837838,1.815789, 1.794872,1.775000, 1.756098,1.738095, 1.720930,1.704545, 1.688889,1.673913, 1.659575,1.645833,1.632653,2.700000,2.645161,2.593750,2.545455,2.500000,2.457143,2.416667,2.378378,2.342105,2.307692,2.275000,2.243902,2.214286,2.186047,2.159091,2.133333,2.108696,2.085106,2.062500,2.040816,3.366667,3.290323,3.218750,3.151515,3.088235,3.028571,2.972222,2.918919,2.868421,2.820513,2.775000,2.731707,2.690476,2.651163,2.613636,2.577778,2.543478,2.510638,2.479167,2.448980 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + + e.assign(eC); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyBroadcast(nd4j::broadcast::Divide, { 0,2,3 }, y, z); + + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_4D_3) { + + auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32); + auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + + e.assign(eC); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z); + + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_4D_4) { + + // x[4, 12, 128, 128] * y[4, 1, 128, 1] = z[4, 12, 128, 128] + + auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + auto y = NDArray('f', { 2, 1, 5, 1 }, nd4j::DataType::FLOAT32); + auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32); + e.assign(eC); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z); + + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_5D_1) { + // x[4, 12, 128, 128, 128] * y[4, 1, 128, 128, 128] = z[4, 12, 128, 128, 128] + auto x = NDArray('c', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + auto y = NDArray('c', { 2, 1, 5, 4, 3 }, nd4j::DataType::FLOAT32); + auto z = NDArray('c', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto e = NDArray('c', { 2, 3, 5, 4, 3 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 630.000000, 682.000000, 736.000000, 792.000000, 850.000000, 910.000000, 972.000000, 1036.000000, 1102.000000, 1170.000000, 1240.000000, 1312.000000, 1386.000000, 1462.000000, 1540.000000, 1620.000000, 1702.000000, 1786.000000, 1872.000000, 1960.000000, 2050.000000, 2142.000000, 2236.000000, 2332.000000, 2430.000000, 2530.000000, 2632.000000, 2736.000000, 2842.000000, 2950.000000, 3060.000000, 3172.000000, 3286.000000, 3402.000000, 3520.000000, 3640.000000, 3762.000000, 3886.000000, 4012.000000, 4140.000000, 610.000000, 682.000000, 756.000000, 832.000000, 910.000000, 990.000000, 1072.000000, 1156.000000, 1242.000000, 1330.000000, 1420.000000, 1512.000000, 1606.000000, 1702.000000, 1800.000000, 1900.000000, 2002.000000, 2106.000000, 2212.000000, 2320.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 5050.000000, 5202.000000, 5356.000000, 5512.000000, 5670.000000, 5830.000000, 5992.000000, 6156.000000, 6322.000000, 6490.000000, 6660.000000, 6832.000000, 7006.000000, 7182.000000, 7360.000000, 7540.000000, 7722.000000, 7906.000000, 8092.000000, 8280.000000, 1210.000000, 1342.000000, 1476.000000, 1612.000000, 1750.000000, 1890.000000, 2032.000000, 2176.000000, 2322.000000, 2470.000000, 2620.000000, 2772.000000, 2926.000000, 3082.000000, 3240.000000, 3400.000000, 3562.000000, 3726.000000, 3892.000000, 4060.000000, 4230.000000, 4402.000000, 4576.000000, 4752.000000, 4930.000000, 5110.000000, 5292.000000, 5476.000000, 5662.000000, 5850.000000, 6040.000000, 6232.000000, 6426.000000, 6622.000000, 6820.000000, 7020.000000, 7222.000000, 7426.000000, 7632.000000, 7840.000000, 8050.000000, 8262.000000, 8476.000000, 8692.000000, 8910.000000, 9130.000000, 9352.000000, 9576.000000, 9802.000000, 10030.000000, 10260.000000, 10492.000000, 10726.000000, 10962.000000, 11200.000000, 11440.000000, 11682.000000, 11926.000000, 12172.000000, 12420.000000, 12670.000000, 12922.000000, 13176.000000, 13432.000000, 13690.000000, 13950.000000, 14212.000000, 14476.000000, 14742.000000, 15010.000000, 15280.000000, 15552.000000, 15826.000000, 16102.000000, 16380.000000, 16660.000000, 16942.000000, 17226.000000, 17512.000000, 17800.000000, 18090.000000, 18382.000000, 18676.000000, 18972.000000, 19270.000000, 19570.000000, 19872.000000, 20176.000000, 20482.000000, 20790.000000, 21100.000000, 21412.000000, 21726.000000, 22042.000000, 22360.000000, 22680.000000, 23002.000000, 23326.000000, 23652.000000, 23980.000000, 24310.000000, 24642.000000, 24976.000000, 25312.000000, 25650.000000, 25990.000000, 26332.000000, 26676.000000, 27022.000000, 27370.000000, 27720.000000, 28072.000000, 28426.000000, 28782.000000, 29140.000000, 29500.000000, 29862.000000, 30226.000000, 30592.000000, 30960.000000, 16870.000000, 17182.000000, 17496.000000, 17812.000000, 18130.000000, 18450.000000, 18772.000000, 19096.000000, 19422.000000, 19750.000000, 20080.000000, 20412.000000, 20746.000000, 21082.000000, 21420.000000, 21760.000000, 22102.000000, 22446.000000, 22792.000000, 23140.000000, 23490.000000, 23842.000000, 24196.000000, 24552.000000, 24910.000000, 25270.000000, 25632.000000, 25996.000000, 26362.000000, 26730.000000, 27100.000000, 27472.000000, 27846.000000, 28222.000000, 28600.000000, 28980.000000, 29362.000000, 29746.000000, 30132.000000, 30520.000000, 30910.000000, 31302.000000, 31696.000000, 32092.000000, 32490.000000, 32890.000000, 33292.000000, 33696.000000, 34102.000000, 34510.000000, 34920.000000, 35332.000000, 35746.000000, 36162.000000, 36580.000000, 37000.000000, 37422.000000, 37846.000000, 38272.000000, 38700.000000, 21070.000000, 21442.000000, 21816.000000, 22192.000000, 22570.000000, 22950.000000, 23332.000000, 23716.000000, 24102.000000, 24490.000000, 24880.000000, 25272.000000, 25666.000000, 26062.000000, 26460.000000, 26860.000000, 27262.000000, 27666.000000, 28072.000000, 28480.000000, 28890.000000, 29302.000000, 29716.000000, 30132.000000, 30550.000000, 30970.000000, 31392.000000, 31816.000000, 32242.000000, 32670.000000, 33100.000000, 33532.000000, 33966.000000, 34402.000000, 34840.000000, 35280.000000, 35722.000000, 36166.000000, 36612.000000, 37060.000000, 37510.000000, 37962.000000, 38416.000000, 38872.000000, 39330.000000, 39790.000000, 40252.000000, 40716.000000, 41182.000000, 41650.000000, 42120.000000, 42592.000000, 43066.000000, 43542.000000, 44020.000000, 44500.000000, 44982.000000, 45466.000000, 45952.000000, 46440.000000 }, nd4j::DataType::FLOAT32); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), y, z); + // z.printBuffer(); + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_5D_2) { + + auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + auto y = NDArray('f', { 2, 5, 4, 3 }, nd4j::DataType::FLOAT32); + auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.400000, 0.437500, 0.470588, 0.500000, 0.526316, 0.550000, 0.571429, 0.590909, 0.608696, 0.625000, 0.640000, 0.653846, 0.666667, 0.678571, 0.689655, 0.700000, 0.709677, 0.718750, 0.727273, 0.735294, 0.742857, 0.750000, 0.756757, 0.763158, 0.769231, 0.775000, 0.780488, 0.785714, 0.790698, 0.795455, 0.800000, 0.804348, 0.808511, 0.812500, 0.816327, 0.820000, 0.823529, 0.826923, 0.830189, 0.833333, 0.836364, 0.839286, 0.842105, 0.844828, 0.847458, 0.850000, 0.852459, 0.854839, 0.857143, 0.859375, 0.861538, 0.863636, 0.865672, 0.867647, 0.869565, 6.100000, 5.636364, 5.250000, 4.923077, 4.642857, 4.400000, 4.187500, 4.000000, 3.833333, 3.684211, 3.550000, 3.428571, 3.318182, 3.217391, 3.125000, 3.040000, 2.961539, 2.888889, 2.821429, 2.758621, 2.700000, 2.645161, 2.593750, 2.545455, 2.500000, 2.457143, 2.416667, 2.378378, 2.342105, 2.307692, 2.275000, 2.243902, 2.214286, 2.186047, 2.159091, 2.133333, 2.108696, 2.085106, 2.062500, 2.040816, 2.020000, 2.000000, 1.980769, 1.962264, 1.944444, 1.927273, 1.910714, 1.894737, 1.879310, 1.864407, 1.850000, 1.836066, 1.822581, 1.809524, 1.796875, 1.784615, 1.772727, 1.761194, 1.750000, 1.739130, 12.100000, 11.090909, 10.250000, 9.538462, 8.928572, 8.400000, 7.937500, 7.529412, 7.166667, 6.842105, 6.550000, 6.285714, 6.045455, 5.826087, 5.625000, 5.440000, 5.269231, 5.111111, 4.964286, 4.827586, 4.700000, 4.580645, 4.468750, 4.363636, 4.264706, 4.171429, 4.083333, 4.000000, 3.921053, 3.846154, 3.775000, 3.707317, 3.642857, 3.581395, 3.522727, 3.466667, 3.413043, 3.361702, 3.312500, 3.265306, 3.220000, 3.176471, 3.134615, 3.094340, 3.055556, 3.018182, 2.982143, 2.947368, 2.913793, 2.881356, 2.850000, 2.819672, 2.790323, 2.761905, 2.734375, 2.707692, 2.681818, 2.656716, 2.632353, 2.608696, 2.585714, 2.563380, 2.541667, 2.520548, 2.500000, 2.480000, 2.460526, 2.441558, 2.423077, 2.405063, 2.387500, 2.370370, 2.353658, 2.337349, 2.321429, 2.305882, 2.290698, 2.275862, 2.261364, 2.247191, 2.233333, 2.219780, 2.206522, 2.193548, 2.180851, 2.168421, 2.156250, 2.144330, 2.132653, 2.121212, 2.110000, 2.099010, 2.088235, 2.077670, 2.067308, 2.057143, 2.047170, 2.037383, 2.027778, 2.018349, 2.009091, 2.000000, 1.991071, 1.982301, 1.973684, 1.965217, 1.956897, 1.948718, 1.940678, 1.932773, 1.925000, 1.917355, 1.909836, 1.902439, 1.895161, 1.888000, 1.880952, 1.874016, 1.867188, 1.860465, 3.442857, 3.408451, 3.375000, 3.342466, 3.310811, 3.280000, 3.250000, 3.220779, 3.192308, 3.164557, 3.137500, 3.111111, 3.085366, 3.060241, 3.035714, 3.011765, 2.988372, 2.965517, 2.943182, 2.921348, 2.900000, 2.879121, 2.858696, 2.838710, 2.819149, 2.800000, 2.781250, 2.762887, 2.744898, 2.727273, 2.710000, 2.693069, 2.676471, 2.660194, 2.644231, 2.628572, 2.613208, 2.598131, 2.583333, 2.568807, 2.554545, 2.540540, 2.526786, 2.513274, 2.500000, 2.486957, 2.474138, 2.461539, 2.449152, 2.436975, 2.425000, 2.413223, 2.401639, 2.390244, 2.379032, 2.368000, 2.357143, 2.346457, 2.335938, 2.325581, 4.300000, 4.253521, 4.208333, 4.164383, 4.121622, 4.080000, 4.039474, 4.000000, 3.961539, 3.924051, 3.887500, 3.851852, 3.817073, 3.783133, 3.750000, 3.717647, 3.686047, 3.655172, 3.625000, 3.595506, 3.566667, 3.538461, 3.510870, 3.483871, 3.457447, 3.431579, 3.406250, 3.381443, 3.357143, 3.333333, 3.310000, 3.287129, 3.264706, 3.242718, 3.221154, 3.200000, 3.179245, 3.158879, 3.138889, 3.119266, 3.100000, 3.081081, 3.062500, 3.044248, 3.026316, 3.008696, 2.991379, 2.974359, 2.957627, 2.941176, 2.925000, 2.909091, 2.893443, 2.878049, 2.862903, 2.848000, 2.833333, 2.818898, 2.804688, 2.790698 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + + e.assign(eC); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyBroadcast(nd4j::broadcast::Divide, { 0,2,3,4 }, y, z); + + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_5D_3) { + + auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32); + auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + + e.assign(eC); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z); + + ASSERT_EQ(e, z); +} +/////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, Test_broadcast_5D_4) { + + auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + auto y = NDArray('f', { 2, 1, 5, 1, 1 }, nd4j::DataType::FLOAT32); + auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + // recieved by main algorithm + auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, nd4j::DataType::FLOAT32); + + auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32); + e.assign(eC); + + x.linspace(1.f); + y.linspace(10.f); + z.assign(0.f); + + x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z); + + ASSERT_EQ(e, z); +} // @Test // public void testMmulRank4_simple(){ @@ -1489,3 +1680,5 @@ TEST_F(DeclarableOpsTests14, matmul_test37) { // INDArray exp = Nd4j.valueArrayOf(shape, 64.0, DataType.FLOAT); //Each entry in output is sum of 64 (1.0 x 1.0) multiplications // assertEquals(exp, out); // } + +