From 44a8d19ac6982585c3626192b86180667344a05a Mon Sep 17 00:00:00 2001 From: raver119 Date: Tue, 1 Oct 2019 09:10:19 +0300 Subject: [PATCH] [WIP] Broadcast changes (#8257) * - provide correct call NDArray::applyBroadcast inside of NDArray::applyTrueBroadcast Signed-off-by: Yurii * - provide new trueBroadcast helper Signed-off-by: Yurii * example for yurii Signed-off-by: raver119 * - provide new trueBroadcast helper for cpu Signed-off-by: Yurii * - start working on new trueBroadcat helper for cuda Signed-off-by: Yurii * - further work on trueBroadcast for cuda Signed-off-by: Yurii * - fix bugs in cuda helper trueBroadcast Signed-off-by: Yurii --- libnd4j/blas/NDArray.hpp | 342 ++++++------------ libnd4j/blas/cuda/NDArray.cu | 1 - libnd4j/include/helpers/ShapeUtils.h | 15 +- libnd4j/include/helpers/TrueBroadcastHelper.h | 84 +++++ .../helpers/cpu/TrueBroadcastHelper.cpp | 218 +++++++++++ .../helpers/cuda/TrueBroadcastHelper.cu | 309 ++++++++++++++++ libnd4j/include/helpers/impl/ShapeUtils.cpp | 85 ++++- 7 files changed, 806 insertions(+), 248 deletions(-) create mode 100644 libnd4j/include/helpers/TrueBroadcastHelper.h create mode 100644 libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp create mode 100644 libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 0f0621a80..2cda1eb6d 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -25,6 +25,7 @@ #include #include #include +#include namespace nd4j { @@ -2519,8 +2520,6 @@ void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray* othe if (isEmpty() || other->isEmpty()) return; - NDArray::prepareSpecialUse({target}, {this, other}); - if (isScalar()) { target->assign(this); target->applyPairwiseTransform(op.p, *other, extraArgs); @@ -2531,57 +2530,24 @@ void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray* othe return; } - const NDArray* min(other); - const NDArray* max(this); - - if(this->rankOf() < other->rankOf()) { - max = other; - min = this; - } if(checkTargetShape) { Nd4jLong* newShapeInfo = nullptr; - if(!ShapeUtils::evalBroadcastShapeInfo(*max, *min, false, newShapeInfo, getContext()->getWorkspace())) // the rank of target array must be equal to max->rankOf)() + if(!ShapeUtils::evalBroadcastShapeInfo(*this, *other, true, newShapeInfo, getContext()->getWorkspace())) // the rank of target array must be equal to max->rankOf)() throw std::runtime_error("NDArray::applyTrueBroadcast method: the shapes of this and other arrays are not suitable for broadcast operation !"); if(!shape::equalsTypesAndShapesSoft(target->getShapeInfo(), newShapeInfo)) throw std::runtime_error("NDArray::applyTrueBroadcast method: the shape or type of target array is wrong !"); } - NDArray* pTarget = (max->dataType() == target->dataType()) ? target : new NDArray(target->ordering(), target->getShapeAsVector(), max->dataType(), target->getContext()); - - // check whether max array has to be tiled - if(!max->isSameShape(target)) { - // evaluate repeating dimensions for tile operation - std::vector repeatMax(max->rankOf()); - for(int i = 1; i <= max->rankOf(); ++i) - repeatMax[i - 1] = (target->_shapeInfo[i] / max->_shapeInfo[i]); - max->tile(repeatMax, *pTarget); - } - else - pTarget->assign(max); - - // check whether min array has to be tiled - std::vector repeatMin(min->rankOf()); - int product = 1; - for(int i = min->rankOf(); i >=1 ; --i) { - repeatMin[i-1] = (target->_shapeInfo[target->rankOf() - min->rankOf() + i] / min->_shapeInfo[i]); - product *= repeatMin[i-1]; + if(target->isSameShape(this) || target->isSameShape(other)) { + const_cast(this)->applyBroadcast(op.b, ShapeUtils::getDimsWithSameShape(*this, *other), other, target, extraArgs); + return; } - auto pMin = const_cast(min); - if(product != 1 ) - pMin = new NDArray(min->tile(repeatMin)); - - std::vector sameDims = ShapeUtils::getDimsWithSameShape(*target, *pMin); - - if(max == this) - pTarget->applyBroadcast(op.b, sameDims, pMin, target, extraArgs); - else - pMin->applyBroadcast(op.b, sameDims, pTarget, target, extraArgs); - - if(pMin != min) - delete pMin; - if(pTarget != target) - delete pTarget; + #ifdef __ND4J_EXPERIMENTAL__ + BUILD_PAIRWISE_SELECTOR(dataType(), other->dataType(), target->dataType(), helpers::TrueBroadcastHelper, ::exec(op.b, *this, *other, *target), LIBND4J_TYPES, LIBND4J_TYPES); + #else + BUILD_SINGLE_SELECTOR_THRICE(dataType(), helpers::TrueBroadcastHelper, ::exec(op.b, *this, *other, *target), LIBND4J_TYPES); + #endif } ////////////////////////////////////////////////////////////////////////// @@ -2594,7 +2560,44 @@ void NDArray::applyTrueBroadcast(nd4j::BroadcastBoolOpsTuple op, const NDArray* if (isEmpty() || other->isEmpty()) return; - NDArray::prepareSpecialUse({target}, {this, other}); + if (isScalar()) { + NDArray temp(target->_shapeInfo, dataType(), false, getContext()); + temp.assign(this); + temp.applyPairwiseTransform(op.p, other, target, extraArgs); + return; + } + if (other->isScalar()) { + this->applyScalarArr(op.s, other, target, extraArgs); + return; + } + + if(checkTargetShape) { + Nd4jLong* newShapeInfo = nullptr; + if(!ShapeUtils::evalBroadcastShapeInfo(*this, *other, true, newShapeInfo, getContext()->getWorkspace())) // the rank of target array must be equal to max->rankOf)() + throw std::runtime_error("NDArray::applyTrueBroadcast method: the shapes of this and other arrays are not suitable for broadcast operation !"); + if(!shape::equalsSoft(target->_shapeInfo, newShapeInfo) || target->dataType() != DataType::BOOL) + throw std::runtime_error("NDArray::applyTrueBroadcast bool method: the shape or type of target array is wrong !"); + if(dataType() != other->dataType()) + throw std::invalid_argument("NDArray::applyTrueBroadcast bool method: this and other arrays must have the same type !"); + } + + if(target->isSameShape(this) || target->isSameShape(other)) { + const_cast(this)->applyBroadcast(op.b, ShapeUtils::getDimsWithSameShape(*this, *other), other, target, extraArgs); + return; + } + + BUILD_DOUBLE_SELECTOR(dataType(), target->dataType(), helpers::TrueBroadcastBoolHelper, ::exec(op.b, *this, *other, *target), LIBND4J_TYPES, BOOL_TYPES); +} + +////////////////////////////////////////////////////////////////////////// +void NDArray::applyTrueBroadcast(nd4j::BroadcastIntOpsTuple op, const NDArray* other, NDArray* target, const bool checkTargetShape, ExtraArguments *extraArgs) const { + if (isS()) + throw std::runtime_error("NDArray::applyTrueBroadcast bool: you can't use this method on String array!"); + if(target == nullptr || other == nullptr) + throw std::runtime_error("NDArray::applyTrueBroadcast int method: target or other = nullptr !"); + + if (isEmpty() || other->isEmpty()) + return; if (isScalar()) { NDArray temp(target->_shapeInfo, dataType(), false, getContext()); @@ -2607,143 +2610,24 @@ void NDArray::applyTrueBroadcast(nd4j::BroadcastBoolOpsTuple op, const NDArray* return; } - const NDArray* min(other); - const NDArray* max(this); - - if(this->rankOf() < other->rankOf()) { - max = other; - min = this; - } - if(checkTargetShape) { Nd4jLong* newShapeInfo = nullptr; - if(!ShapeUtils::evalBroadcastShapeInfo(*max, *min, false, newShapeInfo, getContext()->getWorkspace())) // the rank of target array must be equal to max->rankOf)() + if(!ShapeUtils::evalBroadcastShapeInfo(*this, *other, false, newShapeInfo, getContext()->getWorkspace())) // the rank of target array must be equal to max->rankOf)() throw std::runtime_error("NDArray::applyTrueBroadcast method: the shapes of this and other arrays are not suitable for broadcast operation !"); - if(!shape::equalsSoft(target->_shapeInfo, newShapeInfo) || target->dataType() != DataType::BOOL) - throw std::runtime_error("NDArray::applyTrueBroadcast bool method: the shape or type of target array is wrong !"); + if(!shape::equalsSoft(target->_shapeInfo, newShapeInfo) || target->dataType() != this->dataType()) + throw std::runtime_error("NDArray::applyTrueBroadcast int method: the shape or type of target array is wrong !"); if(dataType() != other->dataType()) - throw std::invalid_argument("NDArray::applyTrueBroadcast bool method: this and other arrays must have the same type !"); + throw std::invalid_argument("NDArray::applyTrueBroadcast int method: this and other arrays must have the same type !"); } - NDArray* pTarget = (max->dataType() == target->dataType()) ? target : new NDArray(target->ordering(), target->getShapeAsVector(), max->dataType(), target->getContext()); - // check whether max array has to be tiled - if(!max->isSameShape(target)) { - // evaluate repeating dimensions for tile operation - std::vector repeatMax(max->rankOf()); - for(int i = 1; i <= max->rankOf(); ++i) - repeatMax[i-1] = (target->_shapeInfo[i] / max->_shapeInfo[i]); - max->tile(repeatMax, *pTarget); - } - else - pTarget->assign(max); - - // check whether min array has to be tiled - std::vector repeatMin(min->rankOf()); - int product = 1; - for(int i = min->rankOf(); i >=1 ; --i) { - repeatMin[i-1] = (target->_shapeInfo[target->rankOf() - min->rankOf() + i] / min->_shapeInfo[i]); - product *= repeatMin[i-1]; + if(target->isSameShape(this) || target->isSameShape(other)) { + const_cast(this)->applyBroadcast(op.b, ShapeUtils::getDimsWithSameShape(*this, *other), other, target, extraArgs); + return; } - auto pMin = const_cast(min); - if(product != 1 ) - pMin = new NDArray(min->tile(repeatMin)); - - std::vector sameDims = ShapeUtils::getDimsWithSameShape(*target, *pMin); - - if(max == this) - pTarget->applyBroadcast(op.b, sameDims, pMin, target, extraArgs); - else - pMin->applyBroadcast(op.b, sameDims, pTarget, target, extraArgs); - - if(pMin != min) - delete pMin; - if(pTarget != target) - delete pTarget; + BUILD_SINGLE_SELECTOR(dataType(), helpers::TrueBroadcastIntHelper, ::exec(op.b, *this, *other, *target), INTEGER_TYPES); } - - -////////////////////////////////////////////////////////////////////////// - void NDArray::applyTrueBroadcast(nd4j::BroadcastIntOpsTuple op, const NDArray* other, NDArray* target, const bool checkTargetShape, ExtraArguments *extraArgs) const { - if (isS()) - throw std::runtime_error("NDArray::applyTrueBroadcast bool: you can't use this method on String array!"); - if(target == nullptr || other == nullptr) - throw std::runtime_error("NDArray::applyTrueBroadcast int method: target or other = nullptr !"); - - if (isEmpty() || other->isEmpty()) - return; - - NDArray::prepareSpecialUse({target}, {this, other}); - - if (isScalar()) { - NDArray temp(target->_shapeInfo, dataType(), false, getContext()); - temp.assign(this); - temp.applyPairwiseTransform(op.p, other, target, extraArgs); - return; - } - if (other->isScalar()) { - this->applyScalarArr(op.s, other, target, extraArgs); - return; - } - - const NDArray* min(other); - const NDArray* max(this); - - if(this->rankOf() < other->rankOf()) { - max = other; - min = this; - } - - if(checkTargetShape) { - Nd4jLong* newShapeInfo = nullptr; - if(!ShapeUtils::evalBroadcastShapeInfo(*max, *min, false, newShapeInfo, getContext()->getWorkspace())) // the rank of target array must be equal to max->rankOf)() - throw std::runtime_error("NDArray::applyTrueBroadcast method: the shapes of this and other arrays are not suitable for broadcast operation !"); - if(!shape::equalsSoft(target->_shapeInfo, newShapeInfo) || target->dataType() != this->dataType()) - throw std::runtime_error("NDArray::applyTrueBroadcast int method: the shape or type of target array is wrong !"); - if(dataType() != other->dataType()) - throw std::invalid_argument("NDArray::applyTrueBroadcast int method: this and other arrays must have the same type !"); - } - - NDArray* pTarget = (max->dataType() == target->dataType()) ? target : new NDArray(target->ordering(), target->getShapeAsVector(), max->dataType(), target->getContext()); - // check whether max array has to be tiled - if(!max->isSameShape(target)) { - // evaluate repeating dimensions for tile operation - std::vector repeatMax(max->rankOf()); - for(int i = 1; i <= max->rankOf(); ++i) - repeatMax[i-1] = (target->_shapeInfo[i] / max->_shapeInfo[i]); - max->tile(repeatMax, *pTarget); - } - else - pTarget->assign(max); - - // check whether min array has to be tiled - std::vector repeatMin(min->rankOf()); - int product = 1; - for(int i = min->rankOf(); i >=1 ; --i) { - repeatMin[i-1] = (target->_shapeInfo[target->rankOf() - min->rankOf() + i] / min->_shapeInfo[i]); - product *= repeatMin[i-1]; - } - - auto pMin = const_cast(min); - if(product != 1 ) - pMin = new NDArray(min->tile(repeatMin)); - - std::vector sameDims = ShapeUtils::getDimsWithSameShape(*target, *pMin); - - if(max == this) - pTarget->applyBroadcast(op.b, sameDims, pMin, target, extraArgs); - else - pMin->applyBroadcast(op.b, sameDims, pTarget, target, extraArgs); - - if(pMin != min) - delete pMin; - if(pTarget != target) - delete pTarget; - } - - - ////////////////////////////////////////////////////////////////////////// NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs) const { if (isEmpty() || other.isEmpty()) { @@ -2884,65 +2768,65 @@ void NDArray::applyBroadcast(nd4j::broadcast::BoolOps op, const std::vector ////////////////////////////////////////////////////////////////////////// - void NDArray::applyBroadcast(nd4j::broadcast::IntOps op, const std::vector& dimensions, const NDArray* other, NDArray* target, ExtraArguments* extraArgs) { - if (!isZ()) - throw std::runtime_error("NDArray::applyBroadcast IntOps: you can't use this method on non-Integer array!"); - if(isEmpty() || other->isEmpty()) { - if(!target->isEmpty()) - throw std::runtime_error("NDArray::applyBroadcast IntOps: when some of input arrays (or both) is empty, target array must be empty as well !"); - return; - } - - if (dimensions.empty()) - return; - - auto result = target == nullptr ? this : target; - - if (other->lengthOf() == lengthOf() && this->rankOf() == other->rankOf()) { - NDArray::prepareSpecialUse({result}, {this, other}); - NativeOpExecutioner::execPairwiseIntTransform(getContext(), fromBroadcastToPairwiseInt(op), buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), other->getBuffer(), other->getShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->buffer(), result->shapeInfo(), result->specialBuffer(), result->specialShapeInfo(), nullptr); - NDArray::registerSpecialUse({result}, {this, other}); - return; - } - - NDArray *min(nullptr), *max(nullptr); - if((lengthOf() > other->lengthOf()) || (lengthOf() == other->lengthOf() && rankOf() >= other->rankOf())) { - max = this; - min = const_cast(other); - } - else { - max = const_cast(other); - min = this; - } - - if(result->dataType() != dataType()) - throw std::invalid_argument("NDArray::applyBroadcast int method: type of target array must be the same as input!"); - if(!result->isSameShape(max)) - throw std::invalid_argument("NDArray::applyBroadcast int method: max and target arrays must have the same shape !"); - if(_dataType != other->_dataType) - throw std::invalid_argument("NDArray::applyBroadcast int method: this and other arrays must have the same type !"); - - std::vector copy(dimensions); - - if (dimensions.size() > 1) - std::sort(copy.begin(), copy.end()); - - Nd4jLong tadLength = shape::tadLength(max->shapeInfo(), copy.data(), (int) copy.size()); - if (tadLength != min->lengthOf()) - throw std::runtime_error("Tad length mismatch"); - - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(result->shapeInfo(), copy); - - // TODO: eventually we want separate tads here - NDArray::prepareSpecialUse({result}, {this, other}); - if(max == this) - NativeOpExecutioner::execBroadcastInt( getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), other->getBuffer(), other->getShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->buffer(), result->shapeInfo(), result->specialBuffer(), result->specialShapeInfo(), copy.data(), (int)copy.size(), packX.platformShapeInfo(), packX.platformOffsets(), packZ.platformShapeInfo(), packZ.platformOffsets()); - else - NativeOpExecutioner::execInverseBroadcastInt(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), other->getBuffer(), other->getShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->buffer(), result->shapeInfo(), result->specialBuffer(), result->specialShapeInfo(), copy.data(), (int)copy.size(), packX.platformShapeInfo(), packX.platformOffsets(), packZ.platformShapeInfo(), packZ.platformOffsets()); - registerSpecialUse({result}, {this, other}); +void NDArray::applyBroadcast(nd4j::broadcast::IntOps op, const std::vector& dimensions, const NDArray* other, NDArray* target, ExtraArguments* extraArgs) { + if (!isZ()) + throw std::runtime_error("NDArray::applyBroadcast IntOps: you can't use this method on non-Integer array!"); + if(isEmpty() || other->isEmpty()) { + if(!target->isEmpty()) + throw std::runtime_error("NDArray::applyBroadcast IntOps: when some of input arrays (or both) is empty, target array must be empty as well !"); + return; } + if (dimensions.empty()) + return; + + auto result = target == nullptr ? this : target; + + if (other->lengthOf() == lengthOf() && this->rankOf() == other->rankOf()) { + NDArray::prepareSpecialUse({result}, {this, other}); + NativeOpExecutioner::execPairwiseIntTransform(getContext(), fromBroadcastToPairwiseInt(op), buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), other->getBuffer(), other->getShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->buffer(), result->shapeInfo(), result->specialBuffer(), result->specialShapeInfo(), nullptr); + NDArray::registerSpecialUse({result}, {this, other}); + return; + } + + NDArray *min(nullptr), *max(nullptr); + if((lengthOf() > other->lengthOf()) || (lengthOf() == other->lengthOf() && rankOf() >= other->rankOf())) { + max = this; + min = const_cast(other); + } + else { + max = const_cast(other); + min = this; + } + + if(result->dataType() != dataType()) + throw std::invalid_argument("NDArray::applyBroadcast int method: type of target array must be the same as input!"); + if(!result->isSameShape(max)) + throw std::invalid_argument("NDArray::applyBroadcast int method: max and target arrays must have the same shape !"); + if(_dataType != other->_dataType) + throw std::invalid_argument("NDArray::applyBroadcast int method: this and other arrays must have the same type !"); + + std::vector copy(dimensions); + + if (dimensions.size() > 1) + std::sort(copy.begin(), copy.end()); + + Nd4jLong tadLength = shape::tadLength(max->shapeInfo(), copy.data(), (int) copy.size()); + if (tadLength != min->lengthOf()) + throw std::runtime_error("Tad length mismatch"); + + auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy); + auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(result->shapeInfo(), copy); + + // TODO: eventually we want separate tads here + NDArray::prepareSpecialUse({result}, {this, other}); + if(max == this) + NativeOpExecutioner::execBroadcastInt( getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), other->getBuffer(), other->getShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->buffer(), result->shapeInfo(), result->specialBuffer(), result->specialShapeInfo(), copy.data(), (int)copy.size(), packX.platformShapeInfo(), packX.platformOffsets(), packZ.platformShapeInfo(), packZ.platformOffsets()); + else + NativeOpExecutioner::execInverseBroadcastInt(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), other->getBuffer(), other->getShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->buffer(), result->shapeInfo(), result->specialBuffer(), result->specialShapeInfo(), copy.data(), (int)copy.size(), packX.platformShapeInfo(), packX.platformOffsets(), packZ.platformShapeInfo(), packZ.platformOffsets()); + registerSpecialUse({result}, {this, other}); +} + ////////////////////////////////////////////////////////////////////////// void NDArray::applyBroadcast(nd4j::broadcast::Ops op, const std::initializer_list dimensions, const NDArray* tadArray, NDArray* target, ExtraArguments* extraArgs) { std::vector vec(dimensions); diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu index 1d95fd3c2..f70760f9a 100644 --- a/libnd4j/blas/cuda/NDArray.cu +++ b/libnd4j/blas/cuda/NDArray.cu @@ -78,7 +78,6 @@ bool NDArray::isActualOnHostSide() const { return _buffer->isPrimaryActual(); bool NDArray::isActualOnDeviceSide() const { return _buffer->isSpecialActual(); } void NDArray::makeBothBuffersActual() const { if(!isActualOnHostSide()) syncToHost(); if(!isActualOnDeviceSide()) syncToDevice(); } - /////////////////////////////////////////////////////////////////// template __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const T val, const int lower, const int upper) { diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h index ba0f956a5..f7b17911c 100644 --- a/libnd4j/include/helpers/ShapeUtils.h +++ b/libnd4j/include/helpers/ShapeUtils.h @@ -81,7 +81,8 @@ namespace nd4j { // check the possibility of broadcast operation for set of arrays, if true then return resulting broadcasted shapeInfo static bool evalCommonBroadcastShapeInfo(const std::vector& arrays, Nd4jLong*& resultShapeInfo, memory::Workspace* workspace = nullptr); - // return sorted vector of dimensions of array with larger dimensions along which two input arrays have same shape + // return sorted vector of dimensions common (same) for two arrays, dimensions values corresponds to array with bigger rank + // for example if arr1{2,7}, arr2{2,5,4,7} then vector = {0,3} static std::vector getDimsWithSameShape(const NDArray& max, const NDArray& min); // evaluate shapeInfo for resulting array of tile operation @@ -169,6 +170,18 @@ namespace nd4j { * @return */ static Nd4jLong stringBufferHeaderRequirements(Nd4jLong numStrings); + + /* + * check whether arr1/arr2 is sub-array of arr2/arr1, + * this method do not evaluate what array is sub-array, it returns true if arr1 is sub-array of arr2 or arr2 is sub-array of arr1 + * sameDims is filled (and sorted) with dimensions values that match both in arr1 and arr2 shapes (unities are ignored) + * for example: + * if arr1{2,3} and arr2{2,4,3,7} then return true and sameDims contains {0,2} + * if arr1{1,1,3,1,3,1,1} and arr2{1,2,3,1,3} then return true and sameDims contains {2,4} + * if arr1{2,1,4,1,7,5} and arr2{1,1,4,5} then return true and sameDims contains {2,5} + + static bool isSubArrayCase(const NDArray& arr1, const NDArray& arr2, std::vector& sameDims); + */ }; diff --git a/libnd4j/include/helpers/TrueBroadcastHelper.h b/libnd4j/include/helpers/TrueBroadcastHelper.h new file mode 100644 index 000000000..4101aa08e --- /dev/null +++ b/libnd4j/include/helpers/TrueBroadcastHelper.h @@ -0,0 +1,84 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com) +// + +#ifndef LIBND4J_TRUEBROADCASTHELPER_H +#define LIBND4J_TRUEBROADCASTHELPER_H + +#include + +namespace nd4j { +namespace helpers { + +//////////////////////////////////////////////////////////////////////// +template +class TrueBroadcastHelper { + + #ifdef __CUDACC__ + template + static __host__ void execLauncher(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo); + #else + template + static void exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr); + #endif + + public: + static void exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr); +}; + +template +class TrueBroadcastBoolHelper { + + #ifdef __CUDACC__ + template + static __host__ void execLauncher(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo); + #else + template + static void exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr); + #endif + + public: + + static void exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr); +}; + +//////////////////////////////////////////////////////////////////////// +template +class TrueBroadcastIntHelper { + + #ifdef __CUDACC__ + template + static __host__ void execLauncher(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo); + #else + template + static void exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr); + #endif + + public: + + static void exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr); +}; + + +} +} + + + +#endif //LIBND4J_BIDIAGONALUP_H diff --git a/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp new file mode 100644 index 000000000..5f8789077 --- /dev/null +++ b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp @@ -0,0 +1,218 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com) +// + +#include + +using namespace simdOps; + +namespace nd4j { +namespace helpers { + +//////////////////////////////////////////////////////////////////////// +template +template +void TrueBroadcastHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + const X* x = reinterpret_cast(xArr.getBuffer()); + const Y* y = reinterpret_cast(yArr.getBuffer()); + Z* z = reinterpret_cast(zArr.getBuffer()); + + const auto xShapeInfo = xArr.getShapeInfo(); + const auto yShapeInfo = yArr.getShapeInfo(); + const auto zShapeInfo = zArr.getShapeInfo(); + + const int xRank = xArr.rankOf(); + const int yRank = yArr.rankOf(); + const int zRank = zArr.rankOf(); + + const Nd4jLong zLen = zArr.lengthOf(); + + std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); + + PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(xCoords, yCoords, zCoords)) + for (Nd4jLong i = 0; i < zLen; ++i) { + + shape::index2coords(i, zShapeInfo, zCoords.data()); + + for(int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if(ix >= 0) { + if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { + xCoords[ix--] = zCoords[iz]; + } else { + xCoords[ix--] = 0; + } + } + + if(iy >= 0) { + if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { + yCoords[iy--] = zCoords[iz]; + } else { + yCoords[iy--] = 0; + } + } + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } +} + +template +void TrueBroadcastHelper::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + DISPATCH_BY_OPNUM_TTT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_OPS); +} + +//////////////////////////////////////////////////////////////////////// +template +template +void TrueBroadcastBoolHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + const X* x = reinterpret_cast(xArr.getBuffer()); + const X* y = reinterpret_cast(yArr.getBuffer()); + Z* z = reinterpret_cast(zArr.getBuffer()); + + const auto xShapeInfo = xArr.getShapeInfo(); + const auto yShapeInfo = yArr.getShapeInfo(); + const auto zShapeInfo = zArr.getShapeInfo(); + + const int xRank = xArr.rankOf(); + const int yRank = yArr.rankOf(); + const int zRank = zArr.rankOf(); + + const Nd4jLong zLen = zArr.lengthOf(); + + std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); + + PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(xCoords, yCoords, zCoords)) + for (Nd4jLong i = 0; i < zLen; ++i) { + + shape::index2coords(i, zShapeInfo, zCoords.data()); + + for(int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if(ix >= 0) { + if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { + xCoords[ix--] = zCoords[iz]; + } else { + xCoords[ix--] = 0; + } + } + + if(iy >= 0) { + if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { + yCoords[iy--] = zCoords[iz]; + } else { + yCoords[iy--] = 0; + } + } + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } +} + +template +void TrueBroadcastBoolHelper::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + DISPATCH_BY_OPNUM_TT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_BOOL_OPS); +} + +//////////////////////////////////////////////////////////////////////// +template +template +void TrueBroadcastIntHelper::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + const X* x = reinterpret_cast(xArr.getBuffer()); + const X* y = reinterpret_cast(yArr.getBuffer()); + X* z = reinterpret_cast(zArr.getBuffer()); + + const auto xShapeInfo = xArr.getShapeInfo(); + const auto yShapeInfo = yArr.getShapeInfo(); + const auto zShapeInfo = zArr.getShapeInfo(); + + const int xRank = xArr.rankOf(); + const int yRank = yArr.rankOf(); + const int zRank = zArr.rankOf(); + + const Nd4jLong zLen = zArr.lengthOf(); + + std::vector xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf()); + + PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(xCoords, yCoords, zCoords)) + for (Nd4jLong i = 0; i < zLen; ++i) { + + shape::index2coords(i, zShapeInfo, zCoords.data()); + + for(int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if(ix >= 0) { + if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) { + xCoords[ix--] = zCoords[iz]; + } else { + xCoords[ix--] = 0; + } + } + + if(iy >= 0) { + if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) { + yCoords[iy--] = zCoords[iz]; + } else { + yCoords[iy--] = 0; + } + } + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data()); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } +} + +template +void TrueBroadcastIntHelper::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + DISPATCH_BY_OPNUM_T(exec, PARAMS(xArr, yArr, zArr), BROADCAST_INT_OPS); +} + +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_0); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_1); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_2); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_3); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_4); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_5); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_6); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_7); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_8); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_9); + +BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastBoolHelper, , LIBND4J_TYPES, BOOL_TYPES); + +BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastIntHelper, , INTEGER_TYPES); + +} +} \ No newline at end of file diff --git a/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu new file mode 100644 index 000000000..152e74652 --- /dev/null +++ b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu @@ -0,0 +1,309 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com) +// + +// #include +#include +#include +#include +#include +#include +// #include +// #include + +using namespace simdOps; + +namespace nd4j { +namespace helpers { + +//////////////////////////////////////////////////////////////////////// +template +__global__ static void trueBroadcastCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo) { + + const auto x = reinterpret_cast(vx); + const auto y = reinterpret_cast(vy); + auto z = reinterpret_cast(vz); + + __shared__ int xRank, yRank, zRank; + __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen + + if (threadIdx.x == 0) { + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); + + xRank = shape::rank(xShapeInfo); + yRank = shape::rank(yShapeInfo); + zRank = shape::rank(zShapeInfo); + + zLen = shape::length(zShapeInfo); + totalThreads = gridDim.x * blockDim.x; + } + __syncthreads(); + + auto xCoords = sharedMem + threadIdx.x * (xRank + yRank + zRank); + auto yCoords = xCoords + xRank; + auto zCoords = yCoords + yRank; + + const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (Nd4jLong i = tid; i < zLen; i += totalThreads) { + + shape::index2coords(i, zShapeInfo, zCoords); + + for(int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if(ix >= 0) + if(xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) + xCoords[ix--] = zCoords[iz]; + else + xCoords[ix--] = 0; + + if(iy >= 0) + if(yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) + yCoords[iy--] = zCoords[iz]; + else + yCoords[iy--] = 0; + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } +} + +//////////////////////////////////////////////////////////////////////// +template +template +void TrueBroadcastHelper::execLauncher(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo) { + + trueBroadcastCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo); +} + +////////////////////////////////////////////////////////////////////////// +template +void TrueBroadcastHelper::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + dim3 launchDims; + + launchDims.x = MAX_NUM_THREADS / 8; // threadsPerBlock + launchDims.y = (zArr.lengthOf() + launchDims.x - 1) / launchDims.x; // blocksPerGrid + launchDims.z = sizeof(Nd4jLong) * launchDims.x * (xArr.rankOf() + yArr.rankOf() + zArr.rankOf()) + 128; // sharedMem + + PointersManager manager(xArr.getContext(), "TrueBroadcastHelper::exec"); + + NDArray::prepareSpecialUse({&zArr}, {&xArr, &yArr}); + + DISPATCH_BY_OPNUM_TTT(execLauncher, PARAMS(launchDims, xArr.getContext()->getCudaStream(), xArr.getSpecialBuffer(), xArr.getSpecialShapeInfo(), yArr.getSpecialBuffer(), yArr.getSpecialShapeInfo(), zArr.specialBuffer(), zArr.specialShapeInfo()), OPS_A(BROADCAST_OPS)); + + NDArray::registerSpecialUse({&zArr}, {&xArr, &yArr}); + + manager.synchronize(); +} + +//////////////////////////////////////////////////////////////////////// +template +__global__ static void trueBroadcastBoolCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo) { + + const auto x = reinterpret_cast(vx); + const auto y = reinterpret_cast(vy); + auto z = reinterpret_cast(vz); + + __shared__ int xRank, yRank, zRank; + __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen + + if (threadIdx.x == 0) { + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); + + xRank = shape::rank(xShapeInfo); + yRank = shape::rank(yShapeInfo); + zRank = shape::rank(zShapeInfo); + + zLen = shape::length(zShapeInfo); + totalThreads = gridDim.x * blockDim.x; + } + __syncthreads(); + + auto xCoords = sharedMem + threadIdx.x * (xRank + yRank + zRank); + auto yCoords = xCoords + xRank; + auto zCoords = yCoords + yRank; + + const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (Nd4jLong i = tid; i < zLen; i += totalThreads) { + + shape::index2coords(i, zShapeInfo, zCoords); + + for(int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if(ix >= 0) + if(xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) + xCoords[ix--] = zCoords[iz]; + else + xCoords[ix--] = 0; + + if(iy >= 0) + if(yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) + yCoords[iy--] = zCoords[iz]; + else + yCoords[iy--] = 0; + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } +} + +//////////////////////////////////////////////////////////////////////// +template +template +void TrueBroadcastBoolHelper::execLauncher(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo) { + + trueBroadcastBoolCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo); +} + +////////////////////////////////////////////////////////////////////////// +template +void TrueBroadcastBoolHelper::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + dim3 launchDims; + launchDims.x = MAX_NUM_THREADS / 8; // threadsPerBlock + launchDims.y = (zArr.lengthOf() + launchDims.x - 1) / launchDims.x; // blocksPerGrid + launchDims.z = sizeof(Nd4jLong) * launchDims.x * (xArr.rankOf() + yArr.rankOf() + zArr.rankOf()) + 128; // sharedMem + + PointersManager manager(xArr.getContext(), "TrueBroadcastBoolHelper::exec"); + + NDArray::prepareSpecialUse({&zArr}, {&xArr, &yArr}); + + DISPATCH_BY_OPNUM_TT(execLauncher, PARAMS(launchDims, xArr.getContext()->getCudaStream(), xArr.getSpecialBuffer(), xArr.getSpecialShapeInfo(), yArr.getSpecialBuffer(), yArr.getSpecialShapeInfo(), zArr.specialBuffer(), zArr.specialShapeInfo()), OPS_A(BROADCAST_BOOL_OPS)); + + NDArray::registerSpecialUse({&zArr}, {&xArr, &yArr}); + + manager.synchronize(); +} + +//////////////////////////////////////////////////////////////////////// +template +__global__ static void trueBroadcastIntCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo) { + + const auto x = reinterpret_cast(vx); + const auto y = reinterpret_cast(vy); + auto z = reinterpret_cast(vz); + + __shared__ int xRank, yRank, zRank; + __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen + + if (threadIdx.x == 0) { + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); + + xRank = shape::rank(xShapeInfo); + yRank = shape::rank(yShapeInfo); + zRank = shape::rank(zShapeInfo); + + zLen = shape::length(zShapeInfo); + totalThreads = gridDim.x * blockDim.x; + } + __syncthreads(); + + auto xCoords = sharedMem + threadIdx.x * (xRank + yRank + zRank); + auto yCoords = xCoords + xRank; + auto zCoords = yCoords + yRank; + + const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (Nd4jLong i = tid; i < zLen; i += totalThreads) { + + shape::index2coords(i, zShapeInfo, zCoords); + + for(int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) { + + if(ix >= 0) + if(xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) + xCoords[ix--] = zCoords[iz]; + else + xCoords[ix--] = 0; + + if(iy >= 0) + if(yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) + yCoords[iy--] = zCoords[iz]; + else + yCoords[iy--] = 0; + } + + const auto xOffset = shape::getOffset(xShapeInfo, xCoords); + const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto yOffset = shape::getOffset(yShapeInfo, yCoords); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset]); + } +} + +//////////////////////////////////////////////////////////////////////// +template +template +void TrueBroadcastIntHelper::execLauncher(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo) { + + trueBroadcastIntCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo); +} + +////////////////////////////////////////////////////////////////////////// +template +void TrueBroadcastIntHelper::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) { + + dim3 launchDims; + launchDims.x = MAX_NUM_THREADS / 8; // threadsPerBlock + launchDims.y = (zArr.lengthOf() + launchDims.x - 1) / launchDims.x; // blocksPerGrid + launchDims.z = sizeof(Nd4jLong) * launchDims.x * (xArr.rankOf() + yArr.rankOf() + zArr.rankOf()) + 128; // sharedMem + + PointersManager manager(xArr.getContext(), "TrueBroadcastIntHelper::exec"); + + NDArray::prepareSpecialUse({&zArr}, {&xArr, &yArr}); + + DISPATCH_BY_OPNUM_T(execLauncher, PARAMS(launchDims, xArr.getContext()->getCudaStream(), xArr.getSpecialBuffer(), xArr.getSpecialShapeInfo(), yArr.getSpecialBuffer(), yArr.getSpecialShapeInfo(), zArr.specialBuffer(), zArr.specialShapeInfo()), OPS_A(BROADCAST_INT_OPS)); + + NDArray::registerSpecialUse({&zArr}, {&xArr, &yArr}); + + manager.synchronize(); +} + + + +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_0); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_1); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_2); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_3); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_4); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_5); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_6); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_7); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_8); +BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_9); + +BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastBoolHelper, , LIBND4J_TYPES, BOOL_TYPES); + +BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastIntHelper, , INTEGER_TYPES); + +} +} \ No newline at end of file diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index 91ee09123..ed51849f9 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -515,21 +515,30 @@ bool ShapeUtils::evalCommonBroadcastShapeInfo(const std::vector& ////////////////////////////////////////////////////////////////////////// -// return sorted vector of dimensions of array with larger dimensions number along which two input arrays have same shape -// the array with larger dimensions number has to be passed as first argument -std::vector ShapeUtils::getDimsWithSameShape(const NDArray& max, const NDArray& min) { +// return sorted vector of dimensions common (same) for two arrays, dimensions values corresponds to array with bigger rank +// for example if arr1{2,7}, arr2{2,5,4,7} then vector = {0,3} +std::vector ShapeUtils::getDimsWithSameShape(const NDArray& arr1, const NDArray& arr2) { - std::vector result; - auto maxShapeInfo = max.getShapeInfo(); - auto minShapeInfo = min.getShapeInfo(); - int maxRank = maxShapeInfo[0]; - int minRank = minShapeInfo[0]; + const NDArray *min, *max; - for (int i = 1; i <= minRank; ++i) - if (minShapeInfo[i] == maxShapeInfo[maxRank - minRank + i]) - result.emplace_back(maxRank - minRank + i - 1); + if(arr1.rankOf() >= arr2.rankOf()) { + max = &arr1; + min = &arr2; + } + else { + max = &arr2; + min = &arr1; + } - return result; + const int rankDiff = max->rankOf() - min->rankOf(); + + std::vector dims; + + for (int i = 0; i < min->rankOf(); ++i) + if (min->sizeAt(i) == max->sizeAt(rankDiff + i)) + dims.emplace_back(rankDiff + i); + + return dims; } ////////////////////////////////////////////////////////////////////////// @@ -997,14 +1006,56 @@ std::vector ShapeUtils::tadAxesForSimpleBroadcast(const NDArray& max, const } - Nd4jLong ShapeUtils::stringBufferHeaderRequirements(Nd4jLong numStrings) { - // we store +1 offset - auto base = numStrings + 1; +Nd4jLong ShapeUtils::stringBufferHeaderRequirements(Nd4jLong numStrings) { + // we store +1 offset + auto base = numStrings + 1; - // since we return number of bytes... - return base * sizeof(Nd4jLong); + // since we return number of bytes... + return base * sizeof(Nd4jLong); +} + +//////////////////////////////////////////////////////////////////////////////// +/* +bool ShapeUtils::isSubArrayCase(const NDArray& arr1, const NDArray& arr2, std::vector& sameDims) { + + if(!sameDims.empty()) + sameDims.clear(); + + const NDArray* max = &arr1; + const NDArray* min = &arr2; + + if(arr1.lengthOf() < arr2.lengthOf()) { + max = &arr2; + min = &arr1; } + int numUnitiesInMin = 0; + + for (int iMax = -1, iMin = -1; iMax >= -max->rankOf() && iMin >= -min->rankOf(); ) { + + if(max->sizeAt(iMax) == 1) { // ignore unities in shape + --iMax; + continue; + } + + if(min->sizeAt(iMin) == 1) { // ignore unities in shape + ++numUnitiesInMin; + --iMin; + continue; + } + + if(max->sizeAt(iMax) == min->sizeAt(iMin)) { + sameDims.insert(sameDims.begin(), iMax + max->rankOf()); + --iMin; + } + + --iMax; + } + + return sameDims.size() + numUnitiesInMin == min->rankOf(); +} +*/ + }