diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h index 3a57fc92b..21eedc665 100644 --- a/libnd4j/blas/NDArray.h +++ b/libnd4j/blas/NDArray.h @@ -1770,7 +1770,7 @@ NDArray NDArray::operator()(const Nd4jLong i) const { } else { Nd4jLong idx[MAX_RANK]; shape::ind2subC(rankOf(), shapeOf(), i, idx); - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), idx); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -1801,7 +1801,7 @@ NDArray& NDArray::operator()(const Nd4jLong i) { } else { Nd4jLong idx[MAX_RANK]; shape::ind2subC(rankOf(), shapeOf(), i, idx); - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), idx); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -1818,7 +1818,7 @@ NDArray NDArray::operator()(const Nd4jLong i, const Nd4jLong j) const { throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !"); Nd4jLong coords[2] = {i, j}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); // TODO: do we really want a view here? auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); @@ -1834,7 +1834,7 @@ NDArray& NDArray::operator()(const Nd4jLong i, const Nd4jLong j) { throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !"); Nd4jLong coords[2] = {i, j}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -1853,7 +1853,7 @@ NDArray NDArray::operator()(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !"); Nd4jLong coords[3] = {i, j, k}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -1870,7 +1870,7 @@ NDArray& NDArray::operator()(const Nd4jLong i, const Nd4jLong j, const Nd4jLong throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !"); Nd4jLong coords[3] = {i, j, k}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -1886,7 +1886,7 @@ NDArray NDArray::operator()(const Nd4jLong t, const Nd4jLong u, const Nd4jLong v throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !"); Nd4jLong coords[4] = {t, u, v, w}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -1900,7 +1900,7 @@ NDArray& NDArray::operator()(const Nd4jLong t, const Nd4jLong u, const Nd4jLong throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !"); Nd4jLong coords[4] = {t, u, v, w}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); // FIXME auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); @@ -1916,7 +1916,7 @@ NDArray NDArray::operator()(const Nd4jLong* idx) const { if (idx[i] >= sizeAt(i)) throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !"); - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), idx); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -1931,7 +1931,7 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) { if (idx[i] >= sizeAt(i)) throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !"); - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), idx); auto cast = reinterpret_cast(_buffer) + (xOffset * this->sizeOfT()); NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace())); @@ -2067,7 +2067,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j) { syncToHost(); Nd4jLong coords[2] = {i, j}; - auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto offset = shape::getOffset(getShapeInfo(), coords); tickWriteHost(); return *(reinterpret_cast(bufferWithOffset(offset))); } @@ -2084,7 +2084,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) { syncToHost(); Nd4jLong coords[3] = {i, j, k}; - auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto offset = shape::getOffset(getShapeInfo(), coords); tickWriteHost(); return *(reinterpret_cast(bufferWithOffset(offset))); } @@ -2118,7 +2118,7 @@ T NDArray::t(const Nd4jLong i, const Nd4jLong j) const { syncToHost(); Nd4jLong coords[2] = {i, j}; - auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto offset = shape::getOffset(getShapeInfo(), coords); tickReadHost(); return *(reinterpret_cast(bufferWithOffset(offset))); } @@ -2135,7 +2135,7 @@ T NDArray::t(const Nd4jLong i, const Nd4jLong j) const { syncToHost(); Nd4jLong coords[3] = {i, j, k}; - auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto offset = shape::getOffset(getShapeInfo(), coords); tickReadHost(); return *(reinterpret_cast(bufferWithOffset(offset))); } diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 82427f9b9..0f0621a80 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -808,7 +808,7 @@ void NDArray::templatedSet(void *buffer, const Nd4jLong *indices, const void *va auto t = reinterpret_cast(buffer); const auto y = *(reinterpret_cast(value)); - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), indices, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), indices); t[xOffset] = static_cast(y); } BUILD_DOUBLE_TEMPLATE(template void NDArray::templatedSet, (void *buffer, const Nd4jLong *indices, const void *value), LIBND4J_TYPES, LIBND4J_TYPES); @@ -2462,14 +2462,13 @@ double NDArray::getTrace() const { int rank = rankOf(); auto shape = shapeOf(); - auto strides = stridesOf(); int minDim = 100000000; Nd4jLong indices[MAX_RANK]; for(int j = 0; j < rank; ++j) indices[j] = 1; - auto offset = shape::getOffset(0, shape, strides, indices, rank); + auto offset = shape::getOffset(getShapeInfo(), indices); for(int i = 0; i < rank; ++i) if(minDim > shape[i]) @@ -3472,7 +3471,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j) const { throw std::invalid_argument("NDArray::e(i,j): one of input indexes is out of array length or rank!=2 !"); const Nd4jLong coords[2] = {i, j}; - const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + const auto xOffset = shape::getOffset(getShapeInfo(), coords); NDArray::preparePrimaryUse({}, {this}); NDArray::registerPrimaryUse({}, {this}); @@ -3492,7 +3491,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) const { throw std::invalid_argument("NDArray::e(i,j,k): one of input indexes is out of array length or rank!=3 !"); const Nd4jLong coords[3] = {i, j, k}; - const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + const auto xOffset = shape::getOffset(getShapeInfo(), coords); NDArray::preparePrimaryUse({}, {this}); NDArray::registerPrimaryUse({}, {this}); @@ -3512,7 +3511,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4jLon throw std::invalid_argument("NDArray::e(i,j,k,l): one of input indexes is out of array length or rank!=4 !"); const Nd4jLong coords[4] = {i, j, k, l}; - const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + const auto xOffset = shape::getOffset(getShapeInfo(), coords); NDArray::preparePrimaryUse({}, {this}); NDArray::registerPrimaryUse({}, {this}); @@ -4095,7 +4094,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const T value) { void *p = reinterpret_cast(const_cast(&value)); Nd4jLong coords[2] = {i, j}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); NDArray::preparePrimaryUse({this}, {}, true); BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES); @@ -4127,7 +4126,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const T va void *p = reinterpret_cast(const_cast(&value)); Nd4jLong coords[3] = {i, j, k}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES); NDArray::registerPrimaryUse({this}, {}); } @@ -4154,7 +4153,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4j void *p = reinterpret_cast(const_cast(&value)); Nd4jLong coords[4] = {i, j, k, l}; - auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf()); + auto xOffset = shape::getOffset(getShapeInfo(), coords); NDArray::preparePrimaryUse({this}, {}, true); BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES); @@ -4409,7 +4408,7 @@ Nd4jLong NDArray::getOffset(const Nd4jLong i) const { if (i >= lengthOf()) throw std::invalid_argument("NDArray::getOffset: input index is out of array length !"); - return shape::getIndexOffset(i, _shapeInfo, lengthOf()); + return shape::getIndexOffset(i, _shapeInfo); } NDArray NDArray::like() { @@ -4455,7 +4454,7 @@ NDArray* NDArray::diagonal(const char type) const { indices[i] = 1; } - auto step = shape::getOffset(0, shapeOf(), stridesOf(), indices, rank); + auto step = shape::getOffset(getShapeInfo(), indices); if(type == 'c') { outShapeInfo[1] = diagSize; diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp index 24ef100d3..03c7c53e1 100644 --- a/libnd4j/blas/cpu/NDArray.cpp +++ b/libnd4j/blas/cpu/NDArray.cpp @@ -103,8 +103,8 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords)) for (Nd4jLong i = 0; i < zLen; ++i) { - shape::index2coords(zRank, target->shapeOf(), i, zLen, coords.data()); - const auto zOffset = shape::getOffset(0, target->shapeOf(), target->stridesOf(), coords.data(), zRank); + shape::index2coords(i, target->getShapeInfo(), coords.data()); + const auto zOffset = shape::getOffset(target->getShapeInfo(), coords.data()); // if( (row + upper < col) || (row + lower > col) ) if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1])) @@ -112,7 +112,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char else if(this != target) { // when this and target are different arrays if(xRank != zRank) coords[0] = coords[1]; - const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(0, shapeOf(), stridesOf(), coords.data(), xRank); + const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords.data()); z[zOffset] = x[xOffset]; } } @@ -128,13 +128,12 @@ void NDArray::setIdentity() { int rank = rankOf(); auto shape = shapeOf(); - auto strides = stridesOf(); int minDim = MAX_INT; Nd4jLong indices[MAX_RANK]; for(int j = 0; j < rank; ++j) indices[j] = 1; - Nd4jLong offset = shape::getOffset(0, shape, strides, indices, rank); + Nd4jLong offset = shape::getOffset(getShapeInfo(), indices); for(int i = 0; i < rank; ++i) if(minDim > shape[i]) @@ -380,9 +379,9 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector 1) { for (uint j = 0; j < repSize; ++j) { @@ -396,7 +395,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector(hX[offset + oldOffset], hX[offset + newOffset]); } } diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu index f6a05c44b..1d95fd3c2 100644 --- a/libnd4j/blas/cuda/NDArray.cu +++ b/libnd4j/blas/cuda/NDArray.cu @@ -106,8 +106,8 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha for (Nd4jLong i = tid; i < zLen; i += totalThreads) { - shape::index2coords(zRank, shape::shapeOf(const_cast(zShapeInfo)), i, zLen, coords); - const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast(zShapeInfo)), shape::stride(const_cast(zShapeInfo)), coords, zRank); + shape::index2coords(i, zShapeInfo, coords); + const auto zOffset = shape::getOffset(zShapeInfo, coords); // if( (row + upper < col) || (row + lower > col) ) if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1])) @@ -115,7 +115,7 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha else if(vx != vz) { // when x and z are different arrays if(xRank != zRank) coords[0] = coords[1]; - const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), coords, xRank); + const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords); z[zOffset] = x[xOffset]; } } @@ -177,8 +177,8 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo, for (Nd4jLong i = tid; i < len; i += totalThreads) { - shape::index2coords(rank, shape::shapeOf(const_cast(xShapeInfo)), i, len, coords); - const auto offset = shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), coords, rank); + shape::index2coords(i, xShapeInfo, coords); + const auto offset = shape::getOffset(xShapeInfo, coords); if(coords[rank - 2] == coords[rank - 1]) // row == col -> on diagonal x[offset] = val; @@ -424,9 +424,9 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo, for (Nd4jLong i = tid; i < zLen; i += totalThreads) { - shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + shape::index2coords(i, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); if(repSize > 1) { for (uint j = 0; j < repSize; ++j) { @@ -440,7 +440,7 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo, else coords[axis] /= repeats[0]; - z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + z[zOffset] = x[shape::getOffset(xShapeInfo, coords)]; } } diff --git a/libnd4j/blas/cuda/NDArrayLambda.hpp b/libnd4j/blas/cuda/NDArrayLambda.hpp index bf9848981..c27476bfb 100644 --- a/libnd4j/blas/cuda/NDArrayLambda.hpp +++ b/libnd4j/blas/cuda/NDArrayLambda.hpp @@ -23,8 +23,8 @@ #include #include -static Nd4jLong __device__ __noinline__ __getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) { - return shape::getIndexOffset(index, shapeInfo, length); +static Nd4jLong __device__ __noinline__ __getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) { + return shape::getIndexOffset(index, shapeInfo); } static Nd4jLong __device__ __noinline__ __length(Nd4jLong *shapeInfo) { @@ -103,8 +103,8 @@ static _CUDA_G void lambdaKernel(void* vx, Nd4jLong *xShapeInfo, void *vz, Nd4jL z[e * zEws] = lambda(x[e * xEws]); } else { for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) { - auto xOffset = __getIndexOffset(e, xShapeInfo, zLength); - auto zOffset = __getIndexOffset(e, zShapeInfo, zLength); + auto xOffset = __getIndexOffset(e, xShapeInfo); + auto zOffset = __getIndexOffset(e, zShapeInfo); z[zOffset] = lambda(x[xOffset]); } @@ -132,8 +132,8 @@ static _CUDA_G void lambdaIndexedKernel(void* vx, Nd4jLong *xShapeInfo, void *vz z[e * zEws] = lambda(e, x[e * xEws]); } else { for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) { - auto xOffset = __getIndexOffset(e, xShapeInfo, zLength); - auto zOffset = __getIndexOffset(e, zShapeInfo, zLength); + auto xOffset = __getIndexOffset(e, xShapeInfo); + auto zOffset = __getIndexOffset(e, zShapeInfo); z[zOffset] = lambda(e, x[xOffset]); } @@ -164,9 +164,9 @@ static _CUDA_G void lambdaIndexedPairwiseKernel(void* vx, Nd4jLong *xShapeInfo, z[e * zEws] = lambda(e, x[e * xEws], y[e * yEws]); } else { for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) { - auto xOffset = __getIndexOffset(e, xShapeInfo, zLength); - auto yOffset = __getIndexOffset(e, yShapeInfo, zLength); - auto zOffset = __getIndexOffset(e, zShapeInfo, zLength); + auto xOffset = __getIndexOffset(e, xShapeInfo); + auto yOffset = __getIndexOffset(e, yShapeInfo); + auto zOffset = __getIndexOffset(e, zShapeInfo); z[zOffset] = lambda(e, x[xOffset], y[yOffset]); } @@ -197,9 +197,9 @@ static _CUDA_G void lambdaPairwiseKernel(void* vx, Nd4jLong *xShapeInfo, void* v z[e * zEws] = lambda(x[e * xEws], y[e * yEws]); } else { for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) { - auto xOffset = __getIndexOffset(e, xShapeInfo, zLength); - auto yOffset = __getIndexOffset(e, yShapeInfo, zLength); - auto zOffset = __getIndexOffset(e, zShapeInfo, zLength); + auto xOffset = __getIndexOffset(e, xShapeInfo); + auto yOffset = __getIndexOffset(e, yShapeInfo); + auto zOffset = __getIndexOffset(e, zShapeInfo); z[zOffset] = lambda(x[xOffset], y[yOffset]); } @@ -233,10 +233,10 @@ static _CUDA_G void lambdaTriplewiseKernel(void* vw, Nd4jLong *wShapeInfo, void* z[e * zEws] = lambda(w[e * wEws], x[e * xEws], y[e * yEws]); } else { for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) { - auto wOffset = __getIndexOffset(e, wShapeInfo, zLength); - auto xOffset = __getIndexOffset(e, xShapeInfo, zLength); - auto yOffset = __getIndexOffset(e, yShapeInfo, zLength); - auto zOffset = __getIndexOffset(e, zShapeInfo, zLength); + auto wOffset = __getIndexOffset(e, wShapeInfo); + auto xOffset = __getIndexOffset(e, xShapeInfo); + auto yOffset = __getIndexOffset(e, yShapeInfo); + auto zOffset = __getIndexOffset(e, zShapeInfo); z[zOffset] = lambda(w[wOffset], x[xOffset], y[yOffset]); } diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index ec88de2e5..6afabfca6 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -3228,8 +3228,8 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) { - const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX); - const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY); + const auto xOffset = shape::getIndexOffset(i, xShapeInfo); + const auto yOffset = shape::getIndexOffset(i, yShapeInfo); switch (opCode) { case 0: diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h index d04d3315d..392ed3edf 100644 --- a/libnd4j/include/helpers/Loops.h +++ b/libnd4j/include/helpers/Loops.h @@ -246,9 +246,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (uint i = 0; i < lenPerThread; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, len, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = op(x[xOffset], y[yOffset], extraParams); } } @@ -452,7 +452,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, for (uint j = 0; j < tadLen; j++) start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams); - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ); + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); z[zOffset] = OpType::postProcess(start, tadLen, extraParams); } } @@ -469,7 +469,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, auto start = OpType::startingValue(tad); for (uint j = 0; j < tadLen; j++) { - auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad); + auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams); } @@ -491,11 +491,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, // auto start = OpType::startingValue(tad); // for (uint j = 0; j < tadLen; j++) { - // auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad); + // auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); // start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams); // } - // auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ); + // auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); // z[zOffset] = OpType::postProcess(start, tadLen, extraParams); // } // } @@ -517,7 +517,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, for (uint j = 0; j < tadLen; j++) start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ); + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); z[zOffset] = OpType::postProcess(start, tadLen, extraParams); } @@ -658,13 +658,13 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, PRAGMA_OMP_SIMD for (uint i = 0; i < lenPerThread; i++) { - const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, len, canCastX); + const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX); zi[i * zEws] = OpType::op(x[xOffset], extraParams); } } else { PRAGMA_OMP_SIMD for (uint i = 0; i < lenPerThread; i++) { - const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, len, canCastX); + const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX); zi[i] = OpType::op(x[xOffset], extraParams); } } @@ -782,8 +782,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, PRAGMA_OMP_SIMD for (uint i = 0; i < lenPerThread; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], extraParams); } } @@ -1123,7 +1123,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, auto start = OpType::startingValue(xTad); for (uint j = 0; j < tadLen; ++j) { - const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad); + const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); } @@ -1147,8 +1147,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, auto start = OpType::startingValue(xTad); for (uint j = 0; j < tadLen; ++j) { - const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad); - const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, tadLen, canCastYTad); + const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); + const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } @@ -1423,7 +1423,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, auto start = startVal; for (uint j = 0; j < tadLen; ++j) { - const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad); + const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); } z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); @@ -1449,8 +1449,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, auto start = startVal; for (uint j = 0; j < tadLen; ++j) { - const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad); - const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, tadLen, canCastYTad); + const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); + const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h index 1d991e36a..ba0f956a5 100644 --- a/libnd4j/include/helpers/ShapeUtils.h +++ b/libnd4j/include/helpers/ShapeUtils.h @@ -15,7 +15,7 @@ ******************************************************************************/ // -// @author iuriish@yahoo.com +// @author Yurii Shyrma (iuriish@yahoo.com) // #ifndef LIBND4J_SHAPEUTILS_H diff --git a/libnd4j/include/helpers/TAD.h b/libnd4j/include/helpers/TAD.h index c49f1047d..9888bb1fd 100644 --- a/libnd4j/include/helpers/TAD.h +++ b/libnd4j/include/helpers/TAD.h @@ -526,7 +526,7 @@ namespace shape { /* int *sub = new int[leftOverIndexLen]; shape::ind2subOrder(tadShape,index,len,sub); */ - shape::index2coords(leftOverIndexLen,tadShape, index,len, sub); + shape::index2coords(index, leftOverIndexLen,tadShape, sub); for(int i = 0; i < leftOverIndexLen; i++) { @@ -609,7 +609,7 @@ namespace shape { if(dimensionLength > 1) { Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager); - Nd4jLong ret = shape::getOffset(0,shape::shapeOf(shapeInfo),shape::stride(shapeInfo),tad2Sub,shape::rank(shapeInfo)); + Nd4jLong ret = shape::getOffset(shapeInfo, tad2Sub); if(ret < 0) { if (ptrManager == nullptr) @@ -625,7 +625,7 @@ namespace shape { else { Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager); - Nd4jLong ret = shape::getOffset(0,shape::shapeOf(shapeInfo),shape::stride(shapeInfo),tad2Sub,shape::rank(shapeInfo)); + Nd4jLong ret = shape::getOffset(shapeInfo, tad2Sub); if (ptrManager == nullptr) delete[] tad2Sub; @@ -703,7 +703,7 @@ namespace shape { /* int *sub = new int[leftOverIndexLen]; shape::ind2subOrder(tadShape,index,len,sub); */ - shape::index2coords(leftOverIndexLen,tadShape,index,len, sub); + shape::index2coords(index, leftOverIndexLen,tadShape, sub); for(int i = 0; i < leftOverIndexLen; i++) { ret[leftOverIndexes[i]] = sub[i]; @@ -732,7 +732,7 @@ namespace shape { // return shape::createScalarShapeInfo(); //ensure tad shapes get setup right for vectors - if(dimensionLength > 1 && shape::isVector(shapeInfo)) + if(dimensionLength > 1 && shape::isVector(shapeInfo)) return shape::copyOf(shape::shapeInfoLength(shape::rank(shapeInfo)),shapeInfo); // case when tad coincides with whole array diff --git a/libnd4j/include/helpers/benchmark/ParametersBatch.h b/libnd4j/include/helpers/benchmark/ParametersBatch.h index 5a45099c3..4a7119937 100644 --- a/libnd4j/include/helpers/benchmark/ParametersBatch.h +++ b/libnd4j/include/helpers/benchmark/ParametersBatch.h @@ -64,7 +64,7 @@ namespace nd4j { for (int i = 0; i < totalIterations; i++) { - shape::index2coords(xRank, xShape, i, totalIterations, xCoords); + shape::index2coords(i, xRank, xShape, xCoords); Parameters params; for (int j = 0; j < xRank; j++) { diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp index 0a096b65f..22ff3e6b1 100644 --- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp +++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp @@ -226,7 +226,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, indexValue = OpType::update(indexValue, comp, extraParams); } - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ); + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); z[zOffset] = (Z) indexValue.index; } } @@ -243,7 +243,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, auto indexValue = OpType::startingIndexValue(tad); for (uint j = 0; j < tadLen; j++) { - auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad); + auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); functions::indexreduce::IndexValue comp(tad[tadOffset], j); indexValue = OpType::update(indexValue, comp, extraParams); } @@ -266,12 +266,12 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, auto indexValue = OpType::startingIndexValue(tad); for (uint j = 0; j < tadLen; j++) { - auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad); + auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); functions::indexreduce::IndexValue comp(tad[tadOffset], j); indexValue = OpType::update(indexValue, comp, extraParams); } - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ); + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); z[zOffset] = (Z) indexValue.index; } } diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index 6d93351c3..91ee09123 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -15,7 +15,7 @@ ******************************************************************************/ // -// @author Yurii Shyrma +// @author Yurii Shyrma (iuriish@yahoo.com) // #include @@ -931,7 +931,7 @@ void ShapeUtils::evalIdxRangesForSubArr(const Nd4jLong subArrIdx, const Nd4jLon for(int i = 0; i < subArrRank; ++i) shapeOfSubArr[i] = shapeInfo[dimsToExclude[i] + 1]; - shape::index2coords(subArrRank, shapeOfSubArr.data(), subArrIdx, indexes.data()); + shape::index2coords(subArrIdx, subArrRank, shapeOfSubArr.data(), indexes.data()); memset(idxRanges, 0, 2 * rank * sizeof(Nd4jLong)); diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h index 705f06b99..cae5f0fa9 100644 --- a/libnd4j/include/helpers/shape.h +++ b/libnd4j/include/helpers/shape.h @@ -887,7 +887,7 @@ namespace shape { * @param indices the indices to iterate over * @return the double at the specified index */ - ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(Nd4jLong baseOffset, const Nd4jLong *shape, const Nd4jLong *stride, const Nd4jLong *indices, const int rank); + ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0); ND4J_EXPORT Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector& indices); @@ -897,20 +897,19 @@ namespace shape { /** * Convert a linear index to the corresponding coordinates - * for example if shape is {2, 4}, then index 5 corresponds to following coordinates - * -> [1, 1] in case of c order - * -> [1, 2] in case of f order + * for example if shape is {2, 4}, then index 5 corresponds to coordinates [1, 1] */ - ND4J_EXPORT _CUDA_HD void index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong arrLen, Nd4jLong *coords, const char order = 'c'); - ND4J_EXPORT _CUDA_HD void index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong *coords, const char order = 'c'); + ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords); + ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords); + + /** * Convert coordinates to the corresponding linear index (sequence number in other words) - * for example if shape is {2, 4}, then: - * in case of c order and coordinates [1, 1] index 5 is returned - * in case of f order and coordinates [1, 2] index 5 is returned + * for example if shape is {2, 4} and coordinates [1, 1] then index 5 is returned */ - ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords, const char order = 'c'); + ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords); + ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords); /** * increment n-dimensional array by one iteration by changing coord appropriately @@ -921,24 +920,10 @@ namespace shape { */ /* calculates an array buffer offset for given "index" using following formula: offset = coord_0*stride_0 + coord_1*stride_1 + ... + coord_{rank-1}*stride_{rank-1} - * arrLen - array length */ - ND4J_EXPORT _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen); - ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen); - ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOrderOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen, const char order); - ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, Nd4jLong arrLen, const bool useUnsigned); - - /** - * Compute the real linear indices for the given shape and stride - */ - ND4J_EXPORT _CUDA_HD Nd4jLong *computeIndices(int rank, Nd4jLong *shape, Nd4jLong *stride); - - /** - * Compute the real linear indices for the - * given shape buffer. Shape,stride and rank are derived - * from the buffer - */ - ND4J_EXPORT _CUDA_HD Nd4jLong *computeIndices( Nd4jLong *shapeBuffer); + ND4J_EXPORT _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo); + ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo); + ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned); ND4J_EXPORT _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo); @@ -1749,57 +1734,34 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) { return output; } -/** - * Compute the real linear indices for the given shape and stride - */ - INLINEDEF _CUDA_HD Nd4jLong *computeIndices(int rank, Nd4jLong *shape, Nd4jLong *stride) { - Nd4jLong length = shape::prodLong(shape,rank); +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *indices) { - traceNew(13); + Nd4jLong index, shift = 1;; - Nd4jLong *ret = new Nd4jLong[length]; - for(int i = 0; i < length; i++) { - Nd4jLong *idx = new Nd4jLong[rank]; - shape::index2coords(rank, shape, i, idx, 'f'); - ret[i] = shape::getOffset(0, shape, stride, idx, rank); - delete[] idx; - } - - return ret; - } - -/** -* Compute the real linear indices for the given shape and stride -*/ - INLINEDEF _CUDA_HD Nd4jLong *computeIndices(Nd4jLong *shapeBuffer) { - return computeIndices(shape::rank(shapeBuffer),shape::shapeOf(shapeBuffer),shape::stride(shapeBuffer)); + index = indices[shapeInfo[0] - 1]; + for(uint i = shapeInfo[0]; i > 1; --i) { + shift *= shapeInfo[i]; + index += shift * indices[i - 2]; } + return index; +} ////////////////////////////////////////////////////////////////////// - INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices, const char order) { +INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices) { - Nd4jLong index, shift = 1;; + Nd4jLong index, shift = 1;; - if(order == 'c') { - - index = indices[rank - 1]; - for(int i = rank - 2; i >= 0; --i) { - shift *= shape[i + 1]; - index += shift * indices[i]; - } - } - else { - index = indices[0]; - for(int i = 1; i < rank; ++i) { - shift *= shape[i - 1]; - index += shift * indices[i]; - } - } - - return index; + index = indices[rank - 1]; + for(uint i = rank - 1; i >= 1; --i) { + shift *= shape[i]; + index += shift * indices[i - 1]; } + return index; +} + template INLINEDEF _CUDA_HD void fill(T* buffer, T value, Nd4jLong length) { @@ -1809,85 +1771,110 @@ template } -////////////////////////////////////////////////////////////////////// - INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen) { +// ////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen) { - const Nd4jLong ews = shapeInfo[shapeInfo[0] + shapeInfo[0] + 2]; +// const Nd4jLong ews = shapeInfo[shapeInfo[0] + shapeInfo[0] + 2]; - if(ews > 0 && order(shapeInfo) == 'c') - if (ews == 1) - return index; - else - return ews * index; +// if(ews > 0 && order(shapeInfo) == 'c') +// if (ews == 1) +// return index; +// else +// return ews * index; - Nd4jLong offset = 0; - Nd4jLong rank = shapeInfo[0]; - for(int i = 1; i <= shapeInfo[0]; ++i) { - arrLen /= shapeInfo[i]; - if(arrLen > 0 && shapeInfo[i] > 1) { - offset += (index / arrLen) * shapeInfo[i + rank]; - index %= arrLen; - } - } - return offset; - } +// Nd4jLong offset = 0; +// Nd4jLong rank = shapeInfo[0]; +// for(int i = 1; i <= shapeInfo[0]; ++i) { +// arrLen /= shapeInfo[i]; +// if(arrLen > 0 && shapeInfo[i] > 1) { +// offset += (index / arrLen) * shapeInfo[i + rank]; +// index %= arrLen; +// } +// } +// return offset; +// } - INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen) { +// INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen) { - const uint rank = shapeInfo[0]; - const uint ews = shapeInfo[rank + rank + 2]; +// const uint rank = shapeInfo[0]; +// const uint ews = shapeInfo[rank + rank + 2]; - if(ews > 0 && shapeInfo[rank + rank + 3] == 99) - if (ews == 1) - return index; - else - return ews * index; +// if(ews > 0 && shapeInfo[rank + rank + 3] == 99) +// if (ews == 1) +// return index; +// else +// return ews * index; - uint offset = 0; +// uint offset = 0; - for(uint i = 1; i <= rank; ++i) { - arrLen /= shapeInfo[i]; - if(arrLen > 0 && shapeInfo[i] > 1) { - offset += (index / arrLen) * shapeInfo[i + rank]; - index %= arrLen; - } - } - return offset; - } - - INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, Nd4jLong arrLen, const bool useUnsigned) { - - if(useUnsigned) - return getIndexOffset(static_cast(index), uShapeInfo, static_cast(arrLen)); - - return getIndexOffset(index, lShapeInfo, arrLen); - } +// for(uint i = 1; i <= rank; ++i) { +// arrLen /= shapeInfo[i]; +// if(arrLen > 0 && shapeInfo[i] > 1) { +// offset += (index / arrLen) * shapeInfo[i + rank]; +// index %= arrLen; +// } +// } +// return offset; +// } ////////////////////////////////////////////////////////////////////// - INLINEDEF _CUDA_HD Nd4jLong getIndexOrderOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen, const char order) { +INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo) { - Nd4jLong offset = 0; - if(order == 'c') { - for(int i = 1; i <= *shapeInfo; ++i) { - arrLen /= shapeInfo[i]; - if(arrLen > 0 && shapeInfo[i] > 1) { - offset += (index / arrLen) * shapeInfo[i + *shapeInfo]; - index %= arrLen; - } - } - } - else { - for(int i = *shapeInfo; i >= 1 ; --i) { - arrLen /= shapeInfo[i]; - if(arrLen > 0 && shapeInfo[i] > 1) { - offset += (index / arrLen) * shapeInfo[i + *shapeInfo]; - index %= arrLen; - } - } - } - return offset; + if (shapeInfo[2 * shapeInfo[0] + 3] == 99) { + + const Nd4jLong ews = shapeInfo[2 * shapeInfo[0] + 2]; + if (ews == 1) + return index; + else if(ews > 1) + return ews * index; } + Nd4jLong offset = 0; + + for(uint i = shapeInfo[0]; i > 1; --i) { + offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]]; + index /= shapeInfo[i]; + } + + offset += index * shapeInfo[1 + shapeInfo[0]]; // last iteration + + return offset; +} + +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo) { + + if (shapeInfo[2 * shapeInfo[0] + 3] == 99) { + + const Nd4jLong ews = shapeInfo[2 * shapeInfo[0] + 2]; + if (ews == 1) + return index; + else if(ews > 1) + return ews * index; + } + + uint offset = 0; + + for(uint i = shapeInfo[0]; i > 1; --i) { + offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]]; + index /= shapeInfo[i]; + } + + offset += index * shapeInfo[1 + shapeInfo[0]]; // last iteration + + return offset; +} + + +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned) { + + if(useUnsigned) + return getIndexOffset(static_cast(index), uShapeInfo); + + return getIndexOffset(index, lShapeInfo); +} + /** * * @param length @@ -2394,7 +2381,7 @@ template auto indices = new Nd4jLong[rank]; memset((void *) indices,0,rank * sizeof(Nd4jLong)); indices[0] = sliceIdx; - Nd4jLong offset = shape::getOffset(0,newShape,newStride,indices,rank); + Nd4jLong offset = shape::getOffset(newShapeBuffer, indices); newShapeBuffer[shape::shapeInfoLength(newRank) - 3] = offset; // set current order and ews @@ -3201,30 +3188,30 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons * @param indices the indices to iterate over * @return the double at the specified index */ - INLINEDEF _CUDA_HD Nd4jLong getOffset(Nd4jLong baseOffset, const Nd4jLong *shape, const Nd4jLong *stride, const Nd4jLong *indices, const int rank) { - Nd4jLong offset = baseOffset; - for(int i = 0; i < rank; i++) { - if(shape[i] != 1) - offset += indices[i] * stride[i]; - } - return offset; - } +////////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset) { - INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset) { - return shape::getOffset(baseOffset, shape::shapeOf(const_cast(shapeInfo)), shape::stride(const_cast(shapeInfo)), indices, shapeInfo[0]); - } + Nd4jLong offset = baseOffset; - INLINEDEF Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector& indices) { + for(uint i = 1; i <= shapeInfo[0]; ++i) + if(shapeInfo[i] != 1) + offset += indices[i - 1] * shapeInfo[shapeInfo[0] + i]; - Nd4jLong offset = 0; + return offset; +} - for(uint i = 0; i < shapeInfo[0]; ++i) - if(shapeInfo[i + 1] != 1) - offset += indices[i] * shapeInfo[shapeInfo[0] + i + 1]; +////////////////////////////////////////////////////////////////////////// +INLINEDEF Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector& indices) { - return offset; - } + Nd4jLong offset = 0; + + for(uint i = 1; i <= shapeInfo[0]; ++i) + if(shapeInfo[i] != 1) + offset += indices[i - 1] * shapeInfo[shapeInfo[0] + i]; + + return offset; +} @@ -4209,24 +4196,24 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) { Nd4jLong maxIdxs[MAX_RANK]; - shape::index2coords(shape::rank(maxShapeInfo), const_cast(maxShapeInfo)+1, const_cast(maxIdx), maxIdxs, shape::order(maxShapeInfo)); + shape::index2coords(const_cast(maxIdx), maxShapeInfo, maxIdxs); Nd4jLong minIdxs[MAX_RANK]; maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen); - return coords2index(shape::rank(minShapeInfo), minShapeInfo + 1, minIdxs); + return shape::coords2index(minShapeInfo, minIdxs); } ////////////////////////////////////////////////////////////////////// INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) { Nd4jLong maxIdxs[MAX_RANK]; - shape::index2coords(shape::rank(maxShapeInfo), const_cast(maxShapeInfo)+1, const_cast(maxIdx), maxIdxs, shape::order(maxShapeInfo)); + shape::index2coords(const_cast(maxIdx), maxShapeInfo, maxIdxs); Nd4jLong minIdxs[MAX_RANK]; maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen); - return getOffset(0, minShapeInfo + 1, minShapeInfo + shape::rank(minShapeInfo) + 1, minIdxs, shape::rank(minShapeInfo)); + return getOffset(minShapeInfo, minIdxs); } ////////////////////////////////////////////////////////////////////// @@ -4246,7 +4233,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con int N, minI, maxI; // calculate min per-dim-indices which corresponds to absolute minIdx index - shape::index2coords(rankMin, minShapeInfo + 1, minIdx, indices, order(minShapeInfo)); + shape::index2coords(minIdx, minShapeInfo, indices); // transform storage indices to contain per-dim max indices, purpose - memory saving // fill increment array as well @@ -4277,7 +4264,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con maxI = rankMax-1; N = 0; int step; - maxOffsets[N++] = shape::getOffset(0, maxShapeInfo + 1, maxShapeInfo + rankMax + 1, indices, rankMax); + maxOffsets[N++] = shape::getOffset(maxShapeInfo, indices); // nested loops - producing of absolute indices for max array while(maxI >= 0) { @@ -4290,7 +4277,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con step = -1; } else { - maxOffsets[N++] = shape::getOffset(0, maxShapeInfo + 1, maxShapeInfo + rankMax + 1, indices, rankMax); + maxOffsets[N++] = shape::getOffset(maxShapeInfo, indices); step = rankMax - 1 - maxI; } } @@ -4322,7 +4309,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con int N, minI, maxI; // calculate min per-dim-indices which corresponds to absolute minIdx index - shape::index2coords(rankMin, minShapeInfo + 1, minIdx, indices, order(minShapeInfo)); + shape::index2coords(minIdx, minShapeInfo, indices); // transform storage indices to contain per-dim max indices, purpose - memory saving // fill increment array as well @@ -4353,7 +4340,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con maxI = rankMax-1; N = 0; int step; - maxIdxs[N++] = coords2index(rankMax, maxShapeInfo + 1, indices); + maxIdxs[N++] = shape::coords2index(maxShapeInfo, indices); // nested loops - producing of absolute indices for max array while(maxI >= 0) { @@ -4366,7 +4353,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con step = -1; } else { - maxIdxs[N++] = coords2index(rankMax, maxShapeInfo + 1, indices); + maxIdxs[N++] = shape::coords2index(maxShapeInfo, indices); step = rankMax - 1 - maxI; } } @@ -4693,37 +4680,23 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo } ////////////////////////////////////////////////////////////////////// -INLINEDEF void _CUDA_HD index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong *coords, const char order) { - Nd4jLong arrLen = shape::prodLong(shape, rank); - shape::index2coords(rank, shape, index, arrLen, coords, order); +INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords) { + + for(uint i = shapeInfo[0]; i > 1; --i) { + coords[i - 1] = index % shapeInfo[i]; + index /= shapeInfo[i]; + } + coords[0] = index; // last iteration } -INLINEDEF void _CUDA_HD index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong arrLen, Nd4jLong *coords, const char order) { +////////////////////////////////////////////////////////////////////// +INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords) { - if(order == 'c') { - - for(int i = 0; i < rank; i++) { - arrLen /= shape[i]; - if(arrLen > 0 && shape[i] > 1) { - coords[i] = index / arrLen; - index %= arrLen; - } - else - coords[i] = 0; - } - } - else { - - for(int i = rank - 1; i >= 0; i--) { - arrLen /= shape[i]; - if(arrLen > 0 && shape[i] > 1) { - coords[i] = index / arrLen; - index %= arrLen; - } - else - coords[i] = 0; - } + for(uint i = rank - 1; i > 0; --i) { + coords[i] = index % shape[i]; + index /= shape[i]; } + coords[0] = index; // last iteration } ////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp index dce9ca54b..3bd619827 100644 --- a/libnd4j/include/loops/cpu/broadcasting.hpp +++ b/libnd4j/include/loops/cpu/broadcasting.hpp @@ -170,13 +170,13 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oX = x + tadOffsets[i]; auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); oZ[offset] = OpType::op(oX[offset], y[offset]); } } @@ -190,14 +190,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); } } @@ -211,14 +211,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); } } @@ -232,14 +232,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); } } @@ -255,15 +255,15 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]); } } @@ -362,7 +362,7 @@ namespace functions { PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); oZ[offset] = OpType::op(x[offset], oY[offset]); } } @@ -382,8 +382,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); } } @@ -403,8 +403,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, lenX, canCastX); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); } } @@ -424,8 +424,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); } } @@ -447,9 +447,9 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]); } } diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp index 54950951c..bca423e3e 100644 --- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp +++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp @@ -126,7 +126,7 @@ namespace functions { if (zTadShapeInfo == nullptr) { zTadShapeInfo = xTadShapeShapeInfo; zTadOffset = tadOffsets; - } + } auto lenZ = shape::length(zTadShapeInfo); auto lenY = shape::length(yShapeInfo); @@ -140,7 +140,7 @@ namespace functions { auto zEws = shape::elementWiseStride(zTadShapeInfo); const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); - + if (kindOfLoop == nd4j::LoopKind::EWS1) { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { @@ -170,15 +170,15 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; - + // TODO: cover this codebranch with tests // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); oZ[offset] = OpType::op(oX[offset], y[offset]); } } @@ -192,14 +192,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); } } @@ -213,14 +213,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); } } @@ -234,14 +234,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); } } @@ -257,15 +257,15 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]); } } @@ -365,7 +365,7 @@ namespace functions { // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); oZ[offset] = OpType::op(x[offset], oY[offset]); } } @@ -385,8 +385,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); } } @@ -406,8 +406,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); } } @@ -427,8 +427,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); } } @@ -450,9 +450,9 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]); } } diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp index c092da50b..375d7577a 100644 --- a/libnd4j/include/loops/cpu/broadcasting_int.cpp +++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp @@ -126,7 +126,7 @@ namespace functions { if (zTadShapeInfo == nullptr) { zTadShapeInfo = xTadShapeShapeInfo; zTadOffset = tadOffsets; - } + } auto lenZ = shape::length(zTadShapeInfo); auto lenY = shape::length(yShapeInfo); @@ -140,7 +140,7 @@ namespace functions { auto zEws = shape::elementWiseStride(zTadShapeInfo); const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); - + if (kindOfLoop == nd4j::LoopKind::EWS1) { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { @@ -170,15 +170,15 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; - + // TODO: cover this codebranch with tests // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); oZ[offset] = OpType::op(oX[offset], y[offset]); } } @@ -192,14 +192,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); } } @@ -213,14 +213,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); } } @@ -234,14 +234,14 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); } } @@ -257,15 +257,15 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) for (int i = 0; i < tads; i++) { - + auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX); - auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]); } } @@ -365,7 +365,7 @@ namespace functions { // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); oZ[offset] = OpType::op(x[offset], oY[offset]); } } @@ -385,8 +385,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); } } @@ -406,8 +406,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); } } @@ -427,8 +427,8 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); } } @@ -450,9 +450,9 @@ namespace functions { PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX); - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ); + auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]); } } diff --git a/libnd4j/include/loops/cpu/indexreduce.cpp b/libnd4j/include/loops/cpu/indexreduce.cpp index 5a7beee24..23286ecd9 100644 --- a/libnd4j/include/loops/cpu/indexreduce.cpp +++ b/libnd4j/include/loops/cpu/indexreduce.cpp @@ -92,7 +92,7 @@ Nd4jLong IndexReduce::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex auto ulen = info.getItersPerThread(threadNum); for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, len, canCastX); + auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, canCastX); IndexValue curr(x[offset], threadOffset + i); local = OpType::update(local, curr, extraParams); } diff --git a/libnd4j/include/loops/cpu/pairwise.hpp b/libnd4j/include/loops/cpu/pairwise.hpp index 6b0c8cb49..9dfa129aa 100644 --- a/libnd4j/include/loops/cpu/pairwise.hpp +++ b/libnd4j/include/loops/cpu/pairwise.hpp @@ -137,7 +137,7 @@ namespace functions { void *vz, Nd4jLong* zShapeInfo, void *vextraParams) { - + auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); @@ -152,13 +152,13 @@ namespace functions { if (shape::isScalar(yShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; + uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - + if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); @@ -166,25 +166,25 @@ namespace functions { PRAGMA_OMP_SIMD for(unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], y[0], extraParams); } } } else { - uint zShapeInfoCast[MAX_RANK]; + uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for(unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); } } @@ -192,18 +192,18 @@ namespace functions { return; } - + const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo); const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo); if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) { exec(x, xEws, y, yEws, z, zEws, extraParams, n); - } + } else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo)); - } - else { + } + else { if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { @@ -211,14 +211,14 @@ namespace functions { bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], y[offset], extraParams); } } @@ -231,15 +231,15 @@ namespace functions { bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[offset], y[offset], extraParams); } } @@ -252,15 +252,15 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpType::op(x[offset], y[yOffset], extraParams); } } @@ -273,15 +273,15 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpType::op(x[xOffset], y[offset], extraParams); } } @@ -296,16 +296,16 @@ namespace functions { bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } } diff --git a/libnd4j/include/loops/cpu/pairwise_bool.cpp b/libnd4j/include/loops/cpu/pairwise_bool.cpp index 30d093bce..8feabb98a 100644 --- a/libnd4j/include/loops/cpu/pairwise_bool.cpp +++ b/libnd4j/include/loops/cpu/pairwise_bool.cpp @@ -61,7 +61,7 @@ namespace functions { Nd4jLong zEws, void *vextraParams, const Nd4jLong n) { - + auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); @@ -72,9 +72,9 @@ namespace functions { if (xEws == 1 && yEws == 1 && zEws == 1) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); + Nd4jLong threadOffset = info.getThreadOffset(threadNum); auto xi = x + threadOffset; auto yi = y + threadOffset; auto zi = z + threadOffset; @@ -88,9 +88,9 @@ namespace functions { else { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); + Nd4jLong threadOffset = info.getThreadOffset(threadNum); auto xi = x + xEws*threadOffset; auto yi = y + yEws*threadOffset; auto zi = z + zEws*threadOffset; @@ -151,33 +151,33 @@ namespace functions { if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for(Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], y[0], extraParams); } } } else { - + uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for(Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); } } @@ -190,11 +190,11 @@ namespace functions { if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) { exec(x, xEws, y, yEws, z, zEws, extraParams, n); - } + } else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo)); } - else { + else { if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { @@ -202,83 +202,83 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], y[offset], extraParams); } } } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - + uint xShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[offset], y[offset], extraParams); } } } else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - + uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpType::op(x[offset], y[yOffset], extraParams); } } } else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) { - + uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpType::op(x[xOffset], y[offset], extraParams); } } } else { - + uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; @@ -287,16 +287,16 @@ namespace functions { const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } } diff --git a/libnd4j/include/loops/cpu/pairwise_int.cpp b/libnd4j/include/loops/cpu/pairwise_int.cpp index b356adcc2..63b9dc8c8 100644 --- a/libnd4j/include/loops/cpu/pairwise_int.cpp +++ b/libnd4j/include/loops/cpu/pairwise_int.cpp @@ -61,7 +61,7 @@ namespace functions { Nd4jLong zEws, void *vextraParams, const Nd4jLong n) { - + auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); @@ -72,9 +72,9 @@ namespace functions { if (xEws == 1 && yEws == 1 && zEws == 1) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); + Nd4jLong threadOffset = info.getThreadOffset(threadNum); auto xi = x + threadOffset; auto yi = y + threadOffset; auto zi = z + threadOffset; @@ -88,9 +88,9 @@ namespace functions { else { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); + Nd4jLong threadOffset = info.getThreadOffset(threadNum); auto xi = x + xEws*threadOffset; auto yi = y + yEws*threadOffset; auto zi = z + zEws*threadOffset; @@ -151,33 +151,33 @@ namespace functions { if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for(Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], y[0], extraParams); } } } else { - + uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for(Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); } } @@ -190,11 +190,11 @@ namespace functions { if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) { exec(x, xEws, y, yEws, z, zEws, extraParams, n); - } + } else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo)); } - else { + else { if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { @@ -202,83 +202,83 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], y[offset], extraParams); } } } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - + uint xShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[offset], y[offset], extraParams); } } } else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - + uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpType::op(x[offset], y[yOffset], extraParams); } } } else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) { - + uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpType::op(x[xOffset], y[offset], extraParams); } } } else { - + uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; @@ -287,16 +287,16 @@ namespace functions { const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } } diff --git a/libnd4j/include/loops/cpu/random.cpp b/libnd4j/include/loops/cpu/random.cpp index 30bab1327..5abc1447a 100644 --- a/libnd4j/include/loops/cpu/random.cpp +++ b/libnd4j/include/loops/cpu/random.cpp @@ -50,27 +50,27 @@ namespace functions { return; } - auto length = shape::length(zShapeInfo); + auto length = shape::length(zShapeInfo); // nd4j::random::RandomBuffer *buffer = reinterpret_cast (state); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); nd4j::OmpLaunchHelper info(length); - + if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); } } @@ -79,19 +79,19 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; - const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); } } @@ -100,19 +100,19 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; - const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); } } @@ -121,19 +121,19 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; - const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY); + for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++) { + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); } } @@ -143,21 +143,21 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; - const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[xOffset], y[yOffset], i, length, rng, extraArguments); } } @@ -185,18 +185,18 @@ namespace functions { nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); nd4j::OmpLaunchHelper info(length); - + if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); } } @@ -207,15 +207,15 @@ namespace functions { const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); } } @@ -231,7 +231,7 @@ namespace functions { auto extraArguments = reinterpret_cast(vextraArguments); auto length = shape::length(zShapeInfo); - + //nd4j::random::RandomBuffer *buffer = reinterpret_cast (state); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); nd4j::OmpLaunchHelper info(length); @@ -240,14 +240,14 @@ namespace functions { const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { + { auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ); + auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[offset] = OpClass::op(i+threadOffset, length, rng, extraArguments); } } diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp index a7145846e..246d18ac4 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp @@ -77,7 +77,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < maxThreads; e++) @@ -112,7 +112,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < omp_get_max_threads(); e++) start = OpType::update(start, intermediate[e], extraParams); diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp index 8d04b7cdb..a94a19b25 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp @@ -81,7 +81,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < maxThreads; e++) @@ -115,7 +115,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < omp_get_max_threads(); e++) start = OpType::update(start, intermediate[e], extraParams); diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp index 9069f4198..1a148805e 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp @@ -77,7 +77,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < maxThreads; e++) @@ -113,7 +113,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < omp_get_max_threads(); e++) start = OpType::update(start, intermediate[e], extraParams); diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp index 676348017..0dfff5e73 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp @@ -79,7 +79,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < maxThreads; e++) @@ -117,7 +117,7 @@ namespace functions { PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams); + intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); for (int e = 0; e < maxThreads; e++) start = OpType::update(start, intermediate[e], extraParams); diff --git a/libnd4j/include/loops/cpu/reduce3.cpp b/libnd4j/include/loops/cpu/reduce3.cpp index eeea227c8..fd09dc0e1 100644 --- a/libnd4j/include/loops/cpu/reduce3.cpp +++ b/libnd4j/include/loops/cpu/reduce3.cpp @@ -95,7 +95,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads) for(unsigned int i = 0; i < length; i++) { const auto threadNum = omp_get_thread_num(); - auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX); + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum); } } else { @@ -105,8 +105,8 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads) for(unsigned int i = 0; i < length; i++) { const auto threadNum = omp_get_thread_num(); - auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX); - auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, length, canCastY); + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum); } } diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp index 8f9fd0990..79e53e4a2 100644 --- a/libnd4j/include/loops/cpu/scalar.hpp +++ b/libnd4j/include/loops/cpu/scalar.hpp @@ -33,14 +33,14 @@ namespace scalar { //////////////////////////////////////////////////////////////////////// template template -void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, - void *vextraParams, - void *vz, Nd4jLong *zShapeInfo, - void *vscalars, - int *dimension, int dimensionLength, +void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, + void *vextraParams, + void *vz, Nd4jLong *zShapeInfo, + void *vscalars, + int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalars = reinterpret_cast(vscalars); @@ -159,37 +159,37 @@ void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism) { - auto threadNum = omp_get_thread_num(); + auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], scalar, extraParams); } } } else { - + uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism) { - auto threadNum = omp_get_thread_num(); + auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); } } - } - } + } + } } //////////////////////////////////////////////////////////////////////// @@ -200,7 +200,7 @@ void ScalarTransform::transform(void *vx, Nd4jLong xEws, void *vscalar, void *vextraParams, const Nd4jLong len, bool allowParallelism) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp index 1f400119b..b37bdd6ef 100644 --- a/libnd4j/include/loops/cpu/scalar_bool.cpp +++ b/libnd4j/include/loops/cpu/scalar_bool.cpp @@ -33,14 +33,14 @@ namespace functions { template template - void ScalarBoolTransform::transform(void *vx, Nd4jLong *xShapeInfo, - void *vextraParams, - void *vz, Nd4jLong *zShapeInfo, - void *vscalars, - int *dimension, int dimensionLength, + void ScalarBoolTransform::transform(void *vx, Nd4jLong *xShapeInfo, + void *vextraParams, + void *vz, Nd4jLong *zShapeInfo, + void *vscalars, + int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalars = reinterpret_cast(vscalars); @@ -63,7 +63,7 @@ namespace functions { printf("ScalarBoolTransform::transform: super-bad loop visited. Shouldn't ever happen\n"); return; } - + int num_threads = nd4j::math::nd4j_min(numTads, omp_get_max_threads()); if (kindOfLoop == nd4j::LoopKind::EWS1) { @@ -76,7 +76,7 @@ namespace functions { for (unsigned int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); } - } + } else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) for (unsigned int r = 0; r < numTads; r++) { @@ -87,7 +87,7 @@ namespace functions { for (unsigned int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); } - } + } } template @@ -139,7 +139,7 @@ namespace functions { Nd4jLong *zShapeInfo, void *vscalar, void *vextraParams) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; @@ -162,41 +162,41 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); nd4j::OmpLaunchHelper info(len); - + if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) { - auto threadNum = omp_get_thread_num(); + auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], scalar, extraParams); } } } else { - + uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) { - auto threadNum = omp_get_thread_num(); + auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); } } - } + } } @@ -213,7 +213,7 @@ namespace functions { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; - auto extraParams = reinterpret_cast(vextraParams); + auto extraParams = reinterpret_cast(vextraParams); nd4j::OmpLaunchHelper info(len); @@ -231,7 +231,7 @@ namespace functions { for (unsigned int i = 0; i < ulen; i++) zi[i] = OpType::op(xi[i], scalar, extraParams); } - } + } else { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp index 9920cc836..9e73e2756 100644 --- a/libnd4j/include/loops/cpu/scalar_int.cpp +++ b/libnd4j/include/loops/cpu/scalar_int.cpp @@ -34,13 +34,13 @@ namespace functions { template template void ScalarIntTransform::transform(void *vx, Nd4jLong *xShapeInfo, - void *vextraParams, - void *vz, Nd4jLong *zShapeInfo, - void *vscalars, - int *dimension, int dimensionLength, + void *vextraParams, + void *vz, Nd4jLong *zShapeInfo, + void *vscalars, + int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalars = reinterpret_cast(vscalars); @@ -63,7 +63,7 @@ namespace functions { printf("ScalarIntTransform::transform: super-bad loop visited. Shouldn't ever happen\n"); return; } - + int num_threads = nd4j::math::nd4j_min(numTads, omp_get_max_threads()); if (kindOfLoop == nd4j::LoopKind::EWS1) { @@ -76,7 +76,7 @@ namespace functions { for (unsigned int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); } - } + } else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) for (unsigned int r = 0; r < numTads; r++) { @@ -87,7 +87,7 @@ namespace functions { for (unsigned int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); } - } + } } template @@ -139,7 +139,7 @@ namespace functions { Nd4jLong *zShapeInfo, void *vscalar, void *vextraParams) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; @@ -162,41 +162,41 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); nd4j::OmpLaunchHelper info(len); - + if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) { - auto threadNum = omp_get_thread_num(); + auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); + auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpType::op(x[offset], scalar, extraParams); } } } else { - + uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) { - auto threadNum = omp_get_thread_num(); + auto threadNum = omp_get_thread_num(); auto threadOffset = info.getThreadOffset(threadNum); auto ulen = static_cast(info.getItersPerThread(threadNum)); PRAGMA_OMP_SIMD for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ); + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); } } - } + } } @@ -213,7 +213,7 @@ namespace functions { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; - auto extraParams = reinterpret_cast(vextraParams); + auto extraParams = reinterpret_cast(vextraParams); nd4j::OmpLaunchHelper info(len); @@ -231,7 +231,7 @@ namespace functions { for (unsigned int i = 0; i < ulen; i++) zi[i] = OpType::op(xi[i], scalar, extraParams); } - } + } else { PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp index ed398db28..1f5a7c339 100644 --- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp +++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp @@ -92,7 +92,7 @@ namespace functions { for (Nd4jLong i = 0; i < length; i++) { - auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCast); + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast); SummaryStatsData curr; curr.initWithValue(x[xOffset]); @@ -175,7 +175,7 @@ namespace functions { } else { for (int i = 1; i < tadLength; i ++) { - auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCast); + auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast); SummaryStatsData indexVal2; indexVal2.initWithValue(tx[xOffset]); diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp index d930d8cad..086e216e6 100644 --- a/libnd4j/include/loops/cuda/broadcasting.chpp +++ b/libnd4j/include/loops/cuda/broadcasting.chpp @@ -42,7 +42,7 @@ static __global__ void broadcastSimple( Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) { - + functions::broadcast::Broadcast::template transformCuda(x,xShapeInfo,y,yShapeInfo,z,zShapeInfo,dimension,dimensionLength,tadOnlyShapeInfo,tadOffsets,tadOnlyShapeInfoZ,tadOffsetsZ); } @@ -64,8 +64,8 @@ static __global__ void broadcastInverseSimple( namespace functions { namespace broadcast { - static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) { - return shape::getIndexOffset(index, shapeInfo, length); + static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) { + return shape::getIndexOffset(index, shapeInfo); } static Nd4jLong __device__ __noinline__ _length(Nd4jLong *shapeInfo) { @@ -154,9 +154,9 @@ namespace functions { else { // it is expected that x and z tads and y array all have the same length for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) { - auto xOffset = _getIndexOffset(i, xShapeInfo, tadLength); - auto yOffset = _getIndexOffset(i, tadOnlyShapeInfo, tadLength); - auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ, tadLength); + auto xOffset = _getIndexOffset(i, xShapeInfo); + auto yOffset = _getIndexOffset(i, tadOnlyShapeInfo); + auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ); rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]); } } @@ -170,14 +170,14 @@ namespace functions { void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, - int *dimension, int dimensionLength, + int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) { if (tadOnlyShapeInfoZ == nullptr) { tadOnlyShapeInfoZ = tadOnlyShapeInfo; tadOffsetsZ = tadOffsets; } - + auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); @@ -212,16 +212,16 @@ namespace functions { if(tadEWS > 0 && zEWS > 0 && yEWS > 0 && xOrder == yOrder && xOrder == zOrder) { - for (int i = threadIdx.x; i < tadLength; i+= blockDim.x) - rZ[i * zEWS] = OpType::op(rX[i * tadEWS], y[i * yEWS]); + for (int i = threadIdx.x; i < tadLength; i+= blockDim.x) + rZ[i * zEWS] = OpType::op(rX[i * tadEWS], y[i * yEWS]); } else { // it is expected that x and z tads and y array all have the same length for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) { - - auto xOffset = _getIndexOffset(i, tadOnlyShapeInfo, tadLength); - auto yOffset = _getIndexOffset(i, yShapeInfo, tadLength); - auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ, tadLength); + + auto xOffset = _getIndexOffset(i, tadOnlyShapeInfo); + auto yOffset = _getIndexOffset(i, yShapeInfo); + auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ); rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]); } } diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu index 8981790f5..aaec44690 100644 --- a/libnd4j/include/loops/cuda/broadcasting_bool.cu +++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu @@ -42,7 +42,7 @@ static __global__ void broadcastBoolSimple( Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) { - + functions::broadcast::BroadcastBool::template transformCuda(x,xShapeInfo,y,yShapeInfo,z,zShapeInfo,dimension,dimensionLength,tadOnlyShapeInfo,tadOffsets,tadOnlyShapeInfoZ,tadOffsetsZ); } @@ -145,9 +145,9 @@ namespace functions { else { // it is expected that x and z tads and y array all have the same length for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, tadLength); - auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); - auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo); + auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ); rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]); } @@ -183,13 +183,13 @@ namespace functions { __shared__ int numTads; __shared__ Nd4jLong yEWS; __shared__ Nd4jLong zEWS; - + if (threadIdx.x == 0) { tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); numTads = shape::length(xShapeInfo) / tadLength; yEWS = shape::elementWiseStride(yShapeInfo); - zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ); + zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ); } __syncthreads(); @@ -213,9 +213,9 @@ namespace functions { else { // it is expected that x and z tads and y array all have the same length for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) { - auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, tadLength); - auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength); + auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ); rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]); } diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu index 38193f35d..fc613a438 100644 --- a/libnd4j/include/loops/cuda/broadcasting_int.cu +++ b/libnd4j/include/loops/cuda/broadcasting_int.cu @@ -42,7 +42,7 @@ static __global__ void broadcastIntSimple( Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) { - + functions::broadcast::BroadcastInt::template transformCuda(x,xShapeInfo,y,yShapeInfo,z,zShapeInfo,dimension,dimensionLength,tadOnlyShapeInfo,tadOffsets,tadOnlyShapeInfoZ,tadOffsetsZ); } @@ -139,9 +139,9 @@ namespace functions { else { // it is expected that x and z tads and y array all have the same length for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, tadLength); - auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); - auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo); + auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ); rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]); } @@ -177,13 +177,13 @@ namespace functions { __shared__ int numTads; __shared__ Nd4jLong yEWS; __shared__ Nd4jLong zEWS; - + if (threadIdx.x == 0) { tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); numTads = shape::length(xShapeInfo) / tadLength; yEWS = shape::elementWiseStride(yShapeInfo); - zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ); + zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ); } __syncthreads(); @@ -207,9 +207,9 @@ namespace functions { else { // it is expected that x and z tads and y array all have the same length for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) { - auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, tadLength); - auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength); + auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ); rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]); } diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu index 5f0cf07ae..8a560e416 100644 --- a/libnd4j/include/loops/cuda/indexreduce.cu +++ b/libnd4j/include/loops/cuda/indexreduce.cu @@ -246,12 +246,12 @@ namespace functions { if (dimensionLength > 1 || tadEWS < 1) { for (int r = blockIdx.x; r < numTads; r += gridDim.x) { - + auto tadOffsetForBlock = tadOffsets[r]; sPartials[threadIdx.x] = OpType::startingIndexValue(dx); - for(int i = threadIdx.x;i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); + for(int i = threadIdx.x;i < tadLength; i += blockDim.x) { + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); IndexValue comp {dx[xOffset], i}; sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], comp, extraParams); } @@ -297,9 +297,9 @@ namespace functions { reduction = OpType::update(reduction, indexVal, extraParams); } } else { - - for(Nd4jLong i = tid;i < n; i += blockDim.x * gridDim.x) { - auto offset = shape::getIndexOffset(i, xShapeInfo, n); + + for(Nd4jLong i = tid;i < n; i += blockDim.x * gridDim.x) { + auto offset = shape::getIndexOffset(i, xShapeInfo); IndexValue indexVal = {dx[offset], i}; reduction = OpType::update(reduction, indexVal, extraParams); } diff --git a/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h index 3c79f443b..5df583e61 100644 --- a/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h +++ b/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h @@ -115,7 +115,7 @@ namespace functions { sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams); else for (int i = tid; i < len; i += blockDim.x * gridDim.x) - sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams); + sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams); __syncthreads(); aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min(blockDim.x, len), extraParams); diff --git a/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h index cb87ea461..9e061003d 100644 --- a/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h +++ b/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h @@ -73,7 +73,7 @@ namespace functions { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - z[shape::getIndexOffset(i, zShapeInfo, length)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, length)], scalar, params); + z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params); } } } diff --git a/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h index e3f653350..b10b23d09 100644 --- a/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h +++ b/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h @@ -72,8 +72,8 @@ namespace functions { for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { - auto xOffset2 = shape::getIndexOffset(i, shapeInfo, length); - auto zOffset2 = shape::getIndexOffset(i, zShapeInfo, length); + auto xOffset2 = shape::getIndexOffset(i, shapeInfo); + auto zOffset2 = shape::getIndexOffset(i, zShapeInfo); result[zOffset2] = OpType::op(dy[xOffset2], params); } } diff --git a/libnd4j/include/loops/cuda/legacy/reduce.legacy b/libnd4j/include/loops/cuda/legacy/reduce.legacy index 7b365f9fe..1ae7985de 100644 --- a/libnd4j/include/loops/cuda/legacy/reduce.legacy +++ b/libnd4j/include/loops/cuda/legacy/reduce.legacy @@ -169,7 +169,7 @@ namespace functions { template <> _CUDA_H void ReduceFunction::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, float *x, Nd4jLong *xShapeInfo, float *extraParams, float *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, float *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) { - + DISPATCH_SIMPLE(reduceScalarSimple, float, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS)) nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed"); @@ -177,7 +177,7 @@ namespace functions { template <> _CUDA_H void ReduceFunction::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, float16 *x, Nd4jLong *xShapeInfo, float16 *extraParams, float16 *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, float16 *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) { - + DISPATCH_SIMPLE(reduceScalarSimple, float16, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS)) nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarHalf(...) failed"); @@ -185,7 +185,7 @@ namespace functions { template <> _CUDA_H void ReduceFunction::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, double *x, Nd4jLong *xShapeInfo, double *extraParams, double *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, double *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) { - + DISPATCH_SIMPLE(reduceScalarSimple, double, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS)) nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarDouble(...) failed"); @@ -294,7 +294,7 @@ namespace functions { for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord); - auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank); + auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams); } @@ -358,7 +358,7 @@ namespace functions { for (int i = tid; i < n; i += blockDim.x * gridDim.x) { shape::ind2subC(rank, xShape, i, n, ind2sub); - auto offset = shape::getOffset(0, xShape, xStride, ind2sub, rank); + auto offset = shape::getOffset(xShapeInfo, ind2sub); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[offset], extraParams), extraParams); } } @@ -461,7 +461,7 @@ namespace functions { for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord); - auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank); + auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams); } @@ -526,7 +526,7 @@ namespace functions { for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord); - auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank); + auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams); } diff --git a/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy b/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy index 73cb9c6ff..7bc30271f 100644 --- a/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy +++ b/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy @@ -88,8 +88,8 @@ static inline __device__ void transformCuda(T scalar, T *dy, int *shapeInfo, T * for (Nd4jLong i = tid; i < length; i+= totalThreads) { shape::ind2sub(xRank, xShape, i, length, xIdx); - int xOffset2 = shape::getOffset(0, xShape, xStride, xIdx, xRank); - int resultOffset = shape::getOffset(0, zShape, zStride, xIdx, zRank); + int xOffset2 = shape::getOffset(shapeInfo, xIdx); + int resultOffset = shape::getOffset(0resultShapeInfo, xIdx); result[resultOffset] = OpType::op(dy[xOffset2],scalar, params); } } diff --git a/libnd4j/include/loops/cuda/legacy/transform.legacy b/libnd4j/include/loops/cuda/legacy/transform.legacy index ed321e79c..6a8344916 100644 --- a/libnd4j/include/loops/cuda/legacy/transform.legacy +++ b/libnd4j/include/loops/cuda/legacy/transform.legacy @@ -111,7 +111,7 @@ __device__ void transformSimpleGeneric( manager->init(sizeof(UnifiedSharedMemory), 0, sizeof(functions::transform::Transform), sizeof(shape::TAD), xRank); } __syncthreads(); - + functions::transform::Transform::template transformCuda( dy, xShapeInfo, @@ -161,7 +161,7 @@ namespace functions { template <> _CUDA_H void Transform::executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, float *x, Nd4jLong *xShape, int xRank, float *extraParams, float *z, Nd4jLong *zShape, int zRank, int *allocationPointer, float *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - + DISPATCH_SIMPLE(transformShaped, float, PARAMS(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(TRANSFORM_OPS)) @@ -170,16 +170,16 @@ namespace functions { template <> _CUDA_H void Transform::executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, float16 *x, Nd4jLong *xShape, int xRank, float16 *extraParams, float16 *z, Nd4jLong *zShape, int zRank, int *allocationPointer, float16 *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - + DISPATCH_SIMPLE(transformShaped, float16, PARAMS(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(TRANSFORM_OPS)) - + if (nd4j::Environment::getInstance()->isDebug()) checkCudaErrors(cudaStreamSynchronize(*stream)); } template <> _CUDA_H void Transform::executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, double *x, Nd4jLong *xShape, int xRank, double *extraParams, double *z, Nd4jLong *zShape, int zRank, int *allocationPointer, double *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - + DISPATCH_SIMPLE(transformShaped, double, PARAMS(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(TRANSFORM_OPS)) DEBUG_KERNEL(stream, opNum); @@ -226,13 +226,13 @@ namespace functions { } else { Nd4jLong xCoord[MAX_RANK]; - + for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { shape::ind2sub(xRank,shape::shapeOf(shapeInfo),i, length, xCoord); - - auto xOffset2 = shape::getOffset(0, xShape, xStride, xCoord, xRank); - auto resultOffset2 = shape::getOffset(0,xShape,shape::stride(resultShapeInfo),xCoord,xRank); - + + auto xOffset2 = shape::getOffset(shapeInfo, xCoord); + auto resultOffset2 = shape::getOffset(resultShapeInfo, xCoord); + result[resultOffset2] = OpType::op(dy[xOffset2], params); } } @@ -249,7 +249,7 @@ namespace functions { T *result, Nd4jLong resultStride, int *allocationPointer, T *reductionPointer, UnifiedSharedMemory *manager) { - + int totalThreads = gridDim.x * blockDim.x; Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libnd4j/include/loops/cuda/pairwise.chpp b/libnd4j/include/loops/cuda/pairwise.chpp index 3f7134887..d3252d862 100644 --- a/libnd4j/include/loops/cuda/pairwise.chpp +++ b/libnd4j/include/loops/cuda/pairwise.chpp @@ -28,11 +28,11 @@ using namespace simdOps; //////////////////////////////////////////////////////////////////////////////// template -__global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, - void *vy, Nd4jLong *yShapeInfo, - void *vz, Nd4jLong *zShapeInfo, +__global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, + void *vy, Nd4jLong *yShapeInfo, + void *vz, Nd4jLong *zShapeInfo, void *vextraParams) { - + auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); @@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, } else if (vx == vz) { for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, len); - + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } } else { for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, len); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, len); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu index 62f040191..414aadd30 100644 --- a/libnd4j/include/loops/cuda/pairwise_bool.cu +++ b/libnd4j/include/loops/cuda/pairwise_bool.cu @@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, } else if (vx == vz) { for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, len); - + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } } else { for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, len); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, len); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } @@ -105,7 +105,7 @@ void _CUDA_H PairWiseBoolTransform::intermediateShaped(dim3& launchDims, cu template void PairWiseBoolTransform::executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *vextraParams) { auto xType = nd4j::DataTypeUtils::fromT(); - auto yType = nd4j::DataTypeUtils::fromT(); + auto yType = nd4j::DataTypeUtils::fromT(); DISPATCH_BY_OPNUM_TT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_BOOL_OPS); } @@ -166,7 +166,7 @@ void PairWiseBoolTransform::executeCudaShaped(dim3& launchDims, cudaStream_ } - + BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT PairWiseBoolTransform, , LIBND4J_TYPES, BOOL_TYPES); } } diff --git a/libnd4j/include/loops/cuda/pairwise_int.cu b/libnd4j/include/loops/cuda/pairwise_int.cu index 5cc12846c..2bedb4a82 100644 --- a/libnd4j/include/loops/cuda/pairwise_int.cu +++ b/libnd4j/include/loops/cuda/pairwise_int.cu @@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, } else if (vx == vz) { for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, len); - + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } } else { for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, len); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, len); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } @@ -165,7 +165,7 @@ void PairWiseIntTransform::executeCudaShaped(dim3& launchDims, cudaStream_t * } - + BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT PairWiseIntTransform, , INTEGER_TYPES); } } diff --git a/libnd4j/include/loops/cuda/random.cu b/libnd4j/include/loops/cuda/random.cu index 727f0868f..3bf06ae91 100644 --- a/libnd4j/include/loops/cuda/random.cu +++ b/libnd4j/include/loops/cuda/random.cu @@ -116,7 +116,7 @@ namespace functions { auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); auto extraArguments = reinterpret_cast(vextraArguments); - + if (OpClass::requiresSpecial) { OpClass::specialOpCuda(state, x, xShapeBuffer, y, yShapeBuffer, z, zShapeBuffer, extraArguments); return; @@ -166,10 +166,10 @@ namespace functions { } } else { for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) { - - auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer, length); - auto yOffset2 = shape::getIndexOffset(i, yShapeBuffer, length); - auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length); + + auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer); + auto yOffset2 = shape::getIndexOffset(i, yShapeBuffer); + auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer); z[zOffset2] = OpClass::op(x[xOffset2], y[yOffset2], i, length, buffer, extraArguments); } @@ -224,11 +224,11 @@ namespace functions { z[e * zEWS] = OpClass::op(x[e * xEWS], e, length, buffer, extraArguments); } } else { - + for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < length; i += blockDim.x * gridDim.x) { - - auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer, length); - auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length); + + auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer); + auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer); z[zOffset2] = OpClass::op(x[xOffset2], i, length, buffer, extraArguments); } @@ -274,9 +274,9 @@ namespace functions { z[i * ews] = OpClass::op(i, length, buffer, extraArguments); } } else { - - for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) { - auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length); + + for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) { + auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer); z[zOffset2] = OpClass::op(i, length, buffer, extraArguments); } } @@ -296,7 +296,7 @@ namespace functions { template <> _CUDA_H void RandomFunction::executeCudaSingle(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) { - + auto z = reinterpret_cast(vz); auto extraArguments = reinterpret_cast(vextraArguments); @@ -320,7 +320,7 @@ namespace functions { template <> _CUDA_H void RandomFunction::executeCudaSingle(dim3& launchDims, cudaStream_t *stream, int opNum, Nd4jPointer stateHost, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) { - + auto z = reinterpret_cast(vz); auto extraArguments = reinterpret_cast(vextraArguments); @@ -332,7 +332,7 @@ namespace functions { template <> _CUDA_H void RandomFunction::executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraArguments = reinterpret_cast(vextraArguments); @@ -346,7 +346,7 @@ namespace functions { template <> _CUDA_H void RandomFunction::executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraArguments = reinterpret_cast(vextraArguments); @@ -372,7 +372,7 @@ namespace functions { template <> _CUDA_H void RandomFunction::executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) { - + auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraArguments = reinterpret_cast(vextraArguments); @@ -385,7 +385,7 @@ namespace functions { template <> _CUDA_H void RandomFunction::executeCudaTriple(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vy, Nd4jLong *yShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) { - + auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -400,7 +400,7 @@ namespace functions { template <> _CUDA_H void RandomFunction::executeCudaTriple(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vy, Nd4jLong *yShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) { - + auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); diff --git a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu index fea9bcb63..a785094f1 100644 --- a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu +++ b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu @@ -129,7 +129,7 @@ __device__ void ReduceBoolFunction::transformCudaXD( void *vx, Nd4jLong *xS for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams); } __syncthreads(); @@ -140,7 +140,7 @@ __device__ void ReduceBoolFunction::transformCudaXD( void *vx, Nd4jLong *xS __syncthreads(); if (threadIdx.x == 0) - z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); + z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); } } @@ -180,7 +180,7 @@ __device__ void ReduceBoolFunction::execScalarCuda(void *vx, Nd4jLong *xSha sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams); else for (int i = tid; i < len; i += blockDim.x * gridDim.x) - sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams); + sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams); __syncthreads(); aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min(blockDim.x, len), extraParams); diff --git a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp index ffd075715..ef366caf7 100644 --- a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp +++ b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp @@ -129,7 +129,7 @@ __device__ void ReduceFloatFunction::transformCudaXD( void *vx, Nd4jLong *x sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock); for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams); } __syncthreads(); @@ -139,7 +139,7 @@ __device__ void ReduceFloatFunction::transformCudaXD( void *vx, Nd4jLong *x __syncthreads(); if (threadIdx.x == 0) - z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); + z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); } } @@ -179,7 +179,7 @@ __device__ void ReduceFloatFunction::execScalarCuda(void *vx, Nd4jLong *xSh sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams); else for (int i = tid; i < len; i += blockDim.x * gridDim.x) - sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams); + sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams); __syncthreads(); aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min(blockDim.x, len), extraParams); diff --git a/libnd4j/include/loops/cuda/reduce/reduce_long.cu b/libnd4j/include/loops/cuda/reduce/reduce_long.cu index b989298fe..79ab25280 100644 --- a/libnd4j/include/loops/cuda/reduce/reduce_long.cu +++ b/libnd4j/include/loops/cuda/reduce/reduce_long.cu @@ -150,7 +150,7 @@ __device__ void ReduceLongFunction::transformCudaXD( void *vx, Nd4jLong *xS sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock); for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams); } __syncthreads(); @@ -160,7 +160,7 @@ __device__ void ReduceLongFunction::transformCudaXD( void *vx, Nd4jLong *xS __syncthreads(); if (threadIdx.x == 0) - z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); + z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); } } @@ -200,7 +200,7 @@ __device__ void ReduceLongFunction::execScalarCuda(void *vx, Nd4jLong *xSha sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams); else for (int i = tid; i < len; i += blockDim.x * gridDim.x) - sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams); + sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams); __syncthreads(); aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min(blockDim.x, len), extraParams); diff --git a/libnd4j/include/loops/cuda/reduce/reduce_same.cu b/libnd4j/include/loops/cuda/reduce/reduce_same.cu index 1c367653c..bcf5bab7f 100644 --- a/libnd4j/include/loops/cuda/reduce/reduce_same.cu +++ b/libnd4j/include/loops/cuda/reduce/reduce_same.cu @@ -139,7 +139,7 @@ __device__ void ReduceSameFunction::transformCudaXD( void *vx, Nd4jLong *xSha sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock); for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams); } __syncthreads(); @@ -149,7 +149,7 @@ __device__ void ReduceSameFunction::transformCudaXD( void *vx, Nd4jLong *xSha __syncthreads(); if (threadIdx.x == 0) - z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); + z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams); } } @@ -197,7 +197,7 @@ __device__ void ReduceSameFunction::execScalarCuda(void *vx, Nd4jLong *xShape sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams); else for (int i = tid; i < len; i += blockDim.x * gridDim.x) - sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams); + sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams); __syncthreads(); aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min(blockDim.x, len), extraParams); diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp index 01b595da1..fa1ab2e17 100644 --- a/libnd4j/include/loops/cuda/reduce3.chpp +++ b/libnd4j/include/loops/cuda/reduce3.chpp @@ -161,8 +161,8 @@ __device__ void Reduce3::execScalarCuda( void *vx, Nd4jLong *xShapeInfo, sPartials[threadIdx.x] = OpType::startingValue(x); auto threadCount = gridDim.x * blockDim.x; for(Nd4jLong i = tid; i < length; i += threadCount) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); - auto yOffset = shape::getIndexOffset(i, yShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto yOffset = shape::getIndexOffset(i, yShapeInfo); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset], y[yOffset], extraZ), extraZ); } } @@ -290,7 +290,7 @@ __device__ void Reduce3::transformAll( void *vx, Nd4jLong *xShapeInfo, X *x = dx + xOffsets[r]; if (threadIdx.x < xTadLength && threadIdx.x < maxBlock) { - auto x0 = shape::getIndexOffset(threadIdx.x, xTadShapeInfo, shape::length(xTadShapeInfo)); + auto x0 = shape::getIndexOffset(threadIdx.x, xTadShapeInfo); tempX[threadIdx.x] = x[x0]; } __syncthreads(); @@ -311,12 +311,12 @@ __device__ void Reduce3::transformAll( void *vx, Nd4jLong *xShapeInfo, // we reset tempX IF we have >1 tiles if (t >= 1 || (limit > 1 && g > 0)) if (threadIdx.x + (t * maxBlock) < xTadLength) { - auto x0 = shape::getIndexOffset(threadIdx.x + (t * maxBlock), xTadShapeInfo, xTadLength); + auto x0 = shape::getIndexOffset(threadIdx.x + (t * maxBlock), xTadShapeInfo); tempX[threadIdx.x] = x[x0]; } for (int f = threadIdx.x + (t * maxBlock); f < xTadLength && f < threadIdx.x + ((t + 1) * maxBlock); f += blockDim.x * gridDim.x) { - auto y0 = shape::getIndexOffset(f, yTadShapeInfo, yTadLength); + auto y0 = shape::getIndexOffset(f, yTadShapeInfo); sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(tempX[threadIdx.x], y[y0], extraZ), extraZ); } @@ -433,8 +433,8 @@ __device__ void Reduce3::transform(void *vx, Nd4jLong *xShapeInfo, for (int j = threadIdx.x; j < tadLen; j += blockDim.x) { - Nd4jLong xOffset2 = xOffset + shape::getIndexOffset(j, tadOnlyShapeInfo, tadLen); - Nd4jLong yOffset2 = yOffset + shape::getIndexOffset(j, yTadOnlyShapeInfo, tadLen); + Nd4jLong xOffset2 = xOffset + shape::getIndexOffset(j, tadOnlyShapeInfo); + Nd4jLong yOffset2 = yOffset + shape::getIndexOffset(j, yTadOnlyShapeInfo); sPartials[threadIdx.x] = j < blockDim.x ? OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ) : OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ), extraZ); } diff --git a/libnd4j/include/loops/cuda/scalar.chpp b/libnd4j/include/loops/cuda/scalar.chpp index 503e5c97a..7277138f9 100644 --- a/libnd4j/include/loops/cuda/scalar.chpp +++ b/libnd4j/include/loops/cuda/scalar.chpp @@ -33,7 +33,7 @@ using namespace simdOps; //////////////////////////////////////////////////////////////////////////////// template __global__ static void scalarSimpleShaped(void* vx, void *vscalar, Nd4jLong *xShapeInfo, void *vparams, void *vz, Nd4jLong *zShapeInfo, int *allocationBuffer) { - + auto scalar = reinterpret_cast(vscalar)[0]; auto x = reinterpret_cast(vx); auto params = reinterpret_cast(vparams); @@ -61,10 +61,10 @@ __global__ static void scalarSimpleShaped(void* vx, void *vscalar, Nd4jLong *xSh } } else { for (Nd4jLong i = tid; i < length; i += totalThreads) { - z[shape::getIndexOffset(i, zShapeInfo, length)] = OpType::op(x[shape::getIndexOffset(i, xShapeInfo, length)], scalar, params); + z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], scalar, params); } } - + } //////////////////////////////////////////////////////////////////////////////// @@ -76,7 +76,7 @@ __global__ static void scalarAlongDimension(void *vx, Nd4jLong *xShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { - + auto x = reinterpret_cast(vx); auto extraParams = reinterpret_cast(vextraParams); auto z = reinterpret_cast(vz); @@ -114,7 +114,7 @@ __global__ static void scalarAlongDimension(void *vx, Nd4jLong *xShapeInfo, auto s = scalars[r]; for (int f = threadIdx.x; f < tadLength; f += blockDim.x) - oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams); + oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams); } } } @@ -127,7 +127,7 @@ namespace scalar { template template void _CUDA_H ScalarTransform::intermediateShaped(dim3& launchDims, cudaStream_t *stream, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, void* vscalar, void *vextraParams, int *allocPointer){ - + auto xEws = shape::elementWiseStride(hxShapeInfo); auto xOrder = shape::order(hxShapeInfo); diff --git a/libnd4j/include/loops/cuda/scalar_bool.cu b/libnd4j/include/loops/cuda/scalar_bool.cu index c6563c9ef..37939b9b9 100644 --- a/libnd4j/include/loops/cuda/scalar_bool.cu +++ b/libnd4j/include/loops/cuda/scalar_bool.cu @@ -36,7 +36,7 @@ __global__ void scalarAlongDimension(void *x, Nd4jLong *xShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { - + functions::scalar::ScalarBoolTransform::template transformCuda(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ); } @@ -60,10 +60,10 @@ namespace scalar { //////////////////////////////////////////////////////////////////////// template template -__device__ void ScalarBoolTransform::transformCuda(void* vscalar, - void *vy, Nd4jLong *yShapeInfo, - void *vparams, - void *vz, Nd4jLong *zShapeInfo, +__device__ void ScalarBoolTransform::transformCuda(void* vscalar, + void *vy, Nd4jLong *yShapeInfo, + void *vparams, + void *vz, Nd4jLong *zShapeInfo, int *allocationBuffer) { auto scalar = reinterpret_cast(vscalar)[0]; auto y = reinterpret_cast(vy); @@ -73,8 +73,8 @@ __device__ void ScalarBoolTransform::transformCuda(void* vscalar, auto yRank = shape::rank(yShapeInfo); auto yEWS = shape::elementWiseStride(yShapeInfo); auto yShape = shape::shapeOf(yShapeInfo); - auto yStride = shape::stride(yShapeInfo); - + auto yStride = shape::stride(yShapeInfo); + auto zRank = shape::rank(zShapeInfo); auto zEWS = shape::elementWiseStride(zShapeInfo); auto zShape = shape::shapeOf(zShapeInfo); @@ -89,22 +89,22 @@ __device__ void ScalarBoolTransform::transformCuda(void* vscalar, __syncthreads(); if(yEWS >= 1 && zEWS >= 1 && shape::order(yShapeInfo) == shape::order(zShapeInfo)) { - transformCuda(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer); + transformCuda(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer); } else { - for (Nd4jLong i = tid; i < len; i+= totalThreads) - z[shape::getIndexOffset(i, zShapeInfo, len)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, len)], scalar, params); + for (Nd4jLong i = tid; i < len; i+= totalThreads) + z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params); } } //////////////////////////////////////////////////////////////////////// template template -__device__ void ScalarBoolTransform::transformCuda(Nd4jLong len, - void* vx, - void *vy, Nd4jLong yEWS, - void *vparams, - void *vz, Nd4jLong zEWS, +__device__ void ScalarBoolTransform::transformCuda(Nd4jLong len, + void* vx, + void *vy, Nd4jLong yEWS, + void *vparams, + void *vz, Nd4jLong zEWS, int *allocationBuffer) { auto x = reinterpret_cast(vx)[0]; @@ -130,18 +130,18 @@ __device__ void ScalarBoolTransform::transformCuda(Nd4jLong len, //////////////////////////////////////////////////////////////////////// template template -__device__ void ScalarBoolTransform::transformCuda(void *vx, Nd4jLong *xShapeInfo, - void *vextraParams, - void *vz, Nd4jLong *zShapeInfo, - void *vscalars, - int *dimension, int dimensionLength, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, +__device__ void ScalarBoolTransform::transformCuda(void *vx, Nd4jLong *xShapeInfo, + void *vextraParams, + void *vz, Nd4jLong *zShapeInfo, + void *vscalars, + int *dimension, int dimensionLength, + Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { auto x = reinterpret_cast(vx); auto scalars = reinterpret_cast(vscalars); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - + if (tadShapeInfoZ == nullptr) { tadShapeInfoZ = tadShapeInfo; tadOffsetsZ = tadOffsets; @@ -174,7 +174,7 @@ __device__ void ScalarBoolTransform::transformCuda(void *vx, Nd4jLong *xS auto s = scalars[r]; for (int f = threadIdx.x; f < tadLength; f += blockDim.x) - oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams); + oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams); } } } @@ -184,12 +184,12 @@ __device__ void ScalarBoolTransform::transformCuda(void *vx, Nd4jLong *xS template template _CUDA_H void ScalarBoolTransform::intermediateAlongDimension(dim3& launchDims, cudaStream_t *stream, - void *x, Nd4jLong *xShapeInfo, - void *z, Nd4jLong *zShapeInfo, - void *scalars, - void *extraParams, - int *dimension, int dimensionLength, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, + void *x, Nd4jLong *xShapeInfo, + void *z, Nd4jLong *zShapeInfo, + void *scalars, + void *extraParams, + int *dimension, int dimensionLength, + Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { scalarAlongDimension<<>>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ); @@ -200,11 +200,11 @@ _CUDA_H void ScalarBoolTransform::intermediateAlongDimension(dim3& launchD template template void _CUDA_H ScalarBoolTransform::intermediateShaped(dim3& launchDims, cudaStream_t *stream, - void *vx, Nd4jLong *xShapeInfo, - void *vz, Nd4jLong *zShapeInfo, - void* vscalar, + void *vx, Nd4jLong *xShapeInfo, + void *vz, Nd4jLong *zShapeInfo, + void* vscalar, void *vextraParams, int *allocPointer){ - + scalarSimpleShaped<<>>(vx, vscalar, xShapeInfo, vextraParams, vz, zShapeInfo, allocPointer); nd4j::DebugHelper::checkErrorCode(stream, "scalarSimpleShaped(...) failed"); } @@ -212,10 +212,10 @@ void _CUDA_H ScalarBoolTransform::intermediateShaped(dim3& launchDims, cuda //////////////////////////////////////////////////////////////////////// template void ScalarBoolTransform::executeCudaShaped(dim3& launchDims, cudaStream_t *stream, - int opNum, - void *vx, Nd4jLong *xShapeInfo, - void *vz, Nd4jLong *zShapeInfo, - void* vscalar, + int opNum, + void *vx, Nd4jLong *xShapeInfo, + void *vz, Nd4jLong *zShapeInfo, + void* vscalar, void *vextraParams) { if (nd4j::Environment::getInstance()->isDebugAndVerbose()) diff --git a/libnd4j/include/loops/cuda/scalar_int.cu b/libnd4j/include/loops/cuda/scalar_int.cu index 48f141525..44c73fcb4 100644 --- a/libnd4j/include/loops/cuda/scalar_int.cu +++ b/libnd4j/include/loops/cuda/scalar_int.cu @@ -36,7 +36,7 @@ __global__ void scalarAlongDimension(void *x, Nd4jLong *xShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { - + functions::scalar::ScalarIntTransform::template transformCuda(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ); } @@ -60,10 +60,10 @@ namespace scalar { //////////////////////////////////////////////////////////////////////// template template -__device__ void ScalarIntTransform::transformCuda(void* vscalar, - void *vy, Nd4jLong *yShapeInfo, - void *vparams, - void *vz, Nd4jLong *zShapeInfo, +__device__ void ScalarIntTransform::transformCuda(void* vscalar, + void *vy, Nd4jLong *yShapeInfo, + void *vparams, + void *vz, Nd4jLong *zShapeInfo, int *allocationBuffer) { auto scalar = reinterpret_cast(vscalar)[0]; auto y = reinterpret_cast(vy); @@ -73,8 +73,8 @@ __device__ void ScalarIntTransform::transformCuda(void* vscalar, auto yRank = shape::rank(yShapeInfo); auto yEWS = shape::elementWiseStride(yShapeInfo); auto yShape = shape::shapeOf(yShapeInfo); - auto yStride = shape::stride(yShapeInfo); - + auto yStride = shape::stride(yShapeInfo); + auto zRank = shape::rank(zShapeInfo); auto zEWS = shape::elementWiseStride(zShapeInfo); auto zShape = shape::shapeOf(zShapeInfo); @@ -89,11 +89,11 @@ __device__ void ScalarIntTransform::transformCuda(void* vscalar, __syncthreads(); if(yEWS >= 1 && zEWS >= 1 && shape::order(yShapeInfo) == shape::order(zShapeInfo)) { - transformCuda(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer); + transformCuda(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer); } else { - for (Nd4jLong i = tid; i < len; i+= totalThreads) - z[shape::getIndexOffset(i, zShapeInfo, len)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, len)], scalar, params); + for (Nd4jLong i = tid; i < len; i+= totalThreads) + z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params); } } @@ -101,10 +101,10 @@ __device__ void ScalarIntTransform::transformCuda(void* vscalar, template template __device__ void ScalarIntTransform::transformCuda(Nd4jLong len, - void* vx, - void *vy, Nd4jLong yEWS, - void *vparams, - void *vz, Nd4jLong zEWS, + void* vx, + void *vy, Nd4jLong yEWS, + void *vparams, + void *vz, Nd4jLong zEWS, int *allocationBuffer) { auto x = reinterpret_cast(vx)[0]; @@ -131,17 +131,17 @@ __device__ void ScalarIntTransform::transformCuda(Nd4jLong len, template template __device__ void ScalarIntTransform::transformCuda(void *vx, Nd4jLong *xShapeInfo, - void *vextraParams, - void *vz, Nd4jLong *zShapeInfo, - void *vscalars, - int *dimension, int dimensionLength, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, + void *vextraParams, + void *vz, Nd4jLong *zShapeInfo, + void *vscalars, + int *dimension, int dimensionLength, + Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { auto x = reinterpret_cast(vx); auto scalars = reinterpret_cast(vscalars); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - + if (tadShapeInfoZ == nullptr) { tadShapeInfoZ = tadShapeInfo; tadOffsetsZ = tadOffsets; @@ -174,7 +174,7 @@ __device__ void ScalarIntTransform::transformCuda(void *vx, Nd4jLong *xShape auto s = scalars[r]; for (int f = threadIdx.x; f < tadLength; f += blockDim.x) - oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams); + oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams); } } } @@ -184,12 +184,12 @@ __device__ void ScalarIntTransform::transformCuda(void *vx, Nd4jLong *xShape template template _CUDA_H void ScalarIntTransform::intermediateAlongDimension(dim3& launchDims, cudaStream_t *stream, - void *x, Nd4jLong *xShapeInfo, - void *z, Nd4jLong *zShapeInfo, - void *scalars, - void *extraParams, - int *dimension, int dimensionLength, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, + void *x, Nd4jLong *xShapeInfo, + void *z, Nd4jLong *zShapeInfo, + void *scalars, + void *extraParams, + int *dimension, int dimensionLength, + Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { scalarAlongDimension<<>>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ); @@ -199,21 +199,21 @@ _CUDA_H void ScalarIntTransform::intermediateAlongDimension(dim3& launchDims, template template void _CUDA_H ScalarIntTransform::intermediateShaped(dim3& launchDims, cudaStream_t *stream, - void *vx, Nd4jLong *xShapeInfo, - void *vz, Nd4jLong *zShapeInfo, - void* vscalar, + void *vx, Nd4jLong *xShapeInfo, + void *vz, Nd4jLong *zShapeInfo, + void* vscalar, void *vextraParams, int *allocPointer){ - + scalarSimpleShaped<<>>(vx, vscalar, xShapeInfo, vextraParams, vz, zShapeInfo, allocPointer); } //////////////////////////////////////////////////////////////////////// template void ScalarIntTransform::executeCudaShaped(dim3& launchDims, cudaStream_t *stream, - int opNum, - void *vx, Nd4jLong *xShapeInfo, - void *vz, Nd4jLong *zShapeInfo, - void* vscalar, + int opNum, + void *vx, Nd4jLong *xShapeInfo, + void *vz, Nd4jLong *zShapeInfo, + void* vscalar, void *vextraParams) { if (nd4j::Environment::getInstance()->isDebugAndVerbose()) diff --git a/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu b/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu index 8ee950c25..13ad1d5b4 100644 --- a/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu +++ b/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu @@ -80,8 +80,8 @@ __global__ void bitonicArbitraryStepKernelKey(void *vx, Nd4jLong *xShapeInfo, vo int it = (reverse) ? i + j + half : i + window - j - 1; int ij = i+j; if (it < length && ij < length ) { - int posIT = shape::getIndexOffset(it, xShapeInfo, xLength); - int posIJ = shape::getIndexOffset(ij, xShapeInfo, xLength); + int posIT = shape::getIndexOffset(it, xShapeInfo); + int posIJ = shape::getIndexOffset(ij, xShapeInfo); X v0 = x[posIJ]; X v1 = x[posIT]; @@ -160,8 +160,8 @@ __global__ void execBitonicArbitraryStepKernel(void *vx, Nd4jLong *xShapeInfo, i int it = (reverse) ? i + j + half : i + window - j - 1; int ij = i+j; if (it < length && ij < length ) { - int posIT = shape::getIndexOffset(it, xShapeInfo, xLength); - int posIJ = shape::getIndexOffset(ij, xShapeInfo, xLength); + int posIT = shape::getIndexOffset(it, xShapeInfo); + int posIJ = shape::getIndexOffset(ij, xShapeInfo); shmem[threadIdx.x] = x[posIJ]; shmem[threadIdx.x + blockDim.x] = x[posIT]; diff --git a/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu b/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu index d9b2ec74c..6bd1e8a33 100644 --- a/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu +++ b/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu @@ -46,8 +46,8 @@ __global__ void bitonicSortStepKernelKey(void *vx, Nd4jLong *xShapeInfo, void *v /* The threads with the lowest ids sort the array. */ if ((ixj)>i) { - int posI = shape::getIndexOffset(i, xShapeInfo, xLength); - int posIXJ = shape::getIndexOffset(ixj, xShapeInfo, xLength); + int posI = shape::getIndexOffset(i, xShapeInfo); + int posIXJ = shape::getIndexOffset(ixj, xShapeInfo); if ((i&k)==0) { /* Sort ascending */ @@ -100,8 +100,8 @@ __global__ void bitonicSortStepKernel(void *vx, Nd4jLong *xShapeInfo, int j, int /* The threads with the lowest ids sort the array. */ if ((ixj)>i) { - int posI = shape::getIndexOffset(i, xShapeInfo, xLength); - int posIXJ = shape::getIndexOffset(ixj, xShapeInfo, xLength); + int posI = shape::getIndexOffset(i, xShapeInfo); + int posIXJ = shape::getIndexOffset(ixj, xShapeInfo); if ((i&k)==0) { /* Sort ascending */ diff --git a/libnd4j/include/loops/cuda/specials/concatKernel.cu b/libnd4j/include/loops/cuda/specials/concatKernel.cu index 5d788c4d1..b6ba2f00e 100644 --- a/libnd4j/include/loops/cuda/specials/concatKernel.cu +++ b/libnd4j/include/loops/cuda/specials/concatKernel.cu @@ -139,19 +139,19 @@ namespace nd4j { Nd4jLong sub[MAX_RANK]; - shape::index2coords(shape::rank(zTadShape),shape::shapeOf(zTadShape), arrOffset, sub, shape::order(zTadShape)); - - Nd4jLong baseOffset = shape::getOffset(0,shape::shapeOf(zTadShape),shape::stride(zTadShape), sub, shape::rank(zTadShape)); + shape::index2coords(arrOffset, zTadShape, sub); + + Nd4jLong baseOffset = shape::getOffset(zTadShape, sub); resultTAD += baseOffset; auto yRank = shape::rank(currentTad); auto tadRank = shape::rank(zTadShape); - shape::index2coords(yRank, shape::shapeOf(currentTad), 0, sub); + shape::index2coords(0, currentTad, sub); - auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), sub, yRank); - resultOffset = shape::getOffset(0, shape::shapeOf(zTadShape), shape::stride(zTadShape), sub, tadRank); + auto yOffset = shape::getOffset(currentTad, sub); + resultOffset = shape::getOffset(zTadShape, sub); resultTAD[resultOffset] = dataTAD[yOffset]; } @@ -168,8 +168,8 @@ namespace nd4j { Nd4jLong sub[MAX_RANK]; - shape::index2coords(shape::rank(zTadShape),shape::shapeOf(zTadShape), arrOffset, sub); - Nd4jLong baseOffset = shape::getOffset(0,shape::shapeOf(zTadShape),shape::stride(zTadShape), sub, shape::rank(zTadShape)); + shape::index2coords(arrOffset, zTadShape, sub); + Nd4jLong baseOffset = shape::getOffset(zTadShape, sub); resultTAD += baseOffset; @@ -203,8 +203,8 @@ namespace nd4j { auto yRank = shape::rank(currentTad); for (int i = threadIdx.x; i < yLength; i+= blockDim.x) { - shape::index2coords(yRank, shape::shapeOf(currentTad), i, yIdx); - auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), yIdx, yRank); + shape::index2coords(i, currentTad, yIdx); + auto yOffset = shape::getOffset(currentTad, yIdx); resultTAD[baseIdx + i * tadEWS] = dataTAD[yOffset]; } @@ -220,11 +220,11 @@ namespace nd4j { auto tadRank = shape::rank(zTadShape); for (int i = threadIdx.x; i < yLength; i+= blockDim.x) { - shape::index2coords(yRank, shape::shapeOf(currentTad), i, yIdx); - shape::index2coords(tadRank, shape::shapeOf(zTadShape), i, zIdx); + shape::index2coords(i, currentTad, yIdx); + shape::index2coords(i, zTadShape, zIdx); - auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), yIdx, yRank); - auto resultOffset = shape::getOffset(0, shape::shapeOf(zTadShape), shape::stride(zTadShape), zIdx, tadRank); + auto yOffset = shape::getOffset(currentTad, yIdx); + auto resultOffset = shape::getOffset(zTadShape, zIdx); resultTAD[resultOffset] = dataTAD[yOffset]; } diff --git a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu index 70da24715..e39ff6bec 100644 --- a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu +++ b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu @@ -53,7 +53,7 @@ namespace nd4j { if (dimensionLength > 1 || tadEWS < 1) { for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo, tadLength); + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo); dZ[xOffset] = (e == highestElement ? (T) 1 : (T) 0); } } else { diff --git a/libnd4j/include/loops/cuda/specials/fillIsMax.cu b/libnd4j/include/loops/cuda/specials/fillIsMax.cu index 0851968ba..c9ed51d28 100644 --- a/libnd4j/include/loops/cuda/specials/fillIsMax.cu +++ b/libnd4j/include/loops/cuda/specials/fillIsMax.cu @@ -30,7 +30,7 @@ namespace nd4j { int tid = blockIdx.x * blockDim.x + threadIdx.x; for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) - dz[shape::getIndexOffset(i, xShapeInfo, length)] = (i == idx ? (T) 1 : (T) 0); + dz[shape::getIndexOffset(i, xShapeInfo)] = (i == idx ? (T) 1 : (T) 0); } //////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/loops/cuda/specials/flatten.cu b/libnd4j/include/loops/cuda/specials/flatten.cu index b820acae9..faec2ec90 100644 --- a/libnd4j/include/loops/cuda/specials/flatten.cu +++ b/libnd4j/include/loops/cuda/specials/flatten.cu @@ -20,6 +20,7 @@ // #include +#include namespace nd4j { @@ -34,34 +35,26 @@ __global__ void flattenKernel( auto z = reinterpret_cast(vz); auto y = reinterpret_cast(vy); - + __shared__ Nd4jLong lenY, yOrder, zEWS, yEWS; - if (threadIdx.x == 0) { - + if (threadIdx.x == 0) { + yEWS = shape::elementWiseStride(yShapeInfo); zEWS = shape::elementWiseStride(zShapeInfo); lenY = shape::length(yShapeInfo); } __syncthreads(); - Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (zEWS >= 1 && yEWS >= 1 && yOrder == order) { - - for (int i = tid; i < lenY; i += gridDim.x * blockDim.x) - z[i * zEWS + dOffset] = y[i * yEWS]; - } - else { - - for(auto i = tid; i < lenY; i += gridDim.x * blockDim.x) - z[i * zEWS + dOffset] = y[shape::getIndexOrderOffset(i, yShapeInfo, lenY, order)]; - } + Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x; + + for(auto i = tid; i < lenY; i += gridDim.x * blockDim.x) + z[i * zEWS + dOffset] = y[ops::helpers::getIndexOffsetOrdered(i, yShapeInfo, order)]; } //////////////////////////////////////////////////////////////////////// template -__host__ void flattenKernelGeneric(dim3& launchDims, cudaStream_t *stream, +__host__ void flattenKernelGeneric(dim3& launchDims, cudaStream_t *stream, Nd4jPointer *extraPointers, int dOffset, char order, diff --git a/libnd4j/include/loops/cuda/specials/oesTad.cu b/libnd4j/include/loops/cuda/specials/oesTad.cu index 8cc6c0766..9f41ffbb9 100644 --- a/libnd4j/include/loops/cuda/specials/oesTad.cu +++ b/libnd4j/include/loops/cuda/specials/oesTad.cu @@ -54,8 +54,8 @@ __global__ void execOesTadKernelKey(void *vx, Nd4jLong *xShapeInfo, for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) { auto top = 2 * tid + 1; if (top < xTadLength) { - auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength); - auto t1 = shape::getIndexOffset(top, tadShapeInfo, xTadLength); + auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo); + auto t1 = shape::getIndexOffset(top, tadShapeInfo); if (!descending == (dx[t0] > dx[t1])) { X dt0 = dx[t0]; @@ -72,8 +72,8 @@ __global__ void execOesTadKernelKey(void *vx, Nd4jLong *xShapeInfo, for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) { auto top = 2 * tid + 2; if (top < xTadLength) { - auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength); - auto t1 = shape::getIndexOffset(top, tadShapeInfo, xTadLength); + auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo); + auto t1 = shape::getIndexOffset(top, tadShapeInfo); if (!descending == (dx[t0] > dx[t1])) { X dt0 = dx[t0]; @@ -126,7 +126,7 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo, int iterations = xTadLength; if (cached) { for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) { - auto t0 = shape::getIndexOffset(tid, tadShapeInfo, xTadLength); + auto t0 = shape::getIndexOffset(tid, tadShapeInfo); shmem[tid] = dx[t0]; } @@ -140,8 +140,8 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo, for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) { auto top = 2 * tid + 1; if (top < xTadLength) { - auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength); - auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo, xTadLength); + auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo); + auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo); if (!descending == (dx[t0] > dx[t1])) { T dt0 = dx[t0]; @@ -154,8 +154,8 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo, for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) { auto top = 2 * tid + 2; if (top < xTadLength) { - auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength); - auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo, xTadLength); + auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo); + auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo); if (!descending == (dx[t0] > dx[t1])) { T dt0 = dx[t0]; @@ -172,7 +172,7 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo, if (cached) { dx = x + tadOffsets[r]; for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) { - auto t0 = shape::getIndexOffset(tid, tadShapeInfo, xTadLength); + auto t0 = shape::getIndexOffset(tid, tadShapeInfo); dx[t0] = shmem[tid]; } } diff --git a/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu b/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu index 39db62099..9730565e6 100644 --- a/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu +++ b/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu @@ -53,8 +53,8 @@ namespace nd4j { T *rZ = z + zTadOffsets[idx]; for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = shape::getIndexOffset(i, tadShapeInfo, tadLength); - auto zOffset = shape::getIndexOffset(i, zTadShapeInfo, tadLength); + auto xOffset = shape::getIndexOffset(i, tadShapeInfo); + auto zOffset = shape::getIndexOffset(i, zTadShapeInfo); rZ[zOffset] = rX[xOffset]; } } diff --git a/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu b/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu index adea72bc4..9bf79bedf 100644 --- a/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu +++ b/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu @@ -33,7 +33,7 @@ namespace nd4j { for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) { for (int j = threadIdx.x; j < cols; j += totalThreads) { Nd4jLong coords[2] = {i, j}; - Nd4jLong xOffset = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), coords, rank); + Nd4jLong xOffset = shape::getOffset(shape, coords); if (i + diagonal <= j) array[xOffset] = value; } @@ -48,7 +48,7 @@ namespace nd4j { for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) { for (int j = threadIdx.x; j < cols; j += totalThreads) { Nd4jLong coords[2] = {i, j}; - auto xOffset = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), coords, rank); + auto xOffset = shape::getOffset(shape, coords); if (i + diagonal >= j) *(reinterpret_cast(buffer) + xOffset) = value; } diff --git a/libnd4j/include/loops/cuda/specials/shuffleKernel.cu b/libnd4j/include/loops/cuda/specials/shuffleKernel.cu index 6e7d4ad3b..c842cad4a 100644 --- a/libnd4j/include/loops/cuda/specials/shuffleKernel.cu +++ b/libnd4j/include/loops/cuda/specials/shuffleKernel.cu @@ -92,7 +92,7 @@ namespace nd4j { } else { for (Nd4jLong i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo[f], tadLength); + auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]); auto yOffset = newOffset + xOffset; xOffset += oldOffset; diff --git a/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu b/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu index bf49d788e..fd36eec00 100644 --- a/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu +++ b/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu @@ -34,8 +34,8 @@ namespace nd4j { auto xEws = shape::order(theFirstShape) == 'c'? shape::elementWiseStride(theFirstShape) :1; auto yEws = shape::order(theSecondShape) == 'c'? shape::elementWiseStride(theSecondShape):1; //if (shape::order(theFirstShape) ==) - auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape, resultLength); - auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape, resultLength); + auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape); + auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape); T temp = *(reinterpret_cast(theFirstBuffer) + xOffset); *(reinterpret_cast(theFirstBuffer) + xOffset) = *(reinterpret_cast(theSecondBuffer) + yOffset); *(reinterpret_cast(theSecondBuffer) + yOffset) = temp; diff --git a/libnd4j/include/loops/cuda/specials/tearKernel.cu b/libnd4j/include/loops/cuda/specials/tearKernel.cu index 9f78f14da..e12aa485f 100644 --- a/libnd4j/include/loops/cuda/specials/tearKernel.cu +++ b/libnd4j/include/loops/cuda/specials/tearKernel.cu @@ -61,8 +61,8 @@ namespace nd4j { } else { for (Nd4jLong j = threadIdx.x; j < tadLength; j += blockDim.x) { - auto xOffset = shape::getIndexOffset(j, tadShapeInfo, tadLength); - auto zOffset = shape::getIndexOffset(j, zShapeInfo, tadLength); + auto xOffset = shape::getIndexOffset(j, tadShapeInfo); + auto zOffset = shape::getIndexOffset(j, zShapeInfo); z[zOffset] = s[xOffset]; } diff --git a/libnd4j/include/loops/cuda/specials/tileKernel.cu b/libnd4j/include/loops/cuda/specials/tileKernel.cu index 7d2e87e2d..0a62313f0 100644 --- a/libnd4j/include/loops/cuda/specials/tileKernel.cu +++ b/libnd4j/include/loops/cuda/specials/tileKernel.cu @@ -21,8 +21,8 @@ #include namespace nd4j { - static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) { - return shape::getIndexOffset(index, shapeInfo, length); + static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) { + return shape::getIndexOffset(index, shapeInfo); } static Nd4jLong __device__ __noinline__ _subArrayOffset(Nd4jLong index, Nd4jLong *shapeInfoA, Nd4jLong *shapeInfoB) { @@ -50,7 +50,7 @@ namespace nd4j { } } else { for (int i = tid; i < resultLength; i += totalThreads) { - auto xOffset = _getIndexOffset(i, outputShape, resultLength); + auto xOffset = _getIndexOffset(i, outputShape); auto yOffset = _subArrayOffset(i, outputShape, inputShape); *(reinterpret_cast(outputBuffer) + xOffset) = *(reinterpret_cast(inputBuffer) + yOffset); } @@ -89,7 +89,7 @@ namespace nd4j { for (int i = tid; i < resultLength; i += totalThreads) { - auto xOffset = _getIndexOffset(i, outputShape, resultLength); + auto xOffset = _getIndexOffset(i, outputShape); auto yOffset = _subArrayOffset(i, outputShape, inputShape); *(reinterpret_cast(outputBuffer) + xOffset) = static_cast(*(reinterpret_cast(inputBuffer) + yOffset)); } diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu index deca80217..4867f5de1 100644 --- a/libnd4j/include/loops/cuda/summarystatsreduce.cu +++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu @@ -40,7 +40,7 @@ namespace functions { template void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *z, Nd4jLong *zShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot,bool biasCorrected,int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) { - + functions::summarystats::SummaryStatsReduce::transform(op,dx,xShapeInfo,extraParams,z,zShapeInfo,dimension,dimensionLength,biasCorrected,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets); } @@ -103,12 +103,12 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa */ template template - _CUDA_D void SummaryStatsReduce::transform(void *vx, Nd4jLong *xShapeInfo, - void *vextraParams, - void *vz, Nd4jLong *zShapeInfo, - int *dimension, int dimensionLength, - int postProcessOrNot, - int *allocationBuffer, void *vreductionBuffer, + _CUDA_D void SummaryStatsReduce::transform(void *vx, Nd4jLong *xShapeInfo, + void *vextraParams, + void *vz, Nd4jLong *zShapeInfo, + int *dimension, int dimensionLength, + int postProcessOrNot, + int *allocationBuffer, void *vreductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) { auto dx = static_cast(vx); @@ -204,7 +204,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa sPartials[threadIdx.x] = val; for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); SummaryStatsData indexVal2; indexVal2.initWithValue(dx[xOffset]); @@ -264,8 +264,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa else { for (Nd4jLong i = tid; i < n; i += blockDim.x * gridDim.x) { - - auto offset = shape::getIndexOffset(i, xShapeInfo, n); + + auto offset = shape::getIndexOffset(i, xShapeInfo); SummaryStatsData indexVal2; indexVal2.initWithValue(dx[offset]); reduction = update(reduction, indexVal2, extraParams); @@ -279,7 +279,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa if (gridDim.x > 1) { __shared__ bool amLast; - unsigned int *tc = (unsigned int *)reductionBuffer; + unsigned int *tc = (unsigned int *)reductionBuffer; tid = threadIdx.x; if (threadIdx.x == 0) { SummaryStatsData *pBuffer = (SummaryStatsData*) reductionBuffer; @@ -338,9 +338,9 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa template _CUDA_H void SummaryStatsReduce::execSummaryStatsReduceScalar(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vextraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer) { - + auto x = static_cast(vx); - auto extraParams = static_cast(vextraParams); + auto extraParams = static_cast(vextraParams); auto z = reinterpret_cast(vz); auto reductionPointerA = reinterpret_cast(reductionBuffer); diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu index 34f56380a..18b53cea7 100644 --- a/libnd4j/include/loops/cuda/transform/transform_any.cu +++ b/libnd4j/include/loops/cuda/transform/transform_any.cu @@ -36,7 +36,7 @@ __global__ void transformAnySimple(void *x, Nd4jLong *xShapeInfo, int xRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - + functions::transform::TransformAny::template transformCuda(x,xShapeInfo,params,z,zShapeInfo,allocationPointer,reductionPointer,tadShapeInfo, tadOffsets); } @@ -57,7 +57,7 @@ namespace functions { __device__ void TransformAny::transformCuda(void *vx, Nd4jLong *xShapeInfo, void *vparams, void *vz, Nd4jLong *zShapeInfo, - int *allocationPointer, void *vreductionPointer, + int *allocationPointer, void *vreductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { auto x = reinterpret_cast(vx); @@ -70,9 +70,9 @@ namespace functions { __shared__ char xOrder; __shared__ char zOrder; __shared__ Nd4jLong length; - + if (threadIdx.x == 0) { - + xEws = shape::elementWiseStride(xShapeInfo); zEws = shape::elementWiseStride(zShapeInfo); xOrder = shape::order(xShapeInfo); @@ -84,26 +84,26 @@ namespace functions { auto tid = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = gridDim.x * blockDim.x; - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { - + if(xEws > 0 && zEws > 0 && xOrder == zOrder) { + for (int i = tid; i < length; i += totalThreads) - z[i * zEws] = OpType::op(x[i * xEws], params); + z[i * zEws] = OpType::op(x[i * xEws], params); } - else { + else { if(vx == vz) { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); z[xOffset] = OpType::op(x[xOffset], params); - } + } } else { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = OpType::op(x[xOffset], params); } } - } + } }; diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu index 52e6b4a10..e88a4274b 100644 --- a/libnd4j/include/loops/cuda/transform/transform_bool.cu +++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu @@ -68,16 +68,16 @@ namespace functions { if(OpType::requiresSpecial) { OpType::execSpecialCuda(x,xShapeInfo,z,zShapeInfo,params, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets); return; - } + } else { __shared__ Nd4jLong xEws; __shared__ Nd4jLong zEws; __shared__ char xOrder; __shared__ char zOrder; __shared__ Nd4jLong length; - + if (threadIdx.x == 0) { - + xEws = shape::elementWiseStride(xShapeInfo); zEws = shape::elementWiseStride(zShapeInfo); xOrder = shape::order(xShapeInfo); @@ -87,28 +87,28 @@ namespace functions { __syncthreads(); auto tid = blockIdx.x * blockDim.x + threadIdx.x; - int totalThreads = gridDim.x * blockDim.x; + int totalThreads = gridDim.x * blockDim.x; + + if(xEws > 0 && zEws > 0 && xOrder == zOrder) { - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { - for (int i = tid; i < length; i += totalThreads) - z[i * zEws] = OpType::op(x[i * xEws], params); + z[i * zEws] = OpType::op(x[i * xEws], params); } - else { + else { if(vx == vz) { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); z[xOffset] = OpType::op(x[xOffset], params); - } + } } else { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); - z[zOffset] = OpType::op(x[xOffset], params); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); + z[zOffset] = OpType::op(x[xOffset], params); } } - } + } } }; diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu index 6fe7b18d1..44ddb0246 100644 --- a/libnd4j/include/loops/cuda/transform/transform_float.cu +++ b/libnd4j/include/loops/cuda/transform/transform_float.cu @@ -35,7 +35,7 @@ __global__ void transformFloatSimple(void *x, Nd4jLong *xShapeInfo, int xRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - + functions::transform::TransformFloat::template transformCuda( x, xShapeInfo, params, @@ -64,7 +64,7 @@ namespace functions { void *vparams, void *vz, Nd4jLong *zShapeInfo, - int *allocationPointer, void *vreductionPointer, + int *allocationPointer, void *vreductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { auto x = reinterpret_cast(vx); @@ -75,7 +75,7 @@ namespace functions { if(OpType::requiresSpecial) { OpType::execSpecialCuda(x,xShapeInfo,z,zShapeInfo,params, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets); return; - } + } else { __shared__ Nd4jLong xEws; @@ -83,9 +83,9 @@ namespace functions { __shared__ char xOrder; __shared__ char zOrder; __shared__ Nd4jLong length; - + if (threadIdx.x == 0) { - + xEws = shape::elementWiseStride(xShapeInfo); zEws = shape::elementWiseStride(zShapeInfo); xOrder = shape::order(xShapeInfo); @@ -95,24 +95,24 @@ namespace functions { __syncthreads(); auto tid = blockIdx.x * blockDim.x + threadIdx.x; - int totalThreads = gridDim.x * blockDim.x; + int totalThreads = gridDim.x * blockDim.x; + + if(xEws > 0 && zEws > 0 && xOrder == zOrder) { - if(xEws > 0 && zEws > 0 && xOrder == zOrder) { - for (Nd4jLong i = tid; i < length; i += totalThreads) z[i * zEws] = OpType::op(x[i * xEws], params); } - else { + else { if(vx == vz) { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); z[xOffset] = OpType::op(x[xOffset], params); - } + } } else { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = OpType::op(x[xOffset], params); } } diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu index 6c533ac3a..e59381fba 100644 --- a/libnd4j/include/loops/cuda/transform/transform_same.cu +++ b/libnd4j/include/loops/cuda/transform/transform_same.cu @@ -95,14 +95,14 @@ namespace functions { else { if(vx == vz) { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); z[xOffset] = OpType::op(x[xOffset], params); } } else { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = OpType::op(x[xOffset], params); } } diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu index a0989b0e6..0befdf35f 100644 --- a/libnd4j/include/loops/cuda/transform/transform_strict.cu +++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu @@ -35,7 +35,7 @@ __global__ void transformStrictSimple(void *x, Nd4jLong *xShapeInfo, int xRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - + functions::transform::TransformStrict::template transformCuda(x,xShapeInfo,params,z,zShapeInfo,allocationPointer,reductionPointer,tadShapeInfo, tadOffsets); } @@ -97,14 +97,14 @@ namespace functions { else { if(vx == vz) { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); z[xOffset] = OpType::op(x[xOffset], params); } } else { for (Nd4jLong i = tid; i < length; i+= totalThreads) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = OpType::op(x[xOffset], params); } } diff --git a/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp b/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp index 6370579d2..d507a5b8d 100644 --- a/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp +++ b/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp @@ -24,6 +24,7 @@ #include #include +#include #include namespace nd4j { @@ -162,7 +163,8 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) { MmulHelper::tensorDot(&columns, weights, output, {1,2,3,4}, {3,0,1,2}, permutForOutput); if(bias) - output->applyBroadcast(broadcast::Add, {indIOioC}, bias); + // output->applyBroadcast(broadcast::Add, {indIOioC}, bias); + helpers::addBias(block, *output, *bias, *output, isNCDHW); if(!isNCDHW) delete input; diff --git a/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp b/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp index 3a7450fc7..f69b6c0f9 100644 --- a/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp +++ b/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp @@ -27,7 +27,7 @@ #include #include #include - +#include namespace nd4j { namespace ops { @@ -80,7 +80,8 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) { //----- add biases if required -----// if(bias) - output->applyBroadcast(broadcast::Add, {1}, bias); + // output->applyBroadcast(broadcast::Add, {1}, bias); + helpers::addBias(block, *output, *bias, *output, true); if(!isNCHW) delete output; diff --git a/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp b/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp index 20d0e991e..f875e4693 100644 --- a/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp +++ b/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp @@ -23,6 +23,7 @@ #include #include +#include #include namespace nd4j { @@ -79,7 +80,8 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) { //----- add biases if required -----// if(bias) - output->applyBroadcast(broadcast::Add,{1}, bias); + // output->applyBroadcast(broadcast::Add,{1}, bias); + helpers::addBias(block, *output, *bias, *output, true); if(!isNCDHW) delete output; diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp index 3309c6104..f5a65079a 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp @@ -15,107 +15,111 @@ ******************************************************************************/ // -// @author raver119@gmail.com +// @author raver119@gmail.com +// @author Yurii Shyrma (iuriish@yahoo.com) // #include #if NOT_EXCLUDED(OP_biasadd) #include +#include namespace nd4j { - namespace ops { - DECLARE_TYPES(biasadd) { - getOpDescriptor() - ->setAllowedInputTypes(nd4j::DataType::ANY) - ->setAllowedOutputTypes({ALL_FLOATS}); - } +namespace ops { - CUSTOM_OP_IMPL(biasadd, 2, 1, true, 0, 0) { - //REQUIRE_OK(this->validateInput2D(block)); - auto input = INPUT_VARIABLE(0); - auto bias = INPUT_VARIABLE(1); +//////////////////////////////////////////////////////////////////// +CUSTOM_OP_IMPL(biasadd, 2, 1, true, 0, 0) { - REQUIRE_TRUE(bias->isRowVector(), 0, "Bias array should be a vector"); + auto input = INPUT_VARIABLE(0); + auto bias = INPUT_VARIABLE(1); - auto z = OUTPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); - if (input->isMatrix()) - input->addRowVector(bias, z); - else { - // TODO: we might want to use NDArray::applyTrueBroadcast here, like AddOp does - std::vector shape({-1, bias->lengthOf()}); - //nd4j_debug("Reshaping to: [%i, %i]\n", -1, (int) bias->lengthOf()); - auto tArr = input->reshape(input->ordering(), shape); - auto zArr = z->reshape(z->ordering(), shape); - tArr.addRowVector(bias, &zArr); - } + const bool isNCHW = !block.getBArguments()->empty() ? B_ARG(0) : false; + const int channelDim = isNCHW ? 1 : input->rankOf() - 1; // second or last - STORE_RESULT(*z); + REQUIRE_TRUE(bias->rankOf() == 1, 0, "BIASADD CUSTOM_OP: bias array should have rank = 1, but got %i instead !", bias->rankOf()); - return Status::OK(); - } - DECLARE_SYN(bias_add, biasadd); + REQUIRE_TRUE(bias->sizeAt(0) == input->sizeAt(channelDim), 0, "BIASADD CUSTOM_OP: shapes of bias %s and input %s arrays are not suitable for broadcast operation along channel dimension %i !", ShapeUtils::shapeAsString(bias).c_str(), ShapeUtils::shapeAsString(input).c_str(), channelDim); - DECLARE_SHAPE_FN(biasadd) { - auto xShape = inputShape->at(0); - auto yShape = inputShape->at(1); + REQUIRE_TRUE(output->isSameShape(input), 0, "BIASADD CUSTOM_OP: wrong shape of output array, expected is %s but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(output).c_str()); - auto dtype = ArrayOptions::dataType(yShape); - return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(xShape, dtype))); - } + helpers::addBias(block, *input, *bias, *output, isNCHW); + // input->applyBroadcast(nd4j::broadcast::Add, {channelDim}, bias, output); - DECLARE_TYPES(biasadd_bp) { - getOpDescriptor() - ->setAllowedInputTypes(nd4j::DataType::ANY) - ->setAllowedOutputTypes({ALL_FLOATS}); - } + return Status::OK(); +} +DECLARE_SYN(bias_add, biasadd); - CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) { - auto input = INPUT_VARIABLE(0); - auto bias = INPUT_VARIABLE(1); - auto epsilonNext = INPUT_VARIABLE(2); +//////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(biasadd) { + auto xShape = inputShape->at(0); + auto yShape = inputShape->at(1); - auto epsilon = OUTPUT_VARIABLE(0); - auto gradB = OUTPUT_VARIABLE(1); + auto dtype = ArrayOptions::dataType(yShape); + return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(xShape, dtype))); +} - epsilon->assign(epsilonNext); +DECLARE_TYPES(biasadd) { + getOpDescriptor() + ->setAllowedInputTypes(nd4j::DataType::ANY) + ->setAllowedOutputTypes({ALL_FLOATS}); +} - // cnn case - if (input->rankOf() == 4) { - auto epsilonNext2d = epsilonNext->permute({1, 0, 2, 3}); - epsilonNext2d.reshapei('c', {(int) bias->lengthOf(), -1}); +//////////////////////////////////////////////////////////////////// +CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) { + auto input = INPUT_VARIABLE(0); + auto bias = INPUT_VARIABLE(1); + auto epsilonNext = INPUT_VARIABLE(2); - auto sum = epsilonNext2d.reduceAlongDimension(reduce::Sum, {1}); - gradB->assign(sum); + auto epsilon = OUTPUT_VARIABLE(0); + auto gradB = OUTPUT_VARIABLE(1); - delete sum; - } else if (input->rankOf() == 2) { - // regular fully-connected case - auto sum = epsilonNext->reduceAlongDimension(reduce::Sum, {0}); - gradB->assign(sum); - - delete sum; - } + epsilon->assign(epsilonNext); - return ND4J_STATUS_OK; - } - DECLARE_SYN(BiasAddGrad, biasadd_bp); + // cnn case + if (input->rankOf() == 4) { + auto epsilonNext2d = epsilonNext->permute({1, 0, 2, 3}); + epsilonNext2d.reshapei('c', {(int) bias->lengthOf(), -1}); - DECLARE_SHAPE_FN(biasadd_bp) { - auto input = inputShape->at(0); - auto bias = inputShape->at(1); + auto sum = epsilonNext2d.reduceAlongDimension(reduce::Sum, {1}); + gradB->assign(sum); - Nd4jLong* epsShape; - Nd4jLong* gradShape; + delete sum; + } else if (input->rankOf() == 2) { + // regular fully-connected case + auto sum = epsilonNext->reduceAlongDimension(reduce::Sum, {0}); + gradB->assign(sum); - COPY_SHAPE(input, epsShape); - COPY_SHAPE(bias, gradShape); - - return SHAPELIST(CONSTANT(epsShape), CONSTANT(gradShape)); - - } + delete sum; } + + return ND4J_STATUS_OK; +} +DECLARE_SYN(BiasAddGrad, biasadd_bp); + +DECLARE_SHAPE_FN(biasadd_bp) { + auto input = inputShape->at(0); + auto bias = inputShape->at(1); + + Nd4jLong* epsShape; + Nd4jLong* gradShape; + + COPY_SHAPE(input, epsShape); + COPY_SHAPE(bias, gradShape); + + return SHAPELIST(CONSTANT(epsShape), CONSTANT(gradShape)); +} + +DECLARE_TYPES(biasadd_bp) { + getOpDescriptor() + ->setAllowedInputTypes(nd4j::DataType::ANY) + ->setAllowedOutputTypes({ALL_FLOATS}); +} + + +} } #endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp index c430fd4d2..c3e73da84 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp @@ -43,14 +43,15 @@ DECLARE_SHAPE_FN(matrix_diag) { auto in = inputShape->at(0); int inRank = shape::rank(in); + // if for example diagonal array has shape [A,B,C] then output array has shape [A,B,C,C] + int outRank = inRank + 1; - auto lastDimension = shape::sizeAt(in, -1); ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong); outShapeInfo[0] = outRank; for(int i = 0; i < inRank; ++i) outShapeInfo[i + 1] = shape::sizeAt(in, i); - outShapeInfo[outRank] = lastDimension; + outShapeInfo[outRank] = shape::sizeAt(in, -1); ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in)); diff --git a/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp index 8ab5fa32f..06656b9de 100644 --- a/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp +++ b/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp @@ -23,7 +23,7 @@ #include #include - +#include namespace nd4j { namespace ops { @@ -59,7 +59,8 @@ namespace ops { output->applyBroadcast(nd4j::broadcast::Multiply, {dimC}, gain); if(bias != nullptr) { // output->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), bias, output); - output->applyBroadcast(nd4j::broadcast::Add, {dimC}, bias); + // output->applyBroadcast(nd4j::broadcast::Add, {dimC}, bias); + helpers::addBias(block, *output, *bias, *output, isNCHW); } return Status::OK(); diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h index bb7f306bd..e30ff86a5 100644 --- a/libnd4j/include/ops/declarable/headers/parity_ops.h +++ b/libnd4j/include/ops/declarable/headers/parity_ops.h @@ -79,36 +79,44 @@ namespace nd4j { * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array * * Input arrays: - * input: input array, considered as batch of matrices - * diagonal: array containing elements to be inserted into input array, - * following rank condition should be satisfied: diagonal_rank = input_rank - 1, - * the shapes of diagonal and input arrays must be equal except last dimension of input array, - * for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C], - * also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions - * that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2]) + * 0: input array, considered as batch of matrices + * 1: diagonal array containing elements to be inserted into input array, + * following rank condition should be satisfied: diagonal_rank = input_rank - 1, + * the shapes of diagonal and input arrays must be equal except last dimension of input array, + * for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C], + * also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions + * that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2]) * * Output array: - * has the same shape as input, corresponding diagonal elements are substituted + * 0: has the same shape as input, corresponding diagonal elements are substituted */ #if NOT_EXCLUDED(OP_matrix_set_diag) DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0); #endif /** - * Returns a batched matrix tensor with diagonal values given (as TF.matrix_diag). - */ + * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of output array, + * rest output elements are set to zeros + * + * Input array: + * diagonal: array containing elements to be inserted into output array, + * following rank condition is present: diagonal_rank = ouput_rank - 1 + * + * Output array: + * 0: is considered as batch of matrices, if for example diagonal array has shape [A,B,C] then output array has shape [A,B,C,C] + */ DECLARE_CUSTOM_OP(matrix_diag, 1, 1, false, 0, 0); /** * This op calculates regularized incomplete beta integral Ix(a, b). * Implementation is based on two algorithms depending on input values of a and b: - * - when a and b are both > maxValue (3000.), then apply Gauss-Legendre quadrature method - * - when a and b are both <= maxValue (3000.), then apply modified Lentz’s algorithm for continued fractions + * - when a and b are both > maxValue (3000.), then Gauss-Legendre quadrature method is applied + * - when a and b are both <= maxValue (3000.), then modified Lentz’s algorithm for continued fractions is applied * * Input arrays: - * a: define power t^{a-1}, must be > 0, type float. - * b: define power (1-t)^{b-1}, must be > 0, type float. - * x: define upper limit of integration, must be within (0 <= x <= 1) range, type float. + * a: defines power t^{a-1}, must be > 0, type float. + * b: defines power (1-t)^{b-1}, must be > 0, type float. + * x: defines upper limit of integration, must be within (0 <= x <= 1) range, type float. * * Output array: * 0: values of regularized incomplete beta integral that corresponds to variable upper limit x, type float diff --git a/libnd4j/include/ops/declarable/helpers/addBias.h b/libnd4j/include/ops/declarable/helpers/addBias.h index 3d9fdec88..c754c07de 100644 --- a/libnd4j/include/ops/declarable/helpers/addBias.h +++ b/libnd4j/include/ops/declarable/helpers/addBias.h @@ -22,14 +22,15 @@ #define LIBND4J_ADDBIAS_H #include +#include namespace nd4j { namespace ops { namespace helpers { - void addBias(NDArray& input, const NDArray& bias, const bool isNCHW); - + void addBias(graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW); + } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index fbc395fc6..bd29094ec 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -91,19 +91,19 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output, PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max)) for (int i = 0; i < length; i++) { - const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length); + const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo); max = nd4j::math::nd4j_max(max, inBuff[offset]); } PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum)) for (int i = 0; i < length; i++) { - const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length); + const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo); outBuff[offset] = nd4j::math::nd4j_exp(inBuff[offset] - max); sum += outBuff[offset]; } PRAGMA_OMP_SIMD for (int i = 0; i < length; i++) { - const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length); + const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo); outBuff[offset] /= sum; outBuff[offset] *= (1.f - outBuff[offset]); // derivative } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp index 09cdf5d4e..0e6e1f777 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp @@ -28,70 +28,116 @@ namespace helpers { ////////////////////////////////////////////////////////////////////////// template -static void addBias_(NDArray& input, const NDArray& bias, const bool isNCHW) { +static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, const bool isNCHW) { - // input [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW) - // bias [oC] + // bias [oC] - X* inBuff = input.bufferAsT(); - const Y* biasBuff = bias.bufferAsT(); + // if(input_rank == 4) + // input and output have same shapes: [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW) + // if(input_rank == 5) + // input and output have same shapes: [bS, oD, oH, oW, oC] (NHWC) or [bS, oD, oC, oH, oW] (NCHW) + // else + // apply applyBroadCast - int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width; - bS = input.sizeAt(0); - const Nd4jLong stride0 = input.stridesOf()[0]; - const Nd4jLong stride1 = input.stridesOf()[1]; - const Nd4jLong stride2 = input.stridesOf()[2]; - uint biasShapeInfoCast[MAX_RANK]; - bool canCastBias = nd4j::DataTypeUtils::castShapeInfo(bias.getShapeInfo(), biasShapeInfoCast); - - if(isNCHW) { - - oC = input.sizeAt(1); - oH = input.sizeAt(2); - oW = input.sizeAt(3); + const X* x = input.bufferAsT(); + const Y* y = bias.bufferAsT(); + X* z = output.bufferAsT(); - const int oHoW = oH*oW; + const bool inOutAreSame = x == z; - PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) - for (int i = 0; i < bS; ++i) { - for (int c = 0; c < oC; ++c) { - - auto biasOffset = shape::indexOffset(c, bias.getShapeInfo(), biasShapeInfoCast, oC, canCastBias); - auto inOffset = i * stride0 + c * stride1; + const uint bS = output.sizeAt(0); // batch size + const Nd4jLong yStrideC = bias.stridesOf()[0]; + const Nd4jLong zStrideB = output.stridesOf()[0]; - PRAGMA_OMP_SIMD - for (uint k = 0; k < oHoW; ++k) - inBuff[inOffset + k] += static_cast(biasBuff[biasOffset]); - } + if(output.rankOf() == 4) { + + const uint C = isNCHW ? output.sizeAt(1) : output.sizeAt(3); // channels + const uint oH = isNCHW ? output.sizeAt(2) : output.sizeAt(1); // height + const uint oW = isNCHW ? output.sizeAt(3) : output.sizeAt(2); // width + + const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[3]; + const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1]; + const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2]; + + if(inOutAreSame) { + + PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4)) + for(uint b = 0; b < bS; ++b) + for(uint c = 0; c < C; ++c) + for(uint h = 0; h < oH ; ++h) + for(uint w = 0; w < oW ; ++w) + z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] += static_cast(y[c*yStrideC]); + } + else { + + const Nd4jLong xStrideB = input.stridesOf()[0]; + const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[3]; + const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1]; + const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2]; + + PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4)) + for(uint b = 0; b < bS; ++b) + for(uint c = 0; c < C; ++c) + for(uint h = 0; h < oH ; ++h) + for(uint w = 0; w < oW ; ++w) + z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + h*xStrideH + w*xStrideW] + static_cast(y[c*yStrideC]); + } + } + else if(output.rankOf() == 5) { + + const uint C = isNCHW ? output.sizeAt(1) : output.sizeAt(4); // channels + const uint oD = isNCHW ? output.sizeAt(2) : output.sizeAt(1); // depth + const uint oH = isNCHW ? output.sizeAt(3) : output.sizeAt(2); // height + const uint oW = isNCHW ? output.sizeAt(4) : output.sizeAt(3); // width + + const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[4]; + const Nd4jLong zStrideD = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1]; + const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2]; + const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[4] : output.stridesOf()[3]; + + if(inOutAreSame) { + + PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5)) + for(uint b = 0; b < bS; ++b) + for(uint c = 0; c < C; ++c) + for(uint d = 0; d < oD ; ++d) + for(uint h = 0; h < oH ; ++h) + for(uint w = 0; w < oW ; ++w) + z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] += static_cast(y[c*yStrideC]); + } + else { + + const Nd4jLong xStrideB = input.stridesOf()[0]; + const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[4]; + const Nd4jLong xStrideD = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1]; + const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2]; + const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[4] : input.stridesOf()[3]; + + PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5)) + for(uint b = 0; b < bS; ++b) + for(uint c = 0; c < C; ++c) + for(uint d = 0; d < oD ; ++d) + for(uint h = 0; h < oH ; ++h) + for(uint w = 0; w < oW ; ++w) + z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + d*xStrideD + h*xStrideH + w*xStrideW] + static_cast(y[c*yStrideC]); } } else { - - oC = input.sizeAt(3); - oH = input.sizeAt(1); - oW = input.sizeAt(2); - - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < bS*oH*oW; ++i) { - - PRAGMA_OMP_SIMD - for (int c = 0; c < oC; ++c) { - auto biasOffset = shape::indexOffset(c, bias.getShapeInfo(), biasShapeInfoCast, oC, canCastBias); - inBuff[i * oC + c] += static_cast(biasBuff[biasOffset]); - } - } - } + const int channelDim = isNCHW ? 1 : input.rankOf() - 1; // second or last + const_cast(input).applyBroadcast(nd4j::broadcast::Add, {channelDim}, &bias, &output); + } } ////////////////////////////////////////////////////////////////////////// -void addBias(NDArray& input, const NDArray& bias, const bool isNCHW) { +void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) { - BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, isNCHW), FLOAT_TYPES, FLOAT_TYPES); + // bias.rankOf() == 1 ? bias : bias.reshape(bias.ordering(), {bias.lengthOf()}) + BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, output, isNCHW), FLOAT_TYPES, FLOAT_TYPES); } -BUILD_DOUBLE_TEMPLATE(template void addBias_, (NDArray& input, const NDArray& bias, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES); +BUILD_DOUBLE_TEMPLATE(template void addBias_, (const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp index 0c12a2896..d6c4da4a1 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp @@ -84,7 +84,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* const Nd4jLong end = start + step; // calculate offset for mean, variance, gamma, beta (all of them have the same shape) - auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, lenSmall, canCastMean); + auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, canCastMean); // calculate offset for input and output (all of them have the same shape) shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data()); @@ -114,7 +114,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* const Nd4jLong end = start + step; // calculate offset for mean, variance, gamma, beta (all of them have the same shape) - auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, lenSmall, canCastMean); + auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, canCastMean); // calculate offset for input and output (all of them have the same shape) shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp index 681b4eb63..bba3e8acb 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp @@ -29,7 +29,7 @@ namespace helpers { /////////////////////////////////////////////////////////////////// // modified Lentz’s algorithm for continued fractions, -// reference: Lentz, W.J. 1976, “Generating Bessel Functions in Mie Scattering Calculations Using Continued Fractions,” +// reference: Lentz, W.J. 1976, “Generating Bessel Functions in Mie Scattering Calculations Using Continued Fractions” template static T continuedFraction(const T a, const T b, const T x) { @@ -122,9 +122,8 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con int xLen = x.lengthOf(); PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold()) - for(int i = 0; i < xLen; ++i) { - output.p(i, betaIncCore(a.e(i), b.e(i), x.e(i))); - } + for(int i = 0; i < xLen; ++i) + output.t(i) = betaIncCore(a.t(i), b.t(i), x.t(i)); } /////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp index 3d04bc129..f096d5bfa 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp @@ -648,7 +648,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( //----- add biases if required -----// if(bias) // output->applyBroadcast(broadcast::Add, {indIOioC}, bias); - helpers::addBias(*output, *bias, isNCHW); + helpers::addBias(block, *output, *bias, *output, isNCHW); if(!isNCHW) delete input; @@ -875,7 +875,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( ////////////////////////////////////////////////////////////////////////// template - static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { + static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { // input [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW) // weights [kH, kW, iC, mC] always @@ -922,7 +922,8 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput); // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC] if(bias) - output->applyBroadcast(broadcast::Add, {indIOioC}, bias); + // output->applyBroadcast(broadcast::Add, {indIOioC}, bias); + helpers::addBias(block, *output, *bias, *output, isNCHW); if(!isNCHW) delete input; @@ -2451,7 +2452,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES); } void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { - BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES); + BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES); } void ConvolutionUtils::depthwiseConv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp index 6a08064fc..30d4d3f7a 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp @@ -37,24 +37,16 @@ namespace nd4j { cOffset += inputs[e]->lengthOf(); } - Nd4jLong xCoord[MAX_RANK]; - // actually transferring data for (int e = 0; e < numArrays; e++) { auto z = reinterpret_cast(output->bufferWithOffset(offsets[e])); auto xBuffer = inputs[e]->bufferAsT(); auto xShapeInfo = inputs[e]->shapeInfo(); - auto xShape = shape::shapeOf(xShapeInfo); - auto xStride = shape::stride(xShapeInfo); - auto xRank = shape::rank(xShapeInfo); auto xLength = inputs[e]->lengthOf(); - - for (uint i = 0; i < xLength; i++) { - shape::index2coords(xRank, xShape, i, xLength, xCoord, order); - auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank); - z[i] = xBuffer[xOffset]; - } + + for (uint i = 0; i < xLength; i++) + z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp index af4e96e2e..def210457 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp @@ -30,7 +30,7 @@ namespace helpers { template static void ismax_(const NDArray* input, NDArray* output, const std::vector& dimensions) { - + if (input->isVector()) { int dimensionsLength = dimensions.size(); int length = input->lengthOf(); @@ -169,7 +169,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector for (int i = 0; i < tadLength; i++) { rZ[i] = maxIdx == i ? (Z) 1 : (Z) 0; } - } + } else if (tadEWS > 1 && zEWS > 1) { for (int i = 0; i < tadLength; i++) { if (rX[i * tadEWS] > maxValue) { @@ -184,7 +184,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector } } else { for (int i = 0; i < tadLength; i++) { - auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo, tadLength); + auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo); if (rX[xOffset] > maxValue) { maxIdx = i; maxValue = rX[xOffset]; @@ -193,7 +193,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector PRAGMA_OMP_SIMD for (int i = 0; i < tadLength; i++) { - auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo(), tadLength); + auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo()); rZ[zOffset] = maxIdx == i ? (Z) 1 : (Z) 0; } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp index e974755ac..9a2034fd0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp @@ -52,14 +52,14 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords)) for (Nd4jLong i = 0; i < xLen; ++i) { - shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data()); + shape::index2coords(i, xShapeInfo, coords.data()); - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank); - const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank); + const auto xOffset = shape::getOffset(xShapeInfo, coords.data()); + const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords.data()); // condition to be on diagonal of innermost matrix if(coords[xRank - 2] == coords[xRank - 1]) - z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)]; + z[zOffset] = y[shape::getOffset(yShapeInfo, coords.data())]; else z[zOffset] = zeroPad ? static_cast(0) : x[xOffset]; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp index ea273d33b..a83518899 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp @@ -73,12 +73,12 @@ namespace nd4j { if (idx < 0 || idx >= tLen) { PRAGMA_OMP_SIMD for (unsigned int t = 0; t < tLen; t++) { - cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo(), tLen)] = zero; + cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero; } } else { PRAGMA_OMP_SIMD for (unsigned int t = 0; t < tLen; t++) { - cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo(), tLen)] = idx == t ? one : zero; + cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero; } } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp index bd14fbd8d..f46346876 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp @@ -53,8 +53,8 @@ namespace nd4j { for (Nd4jLong e = length - 1; e >= 0; --e) { - auto xOffset = shape::getIndexOffset(e, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(e, zShapeInfo, length); + auto xOffset = shape::getIndexOffset(e, xShapeInfo); + auto zOffset = shape::getIndexOffset(e, zShapeInfo); sum = op == scalar::Add ? simdOps::Add::op(sum, x[xOffset]) : simdOps::Multiply::op(sum, x[xOffset]); if (!exclusive) @@ -83,8 +83,8 @@ namespace nd4j { for (int e = 0; e < length; e++) { - auto xOffset = shape::getIndexOffset(e, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(e, zShapeInfo, length); + auto xOffset = shape::getIndexOffset(e, xShapeInfo); + auto zOffset = shape::getIndexOffset(e, zShapeInfo); sum = op == scalar::Add ? simdOps::Add::op(sum, x[xOffset]) : simdOps::Multiply::op(sum, x[xOffset]); if (!exclusive) diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp index 0922a1248..83deeca88 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp @@ -60,7 +60,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * // inArr[e] = inArr[idx]; // inArr[idx] = tmp; } - } + } else if (inEWS > 1) { PRAGMA_OMP_PARALLEL_FOR for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) { @@ -71,19 +71,19 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * // inArr[idx1] = tmp; swap(inArr, idx1, idx2); } - } + } else { PRAGMA_OMP_PARALLEL_FOR for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) { - - auto inOffset = shape::getIndexOffset(e, inShapeBuffer, inLength); - auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer, inLength); + + auto inOffset = shape::getIndexOffset(e, inShapeBuffer); + auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); //outArr[outOffset] = inArr[inOffset]; swap(outArr, inOffset, outOffset); } } - } + } else { // single step phase here auto outEWS = shape::elementWiseStride(outShapeBuffer); @@ -92,15 +92,15 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) { PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) - outArr[sLength - e] = inArr[e]; + for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) + outArr[sLength - e] = inArr[e]; if(inLength != numOfElemsToReverse) { PRAGMA_OMP_PARALLEL_FOR for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) outArr[e] = inArr[e]; } - } + } else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) { PRAGMA_OMP_PARALLEL_FOR @@ -112,14 +112,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) outArr[e * outEWS] = inArr[e * inEWS]; } - } + } else { PRAGMA_OMP_PARALLEL_FOR for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) { - auto inOffset = shape::getIndexOffset(e, inShapeBuffer, inLength); - auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer, outLength); + auto inOffset = shape::getIndexOffset(e, inShapeBuffer); + auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); outArr[outOffset] = inArr[inOffset]; } @@ -128,9 +128,9 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * PRAGMA_OMP_PARALLEL_FOR for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) { - auto inOffset = shape::getIndexOffset(e, inShapeBuffer, inLength); - auto outOffset = shape::getIndexOffset(e, outShapeBuffer, outLength); - outArr[outOffset] = inArr[inOffset]; + auto inOffset = shape::getIndexOffset(e, inShapeBuffer); + auto outOffset = shape::getIndexOffset(e, outShapeBuffer); + outArr[outOffset] = inArr[inOffset]; } } } @@ -151,7 +151,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input helpers::reverseArray(context, const_cast(input)->getBuffer(), const_cast(input)->getShapeInfo(), output->getBuffer(), output->getShapeInfo(), seqLengths->e(0)); } else { - + if(seqDim > batchDim) --seqDim; @@ -163,7 +163,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input for(int i = 0; i < inSubArrsSet->size(); ++i) { Nd4jLong numOfElemsToReverse = seqLengths->e(i); - + if(numOfElemsToReverse == 0 || numOfElemsToReverse == 1) { outSubArrsSet->at(i)->assign(inSubArrsSet->at(i)); } @@ -172,7 +172,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input auto outInnerSet = outSubArrsSet->at(i)->allTensorsAlongDimension({seqDim}); for(int j = 0; j < inInnerSet->size(); ++j) helpers::reverseArray(context, inInnerSet->at(j)->getBuffer(), inInnerSet->at(j)->getShapeInfo(), outInnerSet->at(j)->getBuffer(), outInnerSet->at(j)->getShapeInfo(), numOfElemsToReverse); - + delete inInnerSet; delete outInnerSet; } @@ -195,12 +195,12 @@ void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* outpu auto listOut = output->allTensorsAlongDimension(dimensions); auto listIn = input->allTensorsAlongDimension(dimensions); - + NDArray *subArrIn, *subArrOut; for(int i = 0; i < listIn->size(); ++i) { // listIn->size() = listOut->size() subArrIn = listIn->at(i); - subArrOut = listOut->at(i); + subArrOut = listOut->at(i); BUILD_SINGLE_SELECTOR(input->dataType(), helpers::reverseArray, (context, subArrIn->getBuffer(), subArrIn->getShapeInfo(), subArrOut->getBuffer(), subArrOut->getShapeInfo()), LIBND4J_TYPES); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index cc97e3c5b..5b4c44874 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -116,15 +116,15 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& for (Nd4jLong i = 0; i < zLen; ++i) { - shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data()); + shape::index2coords(i, output.getShapeInfo(), coords.data()); - const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); // evaluate spatial coordinates for x for(uint j = 1; j <= numOfSpatialDims; ++j) coords[j] += crop.e(j - 1, 0); // add crop left - z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)]; + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())]; } } @@ -298,9 +298,9 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords)) for (Nd4jLong i = 0; i < zLen; ++i) { - shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data()); + shape::index2coords(i, output.getShapeInfo(), coords.data()); - const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); bool within = true; @@ -318,7 +318,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra } if(within) - z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)]; + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())]; else z[zOffset] = 0.f; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index 71181afe8..9e04ed4df 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -178,8 +178,6 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray const Nd4jLong* xShape = input.shapeOf(); const Nd4jLong* zShape = output.shapeOf(); - const Nd4jLong* xStride = input.stridesOf(); - const Nd4jLong* zStride = output.stridesOf(); const int rank = input.rankOf(); // both input and output have the same rank const int rankMinusOne = rank - 1; @@ -195,8 +193,8 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords)) for(uint i = 0; i < zLen; ++i) { - shape::index2coords(rank, zShape, i, zLen, coords.data()); - const auto zOffset = shape::getOffset(0, zShape, zStride, coords.data(), rank); + shape::index2coords(i, output.getShapeInfo(), coords.data()); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); bool within = true; for(int j = rankMinusOne; j >= 0; --j) { @@ -207,7 +205,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray } if(within) - z[zOffset] = x[shape::getOffset(0, xShape, xStride, coords.data(), rank)]; + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())]; else z[zOffset] = padVal; } @@ -220,8 +218,8 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords)) for(uint i = 0; i < zLen; ++i) { - shape::index2coords(rank, zShape, i, zLen, coords.data()); - const auto zOffset = shape::getOffset(0, zShape, zStride, coords.data(), rank); + shape::index2coords(i, output.getShapeInfo(), coords.data()); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); for(int j = rankMinusOne; j >= 0; --j) { @@ -231,7 +229,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray else if(coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right } - const auto xOffset = shape::getOffset(0, xShape, xStride, coords.data(), rank); + const auto xOffset = shape::getOffset(input.getShapeInfo(), coords.data()); z[zOffset] = x[xOffset]; } } @@ -580,9 +578,9 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { xCoordStart = coords.data(); } - shape::index2coords(zRank, output.shapeOf(), i, zLen, zCoordStart); + shape::index2coords(i, output.getShapeInfo(), zCoordStart); - const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), zCoordStart, zRank); + const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart); // last y coordinate uint coordToRestore; @@ -590,7 +588,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { coordToRestore = static_cast(zCoordStart[yRank - 1]); zCoordStart[yRank - 1] = 0; - const auto yOffset = shape::getOffset(0, indices.shapeOf(), indices.stridesOf(), zCoordStart, yRank); + const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart); //restore z coordinate if(yLastDim != xRank) @@ -600,7 +598,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { for(uint j = 0; j < yLastDim; ++j) xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride - const auto xOffset = shape::getOffset(0, input.shapeOf(), input.stridesOf(), xCoordStart, xRank); + const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart); z[zOffset] = x[xOffset]; } @@ -1172,7 +1170,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(inIdx, outIdx)) for(int i = 0; i < outLen; ++i) { - shape::index2coords(rank, output.shapeOf(), i, outIdx.data()); + shape::index2coords(i, output.getShapeInfo(), outIdx.data()); for(int j = 0; j < rank; ++j) { @@ -1191,8 +1189,8 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o inIdx[j] = len - outIdx[j]; } - auto outOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), outIdx.data(), rank); - auto inOffset = shape::getOffset(0, input.shapeOf(), input.stridesOf(), inIdx.data(), rank); + auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx.data()); + auto inOffset = shape::getOffset(input.getShapeInfo(), inIdx.data()); reinterpret_cast(output.buffer())[outOffset] = reinterpret_cast(input.getBuffer())[inOffset]; } } @@ -1259,7 +1257,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c for(Nd4jLong i=0; i(fidx) + gradOBuff[shape::getIndexOffset(i, gradO.getShapeInfo(), gradOLen)]); + gradI.p(fidx, gradI.e(fidx) + gradOBuff[shape::getIndexOffset(i, gradO.getShapeInfo())]); } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu index f402944aa..21b2eecd4 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu @@ -60,9 +60,9 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo, for (int i = tid; i < xzLen; i += totalThreads) { - shape::index2coords(xzRank, xShapeInfo + 1, i, xzLen, coords); + shape::index2coords(i, xShapeInfo, coords); - const auto xzOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xzRank + 1, coords, xzRank); + const auto xzOffset = shape::getOffset(xShapeInfo, coords); const auto xVal = x[xzOffset]; @@ -72,7 +72,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo, if(yShapeInfo[j + 1] == 1) coords[j + 1] = 0; - z[xzOffset] = xVal * y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + yRank + 1, coords + 1, yRank)]; + z[xzOffset] = xVal * y[shape::getOffset(yShapeInfo, coords + 1)]; } else z[xzOffset] = xVal; @@ -139,11 +139,11 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI for (int i = tid; i < inLen; i += totalThreads) { - shape::index2coords(inRank, inShapeInfo + 1, i, inLen, coords); + shape::index2coords(i, inShapeInfo, coords); - const auto inOffset = shape::getOffset(0, inShapeInfo + 1, inShapeInfo + inRank + 1, coords, inRank); - const auto dLdOOffset = shape::getOffset(0, dLdOShapeInfo + 1, dLdOShapeInfo + inRank + 1, coords, inRank); - const auto dLdIOffset = shape::getOffset(0, dLdIShapeInfo + 1, dLdIShapeInfo + inRank + 1, coords, inRank); + const auto inOffset = shape::getOffset(inShapeInfo, coords); + const auto dLdOOffset = shape::getOffset(dLdOShapeInfo, coords); + const auto dLdIOffset = shape::getOffset(dLdIShapeInfo, coords); const auto xVal = in[inOffset]; const auto grO = dLdO[dLdOOffset]; @@ -154,8 +154,8 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI if(alphaShapeInfo[j + 1] == 1) coords[j + 1] = 0; - const auto alphaOffset = shape::getOffset(0, alphaShapeInfo + 1, alphaShapeInfo + alphaRank + 1, coords + 1, alphaRank); - const auto dLdAOffset = shape::getOffset(0, dLdAShapeInfo + 1, dLdAShapeInfo + alphaRank + 1, coords + 1, alphaRank); + const auto alphaOffset = shape::getOffset(alphaShapeInfo, coords + 1); + const auto dLdAOffset = shape::getOffset(dLdAShapeInfo, coords + 1); dLdI[dLdIOffset] = grO * alpha[alphaOffset]; @@ -223,7 +223,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx < len) { - const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo, len); + const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo); shmem[threadIdx.x] = (threadIdx.x != 0) ? x[xOffset] : nd4j::math::nd4j_max(x[xOffset], temp); // take into account max element evaluated on previous iteration and stored in temp } else @@ -249,8 +249,8 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx < len) { - const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo, len); - const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo, len); + const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo); + const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo); z[zOffset] = nd4j::math::nd4j_exp(x[xOffset] - max); shmem[threadIdx.x] = (threadIdx.x != 0) ? z[zOffset] : (z[zOffset] + temp); // take into account sum element evaluated on previous iteration and stored in temp } @@ -272,7 +272,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo, for (int i = 0; i < numOfIters; ++i) { const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx >= len) continue; - const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo, len); + const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo); z[zOffset] /= shmem[0]; } } @@ -386,7 +386,7 @@ __global__ void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx < len) { - const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len); + const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo); shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max(x[offset], temp); // take into account max element evaluated on previous iteration and stored in temp } else @@ -412,7 +412,7 @@ __global__ void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx < len) { - const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len); + const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo); z[offset] = nd4j::math::nd4j_exp(x[offset] - max); shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp } @@ -434,7 +434,7 @@ __global__ void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape for (int i = 0; i < numOfIters; ++i) { const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx >= len) continue; - const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len); + const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo); z[offset] = nd4j::math::nd4j_log(z[offset] / shmem[0]); } } @@ -505,7 +505,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx < len) { - const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len); + const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo); shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max(x[offset], temp); // take into account max element evaluated on previous iteration and stored in temp } else @@ -531,7 +531,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx < len) { - const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len); + const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo); z[offset] = nd4j::math::nd4j_exp(x[offset] - max); shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp } @@ -553,7 +553,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong for (int i = 0; i < numOfIters; ++i) { const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x; if(elemIdx >= len) continue; - const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len); + const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo); z[offset] /= shmem[0]; z[offset] *= (1.f - z[offset]); // derivative } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu new file mode 100644 index 000000000..7134d764a --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu @@ -0,0 +1,110 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com) +// + + +#include +#include + +namespace nd4j { +namespace ops { +namespace helpers { + +////////////////////////////////////////////////////////////////////// +template +__global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo, + const void* vy, const Nd4jLong* yShapeInfo, + void* vz, const Nd4jLong* zShapeInfo, + const bool isNCHW) { + + // bias [oC] + + // if(input_rank == 4) + // input and output have same shapes: [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW) + // if(input_rank == 5) + // input and output have same shapes: [bS, oD, oH, oW, oC] (NHWC) or [bS, oD, oC, oH, oW] (NCHW) + + const X* x = reinterpret_cast(vx); + const Y* y = reinterpret_cast(vy); + X* z = reinterpret_cast(vz); + + __shared__ int rank, channelPosition; + __shared__ Nd4jLong *sharedMem, len; + __shared__ bool xzSameOffsets, xzAreSame; + + if (threadIdx.x == 0) { + + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); + + rank = shape::rank(xShapeInfo); // xRank == zRank + xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); + len = shape::length(xShapeInfo); + channelPosition = isNCHW ? 1 : rank - 1; // second or last + xzAreSame = x == z; + } + __syncthreads(); + + auto coords = sharedMem + threadIdx.x * rank; + + for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < len; i += blockDim.x * gridDim.x) { + + shape::index2coords(i, xShapeInfo, coords); + + const auto xOffsets = shape::getOffset(xShapeInfo, coords); + const auto zOffsets = xzSameOffsets ? xOffsets : shape::getOffset(zShapeInfo, coords); + const auto yOffsets = shape::getOffset(yShapeInfo, coords + channelPosition); + + if(xzAreSame) + z[zOffsets] += static_cast(y[yOffsets]); + else + z[zOffsets] = x[xOffsets] + static_cast(y[yOffsets]); + } +} + +////////////////////////////////////////////////////////////////////////// +template +static void addBiasCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, + const void* vx, const Nd4jLong* xShapeInfo, + const void* vy, const Nd4jLong* yShapeInfo, + void* vz, const Nd4jLong* zShapeInfo, + const bool isNCHW) { + + addBiasCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, isNCHW); +} + +////////////////////////////////////////////////////////////////////////// +void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) { + + PointersManager manager(block.launchContext(), "addBias"); + + const int threadsPerBlock = MAX_NUM_THREADS; + const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; + const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; + + NDArray::prepareSpecialUse({&output}, {&input, &bias}); + BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBiasCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), bias.getSpecialBuffer(), bias.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), isNCHW), FLOAT_TYPES, FLOAT_TYPES); + NDArray::registerSpecialUse({&output}, {&input, &bias}); + + manager.synchronize(); +} + +} +} +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu index c27c9fb8a..5b52d1b0b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu @@ -143,13 +143,13 @@ static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShape for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) { - auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);; - auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);; - auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);; + auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo); + auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo); + auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo); - auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);; - auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);; - auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);; + auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo); + auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo); + auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo); T h, v_min, v_max; helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu index a1dc4189a..b801765b2 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu @@ -139,13 +139,13 @@ static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xT auto outputB = reinterpret_cast(zBuffer) + zOffsets[2]; for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) { - auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength); - auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength); - auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength); + auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo); + auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo); + auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo); - auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength); - auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength); - auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength); + auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo); + auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo); + auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo); T h, s, v; // Convert the RGB color to Hue/V-range. diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu index 6c3dedd20..d9188e3a8 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu @@ -64,25 +64,25 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo, for (uint i = tid; i < minLen; i += totalThreads) { - const auto meanOffset = shape::getIndexOffset(i, meanShapeInfo, minLen); - const auto varianceOffset = shape::getIndexOffset(i, varianceShapeInfo, minLen); + const auto meanOffset = shape::getIndexOffset(i, meanShapeInfo); + const auto varianceOffset = shape::getIndexOffset(i, varianceShapeInfo); T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt(variance[varianceOffset] + epsilon); if(gamma != nullptr) - sigmaInvGam *= gamma[shape::getIndexOffset(i, gammaShapeInfo, minLen)]; + sigmaInvGam *= gamma[shape::getIndexOffset(i, gammaShapeInfo)]; auto betaOffset = 0; if(beta != nullptr) - betaOffset = shape::getIndexOffset(i, betaShapeInfo, minLen); + betaOffset = shape::getIndexOffset(i, betaShapeInfo); const auto xTad = x + xTadOffsets[i]; auto zTad = z + zTadOffsets[i]; for (uint j = 0; j < tadLen; ++j) { - const auto xTadOffset = shape::getIndexOffset(j, xTadShapeInfo, tadLen); - const auto zTadOffset = shape::getIndexOffset(j, zTadShapeInfo, tadLen); + const auto xTadOffset = shape::getIndexOffset(j, xTadShapeInfo); + const auto zTadOffset = shape::getIndexOffset(j, zTadShapeInfo); zTad[zTadOffset] = (xTad[xTadOffset] - mean[meanOffset]) * sigmaInvGam; @@ -130,10 +130,10 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo for (uint i = tid; i < xLen; i += totalThreads) { - shape::index2coords(xRank, shape::shapeOf(const_cast(xShapeInfo)), i, xLen, coords); + shape::index2coords(i, xShapeInfo, coords); - const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), coords, xRank); - const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast(zShapeInfo)), shape::stride(const_cast(zShapeInfo)), coords, xRank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); + const auto zOffset = shape::getOffset(zShapeInfo, coords); if(minRank == xRank) { for (uint i = 0, j = 0; i < xRank; ++i) { @@ -146,20 +146,20 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo else // minRank = numDims = 1 in this case coords[0] = coords[dims[0]]; - const auto meanOffset = shape::getOffset(0, shape::shapeOf(const_cast(meanShapeInfo)), shape::stride(const_cast(meanShapeInfo)), coords, minRank); - const auto varianceOffset = shape::getOffset(0, shape::shapeOf(const_cast(varianceShapeInfo)), shape::stride(const_cast(varianceShapeInfo)), coords, minRank); + const auto meanOffset = shape::getOffset(meanShapeInfo, coords); + const auto varianceOffset = shape::getOffset(varianceShapeInfo, coords); T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt(variance[varianceOffset] + epsilon); if(gamma != nullptr) { - const auto gammaOffset = shape::getOffset(0, shape::shapeOf(const_cast(gammaShapeInfo)), shape::stride(const_cast(gammaShapeInfo)), coords, minRank); + const auto gammaOffset = shape::getOffset(gammaShapeInfo, coords); sigmaInvGam *= gamma[gammaOffset]; } z[zOffset] = (x[xOffset] - mean[meanOffset]) * sigmaInvGam; if(beta != nullptr) { - const auto betaOffset = shape::getOffset(0, shape::shapeOf(const_cast(betaShapeInfo)), shape::stride(const_cast(betaShapeInfo)), coords, minRank); + const auto betaOffset = shape::getOffset(betaShapeInfo, coords); z[zOffset] += beta[betaOffset]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu b/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu index 87e4948ec..90619c76c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu @@ -15,7 +15,7 @@ ******************************************************************************/ // -// Created by Yurii Shyrma on 11.12.2017 +// @author Yurii Shyrma (iuriish@yahoo.com) // #include @@ -117,10 +117,10 @@ __global__ void betaIncForArrayCuda(const void* va, const Nd4jLong* aShapeInfo, Nd4jLong len = shape::length(xShapeInfo); - const T a = *(reinterpret_cast(va) + shape::getIndexOffset(j, aShapeInfo, len)); - const T b = *(reinterpret_cast(vb) + shape::getIndexOffset(j, bShapeInfo, len)); - const T x = *(reinterpret_cast(vx) + shape::getIndexOffset(j, xShapeInfo, len)); - T& z = *(reinterpret_cast(vz) + shape::getIndexOffset(j, zShapeInfo, len)); + const T a = *(reinterpret_cast(va) + shape::getIndexOffset(j, aShapeInfo)); + const T b = *(reinterpret_cast(vb) + shape::getIndexOffset(j, bShapeInfo)); + const T x = *(reinterpret_cast(vx) + shape::getIndexOffset(j, xShapeInfo)); + T& z = *(reinterpret_cast(vz) + shape::getIndexOffset(j, zShapeInfo)); // t^{n-1} * (1 - t)^{n-1} is symmetric function with respect to x = 0.5 if(a == b && x == static_cast(0.5)) { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc index aefb97963..63e406cc6 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc +++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc @@ -35,12 +35,12 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp auto colShape = shape::shapeOf(colShapeBuffer); auto colStride = shape::stride(colShapeBuffer); auto imShape = shape::shapeOf(imShapeBuffer); - auto imStride = shape::stride(imShapeBuffer); + auto imStride = shape::stride(imShapeBuffer); const int bS = imShape[0]; const int iC = imShape[1]; const int kH = colShape[2]; - const int kW = colShape[3]; + const int kW = colShape[3]; const int oH = colShape[4]; const int oW = colShape[5]; const Nd4jLong colStride0 = colStride[0]; @@ -58,31 +58,31 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp const auto imEWS = shape::elementWiseStride(imShapeBuffer); if(imEWS == 1) { memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T)); - } + } else if (imEWS > 1) { PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close)) for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS) imBuff[i] = static_cast(0.f); - } - else { + } + else { const auto len = shape::length(imShapeBuffer); PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close)) - for (int i = 0; i < len; i++) - imBuff[shape::getIndexOffset(i, imShapeBuffer, len)] = static_cast(0.f); + for (int i = 0; i < len; i++) + imBuff[shape::getIndexOffset(i, imShapeBuffer)] = static_cast(0.f); } - + T *col, *im; int imRow, imCol; if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) { - + PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { + for (int b = 0; b < bS; b++) { + for (int c = 0; c < iC; ++c) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { + for (int colW = 0; colW < oW; ++colW) { imRow = (-pH + kRow * dH) + colH*sH; imCol = (-pW + kCol * dW) + colW*sW; @@ -97,21 +97,21 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, } } } - } + } } else { PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol)) - for (int b = 0; b < bS; b++) { + for (int b = 0; b < bS; b++) { for (int colH = 0; colH < oH; ++colH) { for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - + for (int c = 0; c < iC; ++c) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + imRow = (-pH + kRow * dH) + colH*sH; imCol = (-pW + kCol * dW) + colW*sW; - + col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; @@ -120,9 +120,9 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, } } } - } + } } - } + } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu index 9ab7337c2..dc1935b83 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu @@ -61,9 +61,9 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI auto coords = sharedMem + threadIdx.x * colRank; - shape::index2coords(imRank, imShapeInfo + 1, imInd, imLen, coords); + shape::index2coords(imInd, imShapeInfo, coords); - const auto imOffset = shape::getOffset(0, imShapeInfo + 1, imShapeInfo + imRank + 1, coords, imRank); + const auto imOffset = shape::getOffset(imShapeInfo, coords); const int imH = coords[2] + pH; const int imW = coords[3] + pW; @@ -86,7 +86,7 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI coords[2] /= dH; coords[3] /= dW; - val += col[shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank)]; + val += col[shape::getOffset(colShapeInfo, coords)]; } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu b/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu index 545d7c668..d2792b630 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu @@ -32,8 +32,8 @@ namespace nd4j { // each thread will compare 2 elements: E and E+1 for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) { - auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)]; - auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)]; + auto val0 = x[shape::getIndexOffset(e, xShapeInfo)]; + auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo)]; bool v = false; if (isStrict) diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu index d372f05c8..6f9a8c6ab 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu @@ -59,9 +59,9 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, zShapeInfo + 1, tid, zLen, coords); + shape::index2coords(tid, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); int inArrIdx = 0; Nd4jLong *xShapeInfo = reinterpret_cast(pxShapeInfo)[inArrIdx]; @@ -72,7 +72,7 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL } const auto* x = reinterpret_cast(reinterpret_cast(pVx)[inArrIdx]); - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); z[zOffset] = x[xOffset]; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu b/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu index 12f14b20b..3738d7770 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu @@ -59,7 +59,7 @@ namespace helpers { auto tZ = z + tadOffsets[label]; T val = (weightsBuffer == nullptr ? (T)1.0f : w[t]); - auto idx = shape::getIndexOffset(pred, tadShape, arrLen); + auto idx = shape::getIndexOffset(pred, tadShape); tZ[idx] = val; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu index c08551318..273749bfd 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -63,7 +64,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI auto coords = sharedMem + threadIdx.x * colRank; - shape::index2coords(colRank, colShapeInfo + 1, colInd, colLen, coords); + shape::index2coords(colInd, colShapeInfo, coords); // const auto colW = coords[7]; // const auto colH = coords[6]; @@ -74,7 +75,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI // const auto c = coords[1]; // const auto b = coords[0]; - const auto colOffset = shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank); + const auto colOffset = shape::getOffset(colShapeInfo, coords); coords[2] = -pD + coords[2] * dD + coords[5] * sD; // const auto volDep = (-pD + kDep * dD) + colD * sD; coords[3] = -pH + coords[3] * dH + coords[6] * sH; // const auto volRow = (-pH + kRow * dH) + colH * sH; @@ -83,7 +84,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI if (static_cast(coords[2]) >= static_cast(iD) || static_cast(coords[3]) >= static_cast(iH) || static_cast(coords[4]) >= static_cast(iW)) col[colOffset] = static_cast(0.); else - col[colOffset] = vol[shape::getOffset(0, volShapeInfo + 1, volShapeInfo + volRank + 1, coords, volRank)]; + col[colOffset] = vol[shape::getOffset(volShapeInfo, coords)]; } ////////////////////////////////////////////////////////////////////////// @@ -149,9 +150,9 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape auto coords = sharedMem + threadIdx.x * colRank; - shape::index2coords(volRank, volShapeInfo + 1, volInd, volLen, coords); + shape::index2coords(volInd, volShapeInfo, coords); - const auto volOffset = shape::getOffset(0, volShapeInfo + 1, volShapeInfo + volRank + 1, coords, volRank); + const auto volOffset = shape::getOffset(volShapeInfo, coords); const int imD = coords[2] + pD; const int imH = coords[3] + pH; @@ -181,7 +182,7 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape coords[3] /= dH; coords[4] /= dW; - val += col[shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank)]; + val += col[shape::getOffset(colShapeInfo, coords)]; } } } @@ -268,8 +269,8 @@ static void conv2d_(nd4j::graph::Context& block, const NDArray* input, const NDA //----- add biases if required -----// if(bias) - output->applyBroadcast(broadcast::Add, {indIOioC}, bias); - // helpers::addBias(*output, *bias, isNCHW); + // output->applyBroadcast(broadcast::Add, {indIOioC}, bias); + helpers::addBias(block, *output, *bias, *output, isNCHW); if(!isNCHW) delete input; @@ -283,7 +284,7 @@ void ConvolutionUtils::conv2d(nd4j::graph::Context& block, const NDArray* input, ////////////////////////////////////////////////////////////////////////// template -static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { +static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { // input [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW) // weights [kH, kW, iC, mC] always @@ -330,7 +331,8 @@ static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput); // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC] if(bias) - output->applyBroadcast(broadcast::Add, {indIOioC}, bias); + // output->applyBroadcast(broadcast::Add, {indIOioC}, bias); + helpers::addBias(block, *output, *bias, *output, isNCHW); if(!isNCHW) delete input; @@ -338,7 +340,7 @@ static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const ////////////////////////////////////////////////////////////////////////// void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { - BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES); + BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES); } ////////////////////////////////////////////////////////////////////////// @@ -735,9 +737,9 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords); + shape::index2coords(zInd, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); int dstart = coords[2] * sD - pD; int hstart = coords[3] * sH - pH; @@ -768,7 +770,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) { for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){ for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) { - T val = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + T val = x[shape::getOffset(xShapeInfo, coords)]; if (val > max) max = val; } @@ -784,7 +786,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) - sum += x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + sum += x[shape::getOffset(xShapeInfo, coords)]; if (extraParam0 == 0) { //Exclude padding uint a = (dend - dstart) / dD + ((dend - dstart) % dD == 0 ? 0 : 1); @@ -805,7 +807,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0); + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(xShapeInfo, coords)]), extraParam0); sum = nd4j::math::nd4j_pow(sum, (T) 1.f / extraParam0); @@ -885,9 +887,9 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, yShapeInfo + 1, yInd, yLen, coords); + shape::index2coords(yInd, yShapeInfo, coords); - const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + rank + 1, coords, rank); + const auto yOffset = shape::getOffset(yShapeInfo, coords); int hstart = coords[2] * sH - pH; int wstart = coords[3] * sW - pW; @@ -913,7 +915,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf T max = -DataTypeUtils::max(); for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) { for (coords[3] = wstart; coords[3] < wend; coords[3] += dW){ - T val = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + T val = x[shape::getOffset(xShapeInfo, coords)]; if (val > max) { max = val; coord2 = coords[2]; @@ -923,7 +925,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf } coords[2] = coord2; coords[3] = coord3; - auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + auto zOffset = shape::getOffset(zShapeInfo, coords); nd4j::math::atomics::nd4j_atomicAdd(&z[zOffset], y[yOffset]); //z[zOffset] += y[yOffset]; } @@ -941,7 +943,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) - nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val); + nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(zShapeInfo, coords)], val); } break; @@ -953,14 +955,14 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0); + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(xShapeInfo, coords)]), extraParam0); val *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) { for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) { - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); + const auto zOffset = shape::getOffset(zShapeInfo, coords); nd4j::math::atomics::nd4j_atomicAdd(&z[zOffset], val * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(x[xOffset])); } } @@ -1046,9 +1048,9 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, yShapeInfo + 1, yInd, yLen, coords); + shape::index2coords(yInd, yShapeInfo, coords); - const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + rank + 1, coords, rank); + const auto yOffset = shape::getOffset(yShapeInfo, coords); int dstart = coords[2] * sD - pD; int hstart = coords[3] * sH - pH; @@ -1080,7 +1082,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) { for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){ for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) { - T val = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + T val = x[shape::getOffset(xShapeInfo, coords)]; if (val > max) { max = val; coord2 = coords[2]; @@ -1093,7 +1095,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf coords[2] = coord2; coords[3] = coord3; coords[4] = coord4; - nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], y[yOffset]); + nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(zShapeInfo, coords)], y[yOffset]); } break; @@ -1110,7 +1112,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) - nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val); + nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(zShapeInfo, coords)], val); } break; @@ -1123,15 +1125,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0); + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(xShapeInfo, coords)]), extraParam0); val *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) { for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) { for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) { - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); + const auto zOffset = shape::getOffset(zShapeInfo, coords); nd4j::math::atomics::nd4j_atomicAdd(&z[zOffset], val * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(x[xOffset])); } } @@ -1363,14 +1365,14 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords); + shape::index2coords(zInd, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); coords[dimIH] /= factorH; coords[dimIH + 1] /= factorW; - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); z[zOffset] = x[xOffset]; } @@ -1431,15 +1433,15 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords); + shape::index2coords(zInd, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); coords[dimID] /= factorD; coords[dimID + 1] /= factorH; coords[dimID + 2] /= factorW; - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); z[zOffset] = x[xOffset]; } @@ -1504,9 +1506,9 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords); + shape::index2coords(zInd, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); z[zOffset] = 0; @@ -1515,7 +1517,7 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape for(coords[dimIH] = zCoord2; coords[dimIH] < zCoord2 + factorH; ++coords[dimIH]) for(coords[dimIH + 1] = zCoord3; coords[dimIH + 1] < zCoord3 + factorW; ++coords[dimIH + 1]) - z[zOffset] += x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + z[zOffset] += x[shape::getOffset(xShapeInfo, coords)]; } ////////////////////////////////////////////////////////////////////////// @@ -1579,9 +1581,9 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape auto coords = sharedMem + threadIdx.x * rank; - shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords); + shape::index2coords(zInd, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); z[zOffset] = 0; @@ -1592,7 +1594,7 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape for(coords[dimID] = zCoord2; coords[dimID] < zCoord2 + factorD; ++coords[dimID]) for(coords[dimID + 1] = zCoord3; coords[dimID + 1] < zCoord3 + factorH; ++coords[dimID + 1]) for(coords[dimID + 2] = zCoord4; coords[dimID + 2] < zCoord4 + factorW; ++coords[dimID + 2]) - z[zOffset] += x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + z[zOffset] += x[shape::getOffset(xShapeInfo, coords)]; } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu index e95473739..1cd771b98 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu @@ -58,12 +58,12 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo, for (uint i = tid; i < lenWithoutLastDim; i += totalThreads) { - shape::index2coords(rank - 1, shape::shapeOf(const_cast(xShapeInfo)), i, lenWithoutLastDim, coords); + shape::index2coords(i, rank - 1, xShapeInfo + 1, coords); coords[rank - 1] = 0; - auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), coords, rank); - auto yOffset = shape::getOffset(0, shape::shapeOf(const_cast(yShapeInfo)), shape::stride(const_cast(yShapeInfo)), coords, rank); + auto xOffset = shape::getOffset(xShapeInfo, coords); + auto yOffset = shape::getOffset(yShapeInfo, coords); const auto x0 = x[xOffset]; const auto y0 = y[yOffset]; @@ -80,7 +80,7 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo, const auto x2 = x[xOffset]; const auto y2 = y[yOffset]; - auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast(zShapeInfo)), shape::stride(const_cast(zShapeInfo)), coords, rank); + auto zOffset = shape::getOffset(zShapeInfo, coords); z[zOffset] = x1 * y2 - x2 * y1; zOffset += shape::stride(const_cast(zShapeInfo))[rank - 1]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu index 0e861b866..fe2d412d9 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu @@ -42,7 +42,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; for (int t = tid; t < inputLength; t += step) { - z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX]; + z[shape::getIndexOffset(t * (inputLength + 1), outputShape)] = x[shape::getIndexOffset(t, inputShape)]; //tX]; } } @@ -63,7 +63,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha const auto step = gridDim.x * blockDim.x; Nd4jLong i = threadIdx.x * (outputLength + 1); for (int t = tid; t < outputLength && i < inputLength; t += step) { - z[shape::getIndexOffset(t, outputShape, outputLength)] = x[shape::getIndexOffset(i, inputShape, inputLength)]; //tX]; + z[shape::getIndexOffset(t, outputShape)] = x[shape::getIndexOffset(i, inputShape)]; //tX]; i += outputLength + 1; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu index de37ab276..92aa4c55a 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu @@ -72,7 +72,7 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo auto xzCoords = sharedMem + threadIdx.x * (xzRank + yRank); auto yCoords = xzCoords + xzRank; - shape::index2coords(xzRank, zShapeInfo + 1, zInd, zLen, xzCoords); + shape::index2coords(zInd, zShapeInfo, xzCoords); const auto zOffset = shape::getOffset(zShapeInfo, xzCoords); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu index 5b4c27bd0..9b2a42d8f 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu @@ -41,7 +41,7 @@ namespace helpers { // if probability is ok - we're saving scaled value if (double(val) < probVal) - output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal); + output[shape::getIndexOffset(e, outputShape)] = T(input[shape::getIndexOffset(e, inputShape)] / probVal); } } @@ -140,11 +140,11 @@ namespace helpers { auto step = blockDim.x * gridDim.x; for (int e = tid; e < len; e += step) { - const auto zOffset = shape::getIndexOffset(e, outputShape, len); + const auto zOffset = shape::getIndexOffset(e, outputShape); // if probability was non-zero on FF step, we'll scale grads back if (output[zOffset] != T(0.)) - output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue); + output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape)] / probValue); } } @@ -173,8 +173,8 @@ namespace helpers { for (auto e = tid; e < inLen; e += step) { T val = nodeRng->relativeT(e, T(0.f), T(1.f)); - T xVal = input[shape::getIndexOffset(e, inputShape, inLen)]; - output[shape::getIndexOffset(e, outputShape, inLen)] = (val >= T(probValue) ? T(alpha * beta + alpha1) : T(alpha * (double)xVal + alpha1)); + T xVal = input[shape::getIndexOffset(e, inputShape)]; + output[shape::getIndexOffset(e, outputShape)] = (val >= T(probValue) ? T(alpha * beta + alpha1) : T(alpha * (double)xVal + alpha1)); } } template diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu b/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu index 75b541b72..c70283997 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu @@ -57,7 +57,7 @@ namespace nd4j { for (Nd4jLong e = threadIdx.x; e < iLimit; e += blockDim.x) { // load set of indices into shared memory if (e < iLength) - rawIndices[threadIdx.x] = i[shape::getIndexOffset(e, iShapeInfo, iLength)]; + rawIndices[threadIdx.x] = i[shape::getIndexOffset(e, iShapeInfo)]; __syncthreads(); // now we need to find out where our actual updates will be mapped @@ -76,7 +76,7 @@ namespace nd4j { // doing actual update if (e < iLength) if (trueIndices[threadIdx.x] >= 0) { - z[trueIndices[threadIdx.x]] = x[shape::getIndexOffset(e, xShapeInfo, xLength)]; + z[trueIndices[threadIdx.x]] = x[shape::getIndexOffset(e, xShapeInfo)]; } __syncthreads(); @@ -97,12 +97,12 @@ namespace nd4j { int outCnt = 0; for (Nd4jLong e = 0; e < iLength; e++) { - if (indices[shape::getIndexOffset(e, iShapeInfo, iLength)] == i) { + if (indices[shape::getIndexOffset(e, iShapeInfo)] == i) { auto dx = x + xTadOffsets[e]; auto dz = z + zTadOffsets[i][outCnt++]; for (int f = threadIdx.x; f < xLength; f += blockDim.x) { - dz[shape::getIndexOffset(f, zTadShapeInfos[i], xLength)] = dx[shape::getIndexOffset(f, xTadShapeInfo, xLength)]; + dz[shape::getIndexOffset(f, zTadShapeInfos[i])] = dx[shape::getIndexOffset(f, xTadShapeInfo)]; } } } @@ -190,9 +190,9 @@ namespace nd4j { auto iLength = shape::length(iShapeInfo); for (int i = threadIdx.x; i < iLength; i += blockDim.x) { - auto idx = indices[shape::getIndexOffset(i, iShapeInfo, iLength)]; + auto idx = indices[shape::getIndexOffset(i, iShapeInfo)]; if (idx >= 0 && idx < zLength) - z[shape::getIndexOffset(idx, zShapeInfo, zLength)] = x[shape::getIndexOffset(i, xShapeInfo, iLength)]; + z[shape::getIndexOffset(idx, zShapeInfo)] = x[shape::getIndexOffset(i, xShapeInfo)]; } } } @@ -215,13 +215,13 @@ namespace nd4j { auto xLength = shape::length(xShapeInfo); for (int i = 0; i < iLength; i++) { - auto idx = indices[shape::getIndexOffset(i, iShapeInfo, iLength)]; + auto idx = indices[shape::getIndexOffset(i, iShapeInfo)]; auto z = bz + zTadOffsets[idx]; auto x = reinterpret_cast(vx[e]) + xTadOffsets[e][i]; for (int f = threadIdx.x; f < zLength; f += blockDim.x) { - z[shape::getIndexOffset(f, zTadShapeInfo, zLength)] = x[shape::getIndexOffset(f, xShapeInfo, xLength)]; + z[shape::getIndexOffset(f, zTadShapeInfo)] = x[shape::getIndexOffset(f, xShapeInfo)]; } __syncthreads(); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu b/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu index 9f6501cad..6cbedcc2a 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu @@ -66,8 +66,8 @@ namespace helpers { // for (auto pixel = 0; pixel < lastDim; pixel++) { // Nd4jLong zPos[] = {i, j, pos}; // Nd4jLong xPos[] = {row, col, pixel}; -// auto zIndex = shape::getOffset(0, shape::shapeOf(outTadShape), shape::stride(outTadShape), zPos, 3); -// auto xIndex = shape::getOffset(0, shape::shapeOf(patchShape), shape::stride(patchShape), xPos, 3); +// auto zIndex = shape::getOffset(outTadShape, zPos); +// auto xIndex = shape::getOffset(patchShape, xPos); // if (theSame) { // SAME case // if (row >= 0 && col >= 0 && row < rowDim && col < colDim) // matrix[zIndex] = patch[xIndex]; //outMatrix->p(i, j, pos, patch->e(row, col, pixel)); @@ -86,18 +86,6 @@ namespace helpers { template static __global__ void globalExtractPatchesKernel(bool theSame, int batchCount, int sizeRow, int sizeCol, int rowDim, int colDim, int outRowDim, int outColDim, int strideRow, int strideCol, int rateRow, int rateCol, int rowCast, int colCast, int lastDim, T* input, Nd4jLong* patchShape, Nd4jLong* inputOffsets, T* output, Nd4jLong* outTadShape, Nd4jLong* outputOffsets) { - __shared__ Nd4jLong* xShapeOf; - __shared__ Nd4jLong* xStrideOf; - __shared__ Nd4jLong* zShapeOf; - __shared__ Nd4jLong* zStrideOf; - - if (0 == threadIdx.x) { - xShapeOf = shape::shapeOf(patchShape); - xStrideOf = shape::stride(patchShape); - zShapeOf = shape::shapeOf(outTadShape); - zStrideOf = shape::stride(outTadShape); - } - __syncthreads(); auto start = threadIdx.x + blockIdx.x * blockDim.x; @@ -128,7 +116,7 @@ namespace helpers { bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame); if (setUp) { // VALID or SAME cases - outMatrix[shape::getOffset(0, zShapeOf, zStrideOf, zPos, 3)] = patch[shape::getOffset(0, xShapeOf, xStrideOf, xPos, 3)]; + outMatrix[shape::getOffset(outTadShape, zPos)] = patch[shape::getOffset(patchShape, xPos)]; } pos++; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu index 6a818a2cd..df4e25130 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu @@ -35,17 +35,11 @@ namespace nd4j { auto xBuffer = reinterpret_cast(xBuffers[e]); auto xShapeInfo = xShapeInfos[e]; - auto xShape = shape::shapeOf(xShapeInfo); - auto xStride = shape::stride(xShapeInfo); - auto xRank = shape::rank(xShapeInfo); auto xLength = shape::length(xShapeInfo); // each element of this input array has own place within common output array - for (uint i = threadIdx.x; i < xLength; i += blockDim.x) { - shape::index2coords(xRank, xShape, i, xLength, xCoord, order); - auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank); - z[i] = xBuffer[xOffset]; - } + for (uint i = threadIdx.x; i < xLength; i += blockDim.x) + z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu index 4eb5450a3..308e58814 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu @@ -52,10 +52,9 @@ namespace helpers { auto step = blockDim.x * gridDim.x; for (int j = start; j < zLen; j += step) { - auto zIndex = shape::getIndexOffset(j, zShapeInfo, zLen); - auto yIndex = shape::getIndexOffset(j, yShapeInfo, yLen); - auto xIndex = shape::getIndexOffset(y[yIndex], xShapeInfo, xLen); - //printf("%lld , %lld\n", zIndex, xIndex); + auto zIndex = shape::getIndexOffset(j, zShapeInfo); + auto yIndex = shape::getIndexOffset(j, yShapeInfo); + auto xIndex = shape::getIndexOffset(y[yIndex], xShapeInfo); z[zIndex] = x[xIndex]; } } @@ -76,15 +75,14 @@ __global__ static void gatherCuda(const int numOfSubArrs, for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) { if (threadIdx.x == 0) { - x = reinterpret_cast(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]]; + x = reinterpret_cast(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo)]]; z = reinterpret_cast(vz) + zOffsets[i]; } __syncthreads(); for (int j = threadIdx.x; j < len; j += blockDim.x) { - auto zIndex = shape::getIndexOffset(j, zShapeInfo, len); - auto xIndex = shape::getIndexOffset(j, xShapeInfo, len); - //printf("%lld , %lld\n", zIndex, xIndex); + auto zIndex = shape::getIndexOffset(j, zShapeInfo); + auto xIndex = shape::getIndexOffset(j, xShapeInfo); z[zIndex] = x[xIndex]; } __syncthreads(); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu index 6587b4ca7..11ba6571b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu @@ -83,9 +83,9 @@ namespace nd4j { for (Nd4jLong i = tid; i < zLen; i += totalThreads) { - shape::index2coords(zRank, zShapeInfo + 1, i, zLen, zCoordStart); + shape::index2coords(i, zShapeInfo, zCoordStart); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + zRank + 1, zCoordStart, zRank); + const auto zOffset = shape::getOffset(zShapeInfo, zCoordStart); // last y coordinate int coordToRestore; @@ -93,7 +93,7 @@ namespace nd4j { coordToRestore = static_cast(zCoordStart[yRank - 1]); zCoordStart[yRank - 1] = 0; // last y coordinate - const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + yRank + 1, zCoordStart, yRank); + const auto yOffset = shape::getOffset(yShapeInfo, zCoordStart); //restore z coordinate if(yLastDim != xRank) @@ -103,7 +103,7 @@ namespace nd4j { for(uint j = 0; j < yLastDim; ++j) xCoordStart[j] = y[yOffset + j * yShapeInfo[2 * yRank]]; // last stride - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, xCoordStart, xRank); + const auto xOffset = shape::getOffset(xShapeInfo, xCoordStart); z[zOffset] = x[xOffset]; printf("z[%lld] = x[%lld] = %f\n", zOffset, xOffset, (float) z[zOffset]); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu index 3bc30e373..9802ff231 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu @@ -43,8 +43,8 @@ namespace nd4j { auto tid = threadIdx.x + blockIdx.x * blockDim.x; for (Nd4jLong e = tid; e < length; e += blockDim.x * gridDim.x) { - auto _x = static_cast(x[shape::getIndexOffset(e, xShapeInfo, length)]); - auto _y = static_cast(y[shape::getIndexOffset(e, yShapeInfo, length)]); + auto _x = static_cast(x[shape::getIndexOffset(e, xShapeInfo)]); + auto _y = static_cast(y[shape::getIndexOffset(e, yShapeInfo)]); // we save intermediate result into shared memory shared[threadIdx.x] += __popcll(_x ^ _y); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu index 317f1d857..07d7bcd93 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu @@ -55,7 +55,7 @@ __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong* for (Nd4jLong i = tid; i < xLen; i += totalThreads) { - const X value = x[shape::getIndexOffset(i, xShapeInfo, xLen)]; + const X value = x[shape::getIndexOffset(i, xShapeInfo)]; Nd4jLong zIndex; @@ -66,7 +66,7 @@ __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong* else zIndex = static_cast((value - leftEdge) / binWidth); - nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getIndexOffset(zIndex, zShapeInfo, nbins)], 1); + nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getIndexOffset(zIndex, zShapeInfo)], 1); } } @@ -101,7 +101,7 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con // const auto tid = blockIdx.x * gridDim.x + threadIdx.x; // const auto step = gridDim.x * blockDim.x; // for (int t = tid; t < bufferLength; t += step) { -// destination[t] = reinterpret_cast(source)[shape::getIndexOffset(t, sourceShape, bufferLength)]; +// destination[t] = reinterpret_cast(source)[shape::getIndexOffset(t, sourceShape)]; // } // } @@ -110,7 +110,7 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con // const auto tid = blockIdx.x * gridDim.x + threadIdx.x; // const auto step = gridDim.x * blockDim.x; // for (int t = tid; t < bufferLength; t += step) { -// reinterpret_cast(destination)[shape::getIndexOffset(t, destinationShape, bufferLength)] = source[t]; +// reinterpret_cast(destination)[shape::getIndexOffset(t, destinationShape)] = source[t]; // } // } @@ -130,7 +130,7 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con // for(auto i = tid; i < inputLength; i += step) { -// const T value = x[shape::getIndexOffset(i, inputShape, inputLength)]; +// const T value = x[shape::getIndexOffset(i, inputShape)]; // Nd4jLong currInd = static_cast((value - leftEdge) / binWidth); // if(value < secondEdge) diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu index f2fb9d94a..62fcd0588 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu @@ -64,9 +64,9 @@ __global__ static void im2colCuda(const void *image, void *columns, auto coords = sharedMem + threadIdx.x * colRank; - shape::index2coords(colRank, colShapeInfo + 1, colInd, colLen, coords); + shape::index2coords(colInd, colShapeInfo, coords); - const auto colOffset = shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank); + const auto colOffset = shape::getOffset(colShapeInfo, coords); coords[2] = (-pH + coords[2] * dH) + coords[4] * sH; // imH coords[3] = (-pW + coords[3] * dW) + coords[5] * sW; // imW @@ -74,7 +74,7 @@ __global__ static void im2colCuda(const void *image, void *columns, if (static_cast(coords[2]) >= static_cast(iH) || static_cast(coords[3]) >= static_cast(iW)) col[colOffset] = zeroPadVal; else - col[colOffset] = im[shape::getOffset(0, imShapeInfo + 1, imShapeInfo + imRank + 1, coords, imRank)]; + col[colOffset] = im[shape::getOffset(imShapeInfo, coords)]; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu index 431524bf3..715792a8c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu @@ -196,8 +196,8 @@ namespace helpers { for (Nd4jLong e = start; e < channels; e += step) { Nd4jLong posX[] = {b, inY, inX, e}; Nd4jLong posZ[] = {b, y, x, e}; - auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 4); - auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), posZ, 4); + auto xIndex = shape::getOffset(inputShape, posX); + auto zIndex = shape::getOffset(outputShape, posZ); output[zIndex] = input[xIndex]; } } @@ -284,10 +284,10 @@ namespace helpers { Nd4jLong y1Pos[] = {b, 0}; Nd4jLong y2Pos[] = {b, 2}; Nd4jLong x2Pos[] = {b, 3}; - Z y1 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), y1Pos, 2)];//->t(b, 0)]; - Z x1 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), x1Pos, 2)]; - Z y2 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), y2Pos, 2)]; - Z x2 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), x2Pos, 2)]; + Z y1 = boxes[shape::getOffset(boxesShape, y1Pos)];//->t(b, 0)]; + Z x1 = boxes[shape::getOffset(boxesShape, x1Pos)]; + Z y2 = boxes[shape::getOffset(boxesShape, y2Pos)]; + Z x2 = boxes[shape::getOffset(boxesShape, x2Pos)]; int bIn = indices[b]; if (bIn >= batchSize) { @@ -308,7 +308,7 @@ namespace helpers { auto step = blockDim.z * gridDim.z; for (int d = start; d < depth; d += step) { Nd4jLong zPos[] = {b, y, x, d}; - auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4); + auto zIndex = shape::getOffset(outputShape, zPos); output[zIndex] = (Z)extrapolationVal; //crops->p(b, y, x, d, extrapolationVal); } @@ -329,7 +329,7 @@ namespace helpers { auto step = blockDim.z * gridDim.z; for (int d = start; d < depth; d += step) { Nd4jLong zPos[] = {b, y, x, d}; - auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4); + auto zIndex = shape::getOffset(outputShape, zPos); output[zIndex] = (Z)extrapolationVal; // crops->p(b, y, x, d, extrapolationVal); } @@ -346,14 +346,14 @@ namespace helpers { Nd4jLong topRightPos[] = {bIn, topYIndex, right_x_index, d}; Nd4jLong bottomLeftPos[] = {bIn, bottomYIndex, left_x_index, d}; Nd4jLong bottomRightPos[] = {bIn, bottomYIndex, right_x_index, d}; - const T topLeft(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), topLeftPos, 4)]); //->e(bIn, topYIndex, left_x_index, d)); - const T topRight(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), topRightPos, 4)]); //->e(bIn, topYIndex, right_x_index, d)); - const T bottomLeft(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), bottomLeftPos, 4)]);//->e(bIn, bottomYIndex, left_x_index, d)); - const T bottomRight(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), bottomRightPos, 4)]); //->e(bIn, bottomYIndex, right_x_index, d)); + const T topLeft(images[shape::getOffset(imagesShape, topLeftPos)]); //->e(bIn, topYIndex, left_x_index, d)); + const T topRight(images[shape::getOffset(imagesShape, topRightPos)]); //->e(bIn, topYIndex, right_x_index, d)); + const T bottomLeft(images[shape::getOffset(imagesShape, bottomLeftPos)]);//->e(bIn, bottomYIndex, left_x_index, d)); + const T bottomRight(images[shape::getOffset(imagesShape, bottomRightPos)]); //->e(bIn, bottomYIndex, right_x_index, d)); const T top = topLeft + (topRight - topLeft) * x_lerp; const T bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp; Nd4jLong zPos[] = {b, y, x, d}; - auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4); + auto zIndex = shape::getOffset(outputShape, zPos); output[zIndex] = Z(top + (bottom - top) * y_lerp); // crops->p(b, y, x, d, top + (bottom - top) * y_lerp); } @@ -368,7 +368,7 @@ namespace helpers { auto step = blockDim.z * gridDim.z; for (int d = start; d < depth; d += step) { Nd4jLong zPos[] = {b, y, x, d}; - auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4); + auto zIndex = shape::getOffset(outputShape, zPos); output[zIndex] = (Z)extrapolationVal; } continue; @@ -380,8 +380,8 @@ namespace helpers { for (int d = start; d < depth; d += step) { Nd4jLong zPos[] = {b, y, x, d}; Nd4jLong xPos[] = {bIn, closestYIndex, closestXIndex, d}; - auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4); - auto xIndex = shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), xPos, 4); + auto zIndex = shape::getOffset(outputShape, zPos); + auto xIndex = shape::getOffset(imagesShape, xPos); output[zIndex] = images[xIndex]; // crops->p(b, y, x, d, images->e(bIn, closestYIndex, closestXIndex, d)); } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu index d96c1efa2..d221ae023 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu @@ -37,16 +37,15 @@ namespace helpers { Nd4jLong next1[] = {nextIndex, 1}; Nd4jLong next2[] = {nextIndex, 2}; Nd4jLong next3[] = {nextIndex, 3}; - Nd4jLong* shapeOf = shape::shapeOf(boxesShape); - Nd4jLong* strideOf = shape::stride(boxesShape); - T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, previous0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous2, 2)]); - T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, previous1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous3, 2)]); - T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, previous0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous2, 2)]); - T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, previous1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous3, 2)]); - T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, next0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, next2, 2)]); - T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, next1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, next3, 2)]); - T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, next0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, next2, 2)]); - T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, next1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, next3, 2)]); + + T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]); + T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]); + T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]); + T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]); + T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next0)], boxes[shape::getOffset(boxesShape, next2)]); + T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next1)], boxes[shape::getOffset(boxesShape, next3)]); + T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)], boxes[shape::getOffset(boxesShape, next2)]); + T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)], boxes[shape::getOffset(boxesShape, next3)]); T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev); T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu index f0d1df1cc..ec4fd2a97 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu @@ -47,10 +47,10 @@ namespace helpers { Nd4jLong pos[] = {i, i - 1}; Nd4jLong posX[] = {i, i}; Nd4jLong posY[] = {i - 1, i - 1}; - auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2); - auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2); - auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2); - auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2); + auto xIndex = shape::getOffset(inputShape, pos); + auto dxIndex = shape::getOffset(inputShape, posX); + auto dyIndex = shape::getOffset(inputShape, posY); + auto zIndex = shape::getOffset(invertedShape, pos); // invert lower triangular matrix inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]); // math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]); @@ -69,8 +69,8 @@ namespace helpers { for (int i = start; i < n; i += step) { Nd4jLong pos[] = {i, i}; - auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2); - auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2); + auto xIndex = shape::getOffset(inputShape, pos); + auto zIndex = shape::getOffset(invertedShape, pos); // math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]); // invert diagonal elements inverted[zIndex] /= input[xIndex]; @@ -85,18 +85,9 @@ namespace helpers { __shared__ T* inverted; __shared__ T* input; - __shared__ Nd4jLong* inputStride; - __shared__ Nd4jLong* invertedStride; - __shared__ Nd4jLong* invertedShapeOf; - __shared__ Nd4jLong* inputShapeOf; if (threadIdx.x == 0) { inverted = reinterpret_cast(invertedBuf); input = reinterpret_cast(inputBuf); - inputStride = shape::stride(inputShape); - invertedStride = shape::stride(invertedShape); - invertedShapeOf = shape::shapeOf(invertedShape); - inputShapeOf = shape::shapeOf(inputShape); - } __syncthreads(); @@ -106,9 +97,9 @@ namespace helpers { for (int i = start; i < n - 1; i += step) { Nd4jLong pos[] = {i, i + 1}; Nd4jLong posX[] = {i + 1, i + 1}; - auto xIndex = shape::getOffset(0, inputShapeOf, shape::stride(inputShape), pos, 2); - auto iIndex = shape::getOffset(0, invertedShapeOf, invertedStride, posX, 2); - auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStride, pos, 2); + auto xIndex = shape::getOffset(inputShape, pos); + auto iIndex = shape::getOffset(invertedShape, posX); + auto zIndex = shape::getOffset(invertedShape, pos); // invert upper matrix math::atomics::nd4j_atomicAdd(&inverted[zIndex], -input[xIndex] * inverted[iIndex]); // / input[yIndex]); //inputMatrix->t(i, i + 1) * invertedMatrix->t(i + 1, i + 1) / inputMatrix->t(i, i) @@ -130,12 +121,10 @@ namespace helpers { Nd4jLong posX[] = {i, k}; Nd4jLong posD[] = {i, i}; - auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2); - auto yIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posY, - 2); - auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2); - auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ, - 2); + auto xIndex = shape::getOffset(inputShape, posX); + auto yIndex = shape::getOffset(invertedShape, posY); + auto dIndex = shape::getOffset(inputShape, posD); + auto zIndex = shape::getOffset(invertedShape, posZ); // invert non-diagonal elements math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex] / input[dIndex]); } @@ -149,18 +138,10 @@ namespace helpers { invertUpKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { __shared__ T* inverted; __shared__ T* input; - __shared__ Nd4jLong* inputShapeOf; - __shared__ Nd4jLong* invertedShapeOf; - __shared__ Nd4jLong* invertedStrideOf; - __shared__ Nd4jLong* inputStrideOf; if (threadIdx.x == 0) { inverted = reinterpret_cast(invertedBuf);; input = reinterpret_cast(inputBuf); - inputShapeOf = shape::shapeOf(inputShape); - invertedShapeOf = shape::shapeOf(invertedShape); - inputStrideOf = shape::stride(inputShape); - invertedStrideOf = shape::stride(invertedShape); } __syncthreads(); @@ -171,9 +152,9 @@ namespace helpers { Nd4jLong posY[] = {k, j}; Nd4jLong posX[] = {i, k}; // inversion with Joardan Gauss transformation - auto xIndex = shape::getOffset(0, inputShapeOf, inputStrideOf, posX, 2); - auto yIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posY, 2); - auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posZ, 2); + auto xIndex = shape::getOffset(inputShape, posX); + auto yIndex = shape::getOffset(invertedShape, posY); + auto zIndex = shape::getOffset(invertedShape, posZ); // invert upper non-diagonal elements math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]); } @@ -289,7 +270,7 @@ namespace helpers { auto step = blockDim.x * gridDim.x; for (int k = pos + start, j = start; j < n2; k += step, j += step) { - auto xIndex = shape::getIndexOffset(k, inputShape, inputLen); + auto xIndex = shape::getIndexOffset(k, inputShape); matrix[j] = (F) inputBuf[xIndex]; } } @@ -315,7 +296,7 @@ namespace helpers { auto step = blockDim.x * gridDim.x; for (int k = pos + start, j = start; j < n2; k += step, j += step) { - auto zIndex = shape::getIndexOffset(k, outputShape, outputLen); + auto zIndex = shape::getIndexOffset(k, outputShape); outputBuf[zIndex] = matrix[j]; } } @@ -331,7 +312,7 @@ namespace helpers { for (auto i = start; i < rowNum; i += step) { int val = source[i] - 1; Nd4jLong posF[] = {i, val}; - auto pos = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), posF, 2); + auto pos = shape::getOffset(shape, posF); permutation[pos] = F(1.f); } } @@ -522,7 +503,7 @@ namespace helpers { lup_(context, &matrix, nullptr, nullptr); // else // lup_(context, &matrix, nullptr, nullptr); - auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf()); + auto offset = shape::getIndexOffset(e, output->shapeInfo()); auto inputBuf = reinterpret_cast(matrix.specialBuffer()); auto outputBuf = reinterpret_cast(output->specialBuffer()) + offset; // if (matrix.dataType() == input->dataType()) @@ -570,7 +551,7 @@ namespace helpers { lup_(context, &matrix, nullptr, nullptr); // else // lup_(context, &matrix, nullptr, nullptr); - auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf()); + auto offset = shape::getIndexOffset(e, output->shapeInfo()); auto inputBuf = reinterpret_cast(matrix.specialBuffer()); auto outputBuf = reinterpret_cast(output->specialBuffer()) + offset; // if (matrix.dataType() == input->dataType()) @@ -596,34 +577,11 @@ namespace helpers { fillLowerUpperKernel(void *lowerBuf, Nd4jLong *lowerShape, void *upperBuf, Nd4jLong *upperShape, void *matrixBuf, Nd4jLong *matrixShape, Nd4jLong n) { - __shared__ - Nd4jLong *xShapeOf; - __shared__ - Nd4jLong *yShapeOf; - __shared__ - Nd4jLong *zShapeOf; - __shared__ - Nd4jLong *xStrideOf; - __shared__ - Nd4jLong *yStrideOf; - __shared__ - Nd4jLong *zStrideOf; - __shared__ - T *lowerMatrix; - __shared__ - T *upperMatrix; - __shared__ - T *matrix; + __shared__ T *lowerMatrix; + __shared__ T *upperMatrix; + __shared__ T *matrix; if (threadIdx.x == 0) { - xShapeOf = shape::shapeOf(lowerShape); - xStrideOf = shape::stride(lowerShape); - - yShapeOf = shape::shapeOf(upperShape); - yStrideOf = shape::stride(upperShape); - - zShapeOf = shape::shapeOf(matrixShape); - zStrideOf = shape::stride(matrixShape); lowerMatrix = reinterpret_cast(lowerBuf); upperMatrix = reinterpret_cast(upperBuf); matrix = reinterpret_cast(matrixBuf); @@ -634,10 +592,10 @@ namespace helpers { for (int j = threadIdx.x; j < n; j += blockDim.x) { Nd4jLong posX[] = {k, j}; Nd4jLong posD[] = {j, j}; - auto xPos = shape::getOffset(0, xShapeOf, xStrideOf, posX, 2); - auto yPos = shape::getOffset(0, yShapeOf, yStrideOf, posX, 2); - auto iPos = shape::getOffset(0, zShapeOf, zStrideOf, posX, 2); - auto dPos = shape::getOffset(0, zShapeOf, zStrideOf, posD, 2); + auto xPos = shape::getOffset(lowerShape, posX); + auto yPos = shape::getOffset(upperShape, posX); + auto iPos = shape::getOffset(matrixShape, posX); + auto dPos = shape::getOffset(matrixShape, posD); if (k >= j) lowerMatrix[xPos] = matrix[iPos];//(k, j); else @@ -850,18 +808,14 @@ namespace helpers { T *output = outputBuf; T *input = inputBuf; - Nd4jLong *shapeOf = shape::shapeOf(tadShape); - Nd4jLong *strideOf = shape::stride(tadShape); - for (auto i = blockIdx.x; i < batchNum; i += gridDim.x) { T *current = input + tadOffsets[i]; - auto zIndex = shape::getIndexOffset(i, outputShape, batchNum); + auto zIndex = shape::getIndexOffset(i, outputShape); for (auto e = threadIdx.x; e < n; e += blockDim.x) { Nd4jLong diag[] = {e, e}; - auto xIndex = shape::getOffset(0, shapeOf, strideOf, diag, 2); - math::atomics::nd4j_atomicAdd(&output[zIndex], - math::nd4j_log(current[xIndex] * current[xIndex])); + auto xIndex = shape::getOffset(tadShape, diag); + math::atomics::nd4j_atomicAdd(&output[zIndex],math::nd4j_log(current[xIndex] * current[xIndex])); } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu index 01baaffb4..a3c754cf5 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu @@ -61,14 +61,14 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) { - shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords); + shape::index2coords(i, xShapeInfo, coords); - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank); - const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); + const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords); // condition to be on diagonal of innermost matrix if(coords[xRank - 2] == coords[xRank - 1]) - z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)]; + z[zOffset] = y[shape::getOffset(yShapeInfo, coords)]; else z[zOffset] = zeroPad ? static_cast(0) : x[xOffset]; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu index 41b71f5d7..e72ab1f5c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu @@ -40,11 +40,9 @@ namespace helpers { for (Nd4jLong i = blockIdx.y; i < rows; i += gridDim.y) { for (Nd4jLong j = threadIdx.x; j < cols; j += totalThreads) { Nd4jLong coords[2] = {i, j}; - Nd4jLong tadOffsetOut = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo), - shape::stride(tadOnlyOutputShapeInfo), coords, 2); - Nd4jLong tadOffsetIn = shape::getOffset(0, shape::shapeOf(tadOnlyInputShapeInfo), - shape::stride(tadOnlyInputShapeInfo), coords, 2); - //shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength) + Nd4jLong tadOffsetOut = shape::getOffset(tadOnlyOutputShapeInfo, coords); + Nd4jLong tadOffsetIn = shape::getOffset(tadOnlyInputShapeInfo, coords); + //shape::getIndexOffset(j, tadOnlyOutputShapeInfo) if (i >= j) { // check lower diagonals if (lowerBand > 0) { if ((i - j) > lowerBand) diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu index a83067f01..ea428acb2 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu @@ -41,9 +41,9 @@ namespace helpers { auto xOffset = tadOutputOffsets[i]; for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) { Nd4jLong coords[2] = {j, j}; - Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyInputShapeInfo), shape::stride(tadOnlyInputShapeInfo), coords, 2); - //shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength) - *(reinterpret_cast(outputBuffer) + xOffset + shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)) = *(reinterpret_cast(inputBuffer) + yOffset + tadOffset); + Nd4jLong tadOffset = shape::getOffset(tadOnlyInputShapeInfo, coords); + //shape::getIndexOffset(j, tadOnlyOutputShapeInfo) + *(reinterpret_cast(outputBuffer) + xOffset + shape::getIndexOffset(j, tadOnlyOutputShapeInfo)) = *(reinterpret_cast(inputBuffer) + yOffset + tadOffset); } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu b/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu index d5af6328a..aa129ee8e 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu @@ -27,12 +27,12 @@ namespace ops { namespace helpers { template - static _CUDA_G void indicesFiller(void *vz, Nd4jLong *zShapeInfo, Nd4jLong zLength, Nd4jLong part, Nd4jLong bSize) { + static _CUDA_G void indicesFiller(void *vz, Nd4jLong *zShapeInfo, Nd4jLong part, Nd4jLong bSize) { auto z = reinterpret_cast(vz); for (int b = blockIdx.x; b < bSize; b += gridDim.x) { for (Nd4jLong e = threadIdx.x; e < part; e += blockDim.x) { - z[shape::getIndexOffset(e + b * part, zShapeInfo, zLength)] = static_cast(e); + z[shape::getIndexOffset(e + b * part, zShapeInfo)] = static_cast(e); } } } @@ -74,7 +74,7 @@ namespace helpers { auto total = input->lengthOf(); auto part = total / bSize; - indicesFiller<<<256, 256, 1024, *block.launchContext()->getCudaStream()>>>(indices->specialBuffer(), indices->specialShapeInfo(), indices->lengthOf(), part, bSize); + indicesFiller<<<256, 256, 1024, *block.launchContext()->getCudaStream()>>>(indices->specialBuffer(), indices->specialShapeInfo(), part, bSize); /* for (int k = 0; k < total; ) diff --git a/libnd4j/include/ops/declarable/helpers/cuda/merge.cu b/libnd4j/include/ops/declarable/helpers/cuda/merge.cu index 27c8fc630..14fda24ec 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/merge.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/merge.cu @@ -47,7 +47,7 @@ namespace nd4j { for (int i = 0; i < numArrays; i++) { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); - auto val = x[shape::getIndexOffset(e, xShape, length)];; + auto val = x[shape::getIndexOffset(e, xShape)];; if (mVal < val) { mIdx = static_cast(i); mVal = val; @@ -55,7 +55,7 @@ namespace nd4j { } __syncthreads(); - output[shape::getIndexOffset(e, outputShape, length)] = mIdx; + output[shape::getIndexOffset(e, outputShape)] = mIdx; } } @@ -105,13 +105,13 @@ namespace nd4j { for (int i = 0; i < numArrays; i++) { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); - auto val = x[shape::getIndexOffset(e, xShape, length)];; + auto val = x[shape::getIndexOffset(e, xShape)];; if (mVal < val) mVal = val; } __syncthreads(); - output[shape::getIndexOffset(e, outputShape, length)] = mVal; + output[shape::getIndexOffset(e, outputShape)] = mVal; } } @@ -160,10 +160,10 @@ namespace nd4j { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); - sum += x[shape::getIndexOffset(e, xShape, length)]; + sum += x[shape::getIndexOffset(e, xShape)]; } - output[shape::getIndexOffset(e, outputShape, length)] = sum / numArrays; + output[shape::getIndexOffset(e, outputShape)] = sum / numArrays; } } @@ -213,10 +213,10 @@ namespace nd4j { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); - sum += x[shape::getIndexOffset(e, xShape, length)]; + sum += x[shape::getIndexOffset(e, xShape)]; } - output[shape::getIndexOffset(e, outputShape, length)] = sum; + output[shape::getIndexOffset(e, outputShape)] = sum; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu b/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu index ea4a1e146..399447c9a 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu @@ -55,8 +55,8 @@ namespace helpers { } } else { for (int i = threadIdx.x; i < length; i += blockDim.x) { - auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); - auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); + auto xOffset = shape::getIndexOffset(i, xShapeInfo); + auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = x[xOffset]; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu index 3b80f3df9..50a5a4025 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu @@ -31,22 +31,21 @@ namespace helpers { template static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) { - __shared__ Nd4jLong bufferLength, arrLen; + __shared__ Nd4jLong bufferLength; auto z = reinterpret_cast(outputBuffer); auto x = reinterpret_cast(inputBuffer); - if (threadIdx.x == 0) { - arrLen = shape::length(pTadShape); + if (threadIdx.x == 0) bufferLength = shape::length(outputShapeInfo); - } + __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; for (int t = tid; t < bufferLength; t += step) { auto tX = x + pTadOffsets[t]; - z[shape::getIndexOffset(t, outputShapeInfo, bufferLength)] = tX[shape::getIndexOffset(n, pTadShape, arrLen)]; //tX]; + z[shape::getIndexOffset(t, outputShapeInfo)] = tX[shape::getIndexOffset(n, pTadShape)]; //tX]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu index 53b983d09..c0d1d95dc 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu @@ -61,14 +61,14 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo for (Nd4jLong i = tid; i < zLen; i += totalThreads) { - shape::index2coords(zRank, shape::shapeOf(const_cast(zShapeInfo)), i, zLen, coord); - const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast(zShapeInfo)), shape::stride(const_cast(zShapeInfo)), coord, zRank); + shape::index2coords(i, zShapeInfo, coord); + const auto zOffset = shape::getOffset(zShapeInfo, coord); const auto depthCoord = coord[axis]; for (uint j = axis; j < zRank - 1; ++j) coord[j] = coord[j + 1]; - const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), coord, xRank); + const auto xOffset = shape::getOffset(xShapeInfo, coord); const Nd4jLong idx = x[xOffset]; z[zOffset] = depthCoord == idx ? on : off; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu index e19ddcb1b..aede6243a 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu @@ -48,7 +48,7 @@ namespace nd4j { auto z = reinterpret_cast(vz); __shared__ int rank, rankMinusOne; - __shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0; + __shared__ Nd4jLong zLen, totalThreads, *coords, *xShape, *zShape, shift1, shift2, yStride0; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -56,12 +56,9 @@ namespace nd4j { zLen = shape::length(zShapeInfo); xShape = shape::shapeOf(const_cast(xShapeInfo)); zShape = shape::shapeOf(const_cast(zShapeInfo)); - xStride = shape::stride(const_cast(xShapeInfo)); - zStride = shape::stride(const_cast(zShapeInfo)); yStride0 = shape::stride(const_cast(yShapeInfo))[0]; rank = shape::rank(xShapeInfo); zLen = shape::length(zShapeInfo); - yLen = 2 * rank; rankMinusOne = rank - 1; totalThreads = gridDim.x * blockDim.x; shift1 = mode == 1 ? 0 : 1; // REFLECT : SYMMETRIC @@ -78,19 +75,19 @@ namespace nd4j { for (Nd4jLong i = tid; i < zLen; i += totalThreads) { - shape::index2coords(rank, zShape, i, zLen, xzCoord); - const auto zOffset = shape::getOffset(0, zShape, zStride, xzCoord, rank); + shape::index2coords(i, zShapeInfo, xzCoord); + const auto zOffset = shape::getOffset(zShapeInfo, xzCoord); bool within = true; for(int j = rankMinusOne; j >= 0; --j) { if(xShape[j] == zShape[j]) continue; - const auto left = y[shape::getIndexOffset(yStride0 * j, yShapeInfo, yLen)]; + const auto left = y[shape::getIndexOffset(yStride0 * j, yShapeInfo)]; if(xzCoord[j] < left || xzCoord[j] >= left + xShape[j]) {within = false; break;} else {xzCoord[j] = xzCoord[j] - left;} } if(within) - z[zOffset] = x[shape::getOffset(0, xShape, xStride, xzCoord, rank)]; + z[zOffset] = x[shape::getOffset(xShapeInfo, xzCoord)]; else z[zOffset] = padVal; } @@ -99,18 +96,18 @@ namespace nd4j { for (Nd4jLong i = tid; i < zLen; i += totalThreads) { - shape::index2coords(rank, zShape, i, zLen, xzCoord); - const auto zOffset = shape::getOffset(0, zShape, zStride, xzCoord, rank); + shape::index2coords(i, zShapeInfo, xzCoord); + const auto zOffset = shape::getOffset(zShapeInfo, xzCoord); for(int j = rankMinusOne; j >= 0; --j) { if(xShape[j] == zShape[j]) continue; - xzCoord[j] = xzCoord[j] - y[shape::getIndexOffset(yStride0 * j, yShapeInfo, yLen)]; // are ready to fill middle (within input dimension range) + xzCoord[j] = xzCoord[j] - y[shape::getIndexOffset(yStride0 * j, yShapeInfo)]; // are ready to fill middle (within input dimension range) if(xzCoord[j] < 0) xzCoord[j] = -xzCoord[j] - shift1; // means fill from left else if(xzCoord[j] >= xShape[j]) xzCoord[j] = 2 * xShape[j] - xzCoord[j] - shift2; // means fill from right } - const auto xOffset = shape::getOffset(0, xShape, xStride, xzCoord, rank); + const auto xOffset = shape::getOffset(xShapeInfo, xzCoord); z[zOffset] = x[xOffset]; } } @@ -164,14 +161,14 @@ namespace nd4j { auto step = blockDim.x * gridDim.x; for(int i = start; i < zLen; i+= step) { - auto zIndex = shape::getIndexOffset(i, zShape, zLen); - auto xIndex = shape::getIndexOffset(len - i, xShape, xLen); + auto zIndex = shape::getIndexOffset(i, zShape); + auto xIndex = shape::getIndexOffset(len - i, xShape); if (i < leftSide) // left side - xIndex = shape::getIndexOffset(leftSideCorrected - i, xShape, xLen); + xIndex = shape::getIndexOffset(leftSideCorrected - i, xShape); else if(i >= leftSide && i < leftSide + xLen) // middle - xIndex = shape::getIndexOffset(i - leftSide, xShape, xLen); + xIndex = shape::getIndexOffset(i - leftSide, xShape); // else // right side // z[i] = x[len - i]; @@ -187,8 +184,6 @@ namespace nd4j { __shared__ I const* pads; __shared__ F* z; __shared__ Nd4jLong zRank, rank; - __shared__ Nd4jLong* xShapeOf, *xStrideOf, *padsShapeOf, *padsStrideOf; - __shared__ Nd4jLong* zShapeOf, *zStrideOf; __shared__ Nd4jLong* xIdx; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -198,13 +193,6 @@ namespace nd4j { x = reinterpret_cast(vx);// pads = reinterpret_cast(paddings); z = reinterpret_cast(vz); - xShapeOf = shape::shapeOf(xShape); - xStrideOf = shape::stride(xShape); - zShapeOf = shape::shapeOf(zShape); - zRank = shape::rank(zShape); - zStrideOf = shape::stride(zShape); - padsShapeOf = shape::shapeOf(paddingShape); - padsStrideOf = shape::stride(paddingShape); } __syncthreads(); auto start = threadIdx.x + blockIdx.x * blockDim.x; @@ -214,14 +202,14 @@ namespace nd4j { auto xzCoord = xIdx + threadIdx.x * rank; //auto zxCoord = xIdx + (threadIdx.x + threadIdx.x % 2 + 1) * rank; - shape::index2coords(rank, zShapeOf, i, xzCoord); - auto outOffset = shape::getOffset(0, zShapeOf, zStrideOf, xzCoord, rank); + shape::index2coords(i, zShape, xzCoord); + auto outOffset = shape::getOffset(zShape, xzCoord); // auto intStep = blockDim.y * gridDim.y; for(int j = 0; j < rank; j++) { const Nd4jLong inLen = shape::sizeAt(xShape, j); Nd4jLong coords[2] = {j, 0}; - auto padOffset = shape::getOffset(0, padsShapeOf, padsStrideOf, coords, 2); // padding already has rank 2 + auto padOffset = shape::getOffset(paddingShape, coords); // padding already has rank 2 const auto leftSide = pads[padOffset]; const auto leftSideCorrected = leftSide - reflBorder; const Nd4jLong len = 2 * (inLen - 1) + leftSide + reflBorder; @@ -238,7 +226,7 @@ namespace nd4j { xzCoord[j] = xzCoord[j] - len; } - auto inOffset = shape::getOffset(0, xShapeOf, xStrideOf, xzCoord, rank); + auto inOffset = shape::getOffset(xShape, xzCoord); z[outOffset] = x[inOffset]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu index 7b325eb3e..ccfbbf943 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu @@ -43,8 +43,8 @@ namespace helpers { for (int tid = threadIdx.x; tid < tadLength; tid += blockDim.x) { auto top = 2 * tid + 1; if (top < tadLength) { - auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo, tadLength); - auto t1 = shape::getIndexOffset(top, xTadShapeInfo, tadLength); + auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo); + auto t1 = shape::getIndexOffset(top, xTadShapeInfo); if (x[t0] > x[t1]) { //swap values @@ -58,8 +58,8 @@ namespace helpers { for (int tid = threadIdx.x; tid < tadLength; tid += blockDim.x) { auto top = 2 * tid + 2; if (top < tadLength) { - auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo, tadLength); - auto t1 = shape::getIndexOffset(top, xTadShapeInfo, tadLength); + auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo); + auto t1 = shape::getIndexOffset(top, xTadShapeInfo); if (x[t0] > x[t1]) { //swap values @@ -76,7 +76,7 @@ namespace helpers { // saving final value if (threadIdx.x == 0) - z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)]; + z[shape::getIndexOffset(t, zShapeInfo)] = x[shape::getIndexOffset(position, xTadShapeInfo)]; __syncthreads(); } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu index bddaf65e3..01b9464fa 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu @@ -34,31 +34,31 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo, const auto n = reinterpret_cast(vn); const auto x = reinterpret_cast(vx); - auto z = reinterpret_cast(vz); + auto z = reinterpret_cast(vz); __shared__ Nd4jLong len; - - if (threadIdx.x == 0) - len = shape::length(nShapeInfo); + + if (threadIdx.x == 0) + len = shape::length(nShapeInfo); __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto totalThreads = gridDim.x * blockDim.x; for (int i = tid; i < len; i += totalThreads) { - - const auto nOffset = shape::getIndexOffset(i, nShapeInfo, len); - const auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - const auto zOffset = shape::getIndexOffset(i, zShapeInfo, len); - const T nVal = n[nOffset]; - + const auto nOffset = shape::getIndexOffset(i, nShapeInfo); + const auto xOffset = shape::getIndexOffset(i, xShapeInfo); + const auto zOffset = shape::getIndexOffset(i, zShapeInfo); + + const T nVal = n[nOffset]; + int sign = (static_cast(nVal) + 1) % 2 ? -1 : 1; T factorial = 1; if(nVal != 0 && nVal != 1) for(int i = 2; i <= nVal; ++i) - factorial *= i; + factorial *= i; z[zOffset] = sign * factorial * zetaScalar(nVal + 1, x[xOffset]); } @@ -75,10 +75,10 @@ static void polyGammaCudaLauncher(const int blocksPerGrid, const int threadsPerB void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) { NDArray::prepareSpecialUse({&z}, {&n, &x}); - + int threadsPerBlock = MAX_NUM_THREADS; int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - + BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES); NDArray::registerSpecialUse({&z}, {&n, &x}); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu index b1412343b..52dd8b815 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu @@ -68,12 +68,12 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op, } if(leftArrInd < tadLen) - shared[sharedInd] = xLeft = xTad[shape::getIndexOffset(leftArrInd, xTadShapeInfo, tadLen)]; + shared[sharedInd] = xLeft = xTad[shape::getIndexOffset(leftArrInd, xTadShapeInfo)]; // else // shared[sharedInd] = (op == scalar::Add) ? 0 : 1; if(rightArrInd < tadLen) - shared[sharedInd + 1] = xRight = xTad[shape::getIndexOffset(rightArrInd, xTadShapeInfo, tadLen)]; + shared[sharedInd + 1] = xRight = xTad[shape::getIndexOffset(rightArrInd, xTadShapeInfo)]; // else // shared[sharedInd + 1] = (op == scalar::Add) ? 0 : 1; @@ -117,7 +117,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op, result = (op == scalar::Add) ? result + xLeft : result * xLeft; if(i > 0) result = (op == scalar::Add) ? result + lastElemInChunk : result * lastElemInChunk; - zTad[shape::getIndexOffset(leftArrInd, zTadShapeInfo, tadLen)] = result; + zTad[shape::getIndexOffset(leftArrInd, zTadShapeInfo)] = result; } if(rightArrInd < tadLen) { @@ -128,7 +128,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op, result = (op == scalar::Add) ? result + lastElemInChunk : result * lastElemInChunk; if(i < numTadChunks - 1 && threadIdx.x == blockDim.x - 1) // last element in chunk lastElemInChunk = !exclusive ? result : (op == scalar::Add) ? result + xRight : result * xRight; - zTad[shape::getIndexOffset(rightArrInd, zTadShapeInfo, tadLen)] = result; + zTad[shape::getIndexOffset(rightArrInd, zTadShapeInfo)] = result; } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu index e86cd382a..aceebf7a0 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu @@ -34,14 +34,12 @@ namespace helpers { static __global__ void reverseArrayKernel(void* input, Nd4jLong *inputShape, void* output, Nd4jLong *outputShape, Nd4jLong numOfElemsToReverse) { const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; - __shared__ Nd4jLong length; __shared__ int linearStatus; __shared__ T* inputArr; __shared__ T* outputArr; __shared__ char inputOrder, outputOrder; if (threadIdx.x == 0) { - length = shape::length(inputShape); linearStatus = (shape::elementWiseStride(inputShape) == shape::elementWiseStride(outputShape)) && (inputOrder == outputOrder)? shape::elementWiseStride(inputShape):0; char inputOrder = shape::order(inputShape); @@ -56,31 +54,28 @@ namespace helpers { for (Nd4jLong e = tid; e < limit; e += step) { // we're calculating offsets within input array - auto fOffset = shape::getIndexOffset(e, inputShape, length); - auto lOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, inputShape, length); + auto fOffset = shape::getIndexOffset(e, inputShape); + auto lOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, inputShape); // now we're storing input values auto v1 = inputArr[fOffset]; auto v2 = inputArr[lOffset]; // now we're calculating offsets within output array - auto zfOffset = shape::getIndexOffset(e, outputShape, length); - auto zlOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, outputShape, length); + auto zfOffset = shape::getIndexOffset(e, outputShape); + auto zlOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, outputShape); // and saving values to output arrays outputArr[zfOffset] = v2; outputArr[zlOffset] = v1; - - //printf("TID: %i; E: %lld; z[%lld], z[%lld] = x[%lld], x[%lld];\n", tid, e, zfOffset, zlOffset, lOffset, fOffset); } // in case of odd array we'll have to move middle value if (odd && tid == 0) { - auto xOffset = shape::getIndexOffset(limit, inputShape, length); - auto zOffset = shape::getIndexOffset(limit, outputShape, length); + auto xOffset = shape::getIndexOffset(limit, inputShape); + auto zOffset = shape::getIndexOffset(limit, outputShape); outputArr[zOffset] = inputArr[xOffset]; - //printf("TID: %i; E: %lld; z[%lld] = x[%lld];\n", tid, limit, zOffset, xOffset); } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/roll.cu b/libnd4j/include/ops/declarable/helpers/cuda/roll.cu index 216c6b7a0..d843feeff 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/roll.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/roll.cu @@ -53,11 +53,11 @@ namespace helpers { for (int i = tid; i < actualShift; i += blockDim.x * gridDim.x) { int sourceIndex = fullLength - actualShift + i; - auto xOffsetA = shape::getIndexOffset(i, xShapeInfo, fullLength); - auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo, fullLength); + auto xOffsetA = shape::getIndexOffset(i, xShapeInfo); + auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo); - auto zOffsetA = shape::getIndexOffset(i, zShapeInfo, fullLength); - auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo, fullLength); + auto zOffsetA = shape::getIndexOffset(i, zShapeInfo); + auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo); auto eA = x[xOffsetA]; auto eB = x[xOffsetB]; @@ -107,11 +107,11 @@ namespace helpers { int destinationIndex = fullLength - (count + 1) * actualShift + i; int sourceIndex = fullLength - count * actualShift + i; - auto xOffsetA = shape::getIndexOffset(destinationIndex, xShapeInfo, fullLength); - auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo, fullLength); + auto xOffsetA = shape::getIndexOffset(destinationIndex, xShapeInfo); + auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo); - auto zOffsetA = shape::getIndexOffset(destinationIndex, zShapeInfo, fullLength); - auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo, fullLength); + auto zOffsetA = shape::getIndexOffset(destinationIndex, zShapeInfo); + auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo); auto eA = x[xOffsetA]; auto eB = x[xOffsetB]; @@ -154,11 +154,11 @@ namespace helpers { int remainIdx = i + actualShift; int sourceIndex = remainIdx + remainShift; - auto xOffsetA = shape::getIndexOffset(remainIdx, xShapeInfo, fullLength); - auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo, fullLength); + auto xOffsetA = shape::getIndexOffset(remainIdx, xShapeInfo); + auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo); - auto zOffsetA = shape::getIndexOffset(remainIdx, zShapeInfo, fullLength); - auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo, fullLength); + auto zOffsetA = shape::getIndexOffset(remainIdx, zShapeInfo); + auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo); auto eA = x[xOffsetA]; auto eB = x[xOffsetB]; @@ -190,7 +190,7 @@ namespace helpers { } } else { for (int e = threadIdx.x; e < tadLength; e += blockDim.x) { - auto zOffset = shape::getIndexOffset(e, zShapeInfo, tadLength); + auto zOffset = shape::getIndexOffset(e, zShapeInfo); auto eA = x[zOffset]; auto eB = z[zOffset]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu index 0ac0a1882..82f421fdd 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu @@ -62,14 +62,14 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn if(i >= zLen) return; - shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + shape::index2coords(i, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); coords[1] += cropBottom; coords[2] += cropLeft; - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); z[zOffset] = x[xOffset]; @@ -156,9 +156,9 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) { - shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + shape::index2coords(i, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); // evaluate spatial coordinates for x for(uint j = 1; j <= numOfSpatialDims; ++j) { @@ -166,7 +166,7 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape coords[j] += y[yOffset]; // add crop left } - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); z[zOffset] = x[xOffset]; } @@ -283,16 +283,16 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn if(i >= zLen) return; - shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + shape::index2coords(i, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); if(coords[1] >= padBottom && coords[1] < zShapeInfo[2] - padTop && coords[2] >= padLeft && coords[2] < zShapeInfo[3] - padRight) { coords[1] -= padBottom; coords[2] -= padLeft; - const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); z[zOffset] = x[xOffset]; } @@ -383,9 +383,9 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < zLen; i += totalThreads) { - shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + shape::index2coords(i, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); bool within = true; @@ -405,7 +405,7 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape } if(within) - z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + z[zOffset] = x[shape::getOffset(xShapeInfo, coords)]; else z[zOffset] = 0.f; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu index 54d350f47..501b9bca4 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu @@ -57,8 +57,8 @@ namespace helpers { // for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) { - // const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX); - // const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY); + // const auto xOffset = shape::getIndexOffset(i, xShapeInfo); + // const auto yOffset = shape::getIndexOffset(i, yShapeInfo); // switch (opCode) { // case pairwise::Add: @@ -99,8 +99,8 @@ namespace helpers { // __syncthreads(); // for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) { - // const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX); - // const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY); + // const auto xOffset = shape::getIndexOffset(i, xShapeInfo); + // const auto yOffset = shape::getIndexOffset(i, yShapeInfo); // switch (opCode) { // case pairwise::Add: @@ -188,7 +188,7 @@ __global__ static void scatterLockCuda(const int opCode, for (int e = 0; e < xLen; e++) { - const Nd4jLong zIndex = x[shape::getIndexOffset(e, xShapeInfo, xLen)]; + const Nd4jLong zIndex = x[shape::getIndexOffset(e, xShapeInfo)]; const bool isOwner = zIndex < gridDim.x ? blockIdx.x == zIndex : blockIdx.x == zIndex % gridDim.x; if (!isOwner) @@ -199,8 +199,8 @@ __global__ static void scatterLockCuda(const int opCode, if(threadIdx.x != 0) continue; - const auto yOffset = shape::getIndexOffset(e, yTadShapeInfo, yTadLen); - const auto zOffset = shape::getIndexOffset(zIndex, zTadShapeInfo, zTadLen); + const auto yOffset = shape::getIndexOffset(e, yTadShapeInfo); + const auto zOffset = shape::getIndexOffset(zIndex, zTadShapeInfo); switch (opCode) { case pairwise::Add: @@ -241,8 +241,8 @@ __global__ static void scatterLockCuda(const int opCode, for (Nd4jLong i = threadIdx.x; i < zTadLen; i += blockDim.x) { - const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo, zTadLen); - const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo, zTadLen); + const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo); + const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo); switch (opCode) { case pairwise::Add: @@ -326,19 +326,19 @@ __global__ static void scatterCuda(const int opCode, for (Nd4jLong i = tid; i < yLen; i += totalThreads) { - shape::index2coords(yRank, shape::shapeOf(const_cast(yShapeInfo)), i, yLen, yCoord); + shape::index2coords(i, yShapeInfo, yCoord); for (uint j = 0; j < xRank; ++j) xCoord[j] = yCoord[j]; - const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), xCoord, xRank); + const auto xOffset = shape::getOffset(xShapeInfo, xCoord); zCoord[0] = x[xOffset]; for (uint j = 0; j < yRank - xRank; ++j) zCoord[j + 1] = yCoord[xRank + j]; - const auto yOffset = shape::getOffset(0, shape::shapeOf(const_cast(yShapeInfo)), shape::stride(const_cast(yShapeInfo)), yCoord, yRank); - const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast(zShapeInfo)), shape::stride(const_cast(zShapeInfo)), zCoord, zRank); + const auto yOffset = shape::getOffset(yShapeInfo, yCoord); + const auto zOffset = shape::getOffset(zShapeInfo, zCoord); switch (opCode) { case pairwise::Add: @@ -471,9 +471,9 @@ __global__ static void scatterNDLockCuda(const int opCode, const X* xTad = x + xOffsets[i]; for (uint k = 0; k < xLastDim; ++k) - zTadCoordsPerThread[k] = xTad[shape::getIndexOffset(k, xTadShapeInfo, xLastDim)]; + zTadCoordsPerThread[k] = xTad[shape::getIndexOffset(k, xTadShapeInfo)]; - const auto zTadIndex = shape::coords2index(xLastDim, shape::shapeOf(const_cast(zShapeInfo)), zTadCoordsPerThread); + const auto zTadIndex = shape::coords2index(xLastDim, zShapeInfo + 1, zTadCoordsPerThread); const bool isOwner = zTadIndex < gridDim.x ? blockIdx.x == zTadIndex : blockIdx.x == zTadIndex % gridDim.x; @@ -485,8 +485,8 @@ __global__ static void scatterNDLockCuda(const int opCode, if(threadIdx.x != 0) continue; - const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo, yTadLen); - const auto zOffset = shape::getIndexOffset(zTadIndex, zTadShapeInfo, yTadLen); + const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo); + const auto zOffset = shape::getIndexOffset(zTadIndex, zTadShapeInfo); switch (opCode) { case pairwise::Add: @@ -526,8 +526,8 @@ __global__ static void scatterNDLockCuda(const int opCode, for (Nd4jLong j = threadIdx.x; j < yTadLen; j += blockDim.x) { - const auto yOffset = shape::getIndexOffset(j, yTadShapeInfo, yTadLen); - const auto zOffset = shape::getIndexOffset(j, zTadShapeInfo, yTadLen); + const auto yOffset = shape::getIndexOffset(j, yTadShapeInfo); + const auto zOffset = shape::getIndexOffset(j, zTadShapeInfo); switch (opCode) { case pairwise::Add: @@ -618,22 +618,22 @@ __global__ static void scatterNDCuda(const int opCode, for (Nd4jLong i = tid; i < yLen; i += totalThreads) { - shape::index2coords(yRank, shape::shapeOf(const_cast(yShapeInfo)), i, yLen, yCoord); + shape::index2coords(i, yShapeInfo, yCoord); for (uint j = 0; j < xRank - 1; ++j) xCoord[j] = yCoord[j]; for (uint j = 0; j < xLastDim; ++j) { xCoord[xRank - 1] = j; - const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), xCoord, xRank); + const auto xOffset = shape::getOffset(xShapeInfo, xCoord); zCoord[j] = x[xOffset]; } for (uint j = xLastDim; j < zRank; ++j) zCoord[j] = yCoord[yRank - zRank + j]; - const auto yOffset = shape::getOffset(0, shape::shapeOf(const_cast(yShapeInfo)), shape::stride(const_cast(yShapeInfo)), yCoord, yRank); - const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast(zShapeInfo)), shape::stride(const_cast(zShapeInfo)), zCoord, zRank); + const auto yOffset = shape::getOffset(yShapeInfo, yCoord); + const auto zOffset = shape::getOffset(zShapeInfo, zCoord); switch (opCode) { case pairwise::Add: @@ -760,18 +760,18 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo, auto coords = sharedMem + threadIdx.x * (xRank + 1); - shape::index2coords(xRank, xShapeInfo + 1, xInd, xLen, coords); + shape::index2coords(xInd, xShapeInfo, coords); // y last coordinate - coords[xRank] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank)]; + coords[xRank] = x[shape::getOffset(xShapeInfo, coords)]; - const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank + 2, coords, xRank + 1); + const auto yOffset = shape::getOffset(yShapeInfo, coords); if(z == nullptr) { // gradient calculation y[yOffset] -= 1.f; } else { - z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank)] = y[yOffset]; + z[shape::getOffset(zShapeInfo, coords)] = y[yOffset]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu index f1eda6b01..37a465144 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu @@ -40,9 +40,9 @@ namespace nd4j { auto tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < iLength; i += blockDim.x * gridDim.x) { auto x = reinterpret_cast(vx) + xTadOffsets[i]; - auto idx = indices[shape::getIndexOffset(i, iShapeInfo, iLength)]; + auto idx = indices[shape::getIndexOffset(i, iShapeInfo)]; - x[shape::getIndexOffset(idx, xTadShape, xLength)] = u[shape::getIndexOffset(i, uShapeInfo, uLength)]; + x[shape::getIndexOffset(idx, xTadShape)] = u[shape::getIndexOffset(i, uShapeInfo)]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu index d8b3575ff..1ad55a111 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu @@ -63,8 +63,8 @@ namespace nd4j { for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) { - const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX); - const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY); + const auto xOffset = shape::getIndexOffset(i, xShapeInfo); + const auto yOffset = shape::getIndexOffset(i, yShapeInfo); switch (opCode) { case 0: diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu index 8830f37e7..cab6e50e7 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu @@ -58,10 +58,10 @@ namespace nd4j { zLen = shape::length(outputShape); if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); start = starts[segment]; finish = start + lengths[segment]; - z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(start, inputShape)]; val[segment] = z[zIndex]; } @@ -69,7 +69,7 @@ namespace nd4j { __syncthreads(); for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]); } } @@ -94,19 +94,19 @@ namespace nd4j { xLen = shape::length(inputShape); zLen = shape::length(outputShape); - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); //start = starts[segment]; //finish = start + lengths[segment]; if (lengths[segment] > 0) - z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)]; else z[zIndex] = -DataTypeUtils::max(); } __syncthreads(); if (lengths[segment] > 0) for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); - auto yIndex = shape::getIndexOffset(e, indicesShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); + auto yIndex = shape::getIndexOffset(e, indicesShape); if (y[yIndex] == segment) { nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]); } @@ -140,16 +140,16 @@ namespace nd4j { auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; if (blockIdx.x == start) { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]); //z[zIndex] = x[xIndex]; } } else { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); if (lengths[segment]) nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]); } @@ -276,12 +276,12 @@ namespace nd4j { for (auto e = start; e < xLen; e += step) { - auto zOffset = shape::getIndexOffset(e, outputShape, xLen); - auto xOffset = shape::getIndexOffset(e, inputShape, xLen); - auto yOffset = shape::getIndexOffset(e, indicesShape, xLen); + auto zOffset = shape::getIndexOffset(e, outputShape); + auto xOffset = shape::getIndexOffset(e, inputShape); + auto yOffset = shape::getIndexOffset(e, indicesShape); auto classIndex = y[yOffset]; - auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen); - auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen); + auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape); + auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape); if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) { z[zOffset] = gradOut[gradOffsetO]; @@ -318,7 +318,7 @@ namespace nd4j { __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { - auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); + auto yIndex = shape::getIndexOffset(i, indicesShape); auto segment = y[yIndex]; T* current = x + inputOffsets[i]; T* currentOut = z + outOffsets[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu index 19869f646..dc958f79c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu @@ -53,11 +53,11 @@ namespace helpers { //[zIndex] = if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); start = starts[segment]; finish = start + lengths[segment]; //val[segment] = ; - z[zIndex] = T(x[shape::getIndexOffset(start, inputShape, xLen)] / lengths[segment]); + z[zIndex] = T(x[shape::getIndexOffset(start, inputShape)] / lengths[segment]); // val[segment] = z[zIndex]; } @@ -65,7 +65,7 @@ namespace helpers { __syncthreads(); for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); if (lengths[segment]) nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex] / lengths[segment])); } @@ -91,11 +91,11 @@ namespace helpers { zLen = shape::length(outputShape); // if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); //start = starts[segment]; //finish = start + lengths[segment]; if (lengths[segment] > 0) - z[zIndex] = T(x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / T(lengths[segment])); + z[zIndex] = T(x[shape::getIndexOffset(starts[segment], inputShape)] / T(lengths[segment])); else z[zIndex] = 0; //DataTypeUtils::max(); // val[segment] = z[zIndex]; @@ -105,8 +105,8 @@ namespace helpers { __syncthreads(); if (lengths[segment] > 0) for (auto e = threadIdx.x; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); - auto yIndex = shape::getIndexOffset(e, indicesShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); + auto yIndex = shape::getIndexOffset(e, indicesShape); if (y[yIndex] == segment && e != starts[segment]) { nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/T(lengths[segment]))); } @@ -137,15 +137,15 @@ namespace helpers { auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; if (blockIdx.x == start) { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment])); } } else { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); if (lengths[segment]) nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment])); } @@ -261,11 +261,11 @@ namespace helpers { for (auto e = start; e < xLen; e += step) { - auto zOffset = shape::getIndexOffset(e, outputShape, xLen); - auto xOffset = shape::getIndexOffset(e, inputShape, xLen); - auto yOffset = shape::getIndexOffset(e, indicesShape, xLen); + auto zOffset = shape::getIndexOffset(e, outputShape); + auto xOffset = shape::getIndexOffset(e, inputShape); + auto yOffset = shape::getIndexOffset(e, indicesShape); auto classIndex = y[yOffset]; - auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen); + auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape); z[zOffset] = T(gradOut[gradOffsetO] / float(lengths[classIndex])); } @@ -294,14 +294,14 @@ namespace helpers { __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { -// auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); +// auto yIndex = shape::getIndexOffset(i, indicesShape); auto segment = y[i]; //yIndex]; T* currentOut = z + outOffsets[i]; T* outGrad = gradOut + gradOutOffsets[segment]; for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) { - auto zIndex = shape::getIndexOffset(e, outTad, currentLen); - auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen); + auto zIndex = shape::getIndexOffset(e, outTad); + auto gradIndex = shape::getIndexOffset(e, gradOutTad); if (lengths[segment] > 0) currentOut[zIndex] = T(outGrad[gradIndex] / float(lengths[segment])); } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu index e5ea2eb91..506cfaa41 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu @@ -56,10 +56,10 @@ namespace helpers { zLen = shape::length(outputShape); if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); start = starts[segment]; finish = start + lengths[segment]; - z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(start, inputShape)]; val[segment] = z[zIndex]; } @@ -67,7 +67,7 @@ namespace helpers { __syncthreads(); for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]); } @@ -98,9 +98,9 @@ namespace helpers { xLen = shape::length(inputShape); zLen = shape::length(outputShape); - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); if (lengths[segment] > 0) - z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)]; else z[zIndex] = DataTypeUtils::max(); @@ -108,8 +108,8 @@ namespace helpers { __syncthreads(); if (lengths[segment] > 0) for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); - auto yIndex = shape::getIndexOffset(e, indicesShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); + auto yIndex = shape::getIndexOffset(e, indicesShape); if (y[yIndex] == segment) { nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]); } @@ -140,15 +140,15 @@ namespace helpers { auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; if (blockIdx.x == start) { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]); } } else { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); // if (lengths[indices[idx]]) nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]); } @@ -269,12 +269,12 @@ namespace helpers { for (auto e = start; e < xLen; e += step) { - auto zOffset = shape::getIndexOffset(e, outputShape, xLen); - auto xOffset = shape::getIndexOffset(e, inputShape, xLen); - auto yOffset = shape::getIndexOffset(e, indicesShape, xLen); + auto zOffset = shape::getIndexOffset(e, outputShape); + auto xOffset = shape::getIndexOffset(e, inputShape); + auto yOffset = shape::getIndexOffset(e, indicesShape); auto classIndex = y[yOffset]; - auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen); - auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen); + auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape); + auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape); if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) { z[zOffset] = gradOut[gradOffsetO]; @@ -311,7 +311,7 @@ namespace helpers { __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { - auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); + auto yIndex = shape::getIndexOffset(i, indicesShape); auto segment = y[yIndex]; T* current = x + inputOffsets[i]; T* currentOut = z + outOffsets[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu index 5709a63ea..7814defe1 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu @@ -53,11 +53,11 @@ namespace helpers { zLen = shape::length(outputShape); if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); start = starts[segment]; finish = start + lengths[segment]; //val[segment] = ; - z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(start, inputShape)]; val[segment] = z[zIndex]; } @@ -67,7 +67,7 @@ namespace helpers { // auto step = blockDim.x * gridDim.x; for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); nd4j::math::atomics::nd4j_atomicMul(&val[segment], x[xIndex]); } __syncthreads(); @@ -98,11 +98,11 @@ namespace helpers { zLen = shape::length(outputShape); // if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); //start = starts[segment]; //finish = start + lengths[segment]; if (lengths[segment] > 0) - z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)]; else z[zIndex] = 0; //DataTypeUtils::max(); // val[segment] = z[zIndex]; @@ -112,8 +112,8 @@ namespace helpers { __syncthreads(); if (lengths[segment] > 0) for (auto e = threadIdx.x; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); - auto yIndex = shape::getIndexOffset(e, indicesShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); + auto yIndex = shape::getIndexOffset(e, indicesShape); if (y[yIndex] == segment && e != starts[segment]) { nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]); } @@ -144,15 +144,15 @@ namespace helpers { auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; if (blockIdx.x == start) { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]); } } else { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); if (lengths[segment] > 0) nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]); } @@ -268,12 +268,12 @@ namespace helpers { for (auto e = start; e < xLen; e += step) { - auto zOffset = shape::getIndexOffset(e, outputShape, xLen); - auto xOffset = shape::getIndexOffset(e, inputShape, xLen); - auto yOffset = shape::getIndexOffset(e, indicesShape, xLen); + auto zOffset = shape::getIndexOffset(e, outputShape); + auto xOffset = shape::getIndexOffset(e, inputShape); + auto yOffset = shape::getIndexOffset(e, indicesShape); auto classIndex = y[yOffset]; - auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen); - auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen); + auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape); + auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape); z[zOffset] = gradOut[gradOffsetO] * gradIn[gradOffsetI] / x[xOffset]; } @@ -307,7 +307,7 @@ namespace helpers { __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { - auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); + auto yIndex = shape::getIndexOffset(i, indicesShape); auto segment = y[yIndex]; T* current = x + inputOffsets[i]; T* currentOut = z + outOffsets[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu index 229d41cc9..f4237ac44 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu @@ -51,11 +51,11 @@ namespace helpers { zLen = shape::length(outputShape); // if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); //start = starts[segment]; //finish = start + lengths[segment]; if (lengths[segment] > 0) - z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / nd4j::math::nd4j_sqrt(lengths[segment]); + z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)] / nd4j::math::nd4j_sqrt(lengths[segment]); else z[zIndex] = 0; //DataTypeUtils::max(); // val[segment] = z[zIndex]; @@ -65,8 +65,8 @@ namespace helpers { __syncthreads(); if (lengths[segment] > 0) for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); - auto yIndex = shape::getIndexOffset(e, indicesShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); + auto yIndex = shape::getIndexOffset(e, indicesShape); if (y[yIndex] == segment && e != starts[segment]) { nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment])); } @@ -97,15 +97,15 @@ namespace helpers { auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; if (blockIdx.x == start) { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); z[zIndex] = x[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment]); } } else { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment])); } } @@ -177,11 +177,11 @@ namespace helpers { for (auto e = start; e < xLen; e += step) { - auto zOffset = shape::getIndexOffset(e, outputShape, xLen); - auto xOffset = shape::getIndexOffset(e, inputShape, xLen); - auto yOffset = shape::getIndexOffset(e, indicesShape, xLen); + auto zOffset = shape::getIndexOffset(e, outputShape); + auto xOffset = shape::getIndexOffset(e, inputShape); + auto yOffset = shape::getIndexOffset(e, indicesShape); auto classIndex = y[yOffset]; - auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen); + auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape); z[zOffset] = T(gradOut[gradOffsetO] / math::nd4j_sqrt(lengths[classIndex])); } @@ -211,14 +211,14 @@ namespace helpers { __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { -// auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); +// auto yIndex = shape::getIndexOffset(i, indicesShape); auto segment = y[i]; //yIndex]; T* currentOut = z + outOffsets[i]; T* outGrad = gradOut + gradOutOffsets[segment]; for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) { - auto zIndex = shape::getIndexOffset(e, outTad, currentLen); - auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen); + auto zIndex = shape::getIndexOffset(e, outTad); + auto gradIndex = shape::getIndexOffset(e, gradOutTad); if (lengths[segment] > 0) currentOut[zIndex] = T(outGrad[gradIndex] / math::nd4j_sqrt(lengths[segment])); } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu index 4b8976f4e..cf4ddd942 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu @@ -58,18 +58,18 @@ namespace helpers { if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); start = starts[segment]; finish = start + lengths[segment]; //val[segment] = ; - z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(start, inputShape)]; } } __syncthreads(); for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]); } } @@ -99,9 +99,9 @@ namespace helpers { xLen = shape::length(inputShape); zLen = shape::length(outputShape); - zIndex = shape::getIndexOffset(segment, outputShape, zLen); + zIndex = shape::getIndexOffset(segment, outputShape); if (lengths[segment] > 0) - z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)]; + z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)]; else z[zIndex] = 0; //DataTypeUtils::max(); } @@ -109,8 +109,8 @@ namespace helpers { if (lengths[segment] > 0) for (auto e = threadIdx.x; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape, xLen); - auto yIndex = shape::getIndexOffset(e, indicesShape, xLen); + auto xIndex = shape::getIndexOffset(e, inputShape); + auto yIndex = shape::getIndexOffset(e, indicesShape); if (y[yIndex] == segment && e != starts[segment]) { nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]); } @@ -141,15 +141,15 @@ namespace helpers { auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; if (blockIdx.x == start) { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]); } } else { for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads, len); - auto zIndex = shape::getIndexOffset(e, outputTads, len); + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); if (lengths[indices[idx]]) nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]); } @@ -269,11 +269,11 @@ namespace helpers { for (auto e = start; e < xLen; e += step) { - auto zOffset = shape::getIndexOffset(e, outputShape, xLen); - auto xOffset = shape::getIndexOffset(e, inputShape, xLen); - auto yOffset = shape::getIndexOffset(e, indicesShape, xLen); + auto zOffset = shape::getIndexOffset(e, outputShape); + auto xOffset = shape::getIndexOffset(e, inputShape); + auto yOffset = shape::getIndexOffset(e, indicesShape); auto classIndex = y[yOffset]; - auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen); + auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape); z[zOffset] = gradOut[gradOffsetO]; } @@ -302,7 +302,7 @@ namespace helpers { __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { - auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); + auto yIndex = shape::getIndexOffset(i, indicesShape); auto segment = y[yIndex]; T* currentOut = z + outOffsets[i]; T* outGrad = gradOut + gradOutOffsets[segment]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu index 7318dbaea..c07db1b95 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu @@ -40,8 +40,8 @@ namespace helpers { for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x) for(auto k = threadIdx.x; k < inputLen; k += blockDim.x) - if (i < input[shape::getIndexOffset(k, inputShape, inputLen)]) - output[shape::getIndexOffset(k * maxIndex + i, outputShape, outputLen)] = B(true); + if (i < input[shape::getIndexOffset(k, inputShape)]) + output[shape::getIndexOffset(k * maxIndex + i, outputShape)] = B(true); } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu index 5c00244f8..5ce883a59 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu @@ -157,11 +157,11 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo, if(tid >= len) return; - shape::index2coords(rank, xShapeInfo + 2, tid, len, coords + 1); // loop through last two dimensions of x : {bS, 2*K} + shape::index2coords(tid, rank - 1, xShapeInfo + 2, coords + 1); // loop through last two dimensions of x : {bS, 2*K} - const auto maskOffst = mask ? shape::getOffset(0, maskShapeInfo + 1, maskShapeInfo + rank, coords + 1, rank - 1) : 0; - const auto c0Offset = shape::getOffset(0, c0ShapeInfo + 1, c0ShapeInfo + rank, coords + 1, rank - 1); - const auto bFOffset = shape::getOffset(0, bShapeInfo + 1, bShapeInfo + rank - 1, coords + 2, rank - 2); + const auto maskOffst = mask ? shape::getOffset(maskShapeInfo, coords + 1) : 0; + const auto c0Offset = shape::getOffset(c0ShapeInfo, coords + 1); + const auto bFOffset = shape::getOffset(bShapeInfo, coords + 2); const auto bROffset = bFOffset + 2 * K * bShapeInfo[2]; // 2*K*b_stride const T maskVal = mask ? mask[maskOffst] : static_cast(1); @@ -176,12 +176,12 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo, else coords[0] = 0; - auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); - auto htOffset = shape::getOffset(0, htShapeInfo + 1, htShapeInfo + rank + 1, coords, rank); - auto ctOffset = shape::getOffset(0, ctShapeInfo + 1, ctShapeInfo + rank + 1, coords, rank); + auto xOffset = shape::getOffset(xShapeInfo, coords); + auto htOffset = shape::getOffset(htShapeInfo, coords); + auto ctOffset = shape::getOffset(ctShapeInfo, coords); coords[2] *= 3; - auto wiOffset0 = shape::getOffset(0, wiShapeInfo + 1, wiShapeInfo + rank + 1, coords, rank); + auto wiOffset0 = shape::getOffset(wiShapeInfo, coords); auto wiOffset1 = wiOffset0 + wiShapeInfo[rank + 3]; // add last stride auto wiOffset2 = wiOffset1 + wiShapeInfo[rank + 3]; // add last stride @@ -363,15 +363,15 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI if(tid >= len) return; - shape::index2coords(rank, xShapeInfo + 2, tid, len, coords + 1); // loop through last two dimensions of x : {bS, 2*K} + shape::index2coords(tid, rank - 1, xShapeInfo + 2, coords + 1); // loop through last two dimensions of x : {bS, 2*K} - const auto maskOffst = mask ? shape::getOffset(0, maskShapeInfo + 1, maskShapeInfo + rank, coords + 1, rank - 1) : 0; - const auto c0Offset = shape::getOffset(0, c0ShapeInfo + 1, c0ShapeInfo + rank, coords + 1, rank - 1); - const auto gradCtOffset = shape::getOffset(0, gradCtShapeInfo + 1, gradCtShapeInfo + rank, coords + 1, rank - 1); - const auto gradC0Offset = shape::getOffset(0, gradC0ShapeInfo + 1, gradC0ShapeInfo + rank, coords + 1, rank - 1); - const auto bFOffset = shape::getOffset(0, bShapeInfo + 1, bShapeInfo + rank - 1, coords + 2, rank - 2); + const auto maskOffst = mask ? shape::getOffset(maskShapeInfo, coords + 1) : 0; + const auto c0Offset = shape::getOffset(c0ShapeInfo, coords + 1); + const auto gradCtOffset = shape::getOffset(gradCtShapeInfo, coords + 1); + const auto gradC0Offset = shape::getOffset(gradC0ShapeInfo, coords + 1); + const auto bFOffset = shape::getOffset(bShapeInfo, coords + 2); const auto bROffset = bFOffset + 2 * K * bShapeInfo[2]; // 2*K*b_stride - // const auto gradBFOffset = shape::getOffset(0, gradBShapeInfo + 1, gradBShapeInfo + rank, coords + 1, rank - 1); + // const auto gradBFOffset = shape::getOffset(gradBShapeInfo, coords + 1); const auto gradBFOffset = coords[1] * gradBShapeInfo[3] / 2 + coords[2] * gradBShapeInfo[4]; const auto gradBROffset = gradBFOffset + gradBShapeInfo[3]; @@ -382,16 +382,16 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI else coords[0] = time - 1; - auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); - auto ctOffset = shape::getOffset(0, ctShapeInfo + 1, ctShapeInfo + rank + 1, coords, rank); - auto gradIOffset = shape::getOffset(0, gradIShapeInfo + 1, gradIShapeInfo + rank + 1, coords, rank); - auto gradHtOffset = shape::getOffset(0, gradHtShapeInfo + 1, gradHtShapeInfo + rank + 1, coords, rank); + auto xOffset = shape::getOffset(xShapeInfo, coords); + auto ctOffset = shape::getOffset(ctShapeInfo, coords); + auto gradIOffset = shape::getOffset(gradIShapeInfo, coords); + auto gradHtOffset = shape::getOffset(gradHtShapeInfo, coords); coords[2] *= 3; - auto gradWiOffset0 = shape::getOffset(0, gradWiShapeInfo + 1, gradWiShapeInfo + rank + 1, coords, rank); + auto gradWiOffset0 = shape::getOffset(gradWiShapeInfo, coords); auto gradWiOffset1 = gradWiOffset0 + gradWiShapeInfo[rank + 3]; // add last stride auto gradWiOffset2 = gradWiOffset1 + gradWiShapeInfo[rank + 3]; // add last stride - auto wiOffset0 = shape::getOffset(0, wiShapeInfo + 1, wiShapeInfo + rank + 1, coords, rank); + auto wiOffset0 = shape::getOffset(wiShapeInfo, coords); auto wiOffset1 = wiOffset0 + wiShapeInfo[rank + 3]; // add last stride auto wiOffset2 = wiOffset1 + wiShapeInfo[rank + 3]; // add last stride diff --git a/libnd4j/include/ops/declarable/helpers/cuda/stack.cu b/libnd4j/include/ops/declarable/helpers/cuda/stack.cu index e492baf8e..e88f5ade8 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/stack.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/stack.cu @@ -39,7 +39,7 @@ namespace helpers { if(tadShape == nullptr) { // scalar case for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < inputListLength; i += gridDim.x * blockDim.x) - z[shape::getIndexOffset(i, zShapeInfo, inputListLength)] = reinterpret_cast(inputList[i])[0]; + z[shape::getIndexOffset(i, zShapeInfo)] = reinterpret_cast(inputList[i])[0]; } else { @@ -50,7 +50,7 @@ namespace helpers { auto xShapeInfo = reinterpret_cast(inputShapeList[t]); for (int e = threadIdx.x; e < arrLen; e += blockDim.x) - tZ[shape::getIndexOffset(e, tadShape, arrLen)] = tX[shape::getIndexOffset(e, xShapeInfo, arrLen)]; + tZ[shape::getIndexOffset(e, tadShape)] = tX[shape::getIndexOffset(e, xShapeInfo)]; } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/svd.cu b/libnd4j/include/ops/declarable/helpers/cuda/svd.cu index 0695119da..b39ebf81b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/svd.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/svd.cu @@ -65,12 +65,12 @@ __global__ static void inverseColumnSignCuda(void* vu, const Nd4jLong* uShapeInf // u for (Nd4jLong i = ind; i < uLen; i += gridDim.x * blockDim.x) { - shape::index2coords(rank, uShapeInfo + 1, i, uLen, coords); + shape::index2coords(i, uShapeInfo, coords); if(coords[rank - 1] == 0 || coords[rank - 1] == uLastButOneColumn) // do not change sign in first and last but one columns continue; - const auto uOffset = shape::getOffset(0, uShapeInfo + 1, uShapeInfo + rank + 1, coords, rank); + const auto uOffset = shape::getOffset(uShapeInfo, coords); u[uOffset] = -u[uOffset]; } @@ -78,12 +78,12 @@ __global__ static void inverseColumnSignCuda(void* vu, const Nd4jLong* uShapeInf // v for (Nd4jLong i = ind; i < vLen; i += gridDim.x * blockDim.x) { - shape::index2coords(rank, vShapeInfo + 1, i, vLen, coords); + shape::index2coords(i, vShapeInfo, coords); if(coords[rank - 2] == 0 || coords[rank - 2] == vLastButOneColumn) // do not change sign in first and last but one columns continue; - const auto vOffset = shape::getOffset(0, vShapeInfo + 1, vShapeInfo + rank + 1, coords, rank); + const auto vOffset = shape::getOffset(vShapeInfo, coords); v[vOffset] = -v[vOffset]; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu index db6213dd3..972013835 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu @@ -50,15 +50,15 @@ __global__ static void inTopKCuda(const void* vx, const Nd4jLong* xShapeInfo, xTadLen = shape::length(xTadShapeInfo); xTad = reinterpret_cast(vx) + xTadOffsets[blockIdx.x]; - idx = y[shape::getIndexOffset(blockIdx.x, yShapeInfo, shape::length(yShapeInfo))]; // shape::length(yShapeInfo) == numTads - elemToCompare = xTad[shape::getIndexOffset(idx, xTadShapeInfo, xTadLen)]; + idx = y[shape::getIndexOffset(blockIdx.x, yShapeInfo)]; // shape::length(yShapeInfo) == numTads + elemToCompare = xTad[shape::getIndexOffset(idx, xTadShapeInfo)]; } __syncthreads(); sharedMem[threadIdx.x] = 0; for (Nd4jLong i = threadIdx.x; i < xTadLen; i += blockDim.x) - if(elemToCompare < xTad[shape::getIndexOffset(i, xTadShapeInfo, xTadLen)]) + if(elemToCompare < xTad[shape::getIndexOffset(i, xTadShapeInfo)]) ++sharedMem[threadIdx.x]; __syncthreads(); @@ -71,7 +71,7 @@ __global__ static void inTopKCuda(const void* vx, const Nd4jLong* xShapeInfo, } if (threadIdx.x == 0) - z[shape::getIndexOffset(blockIdx.x, zShapeInfo, shape::length(zShapeInfo))] = *sharedMem < k; + z[shape::getIndexOffset(blockIdx.x, zShapeInfo)] = *sharedMem < k; } /////////////////////////////////////////////////////////////////// @@ -117,9 +117,9 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con auto z = reinterpret_cast(vz) + zTadOffsets[t]; for (int e = threadIdx.x; e < k; e += blockDim.x) { - auto idx = i[shape::getIndexOffset(e, iTadShapeInfo, k)]; + auto idx = i[shape::getIndexOffset(e, iTadShapeInfo)]; - z[shape::getIndexOffset(e, zTadShapeInfo, k)] = x[shape::getIndexOffset(idx, xTadShapeInfo, tadLength)]; + z[shape::getIndexOffset(e, zTadShapeInfo)] = x[shape::getIndexOffset(idx, xTadShapeInfo)]; } } } @@ -153,7 +153,7 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con // local max values/indices for (int e = threadIdx.x; e < tadLength; e++) { - auto value = x[shape::getIndexOffset(e, xTadShapeInfo, tadLength)]; + auto value = x[shape::getIndexOffset(e, xTadShapeInfo)]; // we'll compare this value to current stored ones for (int f = 0; f < scanWidth; f++) { @@ -180,8 +180,8 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con // at this point we know local minimum for next iteration if (threadIdx.x == 0) { localMaximum = tempValues[scanWidth - 1]; - z[shape::getIndexOffset(p, zTadShapeInfo, k)] = tempValues[scanWidth - 1]; - i[shape::getIndexOffset(p, iTadShapeInfo, k)] = tempIndices[scanWidth - 1]; + z[shape::getIndexOffset(p, zTadShapeInfo)] = tempValues[scanWidth - 1]; + i[shape::getIndexOffset(p, iTadShapeInfo)] = tempIndices[scanWidth - 1]; } __syncthreads(); } @@ -194,8 +194,8 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con for (int tid = threadIdx.x; tid < k; tid += blockDim.x) { auto top = 2 * tid + 1; if (top < k) { - auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo, k); - auto t1 = shape::getIndexOffset(top, iTadShapeInfo, k); + auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo); + auto t1 = shape::getIndexOffset(top, iTadShapeInfo); if (i[t0] > i[t1]) { // swap indices first @@ -215,8 +215,8 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con for (int tid = threadIdx.x; tid < k; tid += blockDim.x) { auto top = 2 * tid + 2; if (top < k) { - auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo, k); - auto t1 = shape::getIndexOffset(top, iTadShapeInfo, k); + auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo); + auto t1 = shape::getIndexOffset(top, iTadShapeInfo); if (i[t0] > i[t1]) { // swap indices first diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu index c3e4f497e..0a707ffb3 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu @@ -54,9 +54,9 @@ __global__ static void invertPermutationCuda(const void* vx, const Nd4jLong* xSh for (Nd4jLong i = tid; i < len; i += totalThreads) { - const auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); + const auto xOffset = shape::getIndexOffset(i, xShapeInfo); const Nd4jLong index = x[xOffset]; - const auto zOffset = shape::getIndexOffset(index, zShapeInfo, len); + const auto zOffset = shape::getIndexOffset(index, zShapeInfo); z[zOffset] = i; } } @@ -112,15 +112,15 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi for (uint m = blockIdx.x; m < zLen; m += gridDim.x) { // one block per each element of z, that is per each matrix - shape::index2coords(zRank, shape::shapeOf(const_cast(zShapeInfo)), m, zLen, coords); - const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast(zShapeInfo)), shape::stride(const_cast(zShapeInfo)), coords, zRank); + shape::index2coords(m, zShapeInfo, coords); + const auto zOffset = shape::getOffset(zShapeInfo, coords); sharedMem[threadIdx.x] = 0; for (uint i = threadIdx.x; i < diagLen; i += blockDim.x) { coords[zRank] = coords[zRank + 1] = i; - const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast(xShapeInfo)), shape::stride(const_cast(xShapeInfo)), coords, xRank); + const auto xOffset = shape::getOffset(xShapeInfo, coords); sharedMem[threadIdx.x] += x[xOffset]; } @@ -197,14 +197,14 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo for (Nd4jLong i = tid; i < len; i += totalThreads) { - shape::index2coords(rank, zShapeInfo + 1, i, len, coords); + shape::index2coords(i, zShapeInfo, coords); - const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(zShapeInfo, coords); if((coords[rank - 2] + diag > coords[rank - 1])) // row + diag > col z[zOffset] = 0; else - z[zOffset] = x[areSameOffsets ? zOffset : shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + z[zOffset] = x[areSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords)]; } } @@ -263,7 +263,7 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo for (Nd4jLong i = tid; i < zLen; i += totalThreads) { - const auto zOffset = shape::getIndexOffset(i, zShapeInfo, zLen); + const auto zOffset = shape::getIndexOffset(i, zShapeInfo); shape::outerArrayOffsets(xOffsets, i, xShapeInfo, zShapeInfo, memBuff); @@ -329,8 +329,8 @@ __global__ static void clipByNormBPWholeArrCuda(const void* vx, const Nd4jLong* __syncthreads(); // fill shared memory with array elements - const auto xVal = x[shape::getIndexOffset(tid, xShapeInfo, len)]; - const auto yVal = y[shape::getIndexOffset(tid, yShapeInfo, len)]; + const auto xVal = x[shape::getIndexOffset(tid, xShapeInfo)]; + const auto yVal = y[shape::getIndexOffset(tid, yShapeInfo)]; shMem[2*threadIdx.x] = static_cast(xVal * xVal); // for norm shMem[2*threadIdx.x + 1] = static_cast(xVal * yVal); // for input * gradO @@ -414,12 +414,12 @@ __global__ static void clipByNormBPCalcGradCuda(const void* vx, const Nd4jLong* } __syncthreads(); - const auto yOffset = shape::getIndexOffset(tid, yShapeInfo, len); - const auto zOffset = shape::getIndexOffset(tid, zShapeInfo, len); + const auto yOffset = shape::getIndexOffset(tid, yShapeInfo); + const auto zOffset = shape::getIndexOffset(tid, zShapeInfo); if(norm > clipNormVal) { - const auto xOffset = shape::getIndexOffset(tid, xShapeInfo, len); + const auto xOffset = shape::getIndexOffset(tid, xShapeInfo); const Z factor1 = static_cast(1) / norm; // 1 / norm const Z factor2 = factor1 / (norm * norm); // 1 / (norm * norm * norm) @@ -462,8 +462,8 @@ __global__ static void clipByNormBPTadsCuda(const void* vx, const Nd4jLong* xTad for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) { - const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo, tadLen); - const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo, tadLen); + const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo); + const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo); shMem[2*threadIdx.x] = static_cast(xTad[xOffset] * xTad[xOffset]); // for norm shMem[2*threadIdx.x + 1] = static_cast(xTad[xOffset] * yTad[yOffset]); // for input * gradO @@ -491,12 +491,12 @@ __global__ static void clipByNormBPTadsCuda(const void* vx, const Nd4jLong* xTad for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) { - const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo, tadLen); - const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo, tadLen); + const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo); + const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo); if(norm > clipNormVal) { - const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo, tadLen); + const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo); const Z factor1 = static_cast(1) / norm; // 1 / norm const Z factor2 = factor1 / (norm * norm); // 1 / (norm * norm * norm) @@ -563,23 +563,25 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr } template - static __global__ void swapShuffleKernel(T* input, Nd4jLong* shape, Nd4jLong firstDim, Nd4jLong len, nd4j::graph::RandomGenerator* rng) { + static __global__ void swapShuffleKernel(T* input, Nd4jLong* shape, Nd4jLong firstDim, nd4j::graph::RandomGenerator* rng) { auto tid = blockIdx.x * blockDim.x; auto step = blockDim.x * gridDim.x; for (int i = firstDim - 1 - tid - threadIdx.x; i > 0; i -= step) { int r = rng->relativeInt(i) % i; if (i != r) { - T e0 = input[shape::getIndexOffset(i, shape, len)]; - T e1 = input[shape::getIndexOffset(r, shape, len)]; + const auto iOffset = shape::getIndexOffset(i, shape); + const auto rOffset = shape::getIndexOffset(r, shape); + T e0 = input[iOffset]; + T e1 = input[rOffset]; //math::nd4j_swap(input(i), input(r)); - input[shape::getIndexOffset(i, shape, len)] = e1; - input[shape::getIndexOffset(r, shape, len)] = e0; + input[iOffset] = e1; + input[rOffset] = e0; } } } template - static __global__ void fillShuffleKernel(T* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, Nd4jLong firstDim, Nd4jLong len, int* indices, nd4j::graph::RandomGenerator* rng) { + static __global__ void fillShuffleKernel(T* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, Nd4jLong firstDim, int* indices, nd4j::graph::RandomGenerator* rng) { // PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold()) auto tid = blockIdx.x * blockDim.x; @@ -587,9 +589,9 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr for(int i = firstDim - 1 - tid - threadIdx.x; i > 0; i -= step) { int r = rng->relativeInt(i) % i; - output[shape::getIndexOffset(i, outputShape, len)] = input[shape::getIndexOffset(indices[r], inputShape, len)]; + output[shape::getIndexOffset(i, outputShape)] = input[shape::getIndexOffset(indices[r], inputShape)]; if(i != r) { - output[shape::getIndexOffset(r, outputShape, len)] = input[shape::getIndexOffset(indices[i], inputShape, len)]; + output[shape::getIndexOffset(r, outputShape)] = input[shape::getIndexOffset(indices[i], inputShape)]; // output.p(r, input.e(indices[i])); // math::nd4j_swap(indices[i], indices[r]); atomicExch(&indices[i], indices[r]); @@ -618,7 +620,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr cudaMemcpy(dRandom, &rng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice); T* inputBuf = reinterpret_cast(input.specialBuffer()); if(isInplace) { - swapShuffleKernel<<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), firstDim, input.lengthOf(), dRandom); + swapShuffleKernel<<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), firstDim, dRandom); } else { std::vector indices(firstDim); @@ -628,7 +630,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr PointersManager pointersManager(context, "helper::randomShuffle_"); int* indicesDev = reinterpret_cast(pointersManager.replicatePointer(indices.data(), indices.size() * sizeof(int))); T* outputBuf = reinterpret_cast(output.specialBuffer()); - fillShuffleKernel<<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), outputBuf, output.specialShapeInfo(), firstDim, input.lengthOf(), indicesDev, dRandom); + fillShuffleKernel<<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), outputBuf, output.specialShapeInfo(), firstDim, indicesDev, dRandom); pointersManager.synchronize(); } // rng.rewindH(firstDim - 1); @@ -704,7 +706,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr } __syncthreads(); for (int j = threadIdx.x; j < len; j+= blockDim.x) { - auto xIndex = shape::getIndexOffset(j, shape, len); + auto xIndex = shape::getIndexOffset(j, shape); if(norm2Buf[arr] > clipNorm) z[xIndex] *= clipNorm / norm2Buf[arr]; // case with ews = 1 and ordering is 'c' @@ -714,23 +716,22 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template static __global__ void clipByNormKernel(Nd4jLong numOfSubArrs, T* inputBuffer, Nd4jLong* shape, Nd4jLong* inputOffsets, T* outputBuffer, Nd4jLong* outputShape, Nd4jLong* outputOffsets, T* norm2Buf, Nd4jLong* norm2shape, T clipNorm) { + for (Nd4jLong arr = blockIdx.x; arr < numOfSubArrs; arr += gridDim.x) { __shared__ T* x, *z; - __shared__ Nd4jLong lenX, lenZ; + __shared__ Nd4jLong lenZ; __shared__ T norm2; if (threadIdx.x == 0) { - lenX = shape::length(shape); x = inputBuffer + inputOffsets[arr]; z = outputBuffer + outputOffsets[arr]; lenZ = shape::length(outputShape); - norm2 = norm2Buf[shape::getIndexOffset(arr, norm2shape, numOfSubArrs)]; - //printf("%d: %lf (vs %lf) %lld %lld\n", arr, norm2, clipNorm, lenX, lenZ); + norm2 = norm2Buf[shape::getIndexOffset(arr, norm2shape)]; } __syncthreads(); for (Nd4jLong j = threadIdx.x; j < lenZ; j+= blockDim.x) { - auto xIndex = shape::getIndexOffset(j, shape, lenX); - auto zIndex = shape::getIndexOffset(j, outputShape, lenZ); + auto xIndex = shape::getIndexOffset(j, shape); + auto zIndex = shape::getIndexOffset(j, outputShape); if(norm2 > clipNorm) { z[zIndex] = x[xIndex] * clipNorm / norm2; // case with ews = 1 and ordering is 'c' } else { @@ -916,8 +917,8 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr else outputBuf[e] = inputBuf[e]; } else { - auto inputOffset = shape::getIndexOffset(e, inputShape, length); - auto outputOffset = shape::getIndexOffset(e, outputShape, length); + auto inputOffset = shape::getIndexOffset(e, inputShape); + auto outputOffset = shape::getIndexOffset(e, outputShape); if (inputBuf[inputOffset] > rightBound) outputBuf[outputOffset] = (T) rightBound; else if (inputBuf[inputOffset] < leftBound) outputBuf[outputOffset] = (T) leftBound; else outputBuf[outputOffset] = inputBuf[outputOffset]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/weights.cu b/libnd4j/include/ops/declarable/helpers/cuda/weights.cu index 55f859295..622732d7d 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/weights.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/weights.cu @@ -29,7 +29,7 @@ namespace helpers { template static __device__ void adjustWeightsKernelD(void* inputBuffer, Nd4jLong* inputShape, void* weightsBuffer, Nd4jLong* weightsShape, - void* outputBuffer, Nd4jLong inputLength, Nd4jLong weightsLength, + void* outputBuffer, Nd4jLong inputLength, Nd4jLong outputLength, int val) { // typedef Nd4jLong T; auto tid = threadIdx.x; @@ -39,13 +39,13 @@ namespace helpers { //for (int e = 0; e < inputLength; e++) { for (Nd4jLong e = tid; e < inputLength; e += blockDim.x) { - Nd4jLong xOffset = shape::getIndexOffset(e, inputShape, inputLength); + Nd4jLong xOffset = shape::getIndexOffset(e, inputShape); int current = *(reinterpret_cast(inputBuffer) + xOffset); if (current == val) { //printf("%lld\n", xOffset); - //Nd4jLong zOffset = shape::getIndexOffset(val, outputShape, outputLength); + //Nd4jLong zOffset = shape::getIndexOffset(val, outputShape); if (weightsBuffer != nullptr) { - Nd4jLong yOffset = shape::getIndexOffset(e, weightsShape, weightsLength); + Nd4jLong yOffset = shape::getIndexOffset(e, weightsShape); //atomicAdd(); //*reinterpret_cast(outputBuffer) += reinterpret_cast(weightsBuffer)[yOffset]; nd4j::math::atomics::nd4j_atomicAdd(reinterpret_cast(outputBuffer), reinterpret_cast(weightsBuffer)[yOffset]); //output->p(val, output->e(val) + 1); @@ -74,22 +74,19 @@ namespace helpers { //auto tid = blockIdx.x * blockDim.x + threadIdx.x; // * blockDim.x; // + threadIdx.x; int threadCount = gridDim.x * blockDim.x; Nd4jLong inputLength = shape::length(inputShape); - Nd4jLong weightsLength = 0; - if (weightsBuffer != nullptr) - weightsLength = shape::length(weightsShape); Nd4jLong outputLength = shape::length(outputShape); - Nd4jLong borderLen = 1;//outputLength / gridDim.x + outputLength % gridDim.x; + Nd4jLong borderLen = 1; for (Nd4jLong e = blockIdx.x; e < outputLength; e += threadCount) { //if (blockIdx.x < outputLength) { //if (e + threadCount < outputLength) { - Nd4jLong zOffset = shape::getIndexOffset(e, outputShape, outputLength); + Nd4jLong zOffset = shape::getIndexOffset(e, outputShape); //printf("%d %d %d\n", blockIdx.x, blockDim.x, threadIdx.x); //Nd4jLong borderLen = 1; T* outputBufferZ = reinterpret_cast(outputBuffer) + zOffset; adjustWeightsKernelD(inputBuffer, inputShape, weightsBuffer, weightsShape, (void*)outputBufferZ, - inputLength, weightsLength, outputLength, (int)zOffset); + inputLength, outputLength, (int)zOffset); } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu index b131ff83f..ada547ac3 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu @@ -46,9 +46,9 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo, for (int i = tid; i < len; i += totalThreads) { - const auto xOffset = shape::getIndexOffset(i, xShapeInfo, len); - const auto qOffset = shape::getIndexOffset(i, qShapeInfo, len); - const auto zOffset = shape::getIndexOffset(i, zShapeInfo, len); + const auto xOffset = shape::getIndexOffset(i, xShapeInfo); + const auto qOffset = shape::getIndexOffset(i, qShapeInfo); + const auto zOffset = shape::getIndexOffset(i, zShapeInfo); z[zOffset] = zetaScalar(x[xOffset], q[qOffset]); } diff --git a/libnd4j/include/ops/declarable/helpers/flatten.h b/libnd4j/include/ops/declarable/helpers/flatten.h index 05421383f..0513e45ea 100644 --- a/libnd4j/include/ops/declarable/helpers/flatten.h +++ b/libnd4j/include/ops/declarable/helpers/flatten.h @@ -24,12 +24,45 @@ #include #include -namespace nd4j { - namespace ops { - namespace helpers { - void flatten(nd4j::LaunchContext *context, std::vector &inputs, NDArray *output, char order); +namespace nd4j { +namespace ops { +namespace helpers { + + +////////////////////////////////////////////////////////////////////// +void flatten(nd4j::LaunchContext *context, std::vector &inputs, NDArray *output, char order); + + +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD Nd4jLong getIndexOffsetOrdered(Nd4jLong index, const Nd4jLong *shapeInfo, const char order) { + + Nd4jLong offset = 0; + + if (order == 'c') { + + for(uint i = shapeInfo[0]; i > 1; --i) { + offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]]; + index /= shapeInfo[i]; } + + offset += index * shapeInfo[1 + shapeInfo[0]]; // last iteration } + else { + + for(uint i = 1; i < shapeInfo[0]; ++i) { + offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]]; + index /= shapeInfo[i]; + } + + offset += index * shapeInfo[2 * shapeInfo[0]]; // last iteration + } + + return offset; +} + + +} +} } #endif //DEV_TESTS_FLATTEN_H diff --git a/libnd4j/include/ops/declarable/helpers/impl/where.cpp b/libnd4j/include/ops/declarable/helpers/impl/where.cpp index 120ecdf16..5ca54ced9 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/where.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/where.cpp @@ -31,9 +31,9 @@ namespace nd4j { Nd4jLong idx[MAX_RANK]; for (int e = 0; e < condition.lengthOf(); e++) { - shape::index2coords(condition.rankOf(), condition.shapeOf(), e, idx); + shape::index2coords(e, condition.getShapeInfo(), idx); - auto offset = shape::getOffset(0, condition.shapeOf(), condition.stridesOf(), idx, condition.rankOf()); + auto offset = shape::getOffset(condition.getShapeInfo(), idx); if (condition.e(offset)) { auto array = NDArrayFactory::create_('c', {1, condition.rankOf()}, output.dataType(), output.getContext()); for (int f = 0; f < condition.rankOf(); f++) diff --git a/libnd4j/include/ops/impl/specials.cpp b/libnd4j/include/ops/impl/specials.cpp index 5eb64fdb4..85642d6c8 100644 --- a/libnd4j/include/ops/impl/specials.cpp +++ b/libnd4j/include/ops/impl/specials.cpp @@ -231,12 +231,12 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint Nd4jLong SpecialMethods::getPosition(Nd4jLong *xShapeInfo, Nd4jLong index) { auto xEWS = shape::elementWiseStride(xShapeInfo); - if (xEWS == 1) - return index; + if (xEWS == 1) + return index; else if (xEWS > 1) return index * xEWS; - else - return shape::getIndexOffset(index, xShapeInfo, shape::length(xShapeInfo)); + else + return shape::getIndexOffset(index, xShapeInfo); } template @@ -457,10 +457,9 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVa template void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { - auto length = shape::length(xShapeInfo); int i = left, j = right; X ktmp; - X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo, length)]; + X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)]; Y vtmp; @@ -468,35 +467,35 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVa /* PARTITION PART */ while (i <= j) { if (descending) { - while (key[shape::getIndexOffset(i, xShapeInfo, length)] > pivot) + while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot) i++; - while (key[shape::getIndexOffset(j, xShapeInfo, length)] < pivot) + while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot) j--; if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)]; - key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)]; - key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp; + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - vtmp = values[shape::getIndexOffset(i, yShapeInfo, length)]; - values[shape::getIndexOffset(i, yShapeInfo, length)] = values[shape::getIndexOffset(j, yShapeInfo, length)]; - values[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp; + vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; + values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; + values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; i++; j--; } } else { - while (key[shape::getIndexOffset(i, xShapeInfo, length)] < pivot) + while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot) i++; - while (key[shape::getIndexOffset(j, xShapeInfo, length)] > pivot) + while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot) j--; if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)]; - key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)]; - key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp; + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - vtmp = values[shape::getIndexOffset(i, yShapeInfo, length)]; - values[shape::getIndexOffset(i, yShapeInfo, length)] = values[shape::getIndexOffset(j, yShapeInfo, length)]; - values[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp; + vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; + values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; + values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; i++; j--; @@ -523,10 +522,9 @@ PRAGMA_OMP_TASK template void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { - auto length = shape::length(xShapeInfo); int i = left, j = right; X ktmp; - Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo, length)]; + Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)]; Y vtmp; @@ -534,35 +532,35 @@ PRAGMA_OMP_TASK /* PARTITION PART */ while (i <= j) { if (descending) { - while (value[shape::getIndexOffset(i, yShapeInfo, length)] > pivot) + while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot) i++; - while (value[shape::getIndexOffset(j, yShapeInfo, length)] < pivot) + while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot) j--; if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)]; - key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)]; - key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp; + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - vtmp = value[shape::getIndexOffset(i, yShapeInfo, length)]; - value[shape::getIndexOffset(i, yShapeInfo, length)] = value[shape::getIndexOffset(j, yShapeInfo, length)]; - value[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp; + vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; + value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; + value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; i++; j--; } } else { - while (value[shape::getIndexOffset(i, yShapeInfo, length)] < pivot) + while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot) i++; - while (value[shape::getIndexOffset(j, yShapeInfo, length)] > pivot) + while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot) j--; if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)]; - key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)]; - key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp; + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - vtmp = value[shape::getIndexOffset(i, yShapeInfo, length)]; - value[shape::getIndexOffset(i, yShapeInfo, length)] = value[shape::getIndexOffset(j, yShapeInfo, length)]; - value[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp; + vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; + value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; + value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; i++; j--; diff --git a/libnd4j/include/ops/special_accumulation_ops.h b/libnd4j/include/ops/special_accumulation_ops.h index 7a587e754..3f2b2ed1d 100644 --- a/libnd4j/include/ops/special_accumulation_ops.h +++ b/libnd4j/include/ops/special_accumulation_ops.h @@ -114,15 +114,15 @@ namespace simdOps { tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); numTads = shape::length(xShapeInfo) / tadLength; } - __syncthreads(); + __syncthreads(); for (int r = blockIdx.x; r < numTads; r += gridDim.x) { auto tadOffsetForBlock = tadOffsets[r]; sPartials[threadIdx.x] = startingValue(dx + tadOffsetForBlock); - for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength); + for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { + auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); sPartials[threadIdx.x] = update(sPartials[threadIdx.x], op(dx[xOffset], result[r]), extraParams); } __syncthreads(); @@ -198,8 +198,8 @@ namespace simdOps { auto offset = tadOffsets[i]; T start = startingValue(x + offset); - for (int j = 0; j < tadLength; j++) { - auto xOffset = offset + shape::getIndexOffset(j, tadOnlyShapeInfo, tadLength); + for (int j = 0; j < tadLength; j++) { + auto xOffset = offset + shape::getIndexOffset(j, tadOnlyShapeInfo); start = update(start, op(x[xOffset], result[i]), extraParams); } diff --git a/libnd4j/include/ops/special_ops.h b/libnd4j/include/ops/special_ops.h index 33cce53c6..8f6ef6b5b 100644 --- a/libnd4j/include/ops/special_ops.h +++ b/libnd4j/include/ops/special_ops.h @@ -81,8 +81,8 @@ namespace simdOps { static inline __device__ void execSpecialCuda( T *dx, Nd4jLong *xShapeBuffer, Z *result, Nd4jLong *zShapeBuffer, - Z *extraParams, - int *allocationPointer, Z *reductionPointer, + Z *extraParams, + int *allocationPointer, Z *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { __shared__ int kH; @@ -119,7 +119,7 @@ namespace simdOps { __shared__ int kHEff; __shared__ int kWEff; __shared__ bool fOrder; - + if (threadIdx.x == 0) { kH = (int)extraParams[0]; @@ -266,7 +266,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha const Nd4jLong sH = (int)extraParams[2]; const Nd4jLong sW = (int)extraParams[3]; const Nd4jLong pH = (int)extraParams[4]; - const Nd4jLong pW = (int)extraParams[5]; + const Nd4jLong pW = (int)extraParams[5]; const Nd4jLong dH = (int)extraParams[6]; const Nd4jLong dW = (int)extraParams[7]; Nd4jLong poolingMode = (int)extraParams[9]; @@ -285,7 +285,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha const int iH = shape::sizeAt(inShapeBuffer, 2); const int iW = shape::sizeAt(inShapeBuffer, 3); const int oH = shape::sizeAt(outShapeBuffer, 2); - const int oW = shape::sizeAt(outShapeBuffer, 3); + const int oW = shape::sizeAt(outShapeBuffer, 3); const Nd4jLong iStride0 = shape::stride(inShapeBuffer)[0]; const Nd4jLong iStride1 = shape::stride(inShapeBuffer)[1]; const Nd4jLong iStride2 = shape::stride(inShapeBuffer)[2]; @@ -296,28 +296,28 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha const Nd4jLong oStride3 = shape::stride(outShapeBuffer)[3]; const Nd4jLong iStep2 = dH*iStride2; - const Nd4jLong iStep3 = dW*iStride3; + const Nd4jLong iStep3 = dW*iStride3; const int kProd = kH*kW; - const T iStep2Inv = 1./iStep2; + const T iStep2Inv = 1./iStep2; const T iStep3Inv = 1./iStep3; Nd4jLong hstart, wstart, hend, wend; T sum, *pIn; - if(poolingMode == 0) { // max + if(poolingMode == 0) { // max PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2)) for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { + for(int c = 0; c < iC; ++c) { for(int oh = 0; oh < oH; ++oh) { for(int ow = 0; ow < oW; ++ow) { - + pIn = in + b * iStride0 + c * iStride1; - + hstart = oh * sH - pH; - wstart = ow * sW - pW; + wstart = ow * sW - pW; hend = hstart + kHEff; wend = wstart + kWEff; - + if(hstart < 0) hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); if(wstart < 0) @@ -333,8 +333,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha wend *= iStride3; sum = -nd4j::DataTypeUtils::max(); - - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { T val = pIn[kh + kw]; if (val > sum) @@ -344,16 +344,16 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha } } } - } + } } -/*************************************************************************/ +/*************************************************************************/ else if(poolingMode == 1) { // avg PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2)) for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { + for(int c = 0; c < iC; ++c) { for(int oh = 0; oh < oH; ++oh) { for(int ow = 0; ow < oW; ++ow) { - + pIn = in + b * iStride0 + c * iStride1; hstart = oh * sH - pH; @@ -376,30 +376,30 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha wend *= iStride3; sum = static_cast(0.); - - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) sum += pIn[kh + kw]; - + if ((int) extraParam0 == 0) //Exclude padding sum /= static_cast(nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(iStep2))) * static_cast(nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(iStep3))); //Accounts for dilation else if ((int) extraParam0 == 1) //Include padding sum /= kProd; - + out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; } } } } - } -/*************************************************************************/ + } +/*************************************************************************/ else if(poolingMode == 2) { // pnorm PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2)) for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { + for(int c = 0; c < iC; ++c) { for(int oh = 0; oh < oH; ++oh) { for(int ow = 0; ow < oW; ++ow) { - + pIn = in + b * iStride0 + c * iStride1; hstart = oh * sH - pH; @@ -422,13 +422,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha wend *= iStride3; sum = static_cast(0.); - - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); - + sum = nd4j::math::nd4j_pow(sum, (T) 1. / extraParam0); - + out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; } } @@ -482,7 +482,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha } template - class + class Im2col { public: static const bool requiresSpecial = true; @@ -502,8 +502,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha static inline __device__ void execSpecialCuda( T *dx, Nd4jLong *xShapeBuffer, T *result, Nd4jLong *zShapeBuffer, - T *extraParams, - int *allocationPointer, T *reductionPointer, + T *extraParams, + int *allocationPointer, T *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { /*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/ @@ -606,7 +606,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha T *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { /*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/ - // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW] + // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW] int kH = (int)extraParams[0]; int kW = (int)extraParams[1]; @@ -615,7 +615,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha int pH = (int)extraParams[4]; int pW = (int)extraParams[5]; int dH = (int)extraParams[6]; //Dilation, height/y dimension - int dW = (int)extraParams[7]; //Dilation, width/x dimension + int dW = (int)extraParams[7]; //Dilation, width/x dimension T zeroPadVal = extraParams[9]; auto colShape = shape::shapeOf(colShapeBuffer); @@ -642,33 +642,33 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha T *col, *im; int imRow, imCol; - + if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) { PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2)) for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { + for (int c = 0; c < iC; ++c) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - + for (int colW = 0; colW < oW; ++colW) { + imRow = (-pH + kRow * dH) + colH*sH; imCol = (-pW + kCol * dW) + colW*sW; - + col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - + im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; + if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) *col = zeroPadVal; - else + else *col = *im; } } } } } - } + } } else { @@ -677,18 +677,18 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha for (int colH = 0; colH < oH; ++colH) { for (int colW = 0; colW < oW; ++colW) { for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + imRow = (-pH + kRow * dH) + colH*sH; imCol = (-pW + kCol * dW) + colW*sW; - + col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - + if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) *col = zeroPadVal; - else + else *col = *im; } } @@ -743,8 +743,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha static inline __device__ void execSpecialCuda( T *dx, Nd4jLong *xShapeBuffer, Z *result, Nd4jLong *zShapeBuffer, - Z *extraParams, - int *allocationPointer, Z *reductionPointer, + Z *extraParams, + int *allocationPointer, Z *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { @@ -782,8 +782,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha static inline __device__ void execSpecialCuda( X *dx, Nd4jLong *xShapeBuffer, X *result, Nd4jLong *zShapeBuffer, - X *extraParams, int *allocationPointer, - X *reductionPointer, + X *extraParams, int *allocationPointer, + X *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { __shared__ int strideex, stridech, stridekrow, stridekcol, striderow, stridecol, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, imgHeight, imgWidth, dY, dX, samples, depth, imgH, imgW, height_col, width_col, n, kEffectiveW, kEffectiveH; @@ -856,7 +856,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { int h_k = (h_im - h_col * strideY); int w_k = (w_im - w_col * strideX); - + if(h_k % dY == 0 && w_k % dX == 0){ h_k /= dY; w_k /= dX; @@ -892,7 +892,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha auto colShape = shape::shapeOf(colShapeBuffer); auto colStride = shape::stride(colShapeBuffer); auto imShape = shape::shapeOf(imShapeBuffer); - auto imStride = shape::stride(imShapeBuffer); + auto imStride = shape::stride(imShapeBuffer); const int sH = (int)extraParams[0]; const int sW = (int)extraParams[1]; @@ -900,13 +900,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha const int pW = (int)extraParams[3]; const int iH = (int)extraParams[4]; const int iW = (int)extraParams[5]; - const int dH = (int)extraParams[6]; - const int dW = (int)extraParams[7]; + const int dH = (int)extraParams[6]; + const int dW = (int)extraParams[7]; const int bS = imShape[0]; const int iC = imShape[1]; const int kH = colShape[2]; - const int kW = colShape[3]; + const int kW = colShape[3]; const int oH = colShape[4]; const int oW = colShape[5]; const Nd4jLong colStride0 = colStride[0]; @@ -932,12 +932,12 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) { PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { + for (int b = 0; b < bS; b++) { + for (int c = 0; c < iC; ++c) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { + for (int colW = 0; colW < oW; ++colW) { imRow = (-pH + kRow * dH) + colH*sH; imCol = (-pW + kCol * dW) + colW*sW; @@ -952,21 +952,21 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha } } } - } + } } else { PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol)) - for (int b = 0; b < bS; b++) { + for (int b = 0; b < bS; b++) { for (int colH = 0; colH < oH; ++colH) { for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - + for (int c = 0; c < iC; ++c) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + imRow = (-pH + kRow * dH) + colH*sH; imCol = (-pW + kCol * dW) + colW*sW; - + col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; @@ -975,9 +975,9 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha } } } - } + } } - } + } } } @@ -1021,10 +1021,10 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha static const bool requiresSpecial = true; #ifdef __CUDACC__ - static inline __device__ void execSpecialCuda(X *dx, Nd4jLong *xShapeBuffer, - X *result, Nd4jLong *zShapeBuffer, - X *extraParams, int *allocationPointer, - X *reductionPointer, + static inline __device__ void execSpecialCuda(X *dx, Nd4jLong *xShapeBuffer, + X *result, Nd4jLong *zShapeBuffer, + X *extraParams, int *allocationPointer, + X *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { __shared__ Nd4jLong xLength; @@ -1064,12 +1064,12 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha dx[idx2] = dx[idx1]; dx[idx1] = tmp; } - } - else { + } + else { for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength); - auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer, xLength); + auto xOffset = shape::getIndexOffset(e, xShapeBuffer); + auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer); result[zOffset] = dx[xOffset]; } } @@ -1094,12 +1094,12 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) { result[(sLength - e) * zEWS] = dx[e * xEWS]; } - } - else { + } + else { for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength); - auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer, xLength); + auto xOffset = shape::getIndexOffset(e, xShapeBuffer); + auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer); result[zOffset] = dx[xOffset]; } } @@ -1134,13 +1134,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha dx[idx2] = dx[idx1]; dx[idx1] = tmp; } - } + } else { PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < xLength / 2; e++) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength); - auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer, xLength); + for (Nd4jLong e = 0; e < xLength / 2; e++) { + auto xOffset = shape::getIndexOffset(e, xShapeBuffer); + auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer); result[zOffset] = dx[xOffset]; } @@ -1160,13 +1160,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha for (Nd4jLong e = 0; e < xLength; e++) { result[(sLength - e) * zEWS] = dx[e * xEWS]; } - } + } else { PRAGMA_OMP_PARALLEL_FOR_SIMD for (Nd4jLong e = 0; e < xLength; e++) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength); - auto zOffset = shape::getIndexOffset(sLength - e, zShapeBuffer, xLength); + auto xOffset = shape::getIndexOffset(e, xShapeBuffer); + auto zOffset = shape::getIndexOffset(sLength - e, zShapeBuffer); result[zOffset] = dx[xOffset]; } } @@ -1192,7 +1192,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha void *vx, Nd4jLong *xShapeBuffer, void *vresult, Nd4jLong *zShapeBuffer, void *vextraParams, - int *allocationPointer, void *reductionPointer, + int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { auto dx = reinterpret_cast(vx); @@ -1263,10 +1263,10 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha tadShapeInfo = tadPack.primaryShapeInfo(); tadOffsets = tadPack.primaryOffsets(); } - + const uint tadLen = shape::length(tadShapeInfo); const uint numOfTads = shape::length(xShapeInfo) / tadLen; - + if(shape::elementWiseStride(tadShapeInfo) == 1) { PRAGMA_OMP_PARALLEL_FOR_SIMD @@ -1277,18 +1277,18 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha X max = -nd4j::DataTypeUtils::max(); X sum = 0; - + for(uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[j]); - + max = nd4j::math::nd4j_max(max, inBuff[j]); + for (uint j = 0; j < tadLen; ++j) { X temp = nd4j::math::nd4j_exp(inBuff[j] - max); outBuff[j] = temp; sum += temp; } - + for (uint j = 0; j < tadLen; ++j) - outBuff[j] /= sum; + outBuff[j] /= sum; } } else { @@ -1300,17 +1300,17 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha shape::calcOffsets(tadShapeInfo, offsets); PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfTads; ++i) { + for (uint i = 0; i < numOfTads; ++i) { X* inBuff = x + tadOffsets[i]; X* outBuff = z + tadOffsets[i]; X max = -nd4j::DataTypeUtils::max(); - X sum = 0.f; + X sum = 0.f; + + for(uint j = 0; j < tadLen; ++j) + max = nd4j::math::nd4j_max(max, inBuff[offsets[j]]); - for(uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[offsets[j]]); - for (uint j = 0; j < tadLen; ++j) { X temp = nd4j::math::nd4j_exp(inBuff[offsets[j]] - max); outBuff[offsets[j]] = temp; @@ -1351,7 +1351,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha delete[] maxResultShapeBuffer; delete[] maxResult; - } + } } else if (shape::isVector(xShapeInfo)) { auto max = -nd4j::DataTypeUtils::max(); @@ -1416,7 +1416,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha void *vx, Nd4jLong *xShapeBuffer, void *vresult, Nd4jLong *zShapeBuffer, void *vextraParams, - int *allocationPointer, void *reductionPointer, + int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { auto shape = shape::shapeOf(xShapeBuffer); @@ -1578,7 +1578,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha void *vx, Nd4jLong *xShapeBuffer, void *vresult, Nd4jLong *zShapeBuffer, void *vextraParams, - int *allocationPointer, void *reductionPointer, + int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { auto dx = reinterpret_cast(vx); @@ -1650,7 +1650,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha auto dx = reinterpret_cast(vx); auto result = reinterpret_cast(vresult); auto extraParams = reinterpret_cast(vextraParams); - + if (shape::isMatrix(xShapeBuffer, 2)) { auto shape = shape::shapeOf(xShapeBuffer); @@ -1700,9 +1700,9 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha } } else { - - for (int i = 0; i < len; i++) { - Nd4jLong zOffset = shape::getIndexOffset(i, zShapeBuffer, len); + + for (int i = 0; i < len; i++) { + Nd4jLong zOffset = shape::getIndexOffset(i, zShapeBuffer); result[zOffset] = result[zOffset] * ((X) 1.0f - result[zOffset]); } } @@ -2013,8 +2013,8 @@ PRAGMA_OMP_CRITICAL static inline __device__ void execSpecialCuda( void *vx, Nd4jLong *xShapeBuffer, void *vresult, Nd4jLong *zShapeBuffer, - void *vextraParams, int *allocationPointer, - void *reductionPointer, + void *vextraParams, int *allocationPointer, + void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { auto dx = reinterpret_cast(vx); @@ -2162,7 +2162,7 @@ PRAGMA_OMP_CRITICAL //decompose in to several sub tads after //moving all dimensions (in sorted order) //to the back. - //permuted version of the x shape info for setting up the tad problem + //permuted version of the x shape info for setting up the tad problem auto tadShapeShapeInfo = tadShapeInfo; if(tadShapeInfo==nullptr) { auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeBuffer, dimension, dimensionLength); @@ -2170,7 +2170,7 @@ PRAGMA_OMP_CRITICAL tadShapeShapeInfo = tadPack.primaryShapeInfo(); tadOffsets = tadPack.primaryOffsets(); tadShapeInfo = tadShapeShapeInfo; - } + } auto tadLength = shape::length(tadShapeInfo);//shape::tadLength(xShapeBuffer, dimension, dimensionLength); auto tads = shape::length(xShapeBuffer) / tadLength; diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h index 0d90c212a..1ae310ad4 100644 --- a/libnd4j/include/ops/special_random_ops.h +++ b/libnd4j/include/ops/special_random_ops.h @@ -111,24 +111,24 @@ namespace randomOps { } // __syncthreads(); // Eliminated due RTX20xx specific } - } + } else { - + for (Nd4jLong i = tid; i < zLength; i+=blockDim.x * gridDim.x) { - auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, zLength); + auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer); T prob = rng->relativeT(i); T cumProb = (T) 0.0f; for (Nd4jLong f = 0; f < yLength; f++) { - - auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer, yLength); + + auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer); T relProb = y[yOffset2]; cumProb += relProb; - if (prob <= cumProb || f == yLength - 1) { - - auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer, xLength); + if (prob <= cumProb || f == yLength - 1) { + + auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer); z[zOffset2] = x[xOffset2]; f += yLength; } @@ -179,25 +179,25 @@ namespace randomOps { } } } - } + } else { PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads) for (Nd4jLong i = 0; i < zLength; i++) { - auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, zLength); + auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer); T prob = rng->relativeT(i); T cumProb = (T) 0.0f; for (Nd4jLong f = 0; f < yLength; f++) { - - auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer, yLength); + + auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer); T relProb = y[yOffset2]; cumProb += relProb; - if (prob <= cumProb || f == yLength - 1) { - - auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer, xLength); + if (prob <= cumProb || f == yLength - 1) { + + auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer); z[zOffset2] = x[xOffset2]; break; } @@ -571,8 +571,8 @@ namespace randomOps { } } }; - -////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////// // This Op produces random Gaussian values within [mean-2*stddev,mean+2*stddev] template class TruncatedNormalDistribution { diff --git a/libnd4j/include/pairwise_util.h b/libnd4j/include/pairwise_util.h index c4b84bee2..e87e8961d 100755 --- a/libnd4j/include/pairwise_util.h +++ b/libnd4j/include/pairwise_util.h @@ -50,7 +50,7 @@ namespace shape { Nd4jLong elementWiseStride(const Nd4jLong *shapeInfo); char order(const Nd4jLong *shapeInfo); bool isStrideSimple(const Nd4jLong* shapeInfo); - Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen); + Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo); } */ @@ -269,13 +269,13 @@ public: Nd4jLong chunks; Nd4jLong modulo; Nd4jLong remainder; - + BlockInformation(Nd4jLong length, int threshold) { threads = length / threshold; threads = nd4j::math::nd4j_max(1, threads); threads = nd4j::math::nd4j_min(threads, omp_get_max_threads()); - + items = length / threads; remainder = length % threads; if(items < 1) diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp index d8cf86495..a5664a24b 100644 --- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp @@ -403,7 +403,7 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_3) { auto input = NDArrayFactory::create('c', {3, 3, 8, 8}); auto weightsD = NDArrayFactory::create('c', {1, 3, 1, 1}); auto weightsP = NDArrayFactory::create('c', {2, 3, 1, 1}); - auto bias = NDArrayFactory::create('c', {1, 2}); + auto bias = NDArrayFactory::create('c', {2}); auto output = NDArrayFactory::create('c', {3, 2, 8, 8}); output.assign(0.0); @@ -911,7 +911,7 @@ TEST_F(ConvolutionTests1, TestDeconv_ff_2) { auto input = NDArrayFactory::create('c', {3, 3, 4, 4}); auto weights = NDArrayFactory::create('c',{3, 2, 1, 1}); - auto bias = NDArrayFactory::create('c', {1, 2}); + auto bias = NDArrayFactory::create('c', {2}); input.linspace(1); weights.linspace(1); @@ -935,11 +935,11 @@ TEST_F(ConvolutionTests1, TestDeconv_ff_2) { TYPED_TEST(TypedConvolutionTests1, Test_Conv1D_ff_1) { auto input = NDArrayFactory::create('c', {2, 2, 6}); auto weights = NDArrayFactory::create('c', {2, 2, 3}, {1,5,9,3,7,11,2,6,10,4,8,12}); - auto bias = NDArrayFactory::create('c', {1, 3}); + auto bias = NDArrayFactory::create('c', {3}); auto expFF = NDArrayFactory::create('c', {2, 3, 5}, {59.0, 69.0, 79.0, 89.0, 99.0, 132.0, 158.0, 184.0, 210.0, 236.0, 205.0, 247.0, 289.0, 331.0, 373.0, 179.0, 189.0, 199.0, 209.0, 219.0, 444.0, 470.0, 496.0, 522.0, 548.0, 709.0, 751.0, 793.0, 835.0, 877.0}); auto expEps = NDArrayFactory::create('c', {2, 2, 6}, {130.0, 293.0, 326.0, 359.0, 392.0, 220.0, 166.0, 371.0, 416.0, 461.0, 506.0, 280.0, 355.0, 788.0, 821.0, 854.0, 887.0, 490.0, 481.0, 1046.0, 1091.0, 1136.0, 1181.0, 640.0}); auto expGW = NDArrayFactory::create('c', {3, 2, 2}, {1415.0, 1520.0, 2045.0, 2150.0, 1865.0, 2020.0, 2795.0, 2950.0, 2315.0, 2520.0, 3545.0, 3750.0}); - auto expGB = NDArrayFactory::create('c', {1, 3}, {105.0, 155.0, 205.0}); + auto expGB = NDArrayFactory::create('c', {3}, {105.0, 155.0, 205.0}); expGW.permutei({2,1,0}); input.linspace(1); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp index 01e8e82c2..7428539f3 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp @@ -47,7 +47,7 @@ TEST_F(DeclarableOpsTests11, test_mixed_biasadd_1) { auto exp = NDArrayFactory::create('c', {2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f}); nd4j::ops::biasadd op; - auto status = op.execute({&x, &y}, {&z}, {}, {}, {}); + auto status = op.execute({&x, &y}, {&z}, {}, {}, {true}); ASSERT_EQ(Status::OK(), status); ASSERT_EQ(exp, z); @@ -66,11 +66,11 @@ TEST_F(DeclarableOpsTests11, test_listdiff_1) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test1) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-12.49997,-13.04346, -13.63635, -14.28571,-14.99999,-15.78947, -16.66666, -17.64705,-18.75 ,-20. , -21.42857, -23.07692, -24.99999,-27.27272, -29.99999, -33.33332,-37.49999,-42.85713, -49.99998, -59.99998,-74.99995,-99.99992,-149.99986,-299.99911}); NDArray dLdwExp('c', {2,3,4}, {3.21887, 4.96807, 6.10512, 6.80726, 7.15461, 7.19051, 6.93973, 6.41584, 5.62456, 4.56548, 3.2326 , 1.61444, @@ -80,14 +80,14 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test1) { predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {0}, {}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -103,16 +103,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test1) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test2) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {2,1,4}, {15.99805, 16.72406, 16.27746, 14.83754,-44.97147,-59.99582,-79.28771,-107.35497}); predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {0}); @@ -120,7 +120,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test2) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -129,11 +129,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test2) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test3) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-12.49997,-13.04346, -13.63635, -14.28571,-14.99999,-15.78947, -16.66666, -17.64705,-18.75 ,-20. , -21.42857, -23.07692, -24.99999,-27.27272, -29.99999, -33.33332,-37.49999,-42.85713, -49.99998, -59.99998,-74.99995,-99.99992,-149.99986,-299.99911}); NDArray dLdwExp('c', {}, {-227.77286}); @@ -142,14 +142,14 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test3) { predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -165,22 +165,22 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test3) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test4) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {4.8876 , -46.29156, -186.36887}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - + auto *dLdw = results->at(1); // dLdw->printIndexedBuffer(); // dLdw->printShapeInfo(); @@ -193,11 +193,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test4) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test5) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-1.04166,-1.08696, -1.13636, -1.19048,-1.25 ,-1.31579, -1.38889, -1.47059,-1.5625 ,-1.66667, -1.78571, -1.92308, -2.08333,-2.27273, -2.5 , -2.77778,-3.125 ,-3.57143, -4.16667, -5. ,-6.25 ,-8.33333,-12.49999,-24.99993}); NDArray dLdwExp('c', {2,3,4}, {1.05912, 1.20488, 1.29964, 1.35815, 1.3871 , 1.39009, 1.36919, 1.32553, 1.25959, 1.17133, 1.06026, 0.92541, @@ -207,14 +207,14 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test5) { predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {2}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -230,16 +230,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test5) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test6) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {6.73432, 2.46939,-9.20372}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {2}); @@ -247,7 +247,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test6) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -256,16 +256,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test6) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test7) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {}, {0.}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {2}); @@ -273,20 +273,20 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test7) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test8) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0. , 0. , 0. , 0. ,-1.5 ,-1.57895, -1.66667, -1.76471,-1.875 ,-2. , -2.14286, -2.30769, -2.5 ,-2.72727, -3. , -3.33333,-3.75 ,-4.28571, -5. , -6. ,-7.49999,-9.99999,-14.99999,-29.99991}); NDArray dLdwExp('c', {2,3,4}, {1.56625, 1.74117, 1.85487, 1.92509, 1.95982, 1.96341, 1.93833, 1.88594, 1.80682, 1.70091, 1.56762, 1.4058 , @@ -307,7 +307,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test8) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -323,11 +323,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test8) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test9) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.52083,-0.54348,-0.56818, -0.59524,-0.625 ,-0.65789,-0.69444, -0.73529,-0.78125,-0.83333,-0.89286, -0.96154, -1.04167,-1.13636,-1.25 , -1.38889,-1.5625 ,-1.78571,-2.08333, -2.5 ,-3.125 ,-4.16666,-6.24999,-12.49996}); NDArray dLdwExp('c', {2,3,4}, {0.13412, 0.207 , 0.25438, 0.28364, 0.29811, 0.2996 , 0.28916, 0.26733, 0.23436, 0.19023, 0.13469, 0.06727, @@ -338,13 +338,13 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test9) { predictions.linspace(0.04, 0.04); labels.linspace(1); weights.assign(0.5); - + nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -357,10 +357,10 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test9) { delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test10) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE); @@ -375,8 +375,8 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test10) { auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -386,7 +386,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test10) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test11) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); @@ -401,8 +401,8 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test11) { auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -412,11 +412,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test11) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test12) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, { 0. , 0. , 0. , 0. ,-0.75 ,-0.789473,-0.833333, -0.882353,-0.9375 ,-1. ,-1.071428, -1.153846, -1.25 ,-1.363636,-1.5 , -1.666666,-1.875 ,-2.142857,-2.499999, -2.999999,-3.749997,-4.999997,-7.499993,-14.999956}); NDArray dLdwExp('c', {2,3,4}, {0.16094, 0.2484 , 0.30526, 0.34036, 0.35773, 0.35953, 0.34699, 0.32079, 0.28123, 0.22827, 0.16163, 0.08072, @@ -433,15 +433,15 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test12) { weights.t(2) = 0.; weights.t(3) = 0.; - + nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -455,11 +455,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test12) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, log_loss_grad_test13) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , -2.08333,-2.27273, -2.5 , -2.77778,-3.125 ,-3.57143, -4.16667, -5. ,-6.25 ,-8.33333,-12.49999,-24.99993}); NDArray dLdwExp('c', {2,3,1}, {1.75828, 2.30839, 1.25309, -1.35098, -6.16602,-16.78383}); @@ -471,16 +471,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test13) { weights.assign(0.5); weights.t(0) = 0.; weights.t(1) = 0.; - weights.t(2) = 0.; - + weights.t(2) = 0.; + nd4j::ops::log_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -494,10 +494,10 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test13) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, summaryStatsData_test1) { - + functions::summarystats::SummaryStatsData var1; functions::summarystats::SummaryStatsData var2; - var2.n = var2.mean = var2.M2 = var2.M3 = var2.M4 = var2.bias = 5; + var2.n = var2.mean = var2.M2 = var2.M3 = var2.M4 = var2.bias = 5; functions::summarystats::SummaryStatsData* arr = new functions::summarystats::SummaryStatsData[2]; arr[0] = var1; @@ -515,11 +515,11 @@ TEST_F(DeclarableOpsTests11, summaryStatsData_test1) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.96, -1.92, -2.88, -3.84, -4.8 , -5.76, -6.72, -7.68, -8.64, -9.6 ,-10.56,-11.52, -12.48,-13.44,-14.4 ,-15.36,-16.32,-17.28,-18.24,-19.2 ,-20.16,-21.12,-22.08,-23.04}); NDArray dLdwExp('c', {2,3,4}, {0.9216 , 3.6864 , 8.2944 , 14.7456 , 23.04 , 33.1776 , 45.1584 , 58.9824 , 74.6496 , 92.16 ,111.51361,132.7104 , @@ -527,14 +527,14 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) { predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {0}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto dLdp = results->at(0); + auto dLdp = results->at(0); auto dLdw = results->at(1); auto dLdl = results->at(2); @@ -547,19 +547,19 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) { delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {2,1,4}, {98.61121,129.024 , 164.9664 , 206.4384 , 828.51837,925.28644,1027.58398,1135.41113}); predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {0}); @@ -567,7 +567,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -576,25 +576,25 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test3) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.96, -1.92, -2.88, -3.84, -4.8 , -5.76, -6.72, -7.68, -8.64, -9.6 ,-10.56,-11.52, -12.48,-13.44,-14.4 ,-15.36,-16.32,-17.28,-18.24,-19.2 ,-20.16,-21.12,-22.08,-23.04}); NDArray dLdwExp('c', {}, {4515.84}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -610,22 +610,22 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test3) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test4) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {807.32153, 1426.63684, 2281.88159}); predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); @@ -636,11 +636,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test4) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.08,-0.16,-0.24,-0.32,-0.4 ,-0.48,-0.56,-0.64,-0.72,-0.8 ,-0.88,-0.96, -1.04,-1.12,-1.2 ,-1.28,-1.36,-1.44,-1.52,-1.6 ,-1.68,-1.76,-1.84,-1.92}); NDArray dLdwExp('c', {2,3,4}, {-15.6032,-15.3728,-14.9888,-14.4512,-13.76 ,-12.9152,-11.9168,-10.7648, -9.4592, -8. , -6.3872, -4.6208, @@ -648,14 +648,14 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) { predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {2}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -671,16 +671,16 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {-58.16319, -6.5536 , 64.71682}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {2}); @@ -688,7 +688,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -697,16 +697,16 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test7) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {}, {0.}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {2}); @@ -714,20 +714,20 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test7) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0. ,0. ,0. ,0. ,-0.48 ,-0.576,-0.672,-0.768,-0.864,-0.96 ,-1.056,-1.152, -1.248,-1.344,-1.44 ,-1.536,-1.632,-1.728,-1.824,-1.92 ,-2.016,-2.112,-2.208,-2.304}); NDArray dLdwExp('c', {2,3,4}, {-22.3488 ,-22.07232,-21.61152,-20.9664 ,-20.13696,-19.1232 ,-17.92512,-16.54272,-14.976 ,-13.22496,-11.2896 , -9.16992, @@ -746,7 +746,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -762,11 +762,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.04,-0.08,-0.12,-0.16,-0.2 ,-0.24,-0.28,-0.32,-0.36,-0.4 ,-0.44,-0.48, -0.52,-0.56,-0.6 ,-0.64,-0.68,-0.72,-0.76,-0.8 ,-0.84,-0.88,-0.92,-0.96}); NDArray dLdwExp('c', {2,3,4}, {0.0384, 0.1536, 0.3456, 0.6144, 0.96 , 1.3824, 1.8816, 2.4576, 3.1104, 3.84 , 4.6464, 5.5296, @@ -775,13 +775,13 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) { predictions.linspace(0.04, 0.04); labels.linspace(1); weights.assign(0.5); - + nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -797,7 +797,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE); @@ -812,8 +812,8 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) { auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -823,7 +823,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); @@ -838,8 +838,8 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) { auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -849,11 +849,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0.,0.,0.,0., -0.24 ,-0.288,-0.336,-0.384,-0.432,-0.48 ,-0.528,-0.576, -0.624,-0.672,-0.72 ,-0.768,-0.816,-0.864,-0.912,-0.96 ,-1.008,-1.056,-1.104,-1.152}); NDArray dLdwExp('c', {2,3,4}, {0.04608, 0.18432, 0.41472, 0.73728, 1.152 , 1.65888, 2.25792, 2.94912, 3.73248, 4.608 , 5.57568, 6.63552, @@ -866,15 +866,15 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) { weights.t(1) = 0.; weights.t(2) = 0.; weights.t(3) = 0.; - + nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -888,11 +888,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test13) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., -1.04,-1.12,-1.2 ,-1.28,-1.36,-1.44,-1.52,-1.6 ,-1.68,-1.76,-1.84,-1.92}); NDArray dLdwExp('c', {2,3,1}, {2.304 , 13.3632 , 34.2528 , 64.97279,105.5232 ,155.90401}); @@ -902,16 +902,16 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test13) { weights.assign(0.5); weights.t(0) = 0.; weights.t(1) = 0.; - weights.t(2) = 0.; - + weights.t(2) = 0.; + nd4j::ops::mean_sqerr_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -963,11 +963,11 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test3) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5, -0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5}); NDArray dLdwExp('c', {2,3,4}, {0.96, 1.92, 2.88, 3.84, 4.8 , 5.76, 6.72, 7.68, 8.64, 9.6 ,10.56,11.52, @@ -975,14 +975,14 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) { predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {0}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto dLdp = results->at(0); + auto dLdp = results->at(0); auto dLdw = results->at(1); auto dLdl = results->at(2); @@ -995,19 +995,19 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) { delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {2,1,4}, {14.4 , 17.28, 20.16, 23.04, 48.96, 51.84, 54.72, 57.6}); predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {0}); @@ -1015,7 +1015,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -1024,25 +1024,25 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test3) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5, -0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5}); NDArray dLdwExp('c', {}, {288.}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1058,22 +1058,22 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test3) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test4) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {65.28, 96., 126.72001}); predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); @@ -1084,11 +1084,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test4) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167, -0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167}); NDArray dLdwExp('c', {2,3,4}, {-0.92,-0.84,-0.76,-0.68,-0.6 ,-0.52,-0.44,-0.36,-0.28,-0.2 ,-0.12,-0.04, @@ -1096,14 +1096,14 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) { predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {2}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1119,16 +1119,16 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {-2.56, 0., 2.56}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {2}); @@ -1136,7 +1136,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -1145,16 +1145,16 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test7) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {}, {0.}); - + predictions.linspace(0.04, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {2}); @@ -1162,20 +1162,20 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test7) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0. ,-0. ,-0. ,-0. ,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05, -0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05}); NDArray dLdwExp('c', {2,3,4}, {-1.296,-1.2 ,-1.104,-1.008,-0.912,-0.816,-0.72 ,-0.624,-0.528,-0.432,-0.336,-0.24 , @@ -1194,7 +1194,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1210,11 +1210,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083, -0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083}); NDArray dLdwExp('c', {2,3,4}, {0.04, 0.08, 0.12, 0.16, 0.2 , 0.24, 0.28, 0.32,0.36, 0.4 , 0.44, 0.48, @@ -1223,13 +1223,13 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) { predictions.linspace(0.04, 0.04); labels.linspace(1); weights.assign(0.5); - + nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1245,7 +1245,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE); @@ -1260,8 +1260,8 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) { auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -1271,7 +1271,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); @@ -1286,8 +1286,8 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) { auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -1297,11 +1297,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., -0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025}); NDArray dLdwExp('c', {2,3,4}, {0.048, 0.096, 0.144, 0.192,0.24 , 0.288, 0.336, 0.384,0.432, 0.48 , 0.528, 0.576, @@ -1314,15 +1314,15 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) { weights.t(1) = 0.; weights.t(2) = 0.; weights.t(3) = 0.; - + nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -1336,11 +1336,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., -0.04167, -0.04167, -0.04167, -0.04167,-0.04167, -0.04167, -0.04167, -0.04167,-0.04167, -0.04167, -0.04167, -0.04167}); NDArray dLdwExp('c', {2,3,1}, {0.8 ,2.08,3.36,4.64,5.92,7.2 }); @@ -1350,16 +1350,16 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) { weights.assign(0.5); weights.t(0) = 0.; weights.t(1) = 0.; - weights.t(2) = 0.; - + weights.t(2) = 0.; + nd4j::ops::absolute_difference_loss_grad op; auto results = op.execute({&predictions, &weights, &labels}, {}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -1373,11 +1373,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, BFloat16_Test_1) { - + NDArray x = NDArrayFactory::create('c', {2,3,4}); NDArray y = NDArrayFactory::create('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16); NDArray exp = NDArrayFactory::create('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16); - + x.linspace(1); y.linspace(1); exp.linspace(2,2); @@ -1385,7 +1385,7 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_1) { auto results = op.execute({&x, &y}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - + auto res = results->at(0); res->printIndexedBuffer("BFloat16 sum:"); ASSERT_TRUE(res->equalsTo(exp)); @@ -1439,11 +1439,11 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_3) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.25999, -0.755 , -1.25 , -1.745 , -2.24001, -2.73502, -3.23004, -3.72508, -4.22014, -4.71523, -5.21034, -5.70548, -6.20066, -6.69587, -7.19113, -7.68643, -8.18177, -8.67717, -9.17262, -9.66813,-10.1637 ,-10.65932,-11.15501,-11.65077}); NDArray dLdwExp('c', {2,3,4}, {0.73395, 0.75335, 0.69315, 0.55335, 0.33395, 0.03495, -0.34366, -0.80186, -1.33967, -1.95708, -2.65411, -3.43074, @@ -1453,14 +1453,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) { logits.linspace(-0.08, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.}, {0}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1473,14 +1473,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) { delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.18499,-0.53 ,-0.875 ,-1.22 ,-1.56501,-1.91002,-2.25504,-2.60008,-2.94514,-3.29023,-3.63534,-3.98048, -4.32566,-4.67087,-5.01613,-5.36143,-5.70677,-6.05217,-6.39762,-6.74313,-7.0887 ,-7.43432,-7.78001,-8.12577}); NDArray dLdwExp('c', {2,1,4}, {0.43622, -0.19079, -0.98462, -1.94525,-18.09855,-20.72768,-23.52373,-26.48669}); @@ -1489,14 +1489,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) { logits.linspace(-0.08, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {0}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1512,11 +1512,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.18499,-0.53 ,-0.875 ,-1.22 ,-1.56501,-1.91002,-2.25504,-2.60008,-2.94514,-3.29023,-3.63534,-3.98048, -4.32566,-4.67087,-5.01613,-5.36143,-5.70677,-6.05217,-6.39762,-6.74313,-7.0887 ,-7.43432,-7.78001,-8.12577}); NDArray dLdwExp('c', {}, {-91.52109}); @@ -1525,14 +1525,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) { logits.linspace(-0.08, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1548,22 +1548,22 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test4) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {-12.54779,-28.13393,-50.83936}); - + logits.linspace(-0.08, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); @@ -1574,11 +1574,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test4) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.01542,-0.04417,-0.07292,-0.10167,-0.13042,-0.15917,-0.18792,-0.21667,-0.24543,-0.27419,-0.30294,-0.33171, -0.36047,-0.38924,-0.41801,-0.44679,-0.47556,-0.50435,-0.53314,-0.56193,-0.59072,-0.61953,-0.64833,-0.67715}); NDArray dLdwExp('c', {2,3,4}, {0.37794, 0.37906, 0.37554, 0.36739, 0.35461, 0.33719, 0.31514, 0.28846, 0.25714, 0.22119, 0.18061, 0.13539, @@ -1588,14 +1588,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) { logits.linspace(-0.08, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1611,16 +1611,16 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {1,3,1}, {1.4966 , 0.19776,-1.69436}); logits.linspace(-0.08, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2}); @@ -1628,7 +1628,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -1637,16 +1637,16 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test7) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights(nd4j::DataType::DOUBLE); - + NDArray dLdwExp('c', {}, {0.}); - + logits.linspace(-0.08, 0.04); labels.linspace(1); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2}); @@ -1654,20 +1654,20 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test7) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdw = results->at(1); - + ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, { 0. , 0. , 0. , 0. ,-0.1565 ,-0.191 ,-0.2255 ,-0.26001,-0.29451,-0.32902,-0.36353,-0.39805, -0.43257,-0.46709,-0.50161,-0.53614,-0.57068,-0.60522,-0.63976,-0.67431,-0.70887,-0.74343,-0.778 ,-0.81258}); NDArray dLdwExp('c', {2,3,4}, {0.54353, 0.54487, 0.54065, 0.53087, 0.51553, 0.49463, 0.46817, 0.43615, 0.39857, 0.35543, 0.30672, 0.25246, @@ -1687,7 +1687,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1703,11 +1703,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.00771, -0.02208, -0.03646, -0.05083,-0.06521, -0.07958, -0.09396, -0.10834,-0.12271, -0.13709, -0.15147, -0.16585, -0.18024, -0.19462, -0.20901, -0.22339,-0.23778, -0.25217, -0.26657, -0.28096,-0.29536, -0.30976, -0.32417, -0.33857}); NDArray dLdwExp('c', {2,3,4}, {0.03008, 0.03064, 0.02888, 0.02481, 0.01841, 0.00971, -0.00132, -0.01466,-0.03032, -0.0483 , -0.06859, -0.0912 , @@ -1717,13 +1717,13 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) { logits.linspace(-0.08, 0.04); labels.linspace(1); weights.assign(0.5); - + nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); @@ -1736,10 +1736,10 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) { delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE); @@ -1754,8 +1754,8 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) { auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -1765,7 +1765,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE); @@ -1780,8 +1780,8 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) { auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - - auto *dLdw = results->at(1); + + auto *dLdw = results->at(1); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); @@ -1791,11 +1791,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0. , 0. , 0. , 0. ,-0.07825, -0.0955 , -0.11275, -0.13 ,-0.14726, -0.16451, -0.18177, -0.19902, -0.21628, -0.23354, -0.25081, -0.26807,-0.28534, -0.30261, -0.31988, -0.33716,-0.35443, -0.37172, -0.389 , -0.40629}); NDArray dLdwExp('c', {2,3,4}, {0.0361 , 0.03677, 0.03466, 0.02977, 0.0221 , 0.01165, -0.00158, -0.01759,-0.03638, -0.05795, -0.08231, -0.10944, @@ -1810,15 +1810,15 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) { weights.t(2) = 0.; weights.t(3) = 0.; - + nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -1832,11 +1832,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test13) { - + NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , -0.36047, -0.38924, -0.41801, -0.44679,-0.47556, -0.50435, -0.53314, -0.56193,-0.59072, -0.61953, -0.64833, -0.67715}); NDArray dLdwExp('c', {2,3,1}, {0.22882, 0.02428,-0.4768 ,-1.27447,-2.36878,-3.75981,}); @@ -1847,16 +1847,16 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test13) { weights.assign(0.5); weights.t(0) = 0.; weights.t(1) = 0.; - weights.t(2) = 0.; - + weights.t(2) = 0.; + nd4j::ops::sigm_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -1940,61 +1940,61 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test1) { NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, nd4j::DataType::INT32); NDArray logits('c', {2,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,4}, {0.1176, 0.1224, -0.3726, 0.1326, 0.1176, -0.3776, 0.1274, 0.1326}); NDArray dLdwExp('c', {2}, {1.36729, 1.40729}); logits.linspace(-0.08, 0.04); - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::softmax_cross_entropy_loss_grad op; - auto results = op.execute({&logits, &weights, &labels}, {0.}, {0}); - + auto results = op.execute({&logits, &weights, &labels}, {0.}, {0}); + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); - ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); + ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; } - + ///////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test2) { NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32); NDArray logits('c', {4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {4}, {0.125, 0.125, -0.375, 0.125}); NDArray dLdwExp('c', {1}, {1.38629}); logits = 2.; - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::softmax_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.}, {1}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); - ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); + ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; -} +} ///////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test3) { @@ -2002,30 +2002,30 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test3) { NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32); NDArray logits('c', {4}, nd4j::DataType::DOUBLE); NDArray weights('c', {}, {0}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {4}, {0.125, 0.125, -0.375, 0.125}); NDArray dLdwExp('c', {}, {1.38629}); logits = 2.; - weights.assign(0.5); + weights.assign(0.5); nd4j::ops::softmax_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.}, {1}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); - ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); + ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; -} +} ///////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test4) { @@ -2033,30 +2033,30 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test4) { NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32); NDArray logits('c', {4}, nd4j::DataType::DOUBLE); NDArray weights('c', {}, {0}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {4}, {0.23521, 0.2448 , -0.7452 , 0.26519}); NDArray dLdwExp('c', {}, {0.}); logits.linspace(-0.08, 0.04); - weights = 0.5; + weights = 0.5; nd4j::ops::softmax_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.}, {2}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); - ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); + ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; -} +} ///////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test5) { @@ -2064,30 +2064,30 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test5) { NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32); NDArray logits('c', {4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {4}, {0.1176, 0.1224, -0.3726, 0.1326}); NDArray dLdwExp('c', {1}, {1.36729}); logits.linspace(-0.08, 0.04); - weights = 0.5; + weights = 0.5; nd4j::ops::softmax_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.}, {3}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); - ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); + ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; -} +} ///////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) { @@ -2095,7 +2095,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) { NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, nd4j::DataType::INT32); NDArray logits('c', {2,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {2}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,4}, {0.0801, 0.0849, -0.2601, 0.0951, 0.0801, -0.2651, 0.0899, 0.0951}); NDArray dLdwExp('c', {2}, {-0.014000, 0.014000}); @@ -2105,12 +2105,12 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) { nd4j::ops::softmax_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); + auto *dLdl = results->at(2); ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2126,27 +2126,27 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test7) { NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1, 1,0,0,0, 0,1,0,0}, nd4j::DataType::INT32); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,3}, {0.5, 0., 1.5}); - + NDArray dLdpExp('c', {2,3,4}, {-0.0956 , 0.0306 , 0.03185, 0.03315, 0.,-0., 0., 0., 0.0882 , 0.0918 ,-0.27945, 0.09945, 0.0294 , 0.0306 , 0.03185,-0.09185,-0., 0., 0., 0., 0.0882 ,-0.2832 , 0.09555, 0.09945}); NDArray dLdwExp('c', {1,3}, {0.69365, 0.71365, 0.69365}); - logits.linspace(-0.08, 0.04); + logits.linspace(-0.08, 0.04); nd4j::ops::softmax_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.}, {3}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); - auto *dLdl = results->at(2); - + auto *dLdl = results->at(2); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); - ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); + ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; } @@ -2157,40 +2157,40 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test8) { NDArray labels('c', {2,3,4,5}, {1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0, 0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0}, nd4j::DataType::INT32); - + NDArray logits('c', {2,3,4,5}, nd4j::DataType::DOUBLE); NDArray weights('c', {1,1,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4,5}, {-0.03399, 0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, - 0.00866, 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, - 0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, - 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, - 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, - 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832, - 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, 0.00768, - 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832, 0.00866, + 0.00866, 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, + 0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, + 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, + 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, + 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832, + 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, 0.00768, + 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901}); NDArray dLdwExp('c', {1,1,4}, {0.005, 0.00167, -0.00167, -0.005}); - logits.linspace(-0.08, 0.04); - weights.assign(0.5); + logits.linspace(-0.08, 0.04); + weights.assign(0.5); nd4j::ops::softmax_cross_entropy_loss_grad op; auto results = op.execute({&logits, &weights, &labels}, {0.}, {2}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); + auto *dLdp = results->at(0); auto *dLdw = results->at(1); auto *dLdl = results->at(2); - // dLdp->printIndexedBuffer(); + // dLdp->printIndexedBuffer(); // ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); // ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); ASSERT_TRUE(dLdwExp.isSameShape(dLdw)); - ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); + ASSERT_TRUE(dLdwExp.equalsTo(dLdw)); delete results; } @@ -2212,19 +2212,19 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test1) { NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1, 1,0,0,0, 0,1,0,0}); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.76479, 0.2448, 0.2548, 0.26519, 0.23521,-0.7552, 0.2548, 0.26519, 0.23521, 0.2448,-0.7452, 0.26519, 0.23521, 0.2448, 0.2548,-0.73481,-0.76479, 0.2448, 0.2548, 0.26519, 0.23521,-0.7552, 0.2548, 0.26519}); - logits.linspace(-0.08, 0.04); + logits.linspace(-0.08, 0.04); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2236,19 +2236,19 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test2) { NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,1, 0,0,1,0, 0,0,0,1, 1,0,1,0, 0,1,0,0}); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.71836, 0.28164, 0.28164, 0.28164, 0.33051, -0.66949, 0.33051, -0.66949, 0.38785, 0.38785, -0.61215, 0.38785, 0.28164, 0.28164, 0.28164, -0.71836,-0.66949, 0.33051, -0.66949, 0.33051, 0.38785, -0.61215, 0.38785, 0.38785}); - logits.linspace(-0.08, 0.04); + logits.linspace(-0.08, 0.04); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {1}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2260,18 +2260,18 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test3) { NDArray labels('c', {2,3}, {1,0,0, 0,1,1}); NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3}, {-0.52996, 0.47004, 0.47004, 0.52996, -0.47004, -0.47004}); - logits.linspace(-0.08, 0.04); + logits.linspace(-0.08, 0.04); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {0}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2283,17 +2283,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test4) { NDArray labels('c', {2,1}, {1,1}); NDArray logits('c', {2,1}, {-0.04, 0.04}); - - NDArray dLdpExp('c', {2,1}, {0., 0.}); + + NDArray dLdpExp('c', {2,1}, {0., 0.}); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {1}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2305,17 +2305,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test5) { NDArray labels('c', {2,1}, {1,0}); NDArray logits('c', {2,1}, {-0.04, 0.04}); - - NDArray dLdpExp('c', {2,1}, {-0.51999, 0.51999}); + + NDArray dLdpExp('c', {2,1}, {-0.51999, 0.51999}); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {0}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2327,17 +2327,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test6) { NDArray labels('c', {1,2}, {1,1}); NDArray logits('c', {1,2}, {-0.04, 0.04}); - - NDArray dLdpExp('c', {1,2}, {0, 0}); + + NDArray dLdpExp('c', {1,2}, {0, 0}); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {0}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2349,17 +2349,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test7) { NDArray labels('c', {2}, {0,1}); NDArray logits('c', {2}, {-0.04, 0.04}); - + NDArray dLdpExp('c', {2}, {0.48001, -0.48001}); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {0}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2371,17 +2371,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test8) { NDArray labels('c', {1}, {1}); NDArray logits('c', {1}, {0.04}); - + NDArray dLdpExp('c', {1}, {0}); nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&logits, &labels}, {}, {0}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2420,19 +2420,19 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test1) { NDArray labels('c', {2}, {2,1}, nd4j::DataType::INT64); NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3}, {0.30061, 0.33222, -0.63283, 0.30061, -0.66778, 0.36717}); - - logits.linspace(0.1, 0.1); + + logits.linspace(0.1, 0.1); nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&labels, &logits}, {}, {}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2444,19 +2444,19 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test2) { NDArray labels('c', {2}, {0,1}, nd4j::DataType::INT64); NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE); - - NDArray dLdpExp('c', {2,3}, {-0.69939, 0.33222, 0.36717, 0.30061, -0.66778, 0.36717}); - + + NDArray dLdpExp('c', {2,3}, {-0.69939, 0.33222, 0.36717, 0.30061, -0.66778, 0.36717}); + logits.linspace(-0.1, 0.1); nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&labels, &logits}, {}, {}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2468,17 +2468,17 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test3) { NDArray labels('c', {}, {1}, nd4j::DataType::INT64); NDArray logits('c', {2}, {-0.2, 0.3}); - + NDArray dLdpExp('c', {2}, {0.37754, -0.37754}); nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&labels, &logits}, {}, {}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2490,7 +2490,7 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test4) { NDArray labels('c', {2,3}, {0,1,1, 3,3,2}, nd4j::DataType::INT64); NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE); - + NDArray dLdpExp('c', {2,3,4}, {-0.78616, 0.23633, 0.26118, 0.28865, 0.21384, -0.76367, 0.26118, 0.28865, 0.21384, -0.76367, 0.26118, 0.28865, 0.21384, 0.23633, 0.26118, -0.71135, 0.21384, 0.23633, 0.26118, -0.71135, 0.21384, 0.23633, -0.73882, 0.28865}); logits.linspace(-0.5, 0.1); @@ -2498,11 +2498,11 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test4) { nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&labels, &logits}, {}, {}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); @@ -2514,17 +2514,17 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test5) { NDArray labels('c', {1,1}, {0}, nd4j::DataType::INT64); NDArray logits('c', {1,1,2}, {-0.3,0.2}); - - NDArray dLdpExp('c', {1,1,2}, {-0.62246, 0.62246}); + + NDArray dLdpExp('c', {1,1,2}, {-0.62246, 0.62246}); nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op; auto results = op.execute({&labels, &logits}, {}, {}); - + ASSERT_EQ(ND4J_STATUS_OK, results->status()); - auto *dLdp = results->at(0); - + auto *dLdp = results->at(0); + ASSERT_TRUE(dLdpExp.isSameShape(dLdp)); ASSERT_TRUE(dLdpExp.equalsTo(dLdp)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp index fd32f8a79..da5f5f75d 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp @@ -245,7 +245,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, Test_Pooling_Parity_12) { TEST_F(DeclarableOpsTests4, Test_BiasAdd_NHWC_1) { auto x = NDArrayFactory::create('c', {2, 3, 3, 2}); - auto bias = NDArrayFactory::create('c', {1, 2}, {1, 2}); + auto bias = NDArrayFactory::create('c', {2}, {1, 2}); auto exp = NDArrayFactory::create('c', {2, 3, 3, 2}, {1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f}); nd4j::ops::biasadd op; @@ -263,11 +263,11 @@ TEST_F(DeclarableOpsTests4, Test_BiasAdd_NHWC_1) { TEST_F(DeclarableOpsTests4, Test_BiasAdd_NCHW_1) { auto x = NDArrayFactory::create('c', {2, 2, 3, 3}); - auto bias = NDArrayFactory::create('c', {1, 2}, {1, 2}); - auto exp = NDArrayFactory::create('c', {2, 2, 3, 3}, {1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f}); + auto bias = NDArrayFactory::create('c', {2}, {1, 2}); + auto exp = NDArrayFactory::create('c', {2, 2, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2}); nd4j::ops::biasadd op; - auto result = op.execute({&x, &bias}, {}, {}, {}, false, nd4j::DataType::DOUBLE); + auto result = op.execute({&x, &bias}, {}, {}, {true}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(ND4J_STATUS_OK, result->status()); @@ -360,6 +360,43 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_2) { delete result; } +TEST_F(DeclarableOpsTests4, Test_FlattenTests_3) { + NDArray x('c', {2,2}, {1, 2, 3, 4}, nd4j::DataType::INT32); + NDArray y('f', {2,2}, nd4j::DataType::INT32); + NDArray exp('c', {8}, {1, 2, 3, 4, 1, 2, 3, 4}, nd4j::DataType::INT32); + + y.assign(x); + + nd4j::ops::flatten op; + auto result = op.execute({&x, &y}, {}, {'c'}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +TEST_F(DeclarableOpsTests4, Test_FlattenTests_4) { + NDArray x('c', {2,2}, {1, 2, 3, 4}, nd4j::DataType::INT32); + NDArray y('f', {2,2}, nd4j::DataType::INT32); + NDArray exp('c', {8}, {1, 3, 2, 4, 1, 3, 2, 4}, nd4j::DataType::INT32); + + y.assign(x); + + nd4j::ops::flatten op; + auto result = op.execute({&x, &y}, {}, {'f'}); + ASSERT_EQ(ND4J_STATUS_OK, result->status()); + + auto z = result->at(0); + z->printIndexedBuffer(); + + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + TEST_F(DeclarableOpsTests4, Test_FloorTests_1) { auto x = NDArrayFactory::create('c', {3, 3}, {1.5, 2.3, 3.4, 4.3, 5.9, 6.1, 7.2, 8.9, 9.7}); auto exp = NDArrayFactory::create('c', {3,3}); @@ -608,7 +645,7 @@ TEST_F(DeclarableOpsTests4, Test_BiasAdd_1) { auto exp = NDArrayFactory::create('c', {2, 3}, {1, 2, 3, 1, 2, 3}); nd4j::ops::biasadd op; - auto result = op.execute({&x, &row}, {}, {}, {}, false, nd4j::DataType::DOUBLE); + auto result = op.execute({&x, &row}, {}, {}, {true}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(ND4J_STATUS_OK, result->status()); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp index a5e808867..e6c692f5b 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp @@ -1610,21 +1610,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests6, MatrixInverse_010) { - auto x = NDArrayFactory::create('c', {1, 5, 5}, { - 1., 0., 0., 0., 0., - 2., 1., 0., 0., 0., - 30., 2., 1., 0., 0., - 4., 3., 2., 1., 0., - 5., 4., 3., 2., 1., - }); - - auto exp = NDArrayFactory::create('c', {1, 5, 5}, { - 1.0, 0.0, 0.0, 0.0, 0., - -2.0, 1.0, 0., 0., 0., - -26.0, -2.0, 1, 0, 0., - 54.0, 1.0, -2.0, 1, 0., - -27.0, 0.0, 1.0, -2.0, 1. - }); + auto x = NDArrayFactory::create('c', {1, 5, 5}, {1., 0., 0., 0., 0.,2., 1., 0., 0., 0.,30., 2., 1., 0., 0.,4., 3., 2., 1., 0.,5., 4., 3., 2., 1.,}); + auto exp = NDArrayFactory::create('c', {1, 5, 5}, {1.0, 0.0, 0.0, 0.0, 0.,-2.0, 1.0, 0., 0., 0.,-26.0, -2.0, 1, 0, 0.,54.0, 1.0, -2.0, 1, 0.,-27.0, 0.0, 1.0, -2.0, 1.}); nd4j::ops::matrix_inverse op; auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32); @@ -1632,8 +1619,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_010) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("010 Output "); -// exp.printIndexedBuffer("010 Expected "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1644,24 +1629,9 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_010) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests6, MatrixInverse_01) { - auto x = NDArrayFactory::create('c', {1, 5, 5}, { - 2., 4., 60., 8., 10., - 0., 1., 2., 3., 4., - 0., 0., 2., 4., 6., - 0., 0., 0., 1., 2., - 0., 0., 0., 0., 4. - - }); - - auto exp = NDArrayFactory::create('c', {1, 5, 5}, { - 0.5, -2.0, -13.0, 54.0, -6.75, - 0.0, 1.0, -1.0, 1.0, 0.0, - 0, 0, 0.5, -2.0, 0.25, - 0, 0, 0, 1.0, -0.5, - 0, 0, 0, 0, 0.25 - - }); + auto x = NDArrayFactory::create('c', {1, 5, 5}, {2., 4., 60., 8., 10., 0., 1., 2., 3., 4., 0., 0., 2., 4., 6., 0., 0., 0., 1., 2., 0., 0., 0., 0., 4. }); + auto exp = NDArrayFactory::create('c', {1, 5, 5}, {0.5, -2.0, -13.0, 54.0, -6.75, 0.0, 1.0, -1.0, 1.0, 0.0, 0, 0, 0.5, -2.0, 0.25, 0, 0, 0, 1.0, -0.5, 0, 0, 0, 0, 0.25 }); nd4j::ops::matrix_inverse op; auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32); @@ -1680,21 +1650,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_01) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests6, MatrixInverse_02) { - auto x = NDArrayFactory::create('c', {1, 5, 5}, { - 1., 0., 0., 0., 0., - 2., 1., 0., 0., 0., - 30., 2., 1., 0., 0., - 4., 3., 2., 1., 0., - 5., 4., 3., 2., 1. - }); - - auto exp = NDArrayFactory::create('c', {1, 5, 5}, { - 1.0, 0.0, 0.0, 0.0, 0., - -2.0, 1.0, 0., 0., 0., - -26.0, -2.0, 1, 0, 0., - 54.0, 1.0, -2.0, 1, 0., - -27.0, 0.0, 1.0, -2.0, 1. - }); + auto x = NDArrayFactory::create('c', {1, 5, 5}, {1., 0., 0., 0., 0., 2., 1., 0., 0., 0., 30., 2., 1., 0., 0., 4., 3., 2., 1., 0., 5., 4., 3., 2., 1. }); + auto exp = NDArrayFactory::create('c', {1, 5, 5}, {1.0, 0.0, 0.0, 0.0, 0., -2.0, 1.0, 0., 0., 0., -26.0, -2.0, 1, 0, 0., 54.0, 1.0, -2.0, 1, 0., -27.0, 0.0, 1.0, -2.0, 1. }); nd4j::ops::matrix_inverse op; auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32); diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp index d0d67000b..21af8e380 100644 --- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp @@ -169,7 +169,7 @@ TEST_F(JavaInteropTests, TestSconv2d_1) { auto input = NDArrayFactory::create('c', {3, 3, 8, 8}); auto weightsD = NDArrayFactory::create('c', {1, 3, 1, 1}); auto weightsP = NDArrayFactory::create('c', {2, 3, 1, 1}); - auto bias = NDArrayFactory::create('c', {1, 2}); + auto bias = NDArrayFactory::create('c', {2}); auto output = NDArrayFactory::create('c', {3, 2, 8, 8}); output.assign(0.0); diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp index 32ff23847..9f9937368 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp @@ -1259,7 +1259,7 @@ TEST_F(NDArrayTest2, reduce_1) { for (int x = 0; x < 4; x++) { for (int y = 0; y < 4; y++) { Nd4jLong indices[] = {0, 0, x, y, i, j}; - Nd4jLong offset = shape::getOffset(0, arr6.shapeOf(), arr6.stridesOf(), indices, arr6.rankOf()); + Nd4jLong offset = shape::getOffset(arr6.getShapeInfo(), indices); sum += ((double*)arr6.getBuffer())[offset]; } } diff --git a/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp b/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp index 5005808bf..05f823e4a 100644 --- a/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp @@ -159,8 +159,6 @@ TEST_F(OneOffTests, test_conv2d_nhwc_failed_1) { auto z = graph->getVariableSpace()->getVariable(9)->getNDArray(); ASSERT_TRUE(z != nullptr); - // z->printIndexedBuffer("z"); - ASSERT_EQ(e, *z); delete graph; diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp index b6981a5c3..0254d1877 100644 --- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp @@ -683,7 +683,7 @@ TEST_F(ParityOpsTests, Test_Reshape_TF_1) { TEST_F(ParityOpsTests, Test_Bias_Add_1) { auto x = NDArrayFactory::create('c', {10, 5}); x.assign(0.0); - auto bias = NDArrayFactory::create('c', {1, 5}, {1, 2, 3, 4, 5}); + auto bias = NDArrayFactory::create('c', {5}, {1, 2, 3, 4, 5}); nd4j::ops::biasadd op; auto result = op.execute({&x, &bias}, {}, {}); @@ -691,7 +691,6 @@ TEST_F(ParityOpsTests, Test_Bias_Add_1) { auto z = result->at(0); - auto tads = z->allTensorsAlongDimension({1}); for (int e = 0; e < tads->size(); e++) { ASSERT_TRUE(bias.equalsTo(tads->at(e))); diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp index 98b9cd026..071c33fab 100644 --- a/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp @@ -67,7 +67,7 @@ TEST_F(ShapeTests, Test_ShapeEquality_1) { Nd4jLong shape[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102}; Nd4jLong shape_GOOD[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, 1, 99}; Nd4jLong shape_BAD[] = {4, 3, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102}; - + ASSERT_TRUE(shape::equalsSoft(shape, shape_GOOD)); ASSERT_FALSE(shape::equalsSoft(shape, shape_BAD)); @@ -77,7 +77,7 @@ TEST_F(ShapeTests, Test_ShapeEquality_2) { Nd4jLong shape[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102}; Nd4jLong shape_GOOD[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102}; Nd4jLong shape_BAD[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 99}; - + ASSERT_TRUE(shape::equalsStrict(shape, shape_GOOD)); ASSERT_FALSE(shape::equalsStrict(shape, shape_BAD)); @@ -86,45 +86,24 @@ TEST_F(ShapeTests, Test_ShapeEquality_2) { TEST_F(ShapeTests, Test_Ind2SubC_1) { Nd4jLong shape[] = {3, 5}; Nd4jLong c0[2]; - shape::index2coords(2, shape, 0, c0); + shape::index2coords(0, 2, shape, c0); ASSERT_EQ(0, c0[0]); ASSERT_EQ(0, c0[1]); Nd4jLong c1[2]; - shape::index2coords(2, shape, 1, c1); + shape::index2coords(1, 2, shape, c1); ASSERT_EQ(0, c1[0]); ASSERT_EQ(1, c1[1]); Nd4jLong c6[2]; - shape::index2coords(2, shape, 5, c6); + shape::index2coords(5, 2, shape, c6); ASSERT_EQ(1, c6[0]); - ASSERT_EQ(0, c6[1]); + ASSERT_EQ(0, c6[1]); } -TEST_F(ShapeTests, Test_Ind2Sub_1) { - Nd4jLong shape[] = {3, 5}; - - Nd4jLong c0[2]; - shape::index2coords(2, shape, 0, c0, 'f'); - - ASSERT_EQ(0, c0[0]); - ASSERT_EQ(0, c0[1]); - - Nd4jLong c1[2]; - shape::index2coords(2, shape, 1, c1, 'f'); - - ASSERT_EQ(1, c1[0]); - ASSERT_EQ(0, c1[1]); - - Nd4jLong c6[2]; - shape::index2coords(2, shape, 5, c6, 'f'); - - ASSERT_EQ(2, c6[0]); - ASSERT_EQ(1, c6[1]); -} TEST_F(ShapeTests, Test_ShapeDetector_1) { Nd4jLong shape[] = {2, 5, 3, 3, 1, 0, 1, 99};