From 58550b7c98390d008bae637db10500e891064265 Mon Sep 17 00:00:00 2001 From: Yurii Shyrma Date: Wed, 11 Mar 2020 15:21:59 +0200 Subject: [PATCH] [WIP] Shyrma coords (#305) * - provide faster index2coords function for cpu Signed-off-by: Yurii * - new faster index2coords function is introduced into cpu code Signed-off-by: Yurii * - replace long long coordinates with int coordinates Signed-off-by: Yurii * - add missed reload of coords2index function Signed-off-by: Yurii * - reststart jenkins Signed-off-by: Yurii * - rollback changes in convolutions.cu and addBias.cu Signed-off-by: Yurii --- libnd4j/include/array/cpu/NDArray.cpp | 27 +++-- libnd4j/include/array/cuda/NDArray.cu | 26 ++-- .../helpers/benchmark/ParametersBatch.h | 2 +- libnd4j/include/helpers/cpu/MmulHelper.cpp | 6 +- .../include/helpers/cuda_off/MmulHelper.cu | 16 +-- libnd4j/include/helpers/impl/ShapeUtils.cpp | 1 - libnd4j/include/helpers/shape.h | 98 ++++++++++------ libnd4j/include/loops/cpu/broadcasting.hpp | 4 +- .../include/loops/cpu/broadcasting_bool.hpp | 4 +- .../include/loops/cpu/broadcasting_int.hpp | 4 +- libnd4j/include/loops/cuda/broadcasting.chpp | 2 +- .../include/loops/cuda/broadcasting_bool.cu | 2 +- .../include/loops/cuda/broadcasting_int.cu | 2 +- .../loops/cuda/specials/concatKernel.cu | 10 +- .../generic/compat/compat_string_split.cpp | 5 +- .../ops/declarable/helpers/cpu/batchnorm.cpp | 36 +++--- .../declarable/helpers/cpu/imagesHelpers.cpp | 34 +++--- .../declarable/helpers/cpu/matrixSetDiag.cpp | 7 +- .../ops/declarable/helpers/cpu/s_t_b.cpp | 34 ++++-- .../ops/declarable/helpers/cpu/scatter.cpp | 4 +- .../ops/declarable/helpers/cpu/split.cpp | 9 +- .../ops/declarable/helpers/cpu/transforms.cpp | 111 ++++++++++-------- .../declarable/helpers/cuda/activations.cu | 4 +- .../ops/declarable/helpers/cuda/addBias.cu | 4 +- .../ops/declarable/helpers/cuda/batchnorm.cu | 2 +- .../ops/declarable/helpers/cuda/concat.cu | 2 +- .../declarable/helpers/cuda/convolutions.cu | 14 +-- .../ops/declarable/helpers/cuda/cross.cu | 8 +- .../ops/declarable/helpers/cuda/dilation2d.cu | 8 +- .../ops/declarable/helpers/cuda/flatten.cu | 2 +- .../ops/declarable/helpers/cuda/im2col.cu | 8 +- .../declarable/helpers/cuda/imagesHelpers.cu | 10 +- .../declarable/helpers/cuda/matrixSetDiag.cu | 10 +- .../ops/declarable/helpers/cuda/s_t_b.cu | 32 ++--- .../ops/declarable/helpers/cuda/scatter.cu | 8 +- .../ops/declarable/helpers/cuda/split.cu | 2 +- .../ops/declarable/helpers/cuda/sru.cu | 20 ++-- .../ops/declarable/helpers/cuda/transforms.cu | 26 ++-- .../helpers/impl/sparse_to_dense.cpp | 2 +- .../ops/declarable/helpers/impl/where.cpp | 7 +- libnd4j/include/ops/impl/specials_single.hpp | 16 ++- .../layers_tests/PlaygroundTests.cpp | 1 + libnd4j/tests_cpu/layers_tests/TadTests.cpp | 2 +- 43 files changed, 359 insertions(+), 273 deletions(-) diff --git a/libnd4j/include/array/cpu/NDArray.cpp b/libnd4j/include/array/cpu/NDArray.cpp index 7d91d1373..1d97ba61c 100644 --- a/libnd4j/include/array/cpu/NDArray.cpp +++ b/libnd4j/include/array/cpu/NDArray.cpp @@ -95,22 +95,29 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target.getShapeInfo()); - auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; + + int coords[MAX_RANK], temp; + for (auto i = start; i < stop; i++) { - shape::index2coords(i, target.getShapeInfo(), coords); + + shape::index2coordsCPU(start, i, target.getShapeInfo(), coords); const auto zOffset = shape::getOffset(target.getShapeInfo(), coords); // if( (row + upper < col) || (row + lower > col) ) if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1])) z[zOffset] = value; else if (this != &target) { // when this and target are different arrays - if (xRank != zRank) + if (xRank != zRank) { + temp = coords[0]; coords[0] = coords[1]; + } const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords); z[zOffset] = x[xOffset]; + + if (xRank != zRank) // restore first coordinate + coords[0] = temp; } } }; @@ -376,12 +383,16 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector 1) { for (uint j = 0; j < repSize; ++j) { coords[axis] -= repeats[j]; @@ -394,6 +405,8 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector(vx); auto z = reinterpret_cast(vz); - __shared__ int zRank, xRank, areSameOffsets; // xRank == zRank always, except when xRank = 1, in this case zRank = 2 - __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen + __shared__ int zRank, xRank, areSameOffsets, *sharedMem; // xRank == zRank always, except when xRank = 1, in this case zRank = 2 + __shared__ Nd4jLong zLen, totalThreads; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); xRank = shape::rank(xShapeInfo); zRank = shape::rank(zShapeInfo); @@ -137,7 +137,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(decltype(*target.getShapeInfo())) * target.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * target.rankOf() + 128; PointersManager manager(getContext(), "NDArray::fillAsTriangular"); @@ -155,12 +155,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo, auto x = reinterpret_cast(vx); - __shared__ int rank; - __shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen + __shared__ int rank, *sharedMem; + __shared__ Nd4jLong len, totalThreads; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); rank = shape::rank(xShapeInfo); len = shape::length(xShapeInfo); totalThreads = gridDim.x * blockDim.x; @@ -201,7 +201,7 @@ void NDArray::setIdentity() { const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(decltype(getShapeInfo())) * rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * rankOf() + 128; PointersManager manager(getContext(), "NDArray::setIdentity"); @@ -398,13 +398,13 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo, const X* x = reinterpret_cast(vx); Z* z = reinterpret_cast(vz); - __shared__ int rank; - __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen = zLen + __shared__ int rank, *sharedMem; + __shared__ Nd4jLong zLen, totalThreads; // xLen = zLen if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); rank = shape::rank(zShapeInfo); // xRank = zRank zLen = shape::length(zShapeInfo); // xLen <= zLen @@ -460,7 +460,7 @@ NDArray NDArray::repeat(const int axis, const std::vector& repeats) const { const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; + const int sharedMem = output.rankOf() * sizeof(int) * threadsPerBlock + 128; PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector& repeats)"); @@ -484,7 +484,7 @@ void NDArray::repeat(const int axis, const std::vector& repeats, NDArray& t const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = target.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; + const int sharedMem = target.rankOf() * sizeof(int) * threadsPerBlock + 128; PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector& repeats)"); diff --git a/libnd4j/include/helpers/benchmark/ParametersBatch.h b/libnd4j/include/helpers/benchmark/ParametersBatch.h index 5a045d5cd..c477f3ea8 100644 --- a/libnd4j/include/helpers/benchmark/ParametersBatch.h +++ b/libnd4j/include/helpers/benchmark/ParametersBatch.h @@ -46,7 +46,7 @@ namespace sd { int totalIterations = 1; // hehe - Nd4jLong xCoords[MAX_RANK]; + int xCoords[MAX_RANK]; Nd4jLong xShape[MAX_RANK]; int xRank = _spaces.size(); diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp index edbc45fd4..62d8153ef 100644 --- a/libnd4j/include/helpers/cpu/MmulHelper.cpp +++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp @@ -63,7 +63,7 @@ static void usualGemm(const NDArray* vA, const NDArray* vB, NDArray* vC, for (auto i = start; i < stop; ++i) { // evaluate C coordinates - shape::index2coords(i, cShapeInfo, cCoords.data()); + shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data()); // evaluate A coordinates aCoords[aMaxis] = cCoords[cMaxis]; @@ -433,12 +433,12 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB, NDArray* vC, auto func = PRAGMA_THREADS_FOR { - std::vector aCoords(aRank), bCoords(bRank), cCoords(cRank); + std::vector aCoords(aRank), bCoords(bRank), cCoords(cRank); for (auto i = start; i < stop; ++i) { // evaluate C coordinates - shape::index2coords(i, cShapeInfo, cCoords.data()); + shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data()); // calculate index of current batch Nd4jLong batchInd; diff --git a/libnd4j/include/helpers/cuda_off/MmulHelper.cu b/libnd4j/include/helpers/cuda_off/MmulHelper.cu index 13fc3b1b5..5e9304e88 100644 --- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu +++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu @@ -40,15 +40,15 @@ static __global__ void usualCudaGemm(const void* vA, const Nd4jLong* aShapeInfo, const T2* B = reinterpret_cast(vB); T3* C = reinterpret_cast< T3*>(vC); - __shared__ int K; + __shared__ int K, *coords; __shared__ bool betaPresent; - __shared__ Nd4jLong cLen, totalThreads, *coords; + __shared__ Nd4jLong cLen, totalThreads; __shared__ T3 alphaZ, betaZ; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - coords = reinterpret_cast(shmem); + coords = reinterpret_cast(shmem); cLen = shape::length(cShapeInfo); K = shape::shapeOf(const_cast(aShapeInfo))[aKaxis]; @@ -263,7 +263,7 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 6 + 128; // 6 = aRank + bRank + cRank + const int sharedMem = threadsPerBlock * sizeof(int) * 6 + 128; // 6 = aRank + bRank + cRank NDArray::prepareSpecialUse({C}, {A, B}); // BUILD_TRIPLE_SELECTOR(aType, bType, cType, usualGemm, (blocksPerGrid, threadsPerBlock, sharedMem, stream, A->getSpecialBuffer(), A->getSpecialShapeInfo(), B->getSpecialBuffer(), B->getSpecialShapeInfo(), C->getSpecialBuffer(), C->getSpecialShapeInfo(), 0, 1, 0, 1, 0, 1, alpha, beta), NUMERIC_TYPES, NUMERIC_TYPES, FLOAT_TYPES); @@ -529,14 +529,14 @@ static __global__ void batchedCudaGemm(const void* vA, const Nd4jLong* aShapeInf T3* C = reinterpret_cast< T3*>(vC); __shared__ bool betaPresent; - __shared__ int aRank, bRank, cRank, K; - __shared__ Nd4jLong cLen, totalThreads, *coords; + __shared__ int aRank, bRank, cRank, K, *coords; + __shared__ Nd4jLong cLen, totalThreads; __shared__ T3 alphaZ, betaZ; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - coords = reinterpret_cast(shmem); + coords = reinterpret_cast(shmem); cLen = shape::length(cShapeInfo); K = shape::shapeOf(const_cast(aShapeInfo))[aKaxis]; @@ -649,7 +649,7 @@ NDArray* MmulHelper::mmulNxN(const NDArray* A, const NDArray* B, NDArray* C, con const int threadsPerBlock = MAX_NUM_THREADS / 8; const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * (aRank + bRank + cRank) + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * (aRank + bRank + cRank) + 128; PointersManager manager(A->getContext(), "MmulHelper::mmulNxN"); diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index fdccd51bc..aa8e917cc 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -306,7 +306,6 @@ std::vector ShapeUtils::evalRepeatShape(int axis, const std::vector(maxIdx), maxShapeInfo, maxIdxs); - Nd4jLong minIdxs[MAX_RANK]; + int minIdxs[MAX_RANK]; maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen); return shape::coords2index(minShapeInfo, minIdxs); @@ -4374,17 +4376,17 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con ////////////////////////////////////////////////////////////////////// INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) { - Nd4jLong maxIdxs[MAX_RANK]; + int maxIdxs[MAX_RANK]; shape::index2coords(const_cast(maxIdx), maxShapeInfo, maxIdxs); - Nd4jLong minIdxs[MAX_RANK]; + int minIdxs[MAX_RANK]; maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen); return getOffset(minShapeInfo, minIdxs); } ////////////////////////////////////////////////////////////////////// - INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude) { + INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude) { const auto rankMin = shape::rank(minShapeInfo); const auto rankMax = shape::rank(maxShapeInfo); @@ -4394,8 +4396,8 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con const auto diff = rankMax - rankMin; // the size of dimsToExclude is equal to diff - Nd4jLong* indices = memBuff; - Nd4jLong* increment = memBuff + rankMax; + int* indices = memBuff; + int* increment = memBuff + rankMax; int N, minI, maxI; @@ -4457,7 +4459,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con } ////////////////////////////////////////////////////////////////////// - INLINEDEF _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) { + INLINEDEF _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) { const auto rankMin = shape::rank(minShapeInfo); const auto rankMax = shape::rank(maxShapeInfo); @@ -4469,9 +4471,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con const auto diff = rankMax - rankMin; // the size of dimsToExclude is equal to diff - Nd4jLong buffer[MAX_RANK]; - Nd4jLong* indices = buffer; - Nd4jLong* increment = buffer + MAX_RANK/2; + int indices[MAX_RANK], increment[MAX_RANK]; int N, minI, maxI; @@ -4886,7 +4886,7 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL } ////////////////////////////////////////////////////////////////////// -INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims) { +INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims) { for(uint i = dimsSize - 1; i > 0; --i) { coords[tadDims[i]] = index % shapeInfo[1 + tadDims[i]]; @@ -4895,6 +4895,34 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, coords[tadDims[0]] = index; // last iteration } +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords) { + + if(startIndex == index) { + shape::index2coords(index, shapeInfo, coords); + } + else { + int axis = shapeInfo[0] - 1; + while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1) + coords[axis--] = 0; + ++coords[axis]; + } +} + +////////////////////////////////////////////////////////////////////// +INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords) { + + if(startIndex == index) { + shape::index2coords(index, shapeInfo, coords); + } + else { + int axis = shapeInfo[0] - 1; + while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1) + coords[axis--] = 0; + ++coords[axis]; + } +} + ////////////////////////////////////////////////////////////////////// // INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) { @@ -5131,23 +5159,23 @@ INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, ////////////////////////////////////////////////////////////////////// -INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) { +// INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) { - Nd4jLong result = 9223372036854775807LL; +// Nd4jLong result = 9223372036854775807LL; - for(uint i = 0; i < shape::rank(inShapeInfo); ++i) { +// for(uint i = 0; i < shape::rank(inShapeInfo); ++i) { - const auto currentStride = shape::stride(inShapeInfo)[i]; +// const auto currentStride = shape::stride(inShapeInfo)[i]; - if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1) - continue; +// if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1) +// continue; - if(result > currentStride) - result = currentStride; - } +// if(result > currentStride) +// result = currentStride; +// } - return result == 9223372036854775807LL ? 1 : result; -} +// return result == 9223372036854775807LL ? 1 : result; +// } diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp index d69396e70..2b24dc17a 100644 --- a/libnd4j/include/loops/cpu/broadcasting.hpp +++ b/libnd4j/include/loops/cpu/broadcasting.hpp @@ -739,11 +739,11 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const auto func = PRAGMA_THREADS_FOR{ - Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; + int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; for (auto i = start; i < stop; ++i) { - shape::index2coords(i, zShapeInfo, zCoords); + shape::index2coordsCPU(start, i, zShapeInfo, zCoords); for (uint j = 0; j < rank; ++j) { xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j]; diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.hpp b/libnd4j/include/loops/cpu/broadcasting_bool.hpp index 9f9e95f50..ef8a35c48 100644 --- a/libnd4j/include/loops/cpu/broadcasting_bool.hpp +++ b/libnd4j/include/loops/cpu/broadcasting_bool.hpp @@ -449,11 +449,11 @@ void BroadcastBool::exec(const void *vx, const Nd4jLong *xShapeInfo, auto func = PRAGMA_THREADS_FOR{ - Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; + int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; for (auto i = start; i < stop; ++i) { - shape::index2coords(i, zShapeInfo, zCoords); + shape::index2coordsCPU(start, i, zShapeInfo, zCoords); for (uint j = 0; j < rank; ++j) { xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j]; diff --git a/libnd4j/include/loops/cpu/broadcasting_int.hpp b/libnd4j/include/loops/cpu/broadcasting_int.hpp index 9c5186f8c..95f54881d 100644 --- a/libnd4j/include/loops/cpu/broadcasting_int.hpp +++ b/libnd4j/include/loops/cpu/broadcasting_int.hpp @@ -609,11 +609,11 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, auto func = PRAGMA_THREADS_FOR{ - Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; + int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; for (auto i = start; i < stop; ++i) { - shape::index2coords(i, zShapeInfo, zCoords); + shape::index2coordsCPU(start, i, zShapeInfo, zCoords); for (uint j = 0; j < rank; ++j) { xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j]; diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp index f54386975..49270ddcc 100644 --- a/libnd4j/include/loops/cuda/broadcasting.chpp +++ b/libnd4j/include/loops/cuda/broadcasting.chpp @@ -275,7 +275,7 @@ __device__ void Broadcast::transformCuda( const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; + int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) { diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu index 513db1b7c..aae6bb141 100644 --- a/libnd4j/include/loops/cuda/broadcasting_bool.cu +++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu @@ -291,7 +291,7 @@ __device__ void BroadcastBool::transformCuda(const void *vx, const Nd4jLong const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; + int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) { diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu index 651aaecc5..f9ad3218c 100644 --- a/libnd4j/include/loops/cuda/broadcasting_int.cu +++ b/libnd4j/include/loops/cuda/broadcasting_int.cu @@ -271,7 +271,7 @@ __device__ void BroadcastInt::transformCuda(const void *vx, const Nd4jLong *x const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; + int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) { diff --git a/libnd4j/include/loops/cuda/specials/concatKernel.cu b/libnd4j/include/loops/cuda/specials/concatKernel.cu index 59c6c5380..a4a849e49 100644 --- a/libnd4j/include/loops/cuda/specials/concatKernel.cu +++ b/libnd4j/include/loops/cuda/specials/concatKernel.cu @@ -137,7 +137,7 @@ namespace sd { T *dataTAD = currentData + inputOffset; T *resultTAD = result + resultOffset; - Nd4jLong sub[MAX_RANK]; + int sub[MAX_RANK]; shape::index2coords(arrOffset, zTadShape, sub); @@ -166,7 +166,7 @@ namespace sd { auto dataTAD = currentData + inputOffset; auto resultTAD = result + resultOffset; - Nd4jLong sub[MAX_RANK]; + int sub[MAX_RANK]; shape::index2coords(arrOffset, zTadShape, sub); Nd4jLong baseOffset = shape::getOffset(zTadShape, sub); @@ -199,7 +199,7 @@ namespace sd { resultTAD[baseIdx + k * tadEWS] = dataTAD[k]; } } else { - Nd4jLong yIdx[MAX_RANK]; + int yIdx[MAX_RANK]; auto yRank = shape::rank(currentTad); for (int i = threadIdx.x; i < yLength; i+= blockDim.x) { @@ -214,8 +214,8 @@ namespace sd { //if (threadIdx.x == 0 && blockIdx.x == 0) // printf("Branch C; yLength: %i;\n", yLength); - Nd4jLong zIdx[MAX_RANK]; - Nd4jLong yIdx[MAX_RANK]; + int zIdx[MAX_RANK]; + int yIdx[MAX_RANK]; auto yRank = shape::rank(currentTad); auto tadRank = shape::rank(zTadShape); diff --git a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp index e835dc711..a59e3f02c 100644 --- a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp +++ b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp @@ -39,8 +39,7 @@ namespace sd { delim->syncToHost(); // output rank N+1 wrt input rank - std::vector ocoords(input->rankOf() + 1); - std::vector icoords(input->rankOf()); + std::vector icoords(input->rankOf()); // getting buffer lengths // FIXME: it'll be bigger, since it'll include delimiters, @@ -54,7 +53,7 @@ namespace sd { auto s = input->e(e); // getting base index - shape::index2coords(e, input->shapeInfo(), icoords.data()); + shape::index2coordsCPU(0, e, input->shapeInfo(), icoords.data()); // getting number of substrings auto cnt = StringUtils::countSubarrays(s.c_str(), s.length(), d.c_str(), d.length()) + 1; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp index a0e6cf061..2293fe843 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp @@ -64,7 +64,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* Nd4jLong* xOffsets = new Nd4jLong[steps]; Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps]; - Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()]; + int* auxBuff = new int[2 * input->rankOf()]; for (Nd4jLong j = 0; j < lenSmall; ++j) { @@ -139,40 +139,42 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; + int xzCoords[MAX_RANK], minCoords[MAX_RANK]; + + for (uint i = 0, j = 0; i < xRank; ++i) + if(j < numAxes && i != axes[j]) + minCoords[i] = 0; + else + ++j; for (auto i = start; i < stop; i++) { - shape::index2coords(i, input->getShapeInfo(), coords); + shape::index2coordsCPU(start, i, input->getShapeInfo(), xzCoords); - const auto xOffset = shape::getOffset(input->getShapeInfo(), coords); - const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), coords); + const auto xOffset = shape::getOffset(input->getShapeInfo(), xzCoords); + const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), xzCoords); if(minRank == xRank) { - for (uint i = 0, j = 0; i < xRank; ++i) { - if(j < numAxes && i != axes[j]) - coords[i] = 0; - else - ++j; - } + for (uint j = 0; j < numAxes; ++j) + minCoords[axes[j]] = xzCoords[axes[j]]; } else // minRank = numAxes = 1 in this case - coords[0] = coords[axes[0]]; + minCoords[0] = xzCoords[axes[0]]; - const auto meanOffset = shape::getOffset(mean->getShapeInfo(), coords); - const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), coords); + const auto meanOffset = shape::getOffset(mean->getShapeInfo(), minCoords); + const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), minCoords); T sigmaInvGam = 1. / sd::math::nd4j_sqrt(v[varianceOffset] + epsilon); if(g != nullptr) { - const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), coords); + const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), minCoords); sigmaInvGam *= g[gammaOffset]; } z[zOffset] = (x[xOffset] - m[meanOffset]) * sigmaInvGam; if(b != nullptr) { - const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), coords); + const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), minCoords); z[zOffset] += b[betaOffset]; } } @@ -184,7 +186,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray ////////////////////////////////////////////////////////////////////////// void batchnorm(const NDArray* input, const NDArray* mean, const NDArray* variance, const NDArray* gamma, const NDArray* beta, NDArray* output, const std::vector& axes, const double epsilon) { - // batchnorm2_ is slower + // batchnorm2_ is still slower ? BUILD_SINGLE_SELECTOR(input->dataType(), batchnorm_, (input, mean, variance, gamma, beta, output, axes, epsilon), FLOAT_TYPES); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp index 46729fbb8..682677ef3 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp @@ -51,19 +51,19 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) { auto func = PRAGMA_THREADS_FOR{ - Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i++) { - shape::index2coords(i, output.getShapeInfo(), coords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); - const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords); - const auto xOffset1 = xOffset0 + input.strideAt(dimC); - const auto xOffset2 = xOffset1 + input.strideAt(dimC); - z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2]; - } - }; + int coords[MAX_RANK]; + for (auto i = start; i < stop; i++) { + shape::index2coordsCPU(start, i, output.getShapeInfo(), coords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); + const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords); + const auto xOffset1 = xOffset0 + input.strideAt(dimC); + const auto xOffset2 = xOffset1 + input.strideAt(dimC); + z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2]; + } + }; - samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1); - return; + samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1); + return; } void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) { @@ -78,9 +78,9 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con const int rank = input.rankOf(); bool bSimple = (dimC == rank - 1 && 'c' == input.ordering() && 1 == input.ews() && 'c' == output.ordering() && 1 == output.ews()); - + if (bSimple) { - + auto func = PRAGMA_THREADS_FOR{ for (auto i = start; i < stop; i += increment) { op(x[i], x[i + 1], x[i + 2], z[i], z[i + 1], z[i + 2]); @@ -177,12 +177,12 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, const T* x = input->bufferAsT(); T* z = output->bufferAsT(); - // TODO: Use tensordot or other optimizied helpers to see if we can get better performance. + // TODO: Use tensordot or other optimizied helpers to see if we can get better performance. if (dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') { auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i += increment) { //simple M*v //tr.T*v.T // v * tr //rule: (AB)' =B'A' // v.shape (1,3) row vector T x0, x1, x2; @@ -192,7 +192,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, z[i] = x0 * tr[0][0] + x1 * tr[1][0] + x2 * tr[2][0]; z[i+1] = x0 * tr[0][1] + x1 * tr[1][1] + x2 * tr[2][1]; z[i+2] = x0 * tr[0][2] + x1 * tr[1][2] + x2 * tr[2][2]; - + } }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp index 3372950f2..60df150a9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp @@ -49,9 +49,12 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp const auto xLen = input.lengthOf(); auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; + + int coords[MAX_RANK]; + for (Nd4jLong i = 0; i < xLen; ++i) { - shape::index2coords(i, xShapeInfo, coords); + + shape::index2coordsCPU(start, i, xShapeInfo, coords); const auto xOffset = shape::getOffset(xShapeInfo, coords); const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index bbbb9199e..6a854bba8 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -113,18 +113,23 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& // loop through input array auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; + + int zCoords[MAX_RANK], xCoords[MAX_RANK]; + for (auto i = start; i < stop; i++) { - shape::index2coords(i, output.getShapeInfo(), coords); + shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); + memcpy(xCoords, zCoords, rank * sizeof(int)); // evaluate spatial coordinates for x for (uint j = 1; j <= numOfSpatialDims; ++j) - coords[j] += crop.e(j - 1, 0); // add crop left + xCoords[j] += crop.e(j - 1, 0); // add crop left - z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)]; + const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords); + const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords); + + z[zOffset] = x[xOffset]; } }; @@ -299,11 +304,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra // loop through output array auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i++) { - shape::index2coords(i, output.getShapeInfo(), coords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); + int zCoords[MAX_RANK], xCoords[MAX_RANK]; + + for (auto i = start; i < stop; i++) { + + shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords); + + const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords); + + memcpy(xCoords, zCoords, rank * sizeof(int)); bool within = true; @@ -312,16 +322,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra const auto padLeft = padding.e(j - 1, 0); const auto padRight = padding.e(j - 1, 1); - within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight); + within &= zCoords[j] >= padLeft && zCoords[j] < output.sizeAt(j) - padRight; if (!within) break; - coords[j] -= padLeft; // get coordinates for x + xCoords[j] = zCoords[j] - padLeft; // get coordinates for x } if (within) - z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)]; + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)]; else z[zOffset] = 0.f; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp index 2d9250f9b..dd83a8618 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp @@ -43,11 +43,11 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int auto func = PRAGMA_THREADS_FOR { - Nd4jLong xCoords[MAX_RANK]; + int xCoords[MAX_RANK]; for (auto i = start; i < stop; i++) { - shape::index2coords(i, xShapeInfo, xCoords); + shape::index2coordsCPU(start, i, xShapeInfo, xCoords); const Nd4jLong currentInd = x[shape::getOffset(xShapeInfo, xCoords)]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp index d138d9892..2e30cdf0a 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp @@ -96,14 +96,17 @@ namespace helpers { auto func = PRAGMA_THREADS_FOR{ - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK], temp; + for (auto i = start; i < stop; i += increment) { - shape::index2coords(i, input.getShapeInfo(), coords); + shape::index2coordsCPU(start, i, input.getShapeInfo(), coords); const auto xOffset = shape::getOffset(input.getShapeInfo(), coords); uint outArrIdx = 0; + temp = coords[axis]; + while (coords[axis] >= zDim) { coords[axis] -= zDim; ++outArrIdx; @@ -112,6 +115,8 @@ namespace helpers { T* z = outArrs[outArrIdx]->bufferAsT(); const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords); z[zOffset] = xBuff[xOffset]; + + coords[axis] = temp; } }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index fa3570879..7169cca4a 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -188,24 +188,35 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray const T padVal = padValue.e(0); auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; + + int zCoords[MAX_RANK], xCoords[MAX_RANK]; + for (auto i = start; i < stop; i++) { - shape::index2coords(i, output.getShapeInfo(), coords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); + + shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords); + + memcpy(xCoords, zCoords, rank * sizeof(int)); bool within = true; + for (int j = rankMinusOne; j >= 0; --j) { - if (xShape[j] == zShape[j]) continue; + + if (xShape[j] == zShape[j]) + continue; + const auto left = paddings.e(j, 0); - if (coords[j] < left || coords[j] >= left + xShape[j]) { + + if (zCoords[j] < left || zCoords[j] >= left + xShape[j]) { within = false; break; } - else { coords[j] = coords[j] - left; } + else + xCoords[j] = zCoords[j] - left; } if (within) - z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)]; + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)]; else z[zOffset] = padVal; } @@ -219,20 +230,30 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray const Nd4jLong shift2 = mode == 1 ? 2 : 1; // REFLECT : SYMMETRIC auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; + + int zCoords[MAX_RANK], xCoords[MAX_RANK]; + for (auto i = start; i < stop; i++) { - shape::index2coords(i, output.getShapeInfo(), coords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); + + shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords); + + memcpy(xCoords, zCoords, rank * sizeof(int)); for (int j = rankMinusOne; j >= 0; --j) { - if (xShape[j] == zShape[j]) continue; - coords[j] = coords[j] - paddings.e(j, 0); // are ready to fill middle (within input dimension range) - if (coords[j] < 0) coords[j] = -coords[j] - shift1; // means fill from left - else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right + if (xShape[j] == zShape[j]) + continue; + + xCoords[j] = zCoords[j] - paddings.e(j, 0); // are ready to fill middle (within input dimension range) + + if (xCoords[j] < 0) + xCoords[j] = -xCoords[j] - shift1; // means fill from left + else if (xCoords[j] >= xShape[j]) + xCoords[j] = 2 * xShape[j] - xCoords[j] - shift2; // means fill from right } - const auto xOffset = shape::getOffset(input.getShapeInfo(), coords); + const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords); z[zOffset] = x[xOffset]; } }; @@ -562,45 +583,37 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { const Nd4jLong zLen = output.lengthOf(); - const int yLastDim = indices.sizeAt(-1); + const uint yLastDim = indices.sizeAt(-1); + + const int diff = zRank - xRank; + const bool bEqual = yLastDim == xRank; auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK * 3]; + + int xCoords[MAX_RANK], zCoords[MAX_RANK], temp; + for (auto i = start; i < stop; i++) { - Nd4jLong *zCoordStart, *xCoordStart; - if (yLastDim == xRank) { - zCoordStart = coords; - xCoordStart = coords; - } else if (zRank >= xRank) { - zCoordStart = coords; - xCoordStart = coords + zRank - xRank; - } else { - zCoordStart = coords + xRank - zRank; - xCoordStart = coords; - } + shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords); - shape::index2coords(i, output.getShapeInfo(), zCoordStart); + const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart); + temp = zCoords[yRank - 1]; + zCoords[yRank - 1] = 0; + const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoords); + zCoords[yRank - 1] = temp; - // last y coordinate - uint coordToRestore; - if (yLastDim != xRank) - coordToRestore = static_cast(zCoordStart[yRank - 1]); + if(bEqual) + memcpy(xCoords, zCoords, zRank * sizeof(int)); + else if(diff >= 0) + memcpy(xCoords, zCoords + diff, xRank * sizeof(int)); + else + memcpy(xCoords - diff, zCoords, zRank * sizeof(int)); - zCoordStart[yRank - 1] = 0; - const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart); + for (uint j = 0; j < yLastDim; ++j) + xCoords[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride - //restore z coordinate - if (yLastDim != xRank) - zCoordStart[yRank - 1] = coordToRestore; - - // construct coordinates for x - for (int j = 0; j < yLastDim; ++j) - xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride - - const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart); + const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords); z[zOffset] = x[xOffset]; } @@ -1188,10 +1201,12 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o else { auto func = PRAGMA_THREADS_FOR { - Nd4jLong inIdx[MAX_RANK]; - Nd4jLong outIdx[MAX_RANK]; + + int inIdx[MAX_RANK], outIdx[MAX_RANK]; + for (auto i = start; i < stop; i++) { - shape::index2coords(i, output.getShapeInfo(), outIdx); + + shape::index2coordsCPU(start, i, output.getShapeInfo(), outIdx); for (int j = 0; j < rank; ++j) { const Nd4jLong inLen = input.sizeAt(j); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu index 6f658c72e..4243c6e04 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu @@ -52,7 +52,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo, __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK]; for (int i = tid; i < xzLen; i += blockDim.x * gridDim.x) { shape::index2coords(i, xShapeInfo, coords); @@ -124,7 +124,7 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK]; for (int i = tid; i < inLen; i += totalThreads) { shape::index2coords(i, inShapeInfo, coords); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu index dad5a5b06..0878a1c77 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu @@ -45,7 +45,7 @@ __global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo, X* z = reinterpret_cast(vz); __shared__ int rank, channelPosition, posOfNonUnityDim; - __shared__ Nd4jLong *sharedMem, len; + __shared__ Nd4jLong len, *sharedMem; __shared__ bool xzSameOffsets, xzAreSame; if (threadIdx.x == 0) { @@ -130,7 +130,7 @@ void addBias(sd::graph::Context& block, const NDArray& input, const NDArray& bia FLOAT_TYPES, FLOAT_TYPES); } else { // default case - const int threadsPerBlock = MAX_NUM_THREADS / 2; + const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu index 5e113ff2f..2daac26c3 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu @@ -124,7 +124,7 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo } __syncthreads(); - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK]; const auto tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu index 6a41a9fc4..10e1d132c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu @@ -51,7 +51,7 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK]; for (uint64_t i = tid; i < zLen; i += totalThreads) { shape::index2coords(i, zShapeInfo, coords); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu index d02e99987..76ba2e1df 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu @@ -706,7 +706,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, T* z = reinterpret_cast(vz); __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd; - __shared__ Nd4jLong *sharedMem, zLen; + __shared__ Nd4jLong zLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -858,7 +858,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf Nd4jLong coord2, coord3; __shared__ int rank, kHeff, kWeff, iH, iW, kProd; - __shared__ Nd4jLong *sharedMem, yLen; + __shared__ Nd4jLong yLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -1017,7 +1017,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf Nd4jLong coord2, coord3, coord4; __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd; - __shared__ Nd4jLong *sharedMem, yLen; + __shared__ Nd4jLong yLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -1342,7 +1342,7 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn T* z = reinterpret_cast(vz); __shared__ int rank, dimIH; - __shared__ Nd4jLong *sharedMem, zLen; + __shared__ Nd4jLong zLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -1410,7 +1410,7 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn T* z = reinterpret_cast(vz); __shared__ int rank, dimID; - __shared__ Nd4jLong *sharedMem, zLen; + __shared__ Nd4jLong zLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -1480,7 +1480,7 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape __shared__ int rank, dimIH; __shared__ uint factorH, factorW; - __shared__ Nd4jLong *sharedMem, zLen; + __shared__ Nd4jLong zLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -1554,7 +1554,7 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape __shared__ int rank, dimID; __shared__ uint factorD, factorH, factorW; - __shared__ Nd4jLong *sharedMem, zLen; + __shared__ Nd4jLong zLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu index 38e8d0cca..d7694641c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu @@ -36,8 +36,8 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo, __shared__ const T* x; __shared__ const T* y; __shared__ T* z; - __shared__ int rank; - __shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem; + __shared__ int rank, *sharedMem; + __shared__ Nd4jLong lenWithoutLastDim, totalThreads; if (threadIdx.x == 0) { x = reinterpret_cast(vx); @@ -45,7 +45,7 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo, z = reinterpret_cast(vz); extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); totalThreads = gridDim.x * blockDim.x; rank = shape::rank(xShapeInfo); @@ -106,7 +106,7 @@ void crossBatched(sd::LaunchContext* context, NDArray *x, NDArray *y, NDArray *z const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (x->lengthOf() / x->sizeAt(-1) + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = sizeof(Nd4jLong) * threadsPerBlock * x->rankOf() + 128; + const int sharedMem = sizeof(int) * threadsPerBlock * x->rankOf() + 128; PointersManager manager(context, "cross"); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu index b318465a7..c05b5fb6d 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu @@ -43,13 +43,13 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo const X* y = reinterpret_cast(vy); Z* z = reinterpret_cast(vz); - __shared__ int xzRank, yRank; + __shared__ int xzRank, yRank, *sharedMem; __shared__ uint iH, iW, kH, kW; - __shared__ Nd4jLong *sharedMem, zLen; + __shared__ Nd4jLong zLen; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); zLen = shape::length(zShapeInfo); @@ -119,7 +119,7 @@ void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, ND const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (output->lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(Nd4jLong) * threadsPerBlock + 128; + const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(int) * threadsPerBlock + 128; NDArray::prepareSpecialUse({output}, {input, weights}); BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), dilation2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), weights->getSpecialBuffer(), weights->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), sH, sW, pH, pW, dH, dW), FLOAT_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu index e8b5e83c0..3600104e1 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu @@ -27,7 +27,7 @@ namespace sd { template void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) { - Nd4jLong xCoord[MAX_RANK]; + int xCoord[MAX_RANK]; // each block of threads works on 1 input array for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu index 5bce6c1b7..0dbca8c47 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu @@ -40,12 +40,12 @@ __global__ static void im2colCuda(const void *image, void *columns, const auto im = reinterpret_cast(image); auto col = reinterpret_cast(columns); - __shared__ Nd4jLong colLen, *sharedMem, iH, iW; - __shared__ int imRank, colRank; + __shared__ Nd4jLong colLen, iH, iW; + __shared__ int imRank, colRank, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); colRank = 6; imRank = 4; @@ -81,7 +81,7 @@ __global__ static void im2colCuda(const void *image, void *columns, ////////////////////////////////////////////////////////////////////////// template static void im2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, sd::LaunchContext & context, const void *image, void *columns, const Nd4jLong *imShapeInfo, const Nd4jLong *colShapeInfo, int sH, int sW, int pH, int pW, int dH, int dW, double zeroPadVal) { - im2colCuda<<>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal); + im2colCuda<<>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu b/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu index 11ec1f46e..54f306ef7 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu @@ -149,19 +149,19 @@ __global__ void rgbToGrsCuda(const void *vx, const Nd4jLong *xShapeInfo, void *v const auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); - __shared__ Nd4jLong zLen, *sharedMem; - __shared__ int rank; // xRank == zRank + __shared__ Nd4jLong zLen; + __shared__ int rank, *sharedMem; // xRank == zRank if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); zLen = shape::length(zShapeInfo); rank = shape::rank(zShapeInfo); } __syncthreads(); - Nd4jLong* coords = sharedMem + threadIdx.x * rank; + auto coords = sharedMem + threadIdx.x * rank; for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) { @@ -197,7 +197,7 @@ void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; + const int sharedMem = input.rankOf() * sizeof(int) * threadsPerBlock + 128; NDArray::prepareSpecialUse({&output}, {&input}); BUILD_SINGLE_SELECTOR(input.dataType(), rgbToGrsCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), dimC), NUMERIC_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu index c771d12ff..e5773abf5 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu @@ -39,14 +39,14 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI const auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); - __shared__ int xRank; // xRank = zRank, xRank = yRank + 1 - __shared__ Nd4jLong xLen, *sharedMem; // xLen = zLen + __shared__ int xRank, *sharedMem; // xRank = zRank, xRank = yRank + 1 + __shared__ Nd4jLong xLen; // xLen = zLen __shared__ bool areSameOffsets; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not @@ -56,7 +56,7 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI __syncthreads(); - auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread + auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(int) * threadIdx.x) amount of shared memory per each thread const auto tid = blockIdx.x * blockDim.x + threadIdx.x; for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) { @@ -86,7 +86,7 @@ void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArr const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * input.rankOf() + 128; PointersManager manager(context, "matrixSetDiag"); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu index da61b0e48..5784699d0 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu @@ -43,12 +43,12 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn const auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); - __shared__ int rank; - __shared__ Nd4jLong zLen, *sharedMem; + __shared__ int rank, *sharedMem; + __shared__ Nd4jLong zLen; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); rank = shape::rank(zShapeInfo); zLen = shape::length(zShapeInfo); @@ -103,7 +103,7 @@ void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& out const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128; PointersManager manager(context, "batchToSpace"); @@ -138,13 +138,13 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape const auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); - __shared__ int rank; - __shared__ Nd4jLong zLen, *sharedMem; + __shared__ int rank, *sharedMem; + __shared__ Nd4jLong zLen; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); rank = shape::rank(zShapeInfo); zLen = shape::length(zShapeInfo); @@ -234,7 +234,7 @@ void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDAr const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128; PointersManager manager(context, "batchToSpaceND"); @@ -264,12 +264,12 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn const auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); - __shared__ int rank; - __shared__ Nd4jLong zLen, *sharedMem; + __shared__ int rank, *sharedMem; + __shared__ Nd4jLong zLen; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); rank = shape::rank(zShapeInfo); zLen = shape::length(zShapeInfo); @@ -326,7 +326,7 @@ void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& out const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128; PointersManager manager(context, "spaceToBatch"); @@ -364,13 +364,13 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape const auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); - __shared__ int rank; // xRank = zRank, yRank = 2; - __shared__ Nd4jLong zLen, totalThreads, *sharedMem; + __shared__ int rank, *sharedMem; // xRank = zRank, yRank = 2; + __shared__ Nd4jLong zLen, totalThreads; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); rank = shape::rank(zShapeInfo); zLen = shape::length(zShapeInfo); @@ -473,7 +473,7 @@ void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDAr const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128; PointersManager manager(context, "spaceToBatchND"); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu index 4e7360004..364ad83d2 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu @@ -628,12 +628,12 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo, auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); - __shared__ Nd4jLong xLen, *sharedMem; - __shared__ int xRank; // xRank = zRank, yRank = xRank + 1 + __shared__ Nd4jLong xLen; + __shared__ int xRank, *sharedMem; // xRank = zRank, yRank = xRank + 1 if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); xLen = shape::length(xShapeInfo); xRank = shape::rank(xShapeInfo); @@ -678,7 +678,7 @@ void scatterForLoss(sd::LaunchContext* context, const NDArray& indices, NDArray& const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = updates.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; + const int sharedMem = updates.rankOf() * sizeof(int) * threadsPerBlock + 128; if(calcGrad) { NDArray::prepareSpecialUse({&updates}, {&indices}); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/split.cu b/libnd4j/include/ops/declarable/helpers/cuda/split.cu index 5c4b4b014..5690d786c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/split.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/split.cu @@ -54,7 +54,7 @@ __global__ static void splitCuda(const void* vx, const Nd4jLong* xShapeInfo, voi const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK]; for (uint64_t i = tid; i < xLen; i += totalThreads) { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu index 0ded43593..518525ecf 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu @@ -135,13 +135,13 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo, const int rank = 3; - __shared__ int time, K; - __shared__ Nd4jLong len, totalThreads, *sharedMem; + __shared__ int time, K, *sharedMem; + __shared__ Nd4jLong len, totalThreads; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); time = xShapeInfo[1]; K = xShapeInfo[3] / 2; @@ -152,7 +152,7 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo, __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong* coords = sharedMem + threadIdx.x * rank; + auto coords = sharedMem + threadIdx.x * rank; if(tid >= len) return; @@ -245,7 +245,7 @@ void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDAr const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock; // loop through last two dimensions of x array -> bS, 2*K - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128; NDArray::prepareSpecialUse({ht, ct}, {x, &wi, b, c0, mask}); BUILD_SINGLE_SELECTOR(x->dataType(), sruBICudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ht->specialBuffer(), ht->specialShapeInfo(), ct->specialBuffer(), ct->specialShapeInfo()), FLOAT_TYPES); @@ -340,13 +340,13 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI const int rank = 3; - __shared__ int time, K; - __shared__ Nd4jLong len, totalThreads, *sharedMem; + __shared__ int time, K, *sharedMem; + __shared__ Nd4jLong len, totalThreads; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); time = xShapeInfo[1]; K = xShapeInfo[3] / 2; @@ -358,7 +358,7 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - Nd4jLong* coords = sharedMem + threadIdx.x * rank; + auto coords = sharedMem + threadIdx.x * rank; if(tid >= len) return; @@ -513,7 +513,7 @@ void sruBIBP(sd::LaunchContext* context, NDArray* x, const NDArray* w, const NDA const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock; // loop through last two dimensions of x array -> bS, 2*K - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128; NDArray::prepareSpecialUse({gradI, &gradWi, &gradBias, gradC0}, {x, &wi, b, c0, ct, gradCt, gradHt, mask}); BUILD_SINGLE_SELECTOR(x->dataType(), sruBIBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ct->getSpecialBuffer(), ct->getSpecialShapeInfo(), gradHt->getSpecialBuffer(), gradHt->getSpecialShapeInfo(), gradCt->getSpecialBuffer(), gradCt->getSpecialShapeInfo(), gradI->specialBuffer(), gradI->specialShapeInfo(), gradWi.specialBuffer(), gradWi.specialShapeInfo(), gradBias.specialBuffer(), gradBias.specialShapeInfo(), gradC0->specialBuffer(), gradC0->specialShapeInfo()), FLOAT_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu index 2c4299a90..b4dcfb2f6 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu @@ -93,13 +93,13 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi auto z = reinterpret_cast(vz); __shared__ T* sharedMem; - __shared__ int xRank, zRank; // xRank = zRank + 2 - __shared__ Nd4jLong xLen, zLen, *coordsMem; + __shared__ int xRank, zRank, *coordsMem; // xRank = zRank + 2 + __shared__ Nd4jLong xLen, zLen; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); - coordsMem = reinterpret_cast(shmem + blockDim.x * sizeof(T)); + coordsMem = reinterpret_cast(shmem + blockDim.x * sizeof(T)); xRank = shape::rank(xShapeInfo); zRank = shape::rank(zShapeInfo); @@ -109,7 +109,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi } __syncthreads(); - Nd4jLong* coords = coordsMem + threadIdx.x * xRank; + auto coords = coordsMem + threadIdx.x * xRank; for (uint m = blockIdx.x; m < zLen; m += gridDim.x) { // one block per each element of z, that is per each matrix @@ -160,7 +160,7 @@ void trace(sd::LaunchContext* context, const NDArray& input, NDArray& output) { const uint diagLen = input.sizeAt(-1) < input.sizeAt(-2) ? input.sizeAt(-1) : input.sizeAt(-2); const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * (sizeof(Nd4jLong) * input.rankOf() + input.sizeOfT()) + 128; + const int sharedMem = threadsPerBlock * (sizeof(int) * input.rankOf() + input.sizeOfT()) + 128; NDArray::prepareSpecialUse({&output}, {&input}); BUILD_SINGLE_SELECTOR(input.dataType(), traceCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), diagLen), LIBND4J_TYPES); @@ -177,13 +177,13 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo const auto x = reinterpret_cast(vx); // gradO auto z = reinterpret_cast(vz); // gradI - __shared__ int rank, areSameOffsets; // xRank = zRank - __shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen = zLen + __shared__ int rank, areSameOffsets, *sharedMem; // xRank = zRank + __shared__ Nd4jLong len, totalThreads; // xLen = zLen if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); rank = shape::rank(xShapeInfo); len = shape::length(zShapeInfo); @@ -221,7 +221,7 @@ void triuBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gra const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * gradO.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * gradO.rankOf() + 128; PointersManager manager(context, "triuBP"); @@ -240,13 +240,13 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo const auto x = reinterpret_cast(vx); // gradO auto z = reinterpret_cast(vz); // gradI - __shared__ int xRank, zRank; // xRank >= zRank - __shared__ Nd4jLong numOfXOffsets, zLen, totalThreads, *sharedMem; // xLen >= zLen + __shared__ int xRank, zRank, *sharedMem; // xRank >= zRank + __shared__ Nd4jLong numOfXOffsets, zLen, totalThreads; // xLen >= zLen if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; - sharedMem = reinterpret_cast(shmem); + sharedMem = reinterpret_cast(shmem); xRank = shape::rank(zShapeInfo); zLen = shape::length(zShapeInfo); @@ -289,7 +289,7 @@ void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray const int threadsPerBlock = MAX_NUM_THREADS / 4; const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; - const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 2 * gradO.rankOf() + 128; + const int sharedMem = threadsPerBlock * sizeof(int) * 2 * gradO.rankOf() + 128; PointersManager manager(context, "tileBP"); diff --git a/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp b/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp index 7a99b4ecc..36044907e 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp @@ -32,7 +32,7 @@ namespace sd { auto indices = reinterpret_cast(vindices); auto output = reinterpret_cast(voutput); - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK]; uint64_t pos = 0; for (uint64_t e = 0L; e < length; e++) { // indices come in blocks diff --git a/libnd4j/include/ops/declarable/helpers/impl/where.cpp b/libnd4j/include/ops/declarable/helpers/impl/where.cpp index f73bcae9a..df8fd1074 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/where.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/where.cpp @@ -29,11 +29,14 @@ namespace sd { NDArrayList list(0, true); int cnt = 0; - Nd4jLong idx[MAX_RANK]; + int idx[MAX_RANK]; + for (Nd4jLong e = 0; e < condition.lengthOf(); e++) { - shape::index2coords(e, condition.getShapeInfo(), idx); + + shape::index2coordsCPU(0, e, condition.getShapeInfo(), idx); auto offset = shape::getOffset(condition.getShapeInfo(), idx); + if (condition.e(offset)) { auto array = NDArrayFactory::create_('c', {1, condition.rankOf()}, output.dataType(), output.getContext()); for (int f = 0; f < condition.rankOf(); f++) diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp index 3cf3d113e..ed86315f7 100644 --- a/libnd4j/include/ops/impl/specials_single.hpp +++ b/libnd4j/include/ops/impl/specials_single.hpp @@ -178,16 +178,18 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inAr // general case auto func = PRAGMA_THREADS_FOR { - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK], temp; for (auto i = start; i < stop; i += increment) { - shape::index2coords(i, output.getShapeInfo(), coords); + shape::index2coordsCPU(start, i, output.getShapeInfo(), coords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); uint inArrIdx = 0; uint xDim = inArrs[inArrIdx]->sizeAt(axis); + temp = coords[axis]; while (coords[axis] >= xDim) { coords[axis] -= xDim; xDim = inArrs[++inArrIdx]->sizeAt(axis); @@ -197,6 +199,8 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inAr const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords); zBuff[zOffset] = x[xOffset]; + + coords[axis] = temp; } }; @@ -298,13 +302,15 @@ void SpecialMethods::splitCpuGeneric(const NDArray& input, const std::vector< auto func = PRAGMA_THREADS_FOR{ - Nd4jLong coords[MAX_RANK]; + int coords[MAX_RANK], temp; + for (auto i = start; i < stop; i += increment) { - shape::index2coords(i, input.getShapeInfo(), coords); + shape::index2coordsCPU(start, i, input.getShapeInfo(), coords); const auto xOffset = shape::getOffset(input.getShapeInfo(), coords); uint outArrIdx = 0; + temp = coords[axis]; while (coords[axis] >= zDim) { coords[axis] -= zDim; @@ -314,6 +320,8 @@ void SpecialMethods::splitCpuGeneric(const NDArray& input, const std::vector< T* z = outArrs[outArrIdx]->bufferAsT(); const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords); z[zOffset] = xBuff[xOffset]; + + coords[axis] = temp; } }; diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 0769a9aef..94156d4bc 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -258,6 +258,7 @@ TEST_F(PlaygroundTests, test_bert_2) { delete graph; } + TEST_F(PlaygroundTests, test_one_off_ops_1) { auto x = NDArrayFactory::create('c', {4, 128, 768}); auto y = NDArrayFactory::create('c', {4, 128, 1}); diff --git a/libnd4j/tests_cpu/layers_tests/TadTests.cpp b/libnd4j/tests_cpu/layers_tests/TadTests.cpp index 421e71b01..5dfdf401d 100644 --- a/libnd4j/tests_cpu/layers_tests/TadTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/TadTests.cpp @@ -289,7 +289,7 @@ TEST_F(TadTests, calcOffsets_1) { TEST_F(TadTests, outerArrayIndexes_1) { NDArray x('c', {2,3,4,5}, sd::DataType::FLOAT32); - Nd4jLong maxIdxs[120]; + int maxIdxs[120]; NDArray y1('c', {3,5}, sd::DataType::FLOAT32); const std::vector dimsToExclude1 = {0,2};