diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu index 126837ad9..60498aaf7 100644 --- a/libnd4j/blas/cuda/NDArray.cu +++ b/libnd4j/blas/cuda/NDArray.cu @@ -78,7 +78,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); @@ -87,7 +86,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha zLen = shape::length(zShapeInfo); totalThreads = gridDim.x * blockDim.x; } - __syncthreads(); auto coords = sharedMem + threadIdx.x * zRank; @@ -153,14 +151,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo, __shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); rank = shape::rank(xShapeInfo); len = shape::length(xShapeInfo); totalThreads = gridDim.x * blockDim.x; } - __syncthreads(); auto coords = sharedMem + threadIdx.x * rank; diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index b85ce8760..11e2c0188 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -1183,7 +1183,6 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape __shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end; if (threadIdx.x == 0) { - blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs; // floor arrIdx = blockIdx.x / blocksPerArr; if (arrIdx >= numOfArrs) @@ -1200,8 +1199,8 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape start = arrLenPerBlock * (blockIdx.x % blocksPerArr); end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock); } - __syncthreads(); + for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) { auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ); auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen); @@ -3165,7 +3164,6 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr arrLenX = shape::length(xShapeInfo); arrLenY = shape::length(yShapeInfo); } - __syncthreads(); if (arrLenX != arrLenY) diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp index dc8a3eeb1..d930d8cad 100644 --- a/libnd4j/include/loops/cuda/broadcasting.chpp +++ b/libnd4j/include/loops/cuda/broadcasting.chpp @@ -128,7 +128,6 @@ namespace functions { if (threadIdx.x == 0) { - tadLength = _length(tadOnlyShapeInfo); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); numTads = _length(yShapeInfo) / tadLength; @@ -194,7 +193,6 @@ namespace functions { __shared__ Nd4jLong zEWS; if (threadIdx.x == 0) { - tadLength = _length(tadOnlyShapeInfo); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); numTads = _length(xShapeInfo) / tadLength; diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu index 6cc3f3cbb..8981790f5 100644 --- a/libnd4j/include/loops/cuda/broadcasting_bool.cu +++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu @@ -185,7 +185,6 @@ namespace functions { __shared__ Nd4jLong zEWS; if (threadIdx.x == 0) { - tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); numTads = shape::length(xShapeInfo) / tadLength; diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu index 94793f8e8..18e5b1432 100644 --- a/libnd4j/include/loops/cuda/indexreduce.cu +++ b/libnd4j/include/loops/cuda/indexreduce.cu @@ -231,7 +231,6 @@ namespace functions { xLength = shape::length(xShapeInfo); } - __syncthreads(); if (!resultScalar) { @@ -267,6 +266,7 @@ namespace functions { if (threadIdx.x == 0) { result[r] = sPartials[threadIdx.x].index; } + __syncthreads(); } } else { @@ -287,6 +287,7 @@ namespace functions { if (threadIdx.x == 0) { result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams); } + __syncthreads(); } } } else { diff --git a/libnd4j/include/loops/cuda/pairwise.chpp b/libnd4j/include/loops/cuda/pairwise.chpp index a7e6f32cc..3f7134887 100644 --- a/libnd4j/include/loops/cuda/pairwise.chpp +++ b/libnd4j/include/loops/cuda/pairwise.chpp @@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, __shared__ Nd4jLong len; if (threadIdx.x == 0) { - xEws = shape::elementWiseStride(xShapeInfo); yEws = shape::elementWiseStride(yShapeInfo); zEws = shape::elementWiseStride(zShapeInfo); diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu index 0834386f2..62f040191 100644 --- a/libnd4j/include/loops/cuda/pairwise_bool.cu +++ b/libnd4j/include/loops/cuda/pairwise_bool.cu @@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, __shared__ Nd4jLong len; if (threadIdx.x == 0) { - xEws = shape::elementWiseStride(xShapeInfo); yEws = shape::elementWiseStride(yShapeInfo); zEws = shape::elementWiseStride(zShapeInfo); diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp index 819c215fc..01b595da1 100644 --- a/libnd4j/include/loops/cuda/reduce3.chpp +++ b/libnd4j/include/loops/cuda/reduce3.chpp @@ -125,7 +125,6 @@ __device__ void Reduce3::execScalarCuda( void *vx, Nd4jLong *xShapeInfo, __shared__ Z* sPartials; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sPartials = reinterpret_cast(shmem); @@ -137,7 +136,6 @@ __device__ void Reduce3::execScalarCuda( void *vx, Nd4jLong *xShapeInfo, else extraZ[2] = (Z) 0.0f; } - __syncthreads(); sPartials[threadIdx.x] = OpType::startingValue(x); @@ -377,7 +375,6 @@ __device__ void Reduce3::transform(void *vx, Nd4jLong *xShapeInfo, __shared__ char yTadOrder; if(threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sPartials = reinterpret_cast(shmem); diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu index 1e2f3ce4f..deca80217 100644 --- a/libnd4j/include/loops/cuda/summarystatsreduce.cu +++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu @@ -217,7 +217,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa if (threadIdx.x == 0) { z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]); } - + __syncthreads(); } } else { @@ -285,8 +285,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa SummaryStatsData *pBuffer = (SummaryStatsData*) reductionBuffer; pBuffer[blockIdx.x] = sPartials[0]; } - __syncthreads(); __threadfence(); + __syncthreads(); if (tid == 0) { unsigned int ticket = atomicInc(&tc[16384], gridDim.x); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu index 1397874f8..80e5d885b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu @@ -52,7 +52,6 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo, xzRank = shape::rank(xShapeInfo); yRank = shape::rank(yShapeInfo); } - __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -132,7 +131,6 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI inRank = shape::rank(inShapeInfo); alphaRank = shape::rank(alphaShapeInfo); } - __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu index e8062e126..c27c9fb8a 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu @@ -45,7 +45,6 @@ static void _CUDA_G adjustHueCuda(const void* vx, const Nd4jLong* xShapeInfo, co xDimCstride = shape::stride(xShapeInfo)[dimC]; zDimCstride = shape::stride(zShapeInfo)[dimC]; } - __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu index 4ab8da304..a1dc4189a 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu @@ -47,7 +47,6 @@ static void _CUDA_G adjustSaturationCuda(const void* vx, const Nd4jLong* xShapeI xDimCstride = shape::stride(xShapeInfo)[dimC]; zDimCstride = shape::stride(zShapeInfo)[dimC]; } - __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu index 7678779ac..6c3dedd20 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu @@ -53,7 +53,6 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo, __shared__ Nd4jLong minLen, tadLen, totalThreads; if (threadIdx.x == 0) { - totalThreads = gridDim.x * blockDim.x; minLen = shape::length(meanShapeInfo); @@ -116,7 +115,6 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); totalThreads = gridDim.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu index e02bce146..8a21f03a5 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu @@ -38,7 +38,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI __shared__ Nd4jLong *sharedMem, imLen; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); @@ -53,7 +52,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI imLen = shape::length(imShapeInfo); } - __syncthreads(); const auto imInd = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu index 94675c587..d372f05c8 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu @@ -50,7 +50,6 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL rank = shape::rank(zShapeInfo); totalThreads = gridDim.x * blockDim.x; } - __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu index 44a0156d7..a37078ad9 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu @@ -43,7 +43,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI __shared__ Nd4jLong *sharedMem; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); @@ -56,7 +55,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI iH = volShapeInfo[4]; iW = volShapeInfo[5]; } - __syncthreads(); const auto colInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -127,7 +125,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape __shared__ Nd4jLong *sharedMem, volLen; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); @@ -144,7 +141,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape volLen = shape::length(volShapeInfo); } - __syncthreads(); const auto volInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -404,7 +400,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff; if (threadIdx.x == 0) { - bS = shape::sizeAt(xShapeInfo, 0); iC = shape::sizeAt(xShapeInfo, 1); oH = shape::sizeAt(zShapeInfo, 2); @@ -428,7 +423,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn kHEff = kH + (kH-1)*(dH-1); kWEff = kW + (kW-1)*(dW-1); } - __syncthreads(); int tid = blockIdx.x * gridDim.x + threadIdx.x; @@ -501,7 +495,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape __shared__ bool fOrder; if (threadIdx.x == 0) { - bS = shape::sizeAt(xShapeInfo, 0); iC = shape::sizeAt(xShapeInfo, 1); oH = shape::sizeAt(zShapeInfo, 2); @@ -525,7 +518,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape kHEff = kH + (kH-1)*(dH-1); kWEff = kW + (kW-1)*(dW-1); } - __syncthreads(); int tid = blockIdx.x * gridDim.x + threadIdx.x; @@ -594,7 +586,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn __shared__ bool fOrder; if (threadIdx.x == 0) { - bS = shape::sizeAt(xShapeInfo, 0); iC = shape::sizeAt(xShapeInfo, 1); oH = shape::sizeAt(zShapeInfo, 2); @@ -618,7 +609,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn kHEff = kH + (kH-1)*(dH-1); kWEff = kW + (kW-1)*(dW-1); } - __syncthreads(); int tid = blockIdx.x * gridDim.x + threadIdx.x; @@ -737,7 +727,6 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, kProd = kD * kH * kW; } - __syncthreads(); const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -888,7 +877,6 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf kProd = kH * kW; } - __syncthreads(); const auto yInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -1043,7 +1031,6 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf kProd = kD * kH * kW; } - __syncthreads(); const auto yInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -1356,7 +1343,6 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn zLen = shape::length(zShapeInfo); rank = 4; } - __syncthreads(); const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -1425,7 +1411,6 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn zLen = shape::length(zShapeInfo); rank = 5; } - __syncthreads(); const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -1499,7 +1484,6 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1]; factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2]; } - __syncthreads(); const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; @@ -1573,7 +1557,6 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2]; factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3]; } - __syncthreads(); const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu index 630d39941..e95473739 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu @@ -40,7 +40,6 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo, __shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem; if (threadIdx.x == 0) { - x = reinterpret_cast(vx); y = reinterpret_cast(vy); z = reinterpret_cast(vz); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu index a636af891..de37ab276 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu @@ -62,7 +62,6 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo kH = yShapeInfo[1]; kW = yShapeInfo[2]; } - __syncthreads(); const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu index a01b4f555..e31349e31 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu @@ -32,13 +32,8 @@ namespace helpers { static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) { auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto step = blockDim.x * gridDim.x; - __shared__ T const* input; - __shared__ T* output; - - if (threadIdx.x == 0) { - input = reinterpret_cast(inputBuf); - output = reinterpret_cast(outputBuf); - } + T const* input = reinterpret_cast(inputBuf); + T* output = reinterpret_cast(outputBuf); for (Nd4jLong e = 0; e < inLen; ++e) { T val = nodeRng->relativeT(e, T(0.f), T(1.f)); @@ -134,6 +129,7 @@ namespace helpers { output = reinterpret_cast(outputBuf); input = reinterpret_cast(gradOutBuf); } + __syncthreads(); auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto step = blockDim.x * gridDim.x; @@ -159,13 +155,8 @@ namespace helpers { static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) { auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto step = blockDim.x * gridDim.x; - __shared__ T const* input; - __shared__ T* output; - - if (threadIdx.x == 0) { - input = reinterpret_cast(inputBuf); - output = reinterpret_cast(outputBuf); - } + T const* input = reinterpret_cast(inputBuf); + T* output = reinterpret_cast(outputBuf); for (auto e = tid; e < inLen; e += step) { T val = nodeRng->relativeT(e, T(0.f), T(1.f)); @@ -209,7 +200,7 @@ namespace helpers { std::vector dims(reduceShape->lengthOf()); reduceShape->syncToHost(); // to ensure that follows are actual bool fit = true; -// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit)) + for( int i = 0; i < dims.size(); i++ ) { if (fit) { dims[i] = reduceShape->e(i); @@ -225,9 +216,9 @@ namespace helpers { REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank."); std::unique_ptr chunk(new NDArray('c', dims, output->dataType(), context.launchContext())); chunk->assign(1.f); - //chunk->applyRandom>(rng, nullptr, chunk.get(), &probValue); - //NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob); + alphaDropoutSimple(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta); + // broadcast chunk to full matrix std::unique_ptr dropOutMultiplier(new NDArray(*input)); dropOutMultiplier->assign(1.f); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu index aabd9e949..4eb5450a3 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu @@ -76,7 +76,6 @@ __global__ static void gatherCuda(const int numOfSubArrs, for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) { if (threadIdx.x == 0) { - x = reinterpret_cast(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]]; z = reinterpret_cast(vz) + zOffsets[i]; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu index 71dc284a6..709f0ed2c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu @@ -47,7 +47,6 @@ namespace nd4j { __shared__ Nd4jLong zLen, totalThreads, *sharedMem; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); @@ -61,7 +60,6 @@ namespace nd4j { totalThreads = gridDim.x * blockDim.x; } - __syncthreads(); auto coord = sharedMem + threadIdx.x * maxRank; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu index 540e2e9ab..ac56b69d5 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu @@ -61,7 +61,6 @@ namespace nd4j { static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) { if (threadIdx.x == 0) { - if (length <= blockSize) *resultBuf = *tempBufferA; else diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu index 3e8ec6836..f2fb9d94a 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu @@ -44,7 +44,6 @@ __global__ static void im2colCuda(const void *image, void *columns, __shared__ int imRank, colRank; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); @@ -56,7 +55,6 @@ __global__ static void im2colCuda(const void *image, void *columns, iH = imShapeInfo[3]; iW = imShapeInfo[4]; } - __syncthreads(); const auto colInd = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu index 0da1fbc28..d96c1efa2 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu @@ -88,12 +88,9 @@ namespace helpers { template static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) { - __shared__ I* indexBuf; - __shared__ Nd4jLong* srcBuf; - if (threadIdx.x == 0) { - indexBuf = reinterpret_cast(indices); - srcBuf = reinterpret_cast(indicesLong); - } + I* indexBuf = reinterpret_cast(indices); + Nd4jLong* srcBuf = reinterpret_cast(indicesLong);; + auto tid = threadIdx.x + blockIdx.x * blockDim.x; auto step = blockDim.x * gridDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu index f27511b3a..239d280dc 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu @@ -29,11 +29,7 @@ namespace helpers { template static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) { extern __shared__ char sharedChar[]; - __shared__ T* shared; - if (threadIdx.x == 0) - shared = reinterpret_cast(sharedChar); - __syncthreads(); - + T* shared = reinterpret_cast(sharedChar); auto xEws = shape::elementWiseStride(xTadShapeInfo); auto zEws = shape::elementWiseStride(zTadShapeInfo); @@ -69,16 +65,8 @@ namespace helpers { template static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) { extern __shared__ char sharedChar[]; - __shared__ X* sharedX; - __shared__ Z* sharedY; - - if (threadIdx.x == 0) { - sharedX = reinterpret_cast(sharedChar); - sharedY = reinterpret_cast(sharedX + blockDim.x); - } - - __syncthreads(); - + X* sharedX = reinterpret_cast(sharedChar); + Z* sharedY = reinterpret_cast(sharedX + blockDim.x); auto xEws = shape::elementWiseStride(xTadShapeInfo); auto zEws = shape::elementWiseStride(zTadShapeInfo); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu index ffd652ee7..1bf40ba7b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu @@ -57,14 +57,8 @@ namespace helpers { // } template static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { - __shared__ T* inverted; - __shared__ T* input; - - if (threadIdx.x == 0) { - inverted = reinterpret_cast(invertedBuf); - input = reinterpret_cast(inputBuf); - } - __syncthreads(); + T* inverted = reinterpret_cast(invertedBuf); + T* input = reinterpret_cast(inputBuf); auto start = threadIdx.x + blockIdx.x * blockDim.x; auto step = blockDim.x * gridDim.x; @@ -84,14 +78,8 @@ namespace helpers { template static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { - __shared__ T* inverted; - __shared__ T* input; - - if (threadIdx.x == 0) { - inverted = reinterpret_cast(invertedBuf); - input = reinterpret_cast(inputBuf); - } - __syncthreads(); + T* inverted = reinterpret_cast(invertedBuf); + T* input = reinterpret_cast(inputBuf); auto start = threadIdx.x + blockIdx.x * blockDim.x; auto step = blockDim.x * gridDim.x; @@ -107,14 +95,8 @@ namespace helpers { template static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { - __shared__ T* inverted; - __shared__ T* input; - - if (threadIdx.x == 0) { - inverted = reinterpret_cast(invertedBuf); - input = reinterpret_cast(inputBuf); - } - __syncthreads(); + T* inverted = reinterpret_cast(invertedBuf); + T* input = reinterpret_cast(inputBuf); auto start = threadIdx.x + blockIdx.x * blockDim.x; auto step = blockDim.x * gridDim.x; @@ -135,17 +117,8 @@ namespace helpers { template static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { - __shared__ T* inverted; - __shared__ T* input; - - if (threadIdx.x == 0) { - inverted = reinterpret_cast(invertedBuf); - input = reinterpret_cast(inputBuf); - } - __syncthreads(); - -// auto start = threadIdx.x + blockIdx.x * blockDim.x; -// auto step = blockDim.x * gridDim.x; + T* inverted = reinterpret_cast(invertedBuf); + T* input = reinterpret_cast(inputBuf); for (int i = blockIdx.x + 2; i < n; i += gridDim.x) { for (int j = i - 2; j >= 0; --j) @@ -166,17 +139,8 @@ namespace helpers { template static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { - __shared__ T* inverted; - __shared__ T* input; - - if (threadIdx.x == 0) { - inverted = reinterpret_cast(invertedBuf); - input = reinterpret_cast(inputBuf); - } - __syncthreads(); - -// auto start = threadIdx.x + blockIdx.x * blockDim.x; -// auto step = blockDim.x * gridDim.x; + T* inverted = reinterpret_cast(invertedBuf);; + T* input = reinterpret_cast(inputBuf); for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) { for (int j = i + 2; j < n; j++) @@ -366,11 +330,8 @@ namespace helpers { template static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) { - __shared__ F* permutation; + F* permutation = reinterpret_cast(output); - if (threadIdx.x == 0) { - permutation = reinterpret_cast(output); - } auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = blockDim.x * gridDim.x; for (auto i = start; i < rowNum; i += step) { @@ -709,13 +670,8 @@ namespace helpers { template __global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) { //auto i = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ Nd4jLong* shapeOf; - __shared__ Nd4jLong* strideOf; - if (blockIdx.x == 0 && threadIdx.x == 0) { - shapeOf = shape::shapeOf(shape); - strideOf = shape::stride(shape); - } - __syncthreads(); + Nd4jLong* shapeOf = shape::shapeOf(shape); + Nd4jLong* strideOf = shape::stride(shape); for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) { auto current = dArray + offsets[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu index 8dcd0683b..95eb5f439 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu @@ -37,6 +37,7 @@ namespace helpers { outLength = shape::length(outputShape); diagonalLen = shape::length(diagonalShape); } + __syncthreads(); for(int i = blockIdx.x; i < batchSize; i+= gridDim.x ) for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu index 2e4240057..53b983d09 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu @@ -46,7 +46,6 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo __shared__ Nd4jLong zLen, totalThreads, *sharedMem; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); xRank = shape::rank(xShapeInfo); @@ -54,11 +53,10 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo zLen = shape::length(zShapeInfo); totalThreads = gridDim.x * blockDim.x; } + __syncthreads(); auto coord = sharedMem + threadIdx.x * zRank; - __syncthreads(); - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; for (Nd4jLong i = tid; i < zLen; i += totalThreads) { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu index ef74180c8..e19ddcb1b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu @@ -51,7 +51,6 @@ namespace nd4j { __shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; coords = reinterpret_cast(shmem); zLen = shape::length(zShapeInfo); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu index 3d7a1a6a3..7b325eb3e 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu @@ -77,6 +77,7 @@ namespace helpers { // saving final value if (threadIdx.x == 0) z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)]; + __syncthreads(); } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu index 56dc8c558..94d3c02ea 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu @@ -40,8 +40,7 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo, if (threadIdx.x == 0) len = shape::length(nShapeInfo); - - __syncthreads(); + __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto totalThreads = gridDim.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu index 53cfcc22d..b1412343b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu @@ -93,6 +93,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op, if (threadIdx.x == 0) shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1; + __syncthreads(); for (uint d = 1; d < blockDim2; d *= 2) { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu index 55f635e1b..bcd484fe9 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu @@ -47,7 +47,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn __shared__ Nd4jLong zLen, totalThreads, *sharedMem; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); @@ -55,7 +54,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn zLen = shape::length(zShapeInfo); totalThreads = gridDim.x * blockDim.x; } - __syncthreads(); auto coords = sharedMem + threadIdx.x * rank; @@ -138,7 +136,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn __shared__ Nd4jLong zLen, totalThreads, *sharedMem; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); @@ -146,7 +143,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn zLen = shape::length(zShapeInfo); totalThreads = gridDim.x * blockDim.x; } - __syncthreads(); auto coords = sharedMem + threadIdx.x * rank; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu index 776d92c45..43480f75d 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu @@ -308,7 +308,6 @@ __global__ static void scatterCuda(const int opCode, __shared__ Nd4jLong yLen, totalThreads, *coord; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; coord = reinterpret_cast(shmem); yLen = shape::length(yShapeInfo); @@ -317,7 +316,6 @@ __global__ static void scatterCuda(const int opCode, yRank = shape::rank(yShapeInfo); zRank = shape::rank(zShapeInfo); } - __syncthreads(); auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank); @@ -455,12 +453,10 @@ __global__ static void scatterNDLockCuda(const int opCode, __shared__ int xLastDim; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; zTadCoords = reinterpret_cast(shmem); xLastDim = xTadShapeInfo[1]; // xTad has rank = 1 always } - __syncthreads(); Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim; @@ -598,7 +594,6 @@ __global__ static void scatterNDCuda(const int opCode, __shared__ Nd4jLong yLen, totalThreads, *coord; if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; coord = reinterpret_cast(shmem); yLen = shape::length(yShapeInfo); @@ -608,7 +603,6 @@ __global__ static void scatterNDCuda(const int opCode, zRank = shape::rank(zShapeInfo); xLastDim = xShapeInfo[xRank]; } - __syncthreads(); auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank); @@ -752,7 +746,6 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo, xLen = shape::length(xShapeInfo); xRank = shape::rank(xShapeInfo); } - __syncthreads(); const auto xInd = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu index 4a64cd4c7..d8b3575ff 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu @@ -56,7 +56,6 @@ namespace nd4j { arrLenX = shape::length(xShapeInfo); arrLenY = shape::length(yShapeInfo); } - __syncthreads(); if (arrLenX != arrLenY) diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu index 20796b1d1..180af41e1 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu @@ -266,6 +266,7 @@ namespace nd4j { gradOut = reinterpret_cast(eps); gradLen = shape::length(epsShape); } + __syncthreads(); auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = gridDim.x * blockDim.x; @@ -311,6 +312,7 @@ namespace nd4j { gradLen = shape::length(epsShape); currentLen = shape::length(outTad); } + __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu index c60272188..3f2168da4 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu @@ -248,6 +248,7 @@ namespace helpers { gradOut = reinterpret_cast(eps); gradLen = shape::length(epsShape); } + __syncthreads(); auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = gridDim.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu index de602201b..0c67b41d5 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu @@ -257,6 +257,7 @@ namespace helpers { gradOut = reinterpret_cast(eps); gradLen = shape::length(epsShape); } + __syncthreads(); auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = gridDim.x * blockDim.x; @@ -302,6 +303,7 @@ namespace helpers { gradLen = shape::length(epsShape); currentLen = shape::length(outTad); } + __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu index 7454756b5..78f21916d 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu @@ -75,7 +75,7 @@ namespace helpers { if (threadIdx.x == 0) { z[zIndex] = val[segment]; } - + __syncthreads(); } // -------------------------------------------------------------------------------------------------------------- // template @@ -256,6 +256,7 @@ namespace helpers { gradOut = reinterpret_cast(eps); gradLen = shape::length(epsShape); } + __syncthreads(); auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = gridDim.x * blockDim.x; @@ -298,6 +299,7 @@ namespace helpers { gradLen = shape::length(epsShape); currentLen = shape::length(outTad); } + __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu index 875f63e77..4141cefba 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu @@ -168,6 +168,7 @@ namespace helpers { gradOut = reinterpret_cast(eps); gradLen = shape::length(epsShape); } + __syncthreads(); auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = gridDim.x * blockDim.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu index 1d9d983ef..37dacee09 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu @@ -256,6 +256,7 @@ namespace helpers { gradOut = reinterpret_cast(eps); gradLen = shape::length(epsShape); } + __syncthreads(); auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = gridDim.x * blockDim.x; @@ -292,6 +293,7 @@ namespace helpers { gradLen = shape::length(epsShape); currentLen = shape::length(outTad); } + __syncthreads(); for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu index 411d7eac1..7318dbaea 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu @@ -36,6 +36,7 @@ namespace helpers { inputLen = shape::length(inputShape); outputLen = shape::length(outputShape); } + __syncthreads(); for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x) for(auto k = threadIdx.x; k < inputLen; k += blockDim.x) diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu b/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu index 6a9fd28e6..3b854415b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu @@ -375,8 +375,9 @@ namespace nd4j { } if (threadIdx.x == 0) { if (hasError) - neu1[0] = DataTypeUtils::infOrMax(); + neu1[0] = DataTypeUtils::infOrMax(); } + __syncthreads(); } template diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu index 150c616a6..5c00244f8 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu @@ -149,7 +149,6 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo, totalThreads = gridDim.x * blockDim.x; } - __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu index 5406e8bbd..7e35ec819 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu @@ -135,6 +135,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi if (threadIdx.x == 0) z[zOffset] = *sharedMem; + __syncthreads(); } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu index feb6ce6e5..94464bbbc 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu @@ -39,8 +39,7 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo, if (threadIdx.x == 0) len = shape::length(xShapeInfo); - - __syncthreads(); + __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto totalThreads = gridDim.x * blockDim.x; diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index e0d53a66f..d05fd1682 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -18208,6 +18208,24 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); } // #endif +// #if NOT_EXCLUDED(OP_space_to_batch_nd) + @Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public space_to_batch_nd(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public space_to_batch_nd(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public space_to_batch_nd position(long position) { + return (space_to_batch_nd)super.position(position); + } + + public space_to_batch_nd() { super((Pointer)null); allocate(); } + private native void allocate(); + public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); + } +// #endif + /** * *