syncthreads (#136)

Signed-off-by: raver119 <raver119@gmail.com>
2019-08-20 18:28:43 +03:00 · 2019-08-20 18:28:43 +03:00 · 23c8738d4a
commit 23c8738d4a
parent 38310777ee
47 changed files with 71 additions and 170 deletions
--- a/libnd4j/blas/cuda/NDArray.cu
+++ b/libnd4j/blas/cuda/NDArray.cu
@ -78,7 +78,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
        areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
@ -87,7 +86,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
        zLen  = shape::length(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
-
    __syncthreads();

    auto coords = sharedMem + threadIdx.x * zRank;
@ -153,14 +151,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
    __shared__ Nd4jLong len, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
        rank = shape::rank(xShapeInfo);
        len  = shape::length(xShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
-
    __syncthreads();

    auto coords = sharedMem + threadIdx.x * rank;
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@ -1183,7 +1183,6 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx,  void* pxShape
    __shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end;

    if (threadIdx.x == 0) {
-
        blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs;     // floor
        arrIdx = blockIdx.x / blocksPerArr;
        if (arrIdx >= numOfArrs)
@ -1200,8 +1199,8 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx,  void* pxShape
        start = arrLenPerBlock * (blockIdx.x % blocksPerArr);
        end   = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
    }
-
    __syncthreads();
+
    for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) {
        auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ);
        auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen);
@ -3165,7 +3164,6 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr
            arrLenX = shape::length(xShapeInfo);
            arrLenY = shape::length(yShapeInfo);
        }
-
        __syncthreads();

        if (arrLenX != arrLenY)
--- a/libnd4j/include/loops/cuda/broadcasting.chpp
+++ b/libnd4j/include/loops/cuda/broadcasting.chpp
@ -128,7 +128,6 @@ namespace functions {


            if (threadIdx.x == 0) {
-
                tadLength = _length(tadOnlyShapeInfo);
                tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                numTads = _length(yShapeInfo) / tadLength;
@ -194,7 +193,6 @@ namespace functions {
            __shared__ Nd4jLong zEWS;

            if (threadIdx.x == 0) {
-                                
   	            tadLength = _length(tadOnlyShapeInfo);
                tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                numTads = _length(xShapeInfo) / tadLength;
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@ -185,7 +185,6 @@ namespace functions {
            __shared__ Nd4jLong zEWS;
      
            if (threadIdx.x == 0) {
-
   	            tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
                tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                numTads = shape::length(xShapeInfo) / tadLength;
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@ -231,7 +231,6 @@ namespace functions {

                xLength = shape::length(xShapeInfo);
            }
-
            __syncthreads();

            if (!resultScalar) {
@ -267,6 +266,7 @@ namespace functions {
                        if (threadIdx.x == 0) {
                            result[r] = sPartials[threadIdx.x].index;
                        }
+                        __syncthreads();
                    }
                } else {

@ -287,6 +287,7 @@ namespace functions {
                        if (threadIdx.x == 0) {
                            result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
                        }
+                        __syncthreads();
                    }
                }
            } else {
--- a/libnd4j/include/loops/cuda/pairwise.chpp
+++ b/libnd4j/include/loops/cuda/pairwise.chpp
@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
 	__shared__ Nd4jLong len;

 	if (threadIdx.x == 0) {
-
 		xEws = shape::elementWiseStride(xShapeInfo);
 		yEws = shape::elementWiseStride(yShapeInfo);
    	zEws = shape::elementWiseStride(zShapeInfo);
--- a/libnd4j/include/loops/cuda/pairwise_bool.cu
+++ b/libnd4j/include/loops/cuda/pairwise_bool.cu
@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
 	__shared__ Nd4jLong len;

 	if (threadIdx.x == 0) {
-		
 		xEws = shape::elementWiseStride(xShapeInfo);
 		yEws = shape::elementWiseStride(yShapeInfo);
    	zEws = shape::elementWiseStride(zShapeInfo);
--- a/libnd4j/include/loops/cuda/reduce3.chpp
+++ b/libnd4j/include/loops/cuda/reduce3.chpp
@ -125,7 +125,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
 	__shared__ Z* sPartials;

 	if (threadIdx.x == 0) {
-
 		extern __shared__ unsigned char shmem[];
        sPartials = reinterpret_cast<Z*>(shmem);

@ -137,7 +136,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
 		else
 			extraZ[2] = (Z) 0.0f;
 	}
-
    __syncthreads();

 	sPartials[threadIdx.x] = OpType::startingValue(x);
@ -377,7 +375,6 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
 	__shared__ char yTadOrder;

 	if(threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sPartials = reinterpret_cast<Z*>(shmem);

--- a/libnd4j/include/loops/cuda/summarystatsreduce.cu
+++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu
@ -217,7 +217,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                        if (threadIdx.x == 0) {
                            z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]);
                        }
-
+                        __syncthreads();
                    }
                }
                else {
@ -285,8 +285,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                        SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer;
                        pBuffer[blockIdx.x] = sPartials[0];
                    }
-                    __syncthreads();
                    __threadfence();
+                    __syncthreads();

                    if (tid == 0) {
                        unsigned int ticket = atomicInc(&tc[16384], gridDim.x);
--- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
@ -52,7 +52,6 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
 		xzRank = shape::rank(xShapeInfo);
 		yRank  = shape::rank(yShapeInfo);
 	}
-
 	__syncthreads();

 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@ -132,7 +131,6 @@ __global__ linkage void preluBPCuda(const void *vIn,    const Nd4jLong *inShapeI
 		inRank     = shape::rank(inShapeInfo);
 		alphaRank  = shape::rank(alphaShapeInfo);
 	}
-
 	__syncthreads();

 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@ -45,7 +45,6 @@ static void _CUDA_G adjustHueCuda(const void* vx, const Nd4jLong* xShapeInfo, co
        xDimCstride = shape::stride(xShapeInfo)[dimC];
        zDimCstride = shape::stride(zShapeInfo)[dimC];
    }
-
    __syncthreads();

    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@ -47,7 +47,6 @@ static void _CUDA_G adjustSaturationCuda(const void* vx, const Nd4jLong* xShapeI
        xDimCstride = shape::stride(xShapeInfo)[dimC];
        zDimCstride = shape::stride(zShapeInfo)[dimC];
    }
-
    __syncthreads();

    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
@ -53,7 +53,6 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo,
    __shared__ Nd4jLong minLen, tadLen, totalThreads;

    if (threadIdx.x == 0) {
-
        totalThreads = gridDim.x * blockDim.x;

        minLen = shape::length(meanShapeInfo);
@ -116,7 +115,6 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo


    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem    = reinterpret_cast<Nd4jLong*>(shmem);
        totalThreads = gridDim.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
@ -38,7 +38,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
    __shared__ Nd4jLong *sharedMem, imLen;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);

@ -53,7 +52,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI

        imLen = shape::length(imShapeInfo);
    }
-
    __syncthreads();

    const auto imInd = threadIdx.x + blockIdx.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@ -50,7 +50,6 @@ __global__ static void concatCuda(void* pVx,  void* pxShapeInfo, void* vz, Nd4jL
        rank = shape::rank(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
-
    __syncthreads();

    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@ -43,7 +43,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
    __shared__ Nd4jLong *sharedMem;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);

@ -56,7 +55,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
        iH = volShapeInfo[4];
        iW = volShapeInfo[5];
    }
-
    __syncthreads();

    const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -127,7 +125,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
    __shared__ Nd4jLong *sharedMem, volLen;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);

@ -144,7 +141,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape

        volLen = shape::length(volShapeInfo);
    }
-
    __syncthreads();

    const auto volInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -404,7 +400,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
    __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;

    if (threadIdx.x == 0) {
-
        bS = shape::sizeAt(xShapeInfo, 0);
        iC = shape::sizeAt(xShapeInfo, 1);
        oH = shape::sizeAt(zShapeInfo, 2);
@ -428,7 +423,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
        kHEff = kH + (kH-1)*(dH-1);
        kWEff = kW + (kW-1)*(dW-1);
    }
-
    __syncthreads();

    int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -501,7 +495,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
    __shared__ bool fOrder;

    if (threadIdx.x == 0) {
-
        bS = shape::sizeAt(xShapeInfo, 0);
        iC = shape::sizeAt(xShapeInfo, 1);
        oH = shape::sizeAt(zShapeInfo, 2);
@ -525,7 +518,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
        kHEff = kH + (kH-1)*(dH-1);
        kWEff = kW + (kW-1)*(dW-1);
    }
-
    __syncthreads();

    int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -594,7 +586,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
    __shared__ bool fOrder;

    if (threadIdx.x == 0) {
-
        bS = shape::sizeAt(xShapeInfo, 0);
        iC = shape::sizeAt(xShapeInfo, 1);
        oH = shape::sizeAt(zShapeInfo, 2);
@ -618,7 +609,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
        kHEff = kH + (kH-1)*(dH-1);
        kWEff = kW + (kW-1)*(dW-1);
    }
-
    __syncthreads();

    int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -737,7 +727,6 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,

        kProd = kD * kH * kW;
    }
-
    __syncthreads();

    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -888,7 +877,6 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf

        kProd = kH * kW;
    }
-
    __syncthreads();

    const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1043,7 +1031,6 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf

        kProd = kD * kH * kW;
    }
-
    __syncthreads();

    const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1356,7 +1343,6 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
        zLen = shape::length(zShapeInfo);
        rank = 4;
    }
-
    __syncthreads();

    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1425,7 +1411,6 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
        zLen = shape::length(zShapeInfo);
        rank = 5;
    }
-
    __syncthreads();

    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1499,7 +1484,6 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
        factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
        factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
    }
-
    __syncthreads();

    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1573,7 +1557,6 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
        factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
        factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
    }
-
    __syncthreads();

    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
@ -40,7 +40,6 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
    __shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;

    if (threadIdx.x == 0) {
-
    	x = reinterpret_cast<const T*>(vx);
    	y = reinterpret_cast<const T*>(vy);
    	z = reinterpret_cast<T*>(vz);
--- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
@ -62,7 +62,6 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
        kH = yShapeInfo[1];
        kW = yShapeInfo[2];
    }
-
    __syncthreads();

    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
@ -32,13 +32,8 @@ namespace helpers {
    static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
-        __shared__ T const* input;
-        __shared__ T* output;
-
-        if (threadIdx.x == 0) {
-            input = reinterpret_cast<T const*>(inputBuf);
-            output = reinterpret_cast<T*>(outputBuf);
-        }
+        T const* input = reinterpret_cast<T const*>(inputBuf);
+        T* output = reinterpret_cast<T*>(outputBuf);

        for (Nd4jLong e = 0; e < inLen; ++e) {
            T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@ -134,6 +129,7 @@ namespace helpers {
            output = reinterpret_cast<T*>(outputBuf);
            input = reinterpret_cast<T*>(gradOutBuf);
        }
+        __syncthreads();

        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
@ -159,13 +155,8 @@ namespace helpers {
    static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
-        __shared__ T const* input;
-        __shared__ T* output;
-
-        if (threadIdx.x == 0) {
-            input = reinterpret_cast<T const*>(inputBuf);
-            output = reinterpret_cast<T*>(outputBuf);
-        }
+        T const* input = reinterpret_cast<T const*>(inputBuf);
+        T* output = reinterpret_cast<T*>(outputBuf);

        for (auto e = tid; e < inLen; e += step) {
            T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@ -209,7 +200,7 @@ namespace helpers {
            std::vector<Nd4jLong> dims(reduceShape->lengthOf());
            reduceShape->syncToHost(); // to ensure that follows are actual
            bool fit = true;
-//            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
+
            for( int i = 0; i < dims.size(); i++ ) {
                if (fit) {
                    dims[i] = reduceShape->e<Nd4jLong>(i);
@ -225,9 +216,9 @@ namespace helpers {
            REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank.");
            std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
            chunk->assign(1.f);
-            //chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
-            //NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
+
            alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta);
+
            // broadcast chunk to full matrix
            std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
            dropOutMultiplier->assign(1.f);
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
@ -76,7 +76,6 @@ __global__ static void gatherCuda(const int numOfSubArrs,
    for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) {

        if (threadIdx.x == 0) {
-
            x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]];
            z = reinterpret_cast<X*>(vz) + zOffsets[i];
        }
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
@ -47,7 +47,6 @@ namespace nd4j {
                __shared__ Nd4jLong zLen, totalThreads, *sharedMem;

                if (threadIdx.x == 0) {
-
                    extern __shared__ unsigned char shmem[];
                    sharedMem = reinterpret_cast<Nd4jLong*>(shmem);

@ -61,7 +60,6 @@ namespace nd4j {

                    totalThreads = gridDim.x * blockDim.x;
                }
-
                __syncthreads();

                auto coord = sharedMem + threadIdx.x * maxRank;
--- a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
@ -61,7 +61,6 @@ namespace nd4j {

            static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) {
                if (threadIdx.x == 0) {
-
                    if (length <= blockSize)
                        *resultBuf = *tempBufferA;
                    else
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
@ -44,7 +44,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
    __shared__ int imRank, colRank;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);

@ -56,7 +55,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
        iH = imShapeInfo[3];
        iW = imShapeInfo[4];
    }
-
    __syncthreads();

    const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
@ -88,12 +88,9 @@ namespace helpers {

    template <typename I>
    static __global__ void copyIndices(void* indices,  void* indicesLong, Nd4jLong len) {
-        __shared__ I* indexBuf;
-        __shared__ Nd4jLong* srcBuf;
-        if (threadIdx.x == 0) {
-            indexBuf = reinterpret_cast<I*>(indices);
-            srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);
-        }
+        I* indexBuf = reinterpret_cast<I*>(indices);
+        Nd4jLong* srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);;
+
        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
        auto step = blockDim.x * gridDim.x;

--- a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
@ -29,11 +29,7 @@ namespace helpers {
    template <typename T>
    static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
        extern __shared__ char sharedChar[];
-        __shared__ T* shared;
-        if (threadIdx.x == 0)
-            shared = reinterpret_cast<T*>(sharedChar);
-        __syncthreads();
-
+        T* shared = reinterpret_cast<T*>(sharedChar);

        auto xEws = shape::elementWiseStride(xTadShapeInfo);
        auto zEws = shape::elementWiseStride(zTadShapeInfo);
@ -69,16 +65,8 @@ namespace helpers {
    template <typename X, typename Z>
    static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
        extern __shared__ char sharedChar[];
-        __shared__ X* sharedX;
-        __shared__ Z* sharedY;
-
-        if (threadIdx.x == 0) {
-            sharedX = reinterpret_cast<X*>(sharedChar);
-            sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
-        }
-
-        __syncthreads();
-
+        X* sharedX = reinterpret_cast<X*>(sharedChar);
+        Z* sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);

        auto xEws = shape::elementWiseStride(xTadShapeInfo);
        auto zEws = shape::elementWiseStride(zTadShapeInfo);
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@ -57,14 +57,8 @@ namespace helpers {
 //    }
    template <typename T>
    static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);

        auto start = threadIdx.x + blockIdx.x * blockDim.x;
        auto step = blockDim.x * gridDim.x;
@ -84,14 +78,8 @@ namespace helpers {

    template <typename T>
    static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);

        auto start = threadIdx.x + blockIdx.x * blockDim.x;
        auto step = blockDim.x * gridDim.x;
@ -107,14 +95,8 @@ namespace helpers {

    template <typename T>
    static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);

        auto start = threadIdx.x + blockIdx.x * blockDim.x;
        auto step = blockDim.x * gridDim.x;
@ -135,17 +117,8 @@ namespace helpers {

    template <typename T>
    static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
-
-//        auto start = threadIdx.x + blockIdx.x * blockDim.x;
-//        auto step = blockDim.x * gridDim.x;
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);

        for (int i = blockIdx.x + 2; i < n; i += gridDim.x) {
            for (int j = i - 2; j >= 0; --j)
@ -166,17 +139,8 @@ namespace helpers {

    template <typename T>
    static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
-
-//        auto start = threadIdx.x + blockIdx.x * blockDim.x;
-//        auto step = blockDim.x * gridDim.x;
+        T* inverted = reinterpret_cast<T*>(invertedBuf);;
+        T* input = reinterpret_cast<T*>(inputBuf);

        for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) {
            for (int j = i + 2; j < n; j++)
@ -366,11 +330,8 @@ namespace helpers {

    template <typename F>
    static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) {
-        __shared__ F* permutation;
+        F* permutation = reinterpret_cast<F*>(output);

-        if (threadIdx.x == 0) {
-            permutation = reinterpret_cast<F*>(output);
-        }
        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
        for (auto i = start; i < rowNum; i += step) {
@ -709,13 +670,8 @@ namespace helpers {
    template <typename F>
    __global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) {
        //auto i = blockIdx.x * blockDim.x + threadIdx.x;
-        __shared__ Nd4jLong* shapeOf;
-        __shared__ Nd4jLong* strideOf;
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
-            shapeOf = shape::shapeOf(shape);
-            strideOf = shape::stride(shape);
-        }
-        __syncthreads();
+        Nd4jLong* shapeOf = shape::shapeOf(shape);
+        Nd4jLong* strideOf = shape::stride(shape);

        for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) {
            auto current = dArray + offsets[i];
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@ -37,6 +37,7 @@ namespace helpers {
            outLength = shape::length(outputShape);
            diagonalLen = shape::length(diagonalShape);
        }
+        __syncthreads();

        for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
            for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
@ -46,7 +46,6 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
        xRank = shape::rank(xShapeInfo);
@ -54,11 +53,10 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
        zLen  = shape::length(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
+    __syncthreads();

    auto coord = sharedMem + threadIdx.x * zRank;

-    __syncthreads();
-
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

    for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
@ -51,7 +51,6 @@ namespace nd4j {
                __shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0;

                if (threadIdx.x == 0) {
-
                    extern __shared__ unsigned char shmem[];
                    coords    = reinterpret_cast<Nd4jLong*>(shmem);
                    zLen     = shape::length(zShapeInfo);
--- a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
@ -77,6 +77,7 @@ namespace helpers {
            // saving final value
            if (threadIdx.x == 0)
                z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)];
+            __syncthreads();
        }
    }

--- a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
@ -40,8 +40,7 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo,
    
    if (threadIdx.x == 0)         
        len = shape::length(nShapeInfo);    
-
-    __syncthreads();    
+    __syncthreads();

    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    const auto totalThreads = gridDim.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
@ -93,6 +93,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,

        if (threadIdx.x == 0)
            shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1;
+        __syncthreads();

        for (uint d = 1; d < blockDim2; d *= 2) {

--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@ -47,7 +47,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);

@ -55,7 +54,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
        zLen  = shape::length(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
-
    __syncthreads();

    auto coords = sharedMem + threadIdx.x * rank;
@ -138,7 +136,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);

@ -146,7 +143,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
        zLen  = shape::length(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
-
    __syncthreads();

    auto coords = sharedMem + threadIdx.x * rank;
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@ -308,7 +308,6 @@ __global__ static void scatterCuda(const int opCode,
    __shared__ Nd4jLong yLen, totalThreads, *coord;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        coord = reinterpret_cast<Nd4jLong*>(shmem);
        yLen = shape::length(yShapeInfo);
@ -317,7 +316,6 @@ __global__ static void scatterCuda(const int opCode,
        yRank = shape::rank(yShapeInfo);
        zRank = shape::rank(zShapeInfo);
    }
-
    __syncthreads();

    auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@ -455,12 +453,10 @@ __global__ static void scatterNDLockCuda(const int opCode,
    __shared__ int xLastDim;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        zTadCoords = reinterpret_cast<Nd4jLong*>(shmem);
        xLastDim = xTadShapeInfo[1];   // xTad has rank = 1 always
    }
-
    __syncthreads();

    Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim;
@ -598,7 +594,6 @@ __global__ static void scatterNDCuda(const int opCode,
    __shared__ Nd4jLong yLen, totalThreads, *coord;

    if (threadIdx.x == 0) {
-
        extern __shared__ unsigned char shmem[];
        coord = reinterpret_cast<Nd4jLong*>(shmem);
        yLen = shape::length(yShapeInfo);
@ -608,7 +603,6 @@ __global__ static void scatterNDCuda(const int opCode,
        zRank = shape::rank(zShapeInfo);
        xLastDim = xShapeInfo[xRank];
    }
-
    __syncthreads();

    auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@ -752,7 +746,6 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
        xLen  = shape::length(xShapeInfo);
        xRank = shape::rank(xShapeInfo);
    }
-
    __syncthreads();

    const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
@ -56,7 +56,6 @@ namespace nd4j {
                        arrLenX = shape::length(xShapeInfo);
                        arrLenY = shape::length(yShapeInfo);
                    }
-
                    __syncthreads();

                    if (arrLenX != arrLenY)
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
@ -266,6 +266,7 @@ namespace nd4j {
                    gradOut = reinterpret_cast<T*>(eps);
                    gradLen = shape::length(epsShape);
                }
+                __syncthreads();

                auto start = blockIdx.x * blockDim.x + threadIdx.x;
                auto step = gridDim.x * blockDim.x;
@ -311,6 +312,7 @@ namespace nd4j {
                    gradLen = shape::length(epsShape);
                    currentLen = shape::length(outTad);
                }
+                __syncthreads();

                for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
                    auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
@ -248,6 +248,7 @@ namespace helpers {
            gradOut = reinterpret_cast<T*>(eps);
            gradLen = shape::length(epsShape);
        }
+        __syncthreads();

        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = gridDim.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
@ -257,6 +257,7 @@ namespace helpers {
            gradOut = reinterpret_cast<T*>(eps);
            gradLen = shape::length(epsShape);
        }
+        __syncthreads();

        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = gridDim.x * blockDim.x;
@ -302,6 +303,7 @@ namespace helpers {
            gradLen = shape::length(epsShape);
            currentLen = shape::length(outTad);
        }
+        __syncthreads();

        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
@ -75,7 +75,7 @@ namespace helpers {
        if (threadIdx.x == 0) {
            z[zIndex] = val[segment];
        }
-
+        __syncthreads();
    }
    // -------------------------------------------------------------------------------------------------------------- //
    template <typename T, typename I>
@ -256,6 +256,7 @@ namespace helpers {
            gradOut = reinterpret_cast<T*>(eps);
            gradLen = shape::length(epsShape);
        }
+        __syncthreads();

        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = gridDim.x * blockDim.x;
@ -298,6 +299,7 @@ namespace helpers {
            gradLen = shape::length(epsShape);
            currentLen = shape::length(outTad);
        }
+        __syncthreads();

        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
@ -168,6 +168,7 @@ namespace helpers {
            gradOut = reinterpret_cast<T*>(eps);
            gradLen = shape::length(epsShape);
        }
+        __syncthreads();

        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = gridDim.x * blockDim.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
@ -256,6 +256,7 @@ namespace helpers {
            gradOut = reinterpret_cast<T*>(eps);
            gradLen = shape::length(epsShape);
        }
+        __syncthreads();

        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = gridDim.x * blockDim.x;
@ -292,6 +293,7 @@ namespace helpers {
            gradLen = shape::length(epsShape);
            currentLen = shape::length(outTad);
        }
+        __syncthreads();

        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
--- a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
@ -36,6 +36,7 @@ namespace helpers {
            inputLen = shape::length(inputShape);
            outputLen = shape::length(outputShape);
        }
+        __syncthreads();

        for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
            for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)
--- a/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
@ -375,8 +375,9 @@ namespace nd4j {
                }
                if (threadIdx.x == 0) {
                    if (hasError)
-                    neu1[0] = DataTypeUtils::infOrMax<T>();
+                        neu1[0] = DataTypeUtils::infOrMax<T>();
                }
+                __syncthreads();
            }

            template <typename T>
--- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
@ -149,7 +149,6 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,

        totalThreads = gridDim.x * blockDim.x;
    }
-
    __syncthreads();

    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@ -135,6 +135,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi

        if (threadIdx.x == 0)
            z[zOffset] = *sharedMem;
+        __syncthreads();
    }

 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
@ -39,8 +39,7 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo,
    
    if (threadIdx.x == 0)         
        len = shape::length(xShapeInfo);    
-
-    __syncthreads();    
+    __syncthreads();

    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    const auto totalThreads = gridDim.x * blockDim.x;
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@ -18208,6 +18208,24 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                }
 //         #endif

+//         #if NOT_EXCLUDED(OP_space_to_batch_nd)
+        @Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public space_to_batch_nd(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public space_to_batch_nd(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public space_to_batch_nd position(long position) {
+                return (space_to_batch_nd)super.position(position);
+            }
+        
+                                                                                    public space_to_batch_nd() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
        /**
         *
         *