diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu
index 126837ad9..60498aaf7 100644
--- a/libnd4j/blas/cuda/NDArray.cu
+++ b/libnd4j/blas/cuda/NDArray.cu
@@ -78,7 +78,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
     __shared__ Nd4jLong zLen, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
         areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
@@ -87,7 +86,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
         zLen  = shape::length(zShapeInfo);
         totalThreads = gridDim.x * blockDim.x;
     }
-
     __syncthreads();
 
     auto coords = sharedMem + threadIdx.x * zRank;
@@ -153,14 +151,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
     __shared__ Nd4jLong len, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
         rank = shape::rank(xShapeInfo);
         len  = shape::length(xShapeInfo);
         totalThreads = gridDim.x * blockDim.x;
     }
-
     __syncthreads();
 
     auto coords = sharedMem + threadIdx.x * rank;
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index b85ce8760..11e2c0188 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -1183,7 +1183,6 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx,  void* pxShape
     __shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end;
 
     if (threadIdx.x == 0) {
-
         blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs;     // floor
         arrIdx = blockIdx.x / blocksPerArr;
         if (arrIdx >= numOfArrs)
@@ -1200,8 +1199,8 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx,  void* pxShape
         start = arrLenPerBlock * (blockIdx.x % blocksPerArr);
         end   = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
     }
-
     __syncthreads();
+
     for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) {
         auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ);
         auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen);
@@ -3165,7 +3164,6 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr
             arrLenX = shape::length(xShapeInfo);
             arrLenY = shape::length(yShapeInfo);
         }
-
         __syncthreads();
 
         if (arrLenX != arrLenY)
diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp
index dc8a3eeb1..d930d8cad 100644
--- a/libnd4j/include/loops/cuda/broadcasting.chpp
+++ b/libnd4j/include/loops/cuda/broadcasting.chpp
@@ -128,7 +128,6 @@ namespace functions {
 
 
             if (threadIdx.x == 0) {
-
                 tadLength = _length(tadOnlyShapeInfo);
                 tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                 numTads = _length(yShapeInfo) / tadLength;
@@ -194,7 +193,6 @@ namespace functions {
             __shared__ Nd4jLong zEWS;
 
             if (threadIdx.x == 0) {
-                                
    	            tadLength = _length(tadOnlyShapeInfo);
                 tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                 numTads = _length(xShapeInfo) / tadLength;
diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu
index 6cc3f3cbb..8981790f5 100644
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@@ -185,7 +185,6 @@ namespace functions {
             __shared__ Nd4jLong zEWS;
       
             if (threadIdx.x == 0) {
-
    	            tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
                 tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                 numTads = shape::length(xShapeInfo) / tadLength;
diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu
index 94793f8e8..18e5b1432 100644
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@@ -231,7 +231,6 @@ namespace functions {
 
                 xLength = shape::length(xShapeInfo);
             }
-
             __syncthreads();
 
             if (!resultScalar) {
@@ -267,6 +266,7 @@ namespace functions {
                         if (threadIdx.x == 0) {
                             result[r] = sPartials[threadIdx.x].index;
                         }
+                        __syncthreads();
                     }
                 } else {
 
@@ -287,6 +287,7 @@ namespace functions {
                         if (threadIdx.x == 0) {
                             result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
                         }
+                        __syncthreads();
                     }
                 }
             } else {
diff --git a/libnd4j/include/loops/cuda/pairwise.chpp b/libnd4j/include/loops/cuda/pairwise.chpp
index a7e6f32cc..3f7134887 100644
--- a/libnd4j/include/loops/cuda/pairwise.chpp
+++ b/libnd4j/include/loops/cuda/pairwise.chpp
@@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
 	__shared__ Nd4jLong len;
 
 	if (threadIdx.x == 0) {
-
 		xEws = shape::elementWiseStride(xShapeInfo);
 		yEws = shape::elementWiseStride(yShapeInfo);
     	zEws = shape::elementWiseStride(zShapeInfo);
diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu
index 0834386f2..62f040191 100644
--- a/libnd4j/include/loops/cuda/pairwise_bool.cu
+++ b/libnd4j/include/loops/cuda/pairwise_bool.cu
@@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
 	__shared__ Nd4jLong len;
 
 	if (threadIdx.x == 0) {
-		
 		xEws = shape::elementWiseStride(xShapeInfo);
 		yEws = shape::elementWiseStride(yShapeInfo);
     	zEws = shape::elementWiseStride(zShapeInfo);
diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp
index 819c215fc..01b595da1 100644
--- a/libnd4j/include/loops/cuda/reduce3.chpp
+++ b/libnd4j/include/loops/cuda/reduce3.chpp
@@ -125,7 +125,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
 	__shared__ Z* sPartials;
 
 	if (threadIdx.x == 0) {
-
 		extern __shared__ unsigned char shmem[];
         sPartials = reinterpret_cast<Z*>(shmem);
 
@@ -137,7 +136,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
 		else
 			extraZ[2] = (Z) 0.0f;
 	}
-
     __syncthreads();
 
 	sPartials[threadIdx.x] = OpType::startingValue(x);
@@ -377,7 +375,6 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
 	__shared__ char yTadOrder;
 
 	if(threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sPartials = reinterpret_cast<Z*>(shmem);
 
diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu
index 1e2f3ce4f..deca80217 100644
--- a/libnd4j/include/loops/cuda/summarystatsreduce.cu
+++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu
@@ -217,7 +217,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                         if (threadIdx.x == 0) {
                             z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]);
                         }
-
+                        __syncthreads();
                     }
                 }
                 else {
@@ -285,8 +285,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                         SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer;
                         pBuffer[blockIdx.x] = sPartials[0];
                     }
-                    __syncthreads();
                     __threadfence();
+                    __syncthreads();
 
                     if (tid == 0) {
                         unsigned int ticket = atomicInc(&tc[16384], gridDim.x);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
index 1397874f8..80e5d885b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
@@ -52,7 +52,6 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
 		xzRank = shape::rank(xShapeInfo);
 		yRank  = shape::rank(yShapeInfo);
 	}
-
 	__syncthreads();
 
 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -132,7 +131,6 @@ __global__ linkage void preluBPCuda(const void *vIn,    const Nd4jLong *inShapeI
 		inRank     = shape::rank(inShapeInfo);
 		alphaRank  = shape::rank(alphaShapeInfo);
 	}
-
 	__syncthreads();
 
 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
index e8062e126..c27c9fb8a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@@ -45,7 +45,6 @@ static void _CUDA_G adjustHueCuda(const void* vx, const Nd4jLong* xShapeInfo, co
         xDimCstride = shape::stride(xShapeInfo)[dimC];
         zDimCstride = shape::stride(zShapeInfo)[dimC];
     }
-
     __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
index 4ab8da304..a1dc4189a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@@ -47,7 +47,6 @@ static void _CUDA_G adjustSaturationCuda(const void* vx, const Nd4jLong* xShapeI
         xDimCstride = shape::stride(xShapeInfo)[dimC];
         zDimCstride = shape::stride(zShapeInfo)[dimC];
     }
-
     __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
index 7678779ac..6c3dedd20 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
@@ -53,7 +53,6 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo,
     __shared__ Nd4jLong minLen, tadLen, totalThreads;
 
     if (threadIdx.x == 0) {
-
         totalThreads = gridDim.x * blockDim.x;
 
         minLen = shape::length(meanShapeInfo);
@@ -116,7 +115,6 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
 
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem    = reinterpret_cast<Nd4jLong*>(shmem);
         totalThreads = gridDim.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
index e02bce146..8a21f03a5 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
@@ -38,7 +38,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
     __shared__ Nd4jLong *sharedMem, imLen;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 
@@ -53,7 +52,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
 
         imLen = shape::length(imShapeInfo);
     }
-
     __syncthreads();
 
     const auto imInd = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
index 94675c587..d372f05c8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@@ -50,7 +50,6 @@ __global__ static void concatCuda(void* pVx,  void* pxShapeInfo, void* vz, Nd4jL
         rank = shape::rank(zShapeInfo);
         totalThreads = gridDim.x * blockDim.x;
     }
-
     __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
index 44a0156d7..a37078ad9 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@@ -43,7 +43,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
     __shared__ Nd4jLong *sharedMem;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 
@@ -56,7 +55,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
         iH = volShapeInfo[4];
         iW = volShapeInfo[5];
     }
-
     __syncthreads();
 
     const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -127,7 +125,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
     __shared__ Nd4jLong *sharedMem, volLen;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 
@@ -144,7 +141,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
 
         volLen = shape::length(volShapeInfo);
     }
-
     __syncthreads();
 
     const auto volInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -404,7 +400,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
     __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
 
     if (threadIdx.x == 0) {
-
         bS = shape::sizeAt(xShapeInfo, 0);
         iC = shape::sizeAt(xShapeInfo, 1);
         oH = shape::sizeAt(zShapeInfo, 2);
@@ -428,7 +423,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
         kHEff = kH + (kH-1)*(dH-1);
         kWEff = kW + (kW-1)*(dW-1);
     }
-
     __syncthreads();
 
     int tid = blockIdx.x * gridDim.x + threadIdx.x;
@@ -501,7 +495,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
     __shared__ bool fOrder;
 
     if (threadIdx.x == 0) {
-
         bS = shape::sizeAt(xShapeInfo, 0);
         iC = shape::sizeAt(xShapeInfo, 1);
         oH = shape::sizeAt(zShapeInfo, 2);
@@ -525,7 +518,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
         kHEff = kH + (kH-1)*(dH-1);
         kWEff = kW + (kW-1)*(dW-1);
     }
-
     __syncthreads();
 
     int tid = blockIdx.x * gridDim.x + threadIdx.x;
@@ -594,7 +586,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
     __shared__ bool fOrder;
 
     if (threadIdx.x == 0) {
-
         bS = shape::sizeAt(xShapeInfo, 0);
         iC = shape::sizeAt(xShapeInfo, 1);
         oH = shape::sizeAt(zShapeInfo, 2);
@@ -618,7 +609,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
         kHEff = kH + (kH-1)*(dH-1);
         kWEff = kW + (kW-1)*(dW-1);
     }
-
     __syncthreads();
 
     int tid = blockIdx.x * gridDim.x + threadIdx.x;
@@ -737,7 +727,6 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
 
         kProd = kD * kH * kW;
     }
-
     __syncthreads();
 
     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -888,7 +877,6 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
         kProd = kH * kW;
     }
-
     __syncthreads();
 
     const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -1043,7 +1031,6 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
         kProd = kD * kH * kW;
     }
-
     __syncthreads();
 
     const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -1356,7 +1343,6 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
         zLen = shape::length(zShapeInfo);
         rank = 4;
     }
-
     __syncthreads();
 
     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -1425,7 +1411,6 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
         zLen = shape::length(zShapeInfo);
         rank = 5;
     }
-
     __syncthreads();
 
     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -1499,7 +1484,6 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
         factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
         factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
     }
-
     __syncthreads();
 
     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@@ -1573,7 +1557,6 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
         factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
         factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
     }
-
     __syncthreads();
 
     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
index 630d39941..e95473739 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
@@ -40,7 +40,6 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
     __shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
 
     if (threadIdx.x == 0) {
-
     	x = reinterpret_cast<const T*>(vx);
     	y = reinterpret_cast<const T*>(vy);
     	z = reinterpret_cast<T*>(vz);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
index a636af891..de37ab276 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
@@ -62,7 +62,6 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
         kH = yShapeInfo[1];
         kW = yShapeInfo[2];
     }
-
     __syncthreads();
 
     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
index a01b4f555..e31349e31 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
@@ -32,13 +32,8 @@ namespace helpers {
     static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
         auto tid = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = blockDim.x * gridDim.x;
-        __shared__ T const* input;
-        __shared__ T* output;
-
-        if (threadIdx.x == 0) {
-            input = reinterpret_cast<T const*>(inputBuf);
-            output = reinterpret_cast<T*>(outputBuf);
-        }
+        T const* input = reinterpret_cast<T const*>(inputBuf);
+        T* output = reinterpret_cast<T*>(outputBuf);
 
         for (Nd4jLong e = 0; e < inLen; ++e) {
             T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@@ -134,6 +129,7 @@ namespace helpers {
             output = reinterpret_cast<T*>(outputBuf);
             input = reinterpret_cast<T*>(gradOutBuf);
         }
+        __syncthreads();
 
         auto tid = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = blockDim.x * gridDim.x;
@@ -159,13 +155,8 @@ namespace helpers {
     static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
         auto tid = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = blockDim.x * gridDim.x;
-        __shared__ T const* input;
-        __shared__ T* output;
-
-        if (threadIdx.x == 0) {
-            input = reinterpret_cast<T const*>(inputBuf);
-            output = reinterpret_cast<T*>(outputBuf);
-        }
+        T const* input = reinterpret_cast<T const*>(inputBuf);
+        T* output = reinterpret_cast<T*>(outputBuf);
 
         for (auto e = tid; e < inLen; e += step) {
             T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@@ -209,7 +200,7 @@ namespace helpers {
             std::vector<Nd4jLong> dims(reduceShape->lengthOf());
             reduceShape->syncToHost(); // to ensure that follows are actual
             bool fit = true;
-//            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
+
             for( int i = 0; i < dims.size(); i++ ) {
                 if (fit) {
                     dims[i] = reduceShape->e<Nd4jLong>(i);
@@ -225,9 +216,9 @@ namespace helpers {
             REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank.");
             std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
             chunk->assign(1.f);
-            //chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
-            //NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
+
             alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta);
+
             // broadcast chunk to full matrix
             std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
             dropOutMultiplier->assign(1.f);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
index aabd9e949..4eb5450a3 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
@@ -76,7 +76,6 @@ __global__ static void gatherCuda(const int numOfSubArrs,
     for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) {
 
         if (threadIdx.x == 0) {
-
             x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]];
             z = reinterpret_cast<X*>(vz) + zOffsets[i];
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
index 71dc284a6..709f0ed2c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
@@ -47,7 +47,6 @@ namespace nd4j {
                 __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
 
                 if (threadIdx.x == 0) {
-
                     extern __shared__ unsigned char shmem[];
                     sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 
@@ -61,7 +60,6 @@ namespace nd4j {
 
                     totalThreads = gridDim.x * blockDim.x;
                 }
-
                 __syncthreads();
 
                 auto coord = sharedMem + threadIdx.x * maxRank;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
index 540e2e9ab..ac56b69d5 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
@@ -61,7 +61,6 @@ namespace nd4j {
 
             static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) {
                 if (threadIdx.x == 0) {
-
                     if (length <= blockSize)
                         *resultBuf = *tempBufferA;
                     else
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
index 3e8ec6836..f2fb9d94a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
@@ -44,7 +44,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
     __shared__ int imRank, colRank;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 
@@ -56,7 +55,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
         iH = imShapeInfo[3];
         iW = imShapeInfo[4];
     }
-
     __syncthreads();
 
     const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
index 0da1fbc28..d96c1efa2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
@@ -88,12 +88,9 @@ namespace helpers {
 
     template <typename I>
     static __global__ void copyIndices(void* indices,  void* indicesLong, Nd4jLong len) {
-        __shared__ I* indexBuf;
-        __shared__ Nd4jLong* srcBuf;
-        if (threadIdx.x == 0) {
-            indexBuf = reinterpret_cast<I*>(indices);
-            srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);
-        }
+        I* indexBuf = reinterpret_cast<I*>(indices);
+        Nd4jLong* srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);;
+
         auto tid = threadIdx.x + blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
index f27511b3a..239d280dc 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
@@ -29,11 +29,7 @@ namespace helpers {
     template <typename T>
     static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
         extern __shared__ char sharedChar[];
-        __shared__ T* shared;
-        if (threadIdx.x == 0)
-            shared = reinterpret_cast<T*>(sharedChar);
-        __syncthreads();
-
+        T* shared = reinterpret_cast<T*>(sharedChar);
 
         auto xEws = shape::elementWiseStride(xTadShapeInfo);
         auto zEws = shape::elementWiseStride(zTadShapeInfo);
@@ -69,16 +65,8 @@ namespace helpers {
     template <typename X, typename Z>
     static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
         extern __shared__ char sharedChar[];
-        __shared__ X* sharedX;
-        __shared__ Z* sharedY;
-
-        if (threadIdx.x == 0) {
-            sharedX = reinterpret_cast<X*>(sharedChar);
-            sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
-        }
-
-        __syncthreads();
-
+        X* sharedX = reinterpret_cast<X*>(sharedChar);
+        Z* sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
 
         auto xEws = shape::elementWiseStride(xTadShapeInfo);
         auto zEws = shape::elementWiseStride(zTadShapeInfo);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
index ffd652ee7..1bf40ba7b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@@ -57,14 +57,8 @@ namespace helpers {
 //    }
     template <typename T>
     static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);
 
         auto start = threadIdx.x + blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
@@ -84,14 +78,8 @@ namespace helpers {
 
     template <typename T>
     static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);
 
         auto start = threadIdx.x + blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
@@ -107,14 +95,8 @@ namespace helpers {
 
     template <typename T>
     static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);
 
         auto start = threadIdx.x + blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
@@ -135,17 +117,8 @@ namespace helpers {
 
     template <typename T>
     static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
-
-//        auto start = threadIdx.x + blockIdx.x * blockDim.x;
-//        auto step = blockDim.x * gridDim.x;
+        T* inverted = reinterpret_cast<T*>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);
 
         for (int i = blockIdx.x + 2; i < n; i += gridDim.x) {
             for (int j = i - 2; j >= 0; --j)
@@ -166,17 +139,8 @@ namespace helpers {
 
     template <typename T>
     static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
-        __shared__ T* inverted;
-        __shared__ T* input;
-
-        if (threadIdx.x == 0) {
-            inverted = reinterpret_cast<T*>(invertedBuf);
-            input = reinterpret_cast<T*>(inputBuf);
-        }
-        __syncthreads();
-
-//        auto start = threadIdx.x + blockIdx.x * blockDim.x;
-//        auto step = blockDim.x * gridDim.x;
+        T* inverted = reinterpret_cast<T*>(invertedBuf);;
+        T* input = reinterpret_cast<T*>(inputBuf);
 
         for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) {
             for (int j = i + 2; j < n; j++)
@@ -366,11 +330,8 @@ namespace helpers {
 
     template <typename F>
     static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) {
-        __shared__ F* permutation;
+        F* permutation = reinterpret_cast<F*>(output);
 
-        if (threadIdx.x == 0) {
-            permutation = reinterpret_cast<F*>(output);
-        }
         auto start = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = blockDim.x * gridDim.x;
         for (auto i = start; i < rowNum; i += step) {
@@ -709,13 +670,8 @@ namespace helpers {
     template <typename F>
     __global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) {
         //auto i = blockIdx.x * blockDim.x + threadIdx.x;
-        __shared__ Nd4jLong* shapeOf;
-        __shared__ Nd4jLong* strideOf;
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
-            shapeOf = shape::shapeOf(shape);
-            strideOf = shape::stride(shape);
-        }
-        __syncthreads();
+        Nd4jLong* shapeOf = shape::shapeOf(shape);
+        Nd4jLong* strideOf = shape::stride(shape);
 
         for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) {
             auto current = dArray + offsets[i];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
index 8dcd0683b..95eb5f439 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@@ -37,6 +37,7 @@ namespace helpers {
             outLength = shape::length(outputShape);
             diagonalLen = shape::length(diagonalShape);
         }
+        __syncthreads();
 
         for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
             for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
index 2e4240057..53b983d09 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
@@ -46,7 +46,6 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
     __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
         xRank = shape::rank(xShapeInfo);
@@ -54,11 +53,10 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
         zLen  = shape::length(zShapeInfo);
         totalThreads = gridDim.x * blockDim.x;
     }
+    __syncthreads();
 
     auto coord = sharedMem + threadIdx.x * zRank;
 
-    __syncthreads();
-
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
index ef74180c8..e19ddcb1b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
@@ -51,7 +51,6 @@ namespace nd4j {
                 __shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0;
 
                 if (threadIdx.x == 0) {
-
                     extern __shared__ unsigned char shmem[];
                     coords    = reinterpret_cast<Nd4jLong*>(shmem);
                     zLen     = shape::length(zShapeInfo);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
index 3d7a1a6a3..7b325eb3e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
@@ -77,6 +77,7 @@ namespace helpers {
             // saving final value
             if (threadIdx.x == 0)
                 z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)];
+            __syncthreads();
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
index 56dc8c558..94d3c02ea 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
@@ -40,8 +40,7 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo,
     
     if (threadIdx.x == 0)         
         len = shape::length(nShapeInfo);    
-
-    __syncthreads();    
+    __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
     const auto totalThreads = gridDim.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
index 53cfcc22d..b1412343b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
@@ -93,6 +93,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,
 
         if (threadIdx.x == 0)
             shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1;
+        __syncthreads();
 
         for (uint d = 1; d < blockDim2; d *= 2) {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
index 55f635e1b..bcd484fe9 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@@ -47,7 +47,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
     __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 
@@ -55,7 +54,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
         zLen  = shape::length(zShapeInfo);
         totalThreads = gridDim.x * blockDim.x;
     }
-
     __syncthreads();
 
     auto coords = sharedMem + threadIdx.x * rank;
@@ -138,7 +136,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
     __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 
@@ -146,7 +143,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
         zLen  = shape::length(zShapeInfo);
         totalThreads = gridDim.x * blockDim.x;
     }
-
     __syncthreads();
 
     auto coords = sharedMem + threadIdx.x * rank;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
index 776d92c45..43480f75d 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@@ -308,7 +308,6 @@ __global__ static void scatterCuda(const int opCode,
     __shared__ Nd4jLong yLen, totalThreads, *coord;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         coord = reinterpret_cast<Nd4jLong*>(shmem);
         yLen = shape::length(yShapeInfo);
@@ -317,7 +316,6 @@ __global__ static void scatterCuda(const int opCode,
         yRank = shape::rank(yShapeInfo);
         zRank = shape::rank(zShapeInfo);
     }
-
     __syncthreads();
 
     auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@@ -455,12 +453,10 @@ __global__ static void scatterNDLockCuda(const int opCode,
     __shared__ int xLastDim;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         zTadCoords = reinterpret_cast<Nd4jLong*>(shmem);
         xLastDim = xTadShapeInfo[1];   // xTad has rank = 1 always
     }
-
     __syncthreads();
 
     Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim;
@@ -598,7 +594,6 @@ __global__ static void scatterNDCuda(const int opCode,
     __shared__ Nd4jLong yLen, totalThreads, *coord;
 
     if (threadIdx.x == 0) {
-
         extern __shared__ unsigned char shmem[];
         coord = reinterpret_cast<Nd4jLong*>(shmem);
         yLen = shape::length(yShapeInfo);
@@ -608,7 +603,6 @@ __global__ static void scatterNDCuda(const int opCode,
         zRank = shape::rank(zShapeInfo);
         xLastDim = xShapeInfo[xRank];
     }
-
     __syncthreads();
 
     auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@@ -752,7 +746,6 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
         xLen  = shape::length(xShapeInfo);
         xRank = shape::rank(xShapeInfo);
     }
-
     __syncthreads();
 
     const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
index 4a64cd4c7..d8b3575ff 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
@@ -56,7 +56,6 @@ namespace nd4j {
                         arrLenX = shape::length(xShapeInfo);
                         arrLenY = shape::length(yShapeInfo);
                     }
-
                     __syncthreads();
 
                     if (arrLenX != arrLenY)
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
index 20796b1d1..180af41e1 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
@@ -266,6 +266,7 @@ namespace nd4j {
                     gradOut = reinterpret_cast<T*>(eps);
                     gradLen = shape::length(epsShape);
                 }
+                __syncthreads();
 
                 auto start = blockIdx.x * blockDim.x + threadIdx.x;
                 auto step = gridDim.x * blockDim.x;
@@ -311,6 +312,7 @@ namespace nd4j {
                     gradLen = shape::length(epsShape);
                     currentLen = shape::length(outTad);
                 }
+                __syncthreads();
 
                 for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
                     auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
index c60272188..3f2168da4 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
@@ -248,6 +248,7 @@ namespace helpers {
             gradOut = reinterpret_cast<T*>(eps);
             gradLen = shape::length(epsShape);
         }
+        __syncthreads();
 
         auto start = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = gridDim.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
index de602201b..0c67b41d5 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
@@ -257,6 +257,7 @@ namespace helpers {
             gradOut = reinterpret_cast<T*>(eps);
             gradLen = shape::length(epsShape);
         }
+        __syncthreads();
 
         auto start = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = gridDim.x * blockDim.x;
@@ -302,6 +303,7 @@ namespace helpers {
             gradLen = shape::length(epsShape);
             currentLen = shape::length(outTad);
         }
+        __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
             auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
index 7454756b5..78f21916d 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
@@ -75,7 +75,7 @@ namespace helpers {
         if (threadIdx.x == 0) {
             z[zIndex] = val[segment];
         }
-
+        __syncthreads();
     }
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
@@ -256,6 +256,7 @@ namespace helpers {
             gradOut = reinterpret_cast<T*>(eps);
             gradLen = shape::length(epsShape);
         }
+        __syncthreads();
 
         auto start = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = gridDim.x * blockDim.x;
@@ -298,6 +299,7 @@ namespace helpers {
             gradLen = shape::length(epsShape);
             currentLen = shape::length(outTad);
         }
+        __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
             auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
index 875f63e77..4141cefba 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
@@ -168,6 +168,7 @@ namespace helpers {
             gradOut = reinterpret_cast<T*>(eps);
             gradLen = shape::length(epsShape);
         }
+        __syncthreads();
 
         auto start = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = gridDim.x * blockDim.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
index 1d9d983ef..37dacee09 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
@@ -256,6 +256,7 @@ namespace helpers {
             gradOut = reinterpret_cast<T*>(eps);
             gradLen = shape::length(epsShape);
         }
+        __syncthreads();
 
         auto start = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = gridDim.x * blockDim.x;
@@ -292,6 +293,7 @@ namespace helpers {
             gradLen = shape::length(epsShape);
             currentLen = shape::length(outTad);
         }
+        __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
             auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
index 411d7eac1..7318dbaea 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
@@ -36,6 +36,7 @@ namespace helpers {
             inputLen = shape::length(inputShape);
             outputLen = shape::length(outputShape);
         }
+        __syncthreads();
 
         for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
             for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu b/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
index 6a9fd28e6..3b854415b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
@@ -375,8 +375,9 @@ namespace nd4j {
                 }
                 if (threadIdx.x == 0) {
                     if (hasError)
-                    neu1[0] = DataTypeUtils::infOrMax<T>();
+                        neu1[0] = DataTypeUtils::infOrMax<T>();
                 }
+                __syncthreads();
             }
 
             template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
index 150c616a6..5c00244f8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
@@ -149,7 +149,6 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
 
         totalThreads = gridDim.x * blockDim.x;
     }
-
     __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index 5406e8bbd..7e35ec819 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -135,6 +135,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
 
         if (threadIdx.x == 0)
             z[zOffset] = *sharedMem;
+        __syncthreads();
     }
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
index feb6ce6e5..94464bbbc 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
@@ -39,8 +39,7 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo,
     
     if (threadIdx.x == 0)         
         len = shape::length(xShapeInfo);    
-
-    __syncthreads();    
+    __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
     const auto totalThreads = gridDim.x * blockDim.x;
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index e0d53a66f..d05fd1682 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -18208,6 +18208,24 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
+//         #if NOT_EXCLUDED(OP_space_to_batch_nd)
+        @Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public space_to_batch_nd(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public space_to_batch_nd(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public space_to_batch_nd position(long position) {
+                return (space_to_batch_nd)super.position(position);
+            }
+        
+                                                                                    public space_to_batch_nd() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
         /**
          *
          *