[WIP] minor (#218)

* - initial docs commit - merge* cuda fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com>
2019-09-02 11:25:48 +03:00 · 2019-09-02 11:25:48 +03:00 · e42c34ca55
commit e42c34ca55
parent 65c9f2a888
19 changed files with 89 additions and 45 deletions
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@ -273,9 +273,11 @@ namespace nd4j {
         * @param writeList
         * @param readList
         */
+         // TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
        static void registerSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
        static void prepareSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);

+        // TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
        static void registerPrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
        static void preparePrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);

--- a/libnd4j/include/loops/cuda/transform/transform_bool.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu
@ -96,13 +96,13 @@ namespace functions {
 		        }
 		        else {			        
 					if(vx == vz) {
-						for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
+						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
 							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);						
 	    			    	z[xOffset] = OpType::op(x[xOffset], params);
 		    	    	}		    	    
 					}
 					else {
-		    	    	for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
+		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
 							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
 							auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);				        
 	    			    	z[zOffset] = OpType::op(x[xOffset], params); 							
--- a/libnd4j/include/loops/cuda/transform/transform_same.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_same.cu
@ -94,13 +94,13 @@ namespace functions {
 		        }
 		        else {
 					if(vx == vz) {
-						for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
+						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
 							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
 	    			    	z[xOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 					}
 					else {
-		    	    	for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
+		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
 							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
 							auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
 	    			    	z[zOffset] = OpType::op(x[xOffset], params);
--- a/libnd4j/include/loops/cuda/transform/transform_strict.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu
@ -96,13 +96,13 @@ namespace functions {
 		        }
 		        else {
 					if(vx == vz) {
-						for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
+						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
 							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
 	    			    	z[xOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 					}
 					else {
-		    	    	for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
+		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
 							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
 							auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
 	    			    	z[zOffset] = OpType::op(x[xOffset], params);
--- a/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
@ -30,7 +30,7 @@ namespace nd4j {

                shared[threadIdx.x] = 0;

-
+                // each thread will compare 2 elements: E and E+1
                for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) {
                    auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)];
                    auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)];
@ -41,11 +41,12 @@ namespace nd4j {
                    else
                        v = val1 >= val0;

+                    // store comparison result in shared memory
                    shared[threadIdx.x] += v ? 0 : 1;
                }
                __syncthreads();

-                // aggregate sum
+                // aggregate sums in shared memory
                for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
                    if (threadIdx.x < activeThreads)
                        shared[threadIdx.x] += shared[threadIdx.x + activeThreads];
@ -53,7 +54,7 @@ namespace nd4j {
                }


-                // store over the grid
+                // store over the grid if we have more than 1 block
                if (gridDim.x > 1) {

                    auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
@ -96,7 +97,7 @@ namespace nd4j {
                    }
                }
                else {
-
+                    // if we have only 1 block, we just store results right away
                    if (threadIdx.x == 0) {
                        auto tc = reinterpret_cast<unsigned int*>(reductionBuffer);
                        tc[16384] = 0;
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@ -424,7 +424,7 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
    }
    __syncthreads();

-    int tid = blockIdx.x * gridDim.x + threadIdx.x;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {

@ -519,7 +519,7 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
    }
    __syncthreads();

-    int tid = blockIdx.x * gridDim.x + threadIdx.x;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {

@ -610,7 +610,7 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
    }
    __syncthreads();

-    int tid = blockIdx.x * gridDim.x + threadIdx.x;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {

--- a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
@ -39,7 +39,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
    }
    __syncthreads();

-    const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    const auto step = gridDim.x * blockDim.x;
    for (int t = tid; t < inputLength; t += step) {
        z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX];
@ -59,7 +59,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
        }
        __syncthreads();

-        const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+        const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        const auto step = gridDim.x * blockDim.x;
        Nd4jLong i = threadIdx.x * (outputLength + 1);
        for (int t = tid; t < outputLength && i < inputLength; t += step) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
@ -35,9 +35,11 @@ namespace helpers {
        T const* input = reinterpret_cast<T const*>(inputBuf);
        T* output = reinterpret_cast<T*>(outputBuf);

+        // trivial idea: loop through all elements, get independent probability for each element to be nullified
        for (Nd4jLong e = 0; e < inLen; ++e) {
            T val = nodeRng->relativeT(e, T(0.f), T(1.f));

+            // if probability is ok - we're saving scaled value
            if (double(val) < probVal)
                output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal);
        }
@ -80,7 +82,7 @@ namespace helpers {
            std::vector<Nd4jLong> dims(reduceShape->lengthOf());
            reduceShape->syncToHost(); // to ensure that follows are actual
            bool fit = true;
-//            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
+
            for( int i = 0; i < dims.size(); i++ ) {
                if (fit) {
                    dims[i] = reduceShape->e<Nd4jLong>(i);
@ -96,8 +98,7 @@ namespace helpers {
            REQUIRE_TRUE(fit, 0, "dropout: Noise shape should fit to input rank.");
            std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
            chunk->assign(1.f);
-            //chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
-            //NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
+
            dropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), probValue, seed);
            // broadcast chunk to full matrix
            std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
@ -105,6 +106,7 @@ namespace helpers {

            *dropOutMultiplier += *chunk;

+            // FIXME: we could do this in one step, aren't we?
            output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr);
        }

@ -113,8 +115,11 @@ namespace helpers {

    int dropOutFunctor(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
        auto xType = input->dataType();
+        NDArray::prepareSpecialUse({output}, {input});

        BUILD_SINGLE_SELECTOR(xType, return _dropOutFunctor, (context, input, output, reduceShape, seed, probValue), FLOAT_TYPES);
+
+        NDArray::registerSpecialUse({output}, {input});
    }

 /////////////////////////////////// backrpopagations ///////////////////////////////////////////////
@ -136,6 +141,8 @@ namespace helpers {

        for (int e = tid; e < len; e += step) {
            const auto zOffset = shape::getIndexOffset(e, outputShape, len);
+
+            // if probability was non-zero on FF step, we'll scale grads back
            if (output[zOffset] != T(0.))
                output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue);

@ -143,12 +150,17 @@ namespace helpers {
    }
    template <typename T>
    static int dropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
+        // we're making additional FF run to see how probabilities played out with given seeds
        int res = dropOutFunctor(context, input, output, reduceShape, seed, probValue);
        auto stream = context.launchContext()->getCudaStream();

+        NDArray::prepareSpecialUse({output}, {input, gradOut});
+
        if (ND4J_STATUS_OK == res)
            dropoutBPKernel<T><<<128, 256, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(), probValue);

+        NDArray::registerSpecialUse({output}, {input, gradOut});
+
        return res;
    }

@ -239,6 +251,7 @@ namespace helpers {

        int res = alphaDropOutFunctor(context, input, output, reduceShape, seed, probValue, alpha, alpha1, beta);
        if (res == ND4J_STATUS_OK) {
+            // FIXME: can we make it single-loop?
            (*output) *= alpha;
            (*output) *= (*gradOut); //->applyPairwiseTransform<transform::Multiply>(gradOut, output, nullptr);
        }
--- a/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
@ -43,7 +43,7 @@ namespace nd4j {
                }
                __syncthreads();

-
+                // we run things in blocks, 1 partition per block of threads
                for (Nd4jLong o = blockIdx.x; o < numOutputs; o += gridDim.x) {
                    auto z = reinterpret_cast<X*>(vz[o]);

@ -89,9 +89,11 @@ namespace nd4j {
                auto x = reinterpret_cast<X*>(vx);
                auto indices = reinterpret_cast<Y*>(vindices);

+                // we run things in blocks, 1 partition per block of threads
                for (int i = blockIdx.x; i < numOutputs; i += gridDim.x) {
                    auto z = reinterpret_cast<X*>(vz[i]);

+                    // each thread has own counter for partitions
                    int outCnt = 0;

                    for (Nd4jLong e = 0; e < iLength; e++) {
@ -145,6 +147,7 @@ namespace nd4j {
                        tadOffsets[i] = packZ.platformOffsets();
                    }

+                    // we copy pointers to device
                    auto dOutBuffers = reinterpret_cast<void **>(pm.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void *)));
                    auto dOutTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadShapes.data(), tadShapes.size() * sizeof(Nd4jLong *)));
                    auto dOutTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadOffsets.data(), tadOffsets.size() * sizeof(Nd4jLong *)));
@ -248,6 +251,7 @@ namespace nd4j {
                        indicesShapes[e] = indices.at(e)->getSpecialShapeInfo();
                    }

+                    // copying pointers to buffers to device
                    auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
                    auto dIndicesBuffers = reinterpret_cast<void **>(pm.replicatePointer(indicesBuffers.data(), inputSize * sizeof(void *)));
                    auto dInputShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputShapes.data(), inputSize * sizeof(Nd4jLong *)));
@ -283,6 +287,7 @@ namespace nd4j {
                        inputTadOffsets[e] = packX.platformOffsets();
                    }

+                    // copying pointers to buffers to device
                    auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
                    auto dInputTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadShapes.data(), inputSize * sizeof(Nd4jLong *)));
                    auto dInputTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadOffsets.data(), inputSize * sizeof(Nd4jLong *)));
@ -313,6 +318,7 @@ namespace nd4j {

                NDArray::registerSpecialUse({}, {indices, input});

+                // TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
                for (auto v:outputList) {
                    v->tickWriteDevice();
                }
--- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
@ -29,6 +29,7 @@ namespace nd4j {

                Nd4jLong xCoord[MAX_RANK];

+                // each block of threads works on 1 input array
                for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
                    auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];

@ -39,6 +40,7 @@ namespace nd4j {
                    auto xRank = shape::rank(xShapeInfo);
                    auto xLength = shape::length(xShapeInfo);

+                    // each element of this input array has own place within common output array
                    for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
                        shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
                        auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
@ -65,6 +67,7 @@ namespace nd4j {
                    hdShapes[e] = inputs[e]->specialShapeInfo();
                }

+                // copying pointers to device
                auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
                auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
                auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
@ -76,6 +79,7 @@ namespace nd4j {
            }

            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+                // FIXME: we want NDArrayFactory::prepareSpecialUse here eventually
                for (auto v:inputs)
                    v->syncToDevice();

--- a/libnd4j/include/ops/declarable/helpers/cuda/gradient.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gradient.cu
@ -26,6 +26,7 @@ namespace ops {
 namespace helpers {
 template <typename T>
 void applyGradientDescent_(LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
+    // classic one
    auto lambda = LAMBDA_TT(_x, _y, weight) {
        return _x - (_y * weight);
    };
--- a/libnd4j/include/ops/declarable/helpers/cuda/histogram.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/histogram.cu
@ -44,6 +44,7 @@ namespace nd4j {

                X binSize = X((*max_val - *min_val) / numBins);

+                // nullify bins
                for (int e = threadIdx.x; e < numBins; e += blockDim.x) {
                    bins[e] = (Z) 0;
                }
@ -53,14 +54,12 @@ namespace nd4j {
                    int idx = int((dx[e] - *min_val) / binSize);
                    idx = math::nd4j_max(idx, 0); //atomicMax(&idx, 0);//atomicMax(&idx, 0);
                    idx = math::nd4j_min(idx, int(numBins - 1)); //atomicMin(&idx, int(numBins - 1));
-                    nd4j::math::atomics::nd4j_atomicAdd(&bins[idx], (Z)1);
-//                    bins[idx]++;
+                    nd4j::math::atomics::nd4j_atomicAdd<Z>(&bins[idx], (Z)1);
                }
                __syncthreads();
+                // at this point all bins in shared memory are calculated, so we aggregate them now via threadfence trick

                // transfer shared memory to reduction memory
-
-
                if (gridDim.x > 1) {
                    unsigned int *tc = (unsigned int *)reductionPointer;
                    __shared__ bool amLast;
--- a/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
@ -64,6 +64,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray*

        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());

+        // we launch legacy IndexMax op, to get indices of max values along dimension
        auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions);

        dim3 launchDims(256, 256, 16384);
--- a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
@ -41,12 +41,12 @@ namespace helpers {
        const T tbeta  = static_cast<T>(beta);
        const T talpha = static_cast<T>(alpha);

-
+        // one block of threads processes 1 example within batch
        for (uint i = blockIdx.x; i < numTads; i += gridDim.x) {
            auto x = reinterpret_cast<T*>(vx) + xTadOffsets[i];
            auto z = reinterpret_cast<T*>(vz) + zTadOffsets[i];

-            // load everything into shared memory
+            // load everything into shared memory, so we'll operate on shared memory from now on
            shared[threadIdx.x] = x[threadIdx.x * xEws];
            __syncthreads();

@ -94,7 +94,7 @@ namespace helpers {
            sharedY[threadIdx.x] = 0.f;
            __syncthreads();

-
+            // we're operating in shared memory
            for (int s = begin; s < end; s++)
                sharedY[threadIdx.x] = sharedY[threadIdx.x] + sharedX[s] * sharedX[s];
            __syncthreads();
--- a/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
@ -37,7 +37,7 @@ namespace nd4j {
            static __global__ void global_mergeMaxIndex_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
                auto output = reinterpret_cast<Z*>(voutput);

-                const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+                const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
                const auto step = gridDim.x * blockDim.x;

                for (Nd4jLong e = tid; e < length; e += step) {
@ -81,7 +81,13 @@ namespace nd4j {
            }

            void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+                NDArray::prepareSpecialUse({&output}, {});
+                for (auto v:inArrs)
+                    v->syncToDevice();
+
                BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (context, inArrs, output), LIBND4J_TYPES, INDEXING_TYPES);
+
+                NDArray::registerSpecialUse({&output}, {});
            }


@ -90,7 +96,7 @@ namespace nd4j {
            static __global__ void global_mergeMax_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
                auto output = reinterpret_cast<T*>(voutput);

-                const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+                const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
                const auto step = gridDim.x * blockDim.x;

                for (Nd4jLong e = tid; e < length; e += step) {
@ -131,7 +137,12 @@ namespace nd4j {
            }

            void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+                NDArray::prepareSpecialUse({&output}, {});
+                for (auto v:inArrs)
+                    v->syncToDevice();
+
                BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (context, inArrs, output), LIBND4J_TYPES);
+                NDArray::registerSpecialUse({&output}, {});
            }

            //////////////////////////////////////////////////////////////////////////
@ -139,7 +150,7 @@ namespace nd4j {
            static __global__ void global_mergeAvg_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
                auto output = reinterpret_cast<T*>(voutput);

-                const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+                const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
                const auto step = gridDim.x * blockDim.x;

                for (Nd4jLong e = tid; e < length; e += step) {
@ -178,7 +189,13 @@ namespace nd4j {
            }

            void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+                NDArray::prepareSpecialUse({&output}, {});
+                for (auto v:inArrs)
+                    v->syncToDevice();
+
                BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (context, inArrs, output), FLOAT_TYPES);
+
+                NDArray::registerSpecialUse({&output}, {});
            }

            //////////////////////////////////////////////////////////////////////////
@ -186,7 +203,7 @@ namespace nd4j {
            static __global__ void global_mergeAdd_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
                auto output = reinterpret_cast<T*>(voutput);

-                const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+                const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
                const auto step = gridDim.x * blockDim.x;

                for (Nd4jLong e = tid; e < length; e += step) {
@ -226,7 +243,13 @@ namespace nd4j {
            BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES);

            void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+                NDArray::prepareSpecialUse({&output}, {});
+                for (auto v:inArrs)
+                    v->syncToDevice();
+
                BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (context, inArrs, output), NUMERIC_TYPES);
+
+                NDArray::registerSpecialUse({&output}, {});
            }
        }
    }
--- a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
@ -31,18 +31,18 @@ namespace helpers {

    template <typename T>
    static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) {
-        __shared__ T *z, *x;
        __shared__ Nd4jLong bufferLength, arrLen;

+        auto z = reinterpret_cast<T*>(outputBuffer);
+        auto x = reinterpret_cast<T*>(inputBuffer);
+
        if (threadIdx.x == 0) {
-            z = reinterpret_cast<T*>(outputBuffer);
-            x = reinterpret_cast<T*>(inputBuffer);
            arrLen = shape::length(pTadShape);
            bufferLength = shape::length(outputShapeInfo);
        }
        __syncthreads();

-        const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+        const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        const auto step = gridDim.x * blockDim.x;
        for (int t = tid; t < bufferLength; t += step) {
            auto tX = x + pTadOffsets[t];
@ -77,8 +77,6 @@ namespace helpers {
 //            manager.synchronize();
            sortedVals.tickWriteDevice();
            sortedVals.syncToHost();
-            sortedVals.printIndexedBuffer("Hello");
-            sortedVals.printBuffer("Hello line");
            auto stream = context->getCudaStream();
            fillUpElementKernel<T><<<32, 64, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), pTadShape, pTadOffsets, n);
        }
--- a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
@ -74,17 +74,14 @@ static void polyGammaCudaLauncher(const int blocksPerGrid, const int threadsPerB
 ///////////////////////////////////////////////////////////////////
 void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {

-    if(!n.isActualOnDeviceSide()) n.syncToDevice();
-    if(!x.isActualOnDeviceSide()) x.syncToDevice();    
+    NDArray::prepareSpecialUse({&z}, {&n, &x});
        
    int threadsPerBlock = MAX_NUM_THREADS;
    int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
    
    BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES);

-    n.tickReadHost();
-    x.tickReadHost();
-    z.tickWriteDevice();
+    NDArray::registerSpecialUse({&z}, {&n, &x});
 }

 BUILD_SINGLE_TEMPLATE(template void polyGammaCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void *vn, const Nd4jLong *nShapeInfo, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo), FLOAT_TYPES);
--- a/libnd4j/include/ops/declarable/helpers/cuda/range.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/range.cu
@ -28,7 +28,7 @@ namespace helpers {
    template <typename T>
    static __global__ void global_range(void *output, Nd4jLong length, T start, T delta) {
        auto buff = reinterpret_cast<T*>(output);
-        const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
+        const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        const auto step = gridDim.x * blockDim.x;

        for(Nd4jLong i = tid; i < length; i += step)
@ -43,10 +43,11 @@ namespace helpers {
    }

    void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
+        NDArray::prepareSpecialUse({&outVector}, {&start, &delta});
        BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (context, start, delta, outVector), LIBND4J_TYPES);
+        NDArray::registerSpecialUse({&outVector}, {&start, &delta});
    }

-    BUILD_SINGLE_TEMPLATE(template void _range, (nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector), NUMERIC_TYPES);
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/toggle_bits.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/toggle_bits.cu
@ -26,13 +26,11 @@ namespace nd4j {
        namespace helpers {
            template<typename T>
            void toggle_bits__(NDArray &in, NDArray &out) {
-                NDArray::prepareSpecialUse({&out}, {&in});
                auto lambda = LAMBDA_T(_x) {
                    return ~_x;//eUtils::flip_bits(_x);
                };

                in.applyLambda(lambda, &out);
-                NDArray::registerSpecialUse({&out}, {&in});
            }
            BUILD_SINGLE_TEMPLATE(template void toggle_bits__, (NDArray &in, NDArray &out), INTEGER_TYPES);