Shugeo cuda doc2 (#255)

* Added comments to tileKernel routine. * Refactored kernel and added doc to it. * Refactored setDiagonal kernel and added doc for it. * Added doc for tnse cuda helpers. * Added doc for diag kernels. * Added doc for kernel. * Refactored code with fake quantization. * Added docs for image resize and crop kernels. * Added docs for image suppression helpers. * Added docs to matrix_band helpers. * Added docs for matrix_diag_part and nth_element helpers. * Fixed syntax error and refactored getIndexOffset usage.
2019-09-11 21:04:43 +03:00 · 2019-09-11 21:04:43 +03:00 · e1a7460f8e
commit e1a7460f8e
parent 589401477d
13 changed files with 336 additions and 263 deletions
--- a/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
@ -23,15 +23,28 @@
 namespace nd4j {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // set up given value to upper diagonal given
 // buffer - input buffer
 // shape - input shape
 // value - given value
 // diagonal - given upper diagonal (acceptable negative values also, 0 - the main diagonal)
 // row, cols - height and width of given matrix (MxN, rows = M, cols = N)
 //
    template <typename T>
    static __global__ void setDiagValueUpperKernel(void* buffer, Nd4jLong* shape, T value, int diagonal, Nd4jLong rows,
            Nd4jLong cols) {
-        Nd4jLong  rank = shape::rank(shape);
+        __shared__ Nd4jLong  rank;
-        int totalThreads = blockDim.x;
+        __shared__ T* array;
-        T* array = reinterpret_cast<T*>(buffer);
+
        if (0 == threadIdx.x) {
            rank = shape::rank(shape);
            array = reinterpret_cast<T *>(buffer);
        }
        __syncthreads();
        for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
-            for (int j = threadIdx.x; j < cols; j += totalThreads) {
+            for (int j = threadIdx.x; j < cols; j += blockDim.x) {
                Nd4jLong coords[2] = {i, j};
                Nd4jLong xOffset = shape::getOffset(shape, coords);
                if (i + diagonal <= j)
@ -40,6 +53,13 @@ namespace nd4j {
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // set up given value to lower given diagonal
 // buffer - input buffer
 // shape - input shape
 // value - given value
 // diagonal - given lower diagonal (acceptable negative values also, 0 - the main diagonal)
 // row, cols - height and width of given matrix (MxN, rows = M, cols = N)
 //
    template <typename T>
    static __global__ void setDiagValueLowerKernel(void* buffer, Nd4jLong* shape, T value, int diagonal, Nd4jLong rows, Nd4jLong cols) {
@ -96,7 +116,4 @@ namespace nd4j {
            int diagonal, Nd4jLong rows, Nd4jLong cols, cudaStream_t& stream), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
--- a/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
@ -23,22 +23,31 @@
 namespace nd4j {
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // kernel to swap two NDArrays vals as linear sequences
    // input - theSecondBuffer/Shape from input NDArray
    // output - theFirstBuffer/Shape from input NDArray
    template <typename T>
    static __global__ void swapUnsafeKernel(void* theFirstBuffer, Nd4jLong* theFirstShape, void* theSecondBuffer, Nd4jLong* theSecondShape) {
        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        int totalThreads = gridDim.x * blockDim.x;
-        Nd4jLong resultLength = shape::length(theFirstShape);
+
-        //const auto resultLength = shape::length(outputShape);
+        __shared__ Nd4jLong resultLength;
-//        if (shape::order(outputShape) == 'c') {           //  ews == 1 always here
+        __shared__ T* input;
        __shared__ T* output;
        if (0 == threadIdx.x) {
           resultLength = shape::length(theFirstShape);
           input = reinterpret_cast<T*>(theSecondBuffer);
           output = reinterpret_cast<T*>(theFirstBuffer);
        }
        __syncthreads();
        for (int i = tid; i < resultLength; i += totalThreads) {
            auto xEws = shape::order(theFirstShape)  == 'c'? shape::elementWiseStride(theFirstShape) :1;
            auto yEws = shape::order(theSecondShape) == 'c'? shape::elementWiseStride(theSecondShape):1;
-            //if (shape::order(theFirstShape) ==)
+
            auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape);
            auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape);
-            T temp = *(reinterpret_cast<T*>(theFirstBuffer) + xOffset);
+            nd4j::math::nd4j_swap(output[xOffset], input[yOffset]);
            *(reinterpret_cast<T*>(theFirstBuffer) + xOffset) = *(reinterpret_cast<T*>(theSecondBuffer) + yOffset);
            *(reinterpret_cast<T*>(theSecondBuffer) + yOffset) = temp;
        }
    }
--- a/libnd4j/include/loops/cuda/specials/tileKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/tileKernel.cu
@ -33,16 +33,19 @@ namespace nd4j {
        return shape::length(shapeInfo);
    }
-////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //  tileKernel:
 //  input: (inputBuffer and inputShape) - NDArray buffer and shape to tile
 //  output: (outputBuffer and outputShape) - NDArray to tile input
 //  resultLength - length for output array
    template<typename T>
    static __global__ void
    tileKernel(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape,
               Nd4jLong resultLength) {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //        Original code to transform in cuda-based
-        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x; // copy linear sequence of elements, so one-level threading
        int totalThreads = gridDim.x * blockDim.x;
        //const auto resultLength = shape::length(outputShape);
        if (shape::order(outputShape) == 'c') {           //  ews == 1 always here
            for (int i = tid; i < resultLength; i += totalThreads) {
                auto yOffset = _subArrayOffset(i, outputShape, inputShape);
@ -60,6 +63,7 @@ namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template __global__ void tileKernel,(void const* inputBuffer, Nd4jLong* inputShape, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong resultLength), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<typename T>
    void tileKernelH(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape, Nd4jLong resultLength, cudaStream_t *stream) {
        dim3 launchDims(256, 512, 8192);
@ -68,6 +72,8 @@ namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template void tileKernelH, (void const* inputBuffer, Nd4jLong* inputShape, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong resultLength, cudaStream_t *stream), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // enhancement for tileKernel to different input and output data types: X - output type, Y - input type
    template<typename X, typename Y>
    static __global__ void
    tileKernelDouble(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape, Nd4jLong resultLength, Nd4jLong ews) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/BarnesHutTsne.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/BarnesHutTsne.cu
@ -23,7 +23,12 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // count rows kernel - count input pRows and pCols and put result onto pRowCounts
 // pRowCounts - array of ints, with length N
 // pRows - array of ints with length N, vals from 0 to N-1
 // pCols - array of ints with length < N and vals between 0 and max(pRows)
 //
    static __global__ void countRowsKernel(int* pRowCounts, int const* pRows, int const* pCols, Nd4jLong N) {
        auto start = blockIdx.x * blockDim.x;
        auto step = blockDim.x * gridDim.x;
@ -32,19 +37,22 @@ namespace helpers {
            int end = pRows[n + 1];//rowP->e<int>(n + 1);
            for (int i = begin; i < end; i++) {
                bool present = false;
                // loop between near pRows
                for (int m = pRows[pCols[i]]; m < pRows[pCols[i] + 1]; m++)
-                    if (pCols[m] == n) {
+                    if (pCols[m] == n) { // mark index as existed with columns array
                        present = true;
                        break;
                    }
                atomicAdd(&pRowCounts[n], 1);
-                if (!present)
+                if (!present) // increment row counter for given index
                    atomicAdd(&pRowCounts[pCols[i]], 1);
            }
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // row counter caller
    Nd4jLong barnes_row_count(const NDArray* rowP, const NDArray* colP, Nd4jLong N, NDArray& rowCounts) {
        int* pRowCounts = reinterpret_cast<int*>(rowCounts.specialBuffer());
@ -58,12 +66,18 @@ namespace helpers {
        return numElements;
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // extend symRowP with pRowCounts array vals
 //  pRowCounts - int array with length N
 //  symRowP - int array with length N+1
 //  N - given array length
 //
    static __global__ void fillUpsymRow(int const* pRowCounts, int* symRowP, int N) {
        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
-        for (int n = start; n < N + 1; n += step) {
+        for (int n = start; n < N + 1; n += step) { // to avoid race condition use shift only for given index
            symRowP[n] = 0;
            for (int i = 0; i < n; i++)
                atomicAdd(&symRowP[n], pRowCounts[i]);
@ -71,6 +85,17 @@ namespace helpers {
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //  symmetrize routine kernel
 // pRows - rows buffer (ints)
 // pCols - column buffer (ints) with vals between 0 and max(pRows)
 // pVals - values vector (floats)
 // symRowP - ints, shifted pRows
 // symColP - ints, shifted pCols,
 // offset - ints, shitfs
 // pOutput - result matrix (floats)
 // N - pRows length
 //
    template <typename T>
    static __global__ void symmetrizeKernel(int const* pRows, int const* pCols, T const* pVals, int* symRowP, int* symColP, int* offset, T* pOutput, int N) {
        auto start = blockIdx.x * blockDim.x + threadIdx.x;
@ -86,7 +111,6 @@ namespace helpers {
                int start = pRows[colPI];
                int end = pRows[colPI + 1];
                //PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(offset))
                for (int m = start; m < end; m++) {
                    if (pCols[m] == n) {
                        present = true;
@ -101,14 +125,10 @@ namespace helpers {
                // If (colP[i], n) is not present, there is no addition involved
                if (!present) {
                    //int colPI = pCols[i];
                    //if (n <= colPI) {
                    symColP[symRowP[n] + offset[n]] = colPI;
                    symColP[symRowP[pCols[i]] + offset[colPI]] = n;
                    pOutput[symRowP[n] + offset[n]] = pVals[i];
                    pOutput[symRowP[colPI] + offset[colPI]] = pVals[i];
                    //}
                }
                // Update offsets
                if (!present || (present && n <= colPI)) {
@ -119,16 +139,18 @@ namespace helpers {
                }
            }
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // symmetrize algorithm itself
 //
    template <typename T>
    static void barnes_symmetrize_(const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts) {
        int const* pRows = reinterpret_cast<int const*>(rowP->getSpecialBuffer());
        int* symRowP = reinterpret_cast<int*>(outputRows->specialBuffer());
        int* pRowCounts = reinterpret_cast<int*>(rowCounts->specialBuffer());
        auto stream = outputCols->getContext()->getCudaStream();
-
+        // fill up syRowP array
        fillUpsymRow<<<1, N, 128, *stream>>>(pRowCounts, symRowP, N);
        outputRows->syncToHost();
 //        outputRows->printBuffer("output rows");
@ -140,15 +162,23 @@ namespace helpers {
        //std::vector<int> rowCountsV = rowCounts->getBufferAsVector<int>();
        auto offsetArr = NDArrayFactory::create<int>('c', {N});
        int* offset = reinterpret_cast<int*>(offsetArr.specialBuffer());
        // symmetrize itself
        symmetrizeKernel<T><<<1, 1, 1024, *stream>>>(pRows, pCols, pVals, symRowP, symColP, offset, pOutput, N);
 //PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset))
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // symmetrize caller and adoption
 //
    void barnes_symmetrize(const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts) {
        BUILD_SINGLE_SELECTOR(valP->dataType(), barnes_symmetrize_, (rowP, colP, valP, N, outputRows, outputCols, outputVals, rowCounts), NUMERIC_TYPES);
        *outputVals /= 2.0;
    }
    BUILD_SINGLE_TEMPLATE(template void barnes_symmetrize_, (const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts), NUMERIC_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // edge forces implementation
 //
    template <typename T>
    static __global__ void edgeForcesKernel(int const* pRows, int const* pCols, T const* dataP, T const* vals, T* outputP, int N, int colCount, int rowSize) {
 //        std::vector<T> buffer(colCount);
@ -172,10 +202,12 @@ namespace helpers {
                for (int k = 0; k < colCount; k++)
                    math::atomics::nd4j_atomicAdd(&outputP[shift + k], T((dataP[shift + k] - thisSlice[k]) * res));
            }
            //atomicAdd(&shift, colCount);
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // edge forces algorithm
 //
    template <typename T>
    static void barnes_edge_forces_(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray const* data, NDArray* output) {
        NDArray::prepareSpecialUse({output}, {data, rowP, colP, valP, valP});
@ -191,18 +223,22 @@ namespace helpers {
        edgeForcesKernel<T><<<1, 128, 1024, *stream>>>(pRows, pCols, dataP, vals, outputP, N, colCount, rowSize);
        NDArray::registerSpecialUse({output}, {rowP, colP, valP, data});
    }
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // edge forces caller
 //
    void barnes_edge_forces(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray* output, NDArray const& data) {
        // Loop over all edges in the graph
        BUILD_SINGLE_SELECTOR(output->dataType(), barnes_edge_forces_, (rowP, colP, valP, N, &data, output), FLOAT_TYPES);
    }
    BUILD_SINGLE_TEMPLATE(template void barnes_edge_forces_, (const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray const* data, NDArray* output), FLOAT_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // gains - run a function T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps));
 // for all members in input and put all in output
 //
    template <typename T>
    void barnes_gains_(NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output) {
        auto gainsInternal = LAMBDA_TTT(x, grad, eps) {
 //            return T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps));
            //return T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) == nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) == nd4j::math::nd4j_sign<T,T>(eps));
            T res = nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps) ? x + T(.2) : x * T(.8);
            if(res < .01) res = .01;
            return res;
@ -211,14 +247,20 @@ namespace helpers {
        input->applyTriplewiseLambda(gradX, epsilon, gainsInternal, output);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // gains caller
    void barnes_gains(NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output) {
        BUILD_SINGLE_SELECTOR(input->dataType(), barnes_gains_, (input, gradX, epsilon, output), NUMERIC_TYPES);
    }
    BUILD_SINGLE_TEMPLATE(template void barnes_gains_, (NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output), NUMERIC_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // cell contains - check cells for given point
 //
    bool cell_contains(NDArray* corner, NDArray* width, NDArray* point, Nd4jLong dimension) {
        auto  cornerMinusWidth = *corner - *width;
        auto cornerPlusWidth = *corner + *width;
        // executes on host side, so sync all to host memory
        cornerMinusWidth.syncToHost();
        cornerPlusWidth.syncToHost();
        for (Nd4jLong i = 0; i < dimension; i++) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
@ -24,7 +24,14 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diag functor cuda kernel
 // outputBuffer - output tensor buffer
 // outputShape - output tensor shape
 // inputBuffer - input tensor buffer - this tensor should be placed on diagonal position of output
 // inputShape - input tensor shape
 // inputLength - length for input tensor
 //
 template <typename T>
 static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputShape, void const* inputBuffer, Nd4jLong* inputShape, Nd4jLong inputLength) {
    __shared__ T *z;
@ -41,12 +48,22 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    const auto step = gridDim.x * blockDim.x;
-    for (int t = tid; t < inputLength; t += step) {
+
    for (int t = tid; t < inputLength; t += step) { // for all vals in input, put all on diagonal position to output
        z[shape::getIndexOffset(t * (inputLength + 1), outputShape)] = x[shape::getIndexOffset(t, inputShape)]; //tX];
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diag part functor cuda kernel
 // outputBuffer - output tensor buffer - linear sequence of diagonal values
 // outputShape - output tensor shape
 // inputBuffer - input tensor buffer - this tensor should be placed on diagonal position of output
 // inputShape - input tensor shape
 // outputLength - given length of output
 // inputLength - given length for input tensor
 //
    template <typename T>
    static __global__ void diagPartFunctorKernel(void* outputBuffer, Nd4jLong* outputShape, void const* inputBuffer, Nd4jLong* inputShape, Nd4jLong outputLength, Nd4jLong inputLength) {
        __shared__ T *z;
@ -61,10 +78,11 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
        const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        const auto step = gridDim.x * blockDim.x;
-        Nd4jLong i = threadIdx.x * (outputLength + 1);
+        Nd4jLong i = threadIdx.x * (outputLength + 1); // pos to diagonal value
-        for (int t = tid; t < outputLength && i < inputLength; t += step) {
+        for (int t = tid; t < outputLength && i < inputLength; t += step) { // loop by output, but input matrix may not be square
-            z[shape::getIndexOffset(t, outputShape)] = x[shape::getIndexOffset(i, inputShape)]; //tX];
+            // put diagonal val from input onto output
-            i += outputLength + 1;
+            z[shape::getIndexOffset(t, outputShape)] = x[shape::getIndexOffset(i, inputShape)]; 
            i += outputLength + 1; // shift to next diagonal value
        }
    }
@ -81,6 +99,8 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
        diagFunctorKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), inputLength);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diagFunctor - caller for diag functor processor
    void diagFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
        auto xType = input->dataType();
@ -89,6 +109,8 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
    BUILD_SINGLE_TEMPLATE(template void _diagFunctor, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output);, LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diagPartFunctor - caller for diag part functor kernel
    template <typename T>
    void _diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) {
        const int outLen = output->lengthOf();
@ -102,7 +124,8 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
        diagPartFunctorKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), outLen, inLen);
    }
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diagPartFunctor - caller for diag part functor processor
    void diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) {
        auto zType = output->dataType();
        BUILD_SINGLE_SELECTOR(zType, _diagPartFunctor, (context, input, output), NUMERIC_TYPES);
--- a/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
@ -28,69 +28,31 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//    template <typename T>
+// extract patches kernel
-//    static __global__ void globalExtractPatchesKernel(bool theSame, int batchCount, int sizeRow, int sizeCol, int rowDim, int colDim, int outRowDim, int outColDim, int strideRow, int strideCol, int rateRow, int rateCol, int rowCast, int colCast, int lastDim, T* input, Nd4jLong* patchShape, Nd4jLong* inputOffsets, T* output, Nd4jLong* outTadShape, Nd4jLong* outputOffsets) {
+//      - theSame - SAME or VALID - output format
-//    //globalExtractPatches_(void *vinput, Nd4jLong *xTadShape, Nd4jLong *xTadOffsets, void *voutput, Nd4jLong *zTadShape, Nd4jLong *zTadOffsets, const int numTads, const int sizeRow, const int sizeCol, const int stradeRow, const int stradeCol, const int rateRow, const int rateCol, const bool theSame, const int lastDim, const int rowDim, const int colDim) {
+//      - batchCount - batches - the first dimension of input
-//        const int warpSize = lastDim;
+//      - sizeRow, sizeCol - rows and cols sizes for batch
-//        const int tid = blockIdx.x * gridDim.x + threadIdx.x;
+//      - rowDim, colDim - rows and cols dimensions for input patches
-//        const int warpIdx = tid / warpSize;
+//      - outRowDim, outColDim - rows and cols dimensions for output patches
-//        const int warpPos = tid % warpSize;
+//      - strideRow, strideCol - step between input elements with patches
-//        const int numWarps = 1; //(gridDim.x * blockDim.x) / warpSize;
+//      - rateRow, rateCol - counts for input patches
-//        const int patchLength = shape::length(outTadShape);
+//      - rowCast, colCast - shifts for output placement (1 or 0)
 //      - lastDim - last dimension of input/output
 //      - input - input tensor buffer
 //      - patchShape - input patch TAD shape
 //      - inputOffsets - input TAD offsets
 //      - output - output tensor buffer
 //      - outTadShape - output TAD shape
 //      - outputOffsets - output TAD offsets
 //
 //        auto xShape = shape::shapeOf(patchShape);
 //        auto xStride = shape::stride(patchShape);
 //        auto xRank = shape::rank(patchShape);
 //
 //        for (int e = 0; e < batchCount; e += numWarps) {
 //            auto patch = input + inputOffsets[e];
 //            auto matrix = output + outputOffsets[e];
 //            int iter = 0;
 //
 //            for (Nd4jLong i = 0; i < outRowDim; i++) {
 //                for (Nd4jLong j = 0; j < outColDim; j++) {
 //                    Nd4jLong pos = 0;
 //                    //for (Nd4jLong k = 0; k < outputLastDim; k++) {
 //                    auto rowStart = i * strideRow - (theSame?rowCast:0);
 //                    auto colStart = j * strideCol - (theSame?colCast:0);
 //                    auto rowEnd = rowStart + sizeRow * rateRow;
 //                    auto colEnd = colStart + sizeCol * rateCol;
 //                    if (!theSame) {
 //                        rowEnd = math::nd4j_min(int(rowStart + sizeRow * rateRow), rowDim);
 //                        colEnd = math::nd4j_min(int(colStart + sizeCol * rateCol), colDim);
 //                    }
 //                    //auto pixel = 0LL;
 //                    for (auto row = rowStart; row < rowEnd; row += rateRow)
 //                        for (auto col = colStart; col < colEnd; col += rateCol)
 //                            for (auto pixel = 0; pixel < lastDim; pixel++) {
 //                                Nd4jLong zPos[] = {i, j, pos};
 //                                Nd4jLong xPos[] = {row, col, pixel};
 //                                auto zIndex = shape::getOffset(outTadShape, zPos);
 //                                auto xIndex = shape::getOffset(patchShape, xPos);
 //                                if (theSame) { // SAME case
 //                                    if (row >= 0 && col >= 0 && row < rowDim && col < colDim)
 //                                        matrix[zIndex] = patch[xIndex]; //outMatrix->p<T>(i, j, pos, patch->e<T>(row, col, pixel));
 //                                    //pos++;
 //                                }
 //                                else { // VALID case
 //                                    matrix[zIndex] = patch[xIndex]; //outMatrix->p<T>(i, j, pos++, patch->e<T>(row, col, pixel));
 //                                }
 //                                pos++;
 //                            }
 //                }
 //            }
 //            __syncthreads();
 //        }
 //    }
    template <typename T>
    static __global__ void globalExtractPatchesKernel(bool theSame, int batchCount, int sizeRow, int sizeCol, int rowDim, int colDim, int outRowDim, int outColDim, int strideRow, int strideCol, int rateRow, int rateCol, int rowCast, int colCast, int lastDim, T* input, Nd4jLong* patchShape, Nd4jLong* inputOffsets, T* output, Nd4jLong* outTadShape, Nd4jLong* outputOffsets) {
        auto start = threadIdx.x + blockIdx.x * blockDim.x;
        auto step = blockDim.x * gridDim.x;
-
+        // batch  input by 3 last dims and extrapole input onto output with outColDim/outRowDim
        for (Nd4jLong batch = start; batch < batchCount; batch += step) {
            auto patch = input + inputOffsets[batch];// listOfMatricies->at(batch);
            auto outMatrix = output + outputOffsets[batch]; //listOfOutputs->at(batch);
@ -98,7 +60,6 @@ namespace helpers {
            for (Nd4jLong i = 0; i < outRowDim; i++) {
                for (Nd4jLong j = 0; j < outColDim; j++) {
                    Nd4jLong pos = 0;
                    //for (Nd4jLong k = 0; k < outputLastDim; k++) {
                    auto rowStart = i * strideRow - (theSame?rowCast:0);
                    auto colStart = j * strideCol - (theSame?colCast:0);
                    auto rowEnd = rowStart + sizeRow * rateRow;
@ -107,25 +68,29 @@ namespace helpers {
                        rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, Nd4jLong (rowDim));
                        colEnd = math::nd4j_min(colStart + sizeCol * rateCol, Nd4jLong (colDim));
                    }
-                    //auto pixel = 0LL;
+
-                    for (auto row = rowStart; row < rowEnd; row += rateRow)
+                    for (auto row = rowStart; row < rowEnd; row += rateRow) {
-                        for (auto col = colStart; col < colEnd; col += rateCol)
+                        for (auto col = colStart; col < colEnd; col += rateCol) {
                            for (auto pixel = 0; pixel < lastDim; pixel++) {
                                Nd4jLong zPos[] = {i, j, pos};
                                Nd4jLong xPos[] = {row, col, pixel};
-                                bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame);
+                                bool setUp =
                                        (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame);
                                if (setUp) { // VALID or SAME cases
                                    outMatrix[shape::getOffset(outTadShape, zPos)] = patch[shape::getOffset(patchShape, xPos)];
                                }
                                pos++;
                            }
                        }
                    }
                }
            }
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template <typename T>
    static void _extractPatches(nd4j::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int strideRow, int strideCol, int rateRow, int rateCol, bool theSame){
        NDArray::prepareSpecialUse({output}, {images});
@ -141,36 +106,29 @@ namespace helpers {
        Nd4jLong colDim = images->sizeAt(2);
        Nd4jLong outRowDim = output->sizeAt(1);
        Nd4jLong outColDim = output->sizeAt(2);
-        auto rowCast = 1; //(sizeRow - 1)*rateRow < outRowDim/sizeRow  ?0:1;///(ksize * lastDim > rowDim * ksizeColsEffective + lastDim?1:0);
+        auto rowCast = 1;
-        auto colCast = 1; //colDim / ksizeColsEffective +2 <= sizeCol?0:1;//(ksize * lastDim > ksizeRowsEffective * colDim + lastDim?1:0);
+        auto colCast = 1;
        // validate shifts
        if (sizeRow * rateRow < 3)
            rowCast = 0;
        if (sizeCol * rateCol < 3)
            colCast = 0;
        //images->tickReadDevice();
        //if (images->isActualOnDeviceSide())
        //images->syncToDevice();
        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(images->getShapeInfo(), restDims.data(), restDims.size());
        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), restDims.data(), restDims.size());
-        int batchCount = packX.numberOfTads(); //'listOfMatricies->size(); //lengthOf() / ksize;
+        int batchCount = packX.numberOfTads();
-        //printf("Batch Count is %d\n", batchCount);
+
        //shape::printShapeInfo(packX.primaryShapeInfo());
        //NDArray::prepareSpecialUse({output}, {images});
        PointersManager manager(context, "helpers::extractPatches");
        auto stream = context->getCudaStream();
        auto imagesBuffer = reinterpret_cast<T*>(images->specialBuffer());
        auto outputBuffer = reinterpret_cast<T*>(output->specialBuffer());
-        //images->printIndexedBuffer("INPUT");
+
 //        globalExtractPatchesKernel<T><<<512, 512, 1024, *context->getCudaStream()>>>(theSame, batchCount, sizeRow, sizeCol,
        globalExtractPatchesKernel<T><<<128, 128, 1024, *stream>>>(theSame, batchCount, sizeRow, sizeCol,
                rowDim, colDim, outRowDim, outColDim, strideRow, strideCol, rateRow, rateCol, rowCast, colCast, lastDim,
                imagesBuffer, packX.specialShapeInfo(), packX.specialOffsets(), outputBuffer, packZ.specialShapeInfo(),
                packZ.specialOffsets());
-        //extractPatchesKernel<T><<<batchCount, 512, 1024, *stream>>>(theSame, batchCount, sizeRow, sizeCol, rowDim, colDim, outRowDim, outColDim, stradeRow, stradeCol, rateRow, rateCol, rowCast, colCast, lastDim, imagesBuffer, packX.specialShapeInfo(), packX.platformOffsets(), outputBuffer, packZ.specialShapeInfo(), packZ.platformOffsets());
+
        //output->tickWriteDevice();
        //output->printIndexedBuffer("OUTPUT");
        manager.synchronize();
        NDArray::registerSpecialUse({output}, {images});
    }
@ -183,76 +141,6 @@ namespace helpers {
        BUILD_SINGLE_SELECTOR(xType, _extractPatches, (context, images, output, sizeRow, sizeCol, stradeRow, stradeCol, rateRow, rateCol, theSame), LIBND4J_TYPES);
    }
 //        std::vector<int> restDims({1, 2, 3}); // the first and the last dims
 //        std::unique_ptr<ResultSet> listOfMatricies(images->allTensorsAlongDimension(restDims));
 //        std::unique_ptr<ResultSet> listOfOutputs(output->allTensorsAlongDimension(restDims));
 //        // 3D matricies - 2D matricies of vectors (if last dim is greater than 1)
 //        //int e = 0;
 //        const int ksizeRowsEffective = sizeRow + (sizeRow - 1) * (rateRow - 1);
 //        const int ksizeColsEffective = sizeCol + (sizeCol - 1) * (rateCol - 1);
 //        const int ksize = ksizeRowsEffective * ksizeColsEffective;
 //        int batchCount = listOfMatricies->size(); //lengthOf() / ksize;
 //        Nd4jLong lastDim = images->sizeAt(3);
 //        Nd4jLong outLastDim = output->sizeAt(3);
 //        Nd4jLong rowDim = images->sizeAt(1);
 //        Nd4jLong colDim = images->sizeAt(2);
 //        Nd4jLong outRowDim = output->sizeAt(1);
 //        Nd4jLong outColDim = output->sizeAt(2);
 //        auto rowCast = 1; //(sizeRow - 1)*rateRow < outRowDim/sizeRow  ?0:1;///(ksize * lastDim > rowDim * ksizeColsEffective + lastDim?1:0);
 //        auto colCast = 1; //colDim / ksizeColsEffective +2 <= sizeCol?0:1;//(ksize * lastDim > ksizeRowsEffective * colDim + lastDim?1:0);
 //        if (sizeRow * rateRow < 3)
 //            rowCast = 0;
 //        if (sizeCol * rateCol < 3)
 //            colCast = 0;
 //        //Nd4jLong outputLastDim = output->sizeAt(3);
 //       PRAGMA_OMP_PARALLEL_FOR
 //        for (Nd4jLong batch = 0; batch < batchCount; batch++) {
 //            auto patch = listOfMatricies->at(batch);
 //            auto outMatrix = listOfOutputs->at(batch);
 //            //auto patchBorder = patch->sizeAt(0);
 //            if (theSame) { // SAME case
 //                for (Nd4jLong i = 0; i < outRowDim; i++) {
 //                    for (Nd4jLong j = 0; j < outColDim; j++) {
 //                        Nd4jLong pos = 0;
 //                        //for (Nd4jLong k = 0; k < outputLastDim; k++) {
 //                        auto rowStart = i * strideRow - rowCast;
 //                        auto colStart = j * strideCol - colCast;
 //                        auto rowEnd = rowStart + sizeRow * rateRow;
 //                        auto colEnd = colStart + sizeCol * rateCol;
 //                        auto pixel = 0LL;
 //                        for (auto row = rowStart; row < rowEnd; row += rateRow)
 //                            for (auto col = colStart; col < colEnd; col += rateCol)
 //                                for (auto pixel = 0; pixel < lastDim; pixel++) {
 //                                    if (row >=0 && col >= 0 && row < rowDim && col < colDim)
 //                                    outMatrix->p<T>(i, j, pos, patch->e<T>(row, col, pixel));
 //                                    pos++;
 //                                }
 //                        //}
 //                    }
 //                }
 //
 //            } else { // VALID case
 //                for (Nd4jLong i = 0; i < outRowDim; i++) {
 //                    for (Nd4jLong j = 0; j < outColDim; j++) {
 //                        Nd4jLong pos = 0;
 //                        //for (Nd4jLong k = 0; k < outputLastDim; k++) {
 //                            auto rowStart = i * strideRow;
 //                            auto colStart = j * strideCol;
 //                            auto rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim);
 //                            auto colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim);
 //                            auto pixel = 0LL;
 //                            for (auto row = rowStart; row < rowEnd; row += rateRow)
 //                                for (auto col = colStart; col < colEnd; col += rateCol)
 //                                    for (auto pixel = 0; pixel < lastDim; pixel++)
 //                                        outMatrix->p<T>(i,j,pos++, patch->e<T>(row, col, pixel));
 //                        //}
 //                    }
 //                }
 //            }
 //        }
 //
 //
 //
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@ -24,70 +24,68 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // fakeQuantWithMinMaxVars_
 // input - input tensor
 // min - min scalar tensor
 // max - max scalar tensor
 // numBits - (default 16bit)
 // narrowed - shrink is true
 // output - output tensor
 //
    template <typename T>
    void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
        int lowIntBound = narrowed?1:0;
        int upperIntBound = 1 << numBits - 1;
-
+        min->syncToHost();
        max->syncToHost();
        const float quant_min_float = static_cast<float>(lowIntBound);
        const float quant_max_float = static_cast<float>(upperIntBound);
        T scale = (max->t<T>(0) - min->t<T>(0)) / (quant_max_float - quant_min_float);
-        const T zero_point_from_min = quant_min_float - min->e<T>(0) / scale;
+        const T zero_point_from_min = quant_min_float - min->t<T>(0) / scale;
        const uint16_t nudged_zero_point = [zero_point_from_min, lowIntBound,
                                        quant_min_float, upperIntBound,
                                        quant_max_float] {
-        if (zero_point_from_min < quant_min_float) {
+            if (zero_point_from_min < quant_min_float) {
-          return static_cast<uint16_t>(lowIntBound);
+                return static_cast<uint16_t>(lowIntBound);
-        }
+            }
-        if (zero_point_from_min > quant_max_float) {
+            if (zero_point_from_min > quant_max_float) {
-          return static_cast<uint16_t>(upperIntBound);
+                return static_cast<uint16_t>(upperIntBound);
-        }
+            }
-        return static_cast<uint16_t>(roundf(zero_point_from_min));
+            return static_cast<uint16_t>(roundf(zero_point_from_min));
        }();
        auto nudged_min = (quant_min_float - nudged_zero_point) * (scale);
        auto nudged_max = (quant_max_float - nudged_zero_point) * (scale);
-        //input->applyScalar(scalar::CompareAndSet, nudged_max, clamped, nullptr); //.cwiseMin(nudged_max).cwiseMax(nudged_min);
+
        //input->applyScalar(scalar::CompareAndSet, nudged_min, clamped, nullptr); //.cwiseMin(nudged_max).cwiseMax(nudged_min);
        auto wiseMax = LAMBDA_T(x, nudged_min) {
            if (x < nudged_min) {
                return nudged_min;
            }
            return x;
        };
        auto wiseMin = LAMBDA_T(x, nudged_max) {
            if (x > nudged_max) {
                return nudged_max;
            }
            return x;
        };
-        auto scaleTensor(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
+
-        auto clamped(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
+        auto scaleTensor(*input);
        auto clamped(*input);
        scaleTensor.assign(scale);
        input->applyLambda(wiseMin, &clamped);
-//        const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
+
        clamped.applyLambda(wiseMax, output);
 //        const auto clamped_shifted = clamped - nudged_min;
        *output -= nudged_min;
-       // auto nudgedScale = scale;
+
        (*output) /= scaleTensor;
        (*output) += T(0.5f);
        output->applyTransform(transform::Floor, nullptr, nullptr);
        (*output) *= scaleTensor;
        (*output) += nudged_min;
        //output->printIndexedBuffer("FAKE QUANTED");
        /*
    const auto nudged_scale_repl = inputs.constant(nudged_scale);
    const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
    const auto clamped_shifted = clamped - nudged_min;
    *output = (clamped_shifted / nudged_scale_repl + 0.5f).floor() *
                            nudged_scale_repl +
                        nudged_min;
 */
    }
    void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
@ -32,7 +32,13 @@ namespace helpers {
        // https://en.wikipedia.org/wiki/Bilinear_interpolation)
        double interpolarValue;
    };
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // computeInterpolationWeights kernel
 //      outSize - output length
 //      inSize - input size
 //      scale - input scale
 //      interporationData - result
 //
    static __global__ void computeInterpolationWeights(Nd4jLong outSize,
                                              Nd4jLong inSize,
                                              double scale,
@ -54,21 +60,26 @@ namespace helpers {
            }
        }
    }
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resize image with bilinear interpolation algorithm
 //
    static void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
                     Nd4jLong outWidth, Nd4jLong channels,
                     BilinearInterpolationData* xs_,
                     BilinearInterpolationData* ys_,
                     NDArray* output);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resize image with bilinear interpolation algorithm kernel
 //
    template <typename T>
    static __global__ void resizeImageKernel(T const* input, Nd4jLong const* inputShape, T* outputYptr, Nd4jLong* outputShape, Nd4jLong batchSize,
                                             Nd4jLong outWidth, Nd4jLong outHeight, Nd4jLong channels, Nd4jLong inRowSize, Nd4jLong outRowSize, Nd4jLong inBatchNumValues,
                                             BilinearInterpolationData* xs_, BilinearInterpolationData* ys_) {
-        if (blockIdx.x < batchSize) {
+        if (blockIdx.x < batchSize) { // blockIdx.x as batch index
            auto pX = input + blockIdx.x * inBatchNumValues;
-            //auto pZ = output_y_ptr;
+
            auto channelStart = blockIdx.z * blockDim.z + threadIdx.z;
            auto step = blockDim.z * gridDim.z;
            for (Nd4jLong y = threadIdx.x; y < outHeight; y += blockDim.x) {
@ -80,6 +91,7 @@ namespace helpers {
                    auto xsBottom = xs_[x].bottomIndex;
                    auto xsTop = xs_[x].topIndex;
                    auto xVal = xs_[x].interpolarValue;
                    // process interpolation for all channels
                    for (int c = channelStart; c < channels; c += step) {
                        double topLeft(ys_input_lower_ptr[xsBottom + c]);
                        double topRight(ys_input_lower_ptr[xsTop + c]);
@ -87,13 +99,15 @@ namespace helpers {
                        double bottomRight(ys_input_upper_ptr[xsTop + c]);
                        double top = topLeft + (topRight - topLeft) * xVal;
                        double bottom = bottomLeft + (bottomRight - bottomLeft) * xVal;
-                        pZ[x * channels + c] = top + (bottom - top) * yVal;
+                        pZ[x * channels + c] = T(top + (bottom - top) * yVal);
                    }
                }
            }
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resize image with
    template <typename T>
    static void resizeImage_(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
                     Nd4jLong outWidth, Nd4jLong channels,
@ -111,7 +125,8 @@ namespace helpers {
                outWidth, outHeight, channels, inRowSize, outRowSize, inBatchNumValues, xs_, ys_);
    }
-    template <typename T>
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
        template <typename T>
    static int resizeBilinearFunctor_(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
        const Nd4jLong batchSize = images->sizeAt(0);
        const Nd4jLong inHeight = images->sizeAt(1);
@ -174,6 +189,10 @@ namespace helpers {
        return Status::OK();
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resize by interpolation nearest neighbor algorithm kernel
 //
    template <typename T>
    static __global__ void resizeNeighborKernel(T const* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape,
            Nd4jLong batchSize, Nd4jLong inWidth, Nd4jLong inHeight, Nd4jLong outWidth, Nd4jLong outHeight, Nd4jLong channels, double widthScale, double heightScale, bool center) {
@ -206,6 +225,9 @@ namespace helpers {
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resizeNeighborFunctor - main algorithm by nearest neighbor
 //
    template <typename T>
    int resizeNeighborFunctor_(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
        const Nd4jLong batchSize = images->sizeAt(0);
@ -243,10 +265,11 @@ namespace helpers {
                batchSize, inWidth, inHeight, outWidth, outHeight, channels, widthScale, heightScale, center);
        NDArray::registerSpecialUse({output}, {images});
        return ND4J_STATUS_OK;
        return Status::OK();
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resizeImage - resize bilinear algorithm caller
 //
    void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight,
            Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth, Nd4jLong channels, BilinearInterpolationData* xs_,
            BilinearInterpolationData* ys_, NDArray* output) {
@ -257,21 +280,25 @@ namespace helpers {
            Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth,
            Nd4jLong channels, BilinearInterpolationData* xs_, BilinearInterpolationData* ys_, NDArray* output), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    int resizeBilinearFunctor(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
        BUILD_SINGLE_SELECTOR(images->dataType(), return resizeBilinearFunctor_, (context, images, width, height, center, output), LIBND4J_TYPES);
    }
    BUILD_SINGLE_TEMPLATE(template int resizeBilinearFunctor_, (nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    int resizeNeighborFunctor(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
        BUILD_SINGLE_SELECTOR(images->dataType(), return resizeNeighborFunctor_, (context, images, width, height, center, output), LIBND4J_TYPES);
    }
    BUILD_SINGLE_TEMPLATE(template int resizeNeighborFunctor_, (nd4j::LaunchContext* context, NDArray const* images,
            int width, int height, bool center, NDArray* output), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // --------------------------------------------------------------------------------------------------------------- //
    // Crop and Resize helper implementation
    // --------------------------------------------------------------------------------------------------------------- //
-    ///////
+    // cropAndResize kernel
    //
    template <typename T, typename Z, typename I>
    static __global__ void cropAndResizeKernel(T const *images, Nd4jLong* imagesShape, Z const* boxes, Nd4jLong* boxesShape,
            I const* indices, Nd4jLong* indexShape, I const* cropSize, Nd4jLong* cropShape, int method,
@ -297,7 +324,6 @@ namespace helpers {
            Z heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / Z(cropHeight - 1) : Z(0);
            Z widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / Z(cropWidth - 1) : Z(0);
 //            PRAGMA_OMP_PARALLEL_FOR_SIMD
            for (int y = threadIdx.x; y < cropHeight; y += blockDim.x) {
                const float inY = (cropHeight > 1)
                                  ? y1 * (imageHeight - 1) + y * heightScale
@ -315,6 +341,7 @@ namespace helpers {
                    }
                    continue;
                }
                if (method == 0 /* bilinear */) {
                    const int topYIndex = nd4j::math::p_floor(inY);
                    const int bottomYIndex = nd4j::math::p_ceil(inY);
@ -355,7 +382,6 @@ namespace helpers {
                            Nd4jLong zPos[] = {b, y, x, d};
                            auto zIndex = shape::getOffset(outputShape, zPos);
                            output[zIndex] = Z(top + (bottom - top) * y_lerp);
 //                            crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
                        }
                    }
                } else {  // method is "nearest neighbor"
@ -383,7 +409,6 @@ namespace helpers {
                            auto zIndex = shape::getOffset(outputShape, zPos);
                            auto xIndex = shape::getOffset(imagesShape, xPos);
                            output[zIndex] = images[xIndex];
 //                            crops->p(b, y, x, d, images->e<T>(bIn, closestYIndex, closestXIndex, d));
                        }
                    }
                }
@ -392,6 +417,17 @@ namespace helpers {
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // cropAndResizeFunctor main algorithm
 //      context - launch context
 //      images - batch of images (4D tensor - [batch, width, height, pixels])
 //      boxes - 2D tensor with boxes for crop
 //      indices - 2D int tensor with indices of boxes to crop
 //      cropSize - 2D int tensor with crop box sizes
 //      method - (one of 0 - bilinear, 1 - nearest)
 //      extrapolationVal - double value of extrapolation
 //      crops - output (4D tensor - [batch, outWidth, outHeight, pixels])
 //
    template <typename T, typename Z, typename I>
    static void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices,
                                      NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
@ -416,6 +452,7 @@ namespace helpers {
        NDArray::registerSpecialUse({crops}, {images, boxes, indices, cropSize});
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
        BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_,
                              (context, images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES);
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
@ -26,7 +26,16 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // needToSuppressWithThreshold - predicate for suppression
 //      boxes - boxes tensor buffer
 //      boxesShape boxes tensor shape
 //      previousIndex - index for current pos value
 //      nextIndex - index for neighbor pos value
 //      threshold - threashold value to suppress
 //
 //      return value: true, if threshold is overcome, false otherwise
 //
    template <typename T>
    static __device__ bool needToSuppressWithThreshold(T* boxes, Nd4jLong* boxesShape, int previousIndex, int nextIndex, T threshold) {
        Nd4jLong previous0[] = {previousIndex, 0};
@ -38,6 +47,8 @@ namespace helpers {
        Nd4jLong next2[] = {nextIndex, 2};
        Nd4jLong next3[] = {nextIndex, 3};
        // we have rectangle with given max values. Compute vexes of rectangle first
        T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
        T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
        T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
@ -47,11 +58,14 @@ namespace helpers {
        T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
        T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
        // compute areas for comparation
        T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev);
        T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext);
        // of course, areas should be positive
        if (areaNext <= T(0.f) || areaPrev <= T(0.f)) return false;
        // compute intersection of rectangles
        T minIntersectionY = nd4j::math::nd4j_max(minYPrev, minYNext);
        T minIntersectionX = nd4j::math::nd4j_max(minXPrev, minXNext);
        T maxIntersectionY = nd4j::math::nd4j_min(maxYPrev, maxYNext);
@ -60,9 +74,15 @@ namespace helpers {
                nd4j::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
                nd4j::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
        T intersectionValue = intersectionArea / (areaPrev + areaNext - intersectionArea);
        // final check
        return intersectionValue > threshold;
-    };
+    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // shouldSelectKernel - compute status for all selected rectangles (boxes)
 //
 // we compute boolean flag as shared uint32 and return it on final only for the first thread
 //
    template <typename T, typename I>
    static __global__ void shouldSelectKernel(T* boxesBuf, Nd4jLong* boxesShape, I* indexBuf, I* selectedIndicesData, double threshold, int numSelected, int i, bool* shouldSelect) {
        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@ -76,15 +96,20 @@ namespace helpers {
            if (shouldSelectShared) {
                if (needToSuppressWithThreshold(boxesBuf, boxesShape, indexBuf[i],
                                                                  indexBuf[selectedIndicesData[j]], T(threshold)))
-                    atomicCAS(&shouldSelectShared, 1, 0);
+                    atomicCAS(&shouldSelectShared, 1, 0); // exchange only when need to suppress
            }
        }
        __syncthreads();
        // final move: collect result
        if (threadIdx.x == 0) {
            *shouldSelect = shouldSelectShared > 0;
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // indices - type depended, indicesLong - type defined (only 64bit integers)
 //
    template <typename I>
    static __global__ void copyIndices(void* indices,  void* indicesLong, Nd4jLong len) {
        I* indexBuf = reinterpret_cast<I*>(indices);
@ -96,7 +121,9 @@ namespace helpers {
        for (auto i = tid; i < len; i += step)
            indexBuf[i] = (I)srcBuf[i];
    }
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // nonMaxSuppressionV2 algorithm - given from TF NonMaxSuppressionV2 implementation
 //
    template <typename T, typename I>
    static void nonMaxSuppressionV2_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) {
        auto stream = context->getCudaStream();
@ -109,8 +136,7 @@ namespace helpers {
        Nd4jPointer extras[2] = {nullptr, stream};
        sortByValue(extras, indices->buffer(), indices->shapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), scores.buffer(), scores.shapeInfo(), scores.specialBuffer(), scores.specialShapeInfo(), true);
-        // TO DO: sort indices using scales as value row
+
        //std::sort(indices.begin(), indices.end(), [scales](int i, int j) {return scales->e<T>(i) > scales->e<T>(j);});
        auto indexBuf = reinterpret_cast<I*>(indices->specialBuffer());
        NDArray selectedIndices = NDArrayFactory::create<I>('c', {output->lengthOf()});
@ -154,6 +180,7 @@ namespace helpers {
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void nonMaxSuppressionV2(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) {
        BUILD_DOUBLE_SELECTOR(boxes->dataType(), output->dataType(), nonMaxSuppressionV2_, (context, boxes, scales, maxSize, threshold, output), FLOAT_TYPES, INDEXING_TYPES);
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
@ -26,6 +26,7 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template <typename T>
    linkage void cubeDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
        auto functor = LAMBDA_TT(x, y){
@ -35,10 +36,12 @@ namespace helpers {
        input->applyPairwiseLambda(epsilon, functor, output);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void cubeDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
        BUILD_SINGLE_SELECTOR(theFirst->dataType(), cubeDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //return (x >= X(0.f) ? y: -y);
    template <typename T>
    linkage void reduceNorm1_(NDArray* input, NDArray* epsilon, NDArray* output) {
@ -49,10 +52,12 @@ namespace helpers {
        input->applyPairwiseLambda(epsilon, functor, output);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void reduceNorm1(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
        BUILD_SINGLE_SELECTOR(theFirst->dataType(), reduceNorm1_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////
    template <typename T>
    linkage void sigmCrossEntropy_(NDArray* logits, NDArray* labels, NDArray* output) {
@ -63,10 +68,12 @@ namespace helpers {
        logits->applyPairwiseLambda(labels, functor, output);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void sigmCrossEntropy(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
        BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropy_, (logits, labels, output), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////
    template <typename T>
    linkage void sigmCrossEntropyGrad_(NDArray* logits, NDArray* labels, NDArray* output) {
@ -80,14 +87,15 @@ namespace helpers {
        logits->applyPairwiseLambda(labels, functor, output);
    }
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void sigmCrossEntropyGrad(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
        BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropyGrad_, (logits, labels, output), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //            X f = (X) 1.0f + nd4j::math::nd4j_abs<X>(d1);
    //            return (X) d2 * ((X) 1.0f / (f * f));
-
+    //
    template <typename T>
    linkage void softSignDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
        auto functor = LAMBDA_TT(x, y){
@ -98,10 +106,12 @@ namespace helpers {
        input->applyPairwiseLambda(epsilon, functor, output);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void softSignDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
        BUILD_SINGLE_SELECTOR(theFirst->dataType(), softSignDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template <typename T>
    linkage void softPlusDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
        auto functor = LAMBDA_TT(x, y){
@ -115,10 +125,11 @@ namespace helpers {
    void softPlusDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
        BUILD_SINGLE_SELECTOR(theFirst->dataType(), softPlusDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ///
-/// \param theFirst
+/// \param input
-/// \param theSecond
+/// \param epsilon
-/// \param theOutput
+/// \param output
    template <typename T>
    linkage void sigmoidDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
        auto functor = LAMBDA_TT(x, y){
@ -146,6 +157,7 @@ namespace helpers {
        BUILD_SINGLE_SELECTOR(theFirst->dataType(), hardSigmoidDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template <typename T>
    linkage void logSumExp_(NDArray* input, NDArray* axis, NDArray* output) {
        // reduce along axis with
@ -178,15 +190,17 @@ namespace helpers {
        output->applyTransform(transform::Log, nullptr, nullptr);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output) {
        BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, axis, output), FLOAT_TYPES);
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output) {
        BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, subtrah, axis, output), FLOAT_TYPES);
    }
-//////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template <typename T>
    void weightedCrossEntropyWithLogitsFunctor_(NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
@ -220,15 +234,14 @@ namespace helpers {
            const_cast<NDArray*>(input)->applyTriplewiseLambda(const_cast<NDArray*>(targets), targetTensor.get(), mainRoutineT2, output);
        }
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
        NDArray::prepareSpecialUse({output}, {targets, input, weights});
-void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
+        BUILD_SINGLE_SELECTOR(targets->dataType(), weightedCrossEntropyWithLogitsFunctor_, (targets, input, weights, output), FLOAT_TYPES);
    NDArray::prepareSpecialUse({output}, {targets, input, weights});
    BUILD_SINGLE_SELECTOR(targets->dataType(), weightedCrossEntropyWithLogitsFunctor_, (targets, input, weights, output), FLOAT_TYPES);
    NDArray::registerSpecialUse({output}, {targets, input, weights});
 }
        NDArray::registerSpecialUse({output}, {targets, input, weights});
    }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
@ -26,7 +26,22 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // matrix band kernel
 //
 // inputBuffer - buffer of input tensor
 // inputShape - shape of input tensor
 // outputBuffer - buffer of output tensor
 // outputShape - shape of output tensor
 // lowerBand - lower band of matrix
 // upperBand - upper band of matrix
 // tadOnlyInputShapeInfo - TAD shape for input
 // tadInputOffsets - TAD offsets for input
 // tadOnlyOutputShapeInfo - TAD output shape
 // tadOutputOffsets - TAD output offsets
 // numTads - number of subarrays
 // inputLength - input subarray length
 //
    template <typename T>
    static __global__ void matrixBandKernel(void* inputBuffer, Nd4jLong* inputShape,
            void* outputBuffer, Nd4jLong* outputShape, Nd4jLong lowerBand, Nd4jLong upperBand, Nd4jLong* tadOnlyInputShapeInfo,  Nd4jLong* tadInputOffsets,
@ -42,7 +57,7 @@ namespace helpers {
                    Nd4jLong coords[2] = {i, j};
                    Nd4jLong tadOffsetOut = shape::getOffset(tadOnlyOutputShapeInfo, coords);
                    Nd4jLong tadOffsetIn = shape::getOffset(tadOnlyInputShapeInfo, coords);
-                    //shape::getIndexOffset(j, tadOnlyOutputShapeInfo)
+
                    if (i >= j) { // check lower diagonals
                        if (lowerBand > 0) {
                            if ((i - j) > lowerBand)
@ -59,16 +74,14 @@ namespace helpers {
                                *(reinterpret_cast<T *>(outputBuffer) + xOffset + tadOffsetOut) = *(
                                        reinterpret_cast<T const *>(inputBuffer) + yOffset + tadOffsetIn);
                    }
 //                if ((i >= j) && (i - j) <= lowerBand && (j - i) <= upperBand) // with in band
 //                    *(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffsetOut) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + tadOffsetIn);
                    //else
                    //    *(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffsetOut) = T(0);
                }
            }
        }
    }
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // matrixBandPart_ - main algorithm caller
 //
    template <typename T>
    void matrixBandPart_(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
        dim3 launchDims(256, 512, 8192);
@ -82,17 +95,14 @@ namespace helpers {
        const Nd4jLong numTads = packX.numberOfTads();
-        if (!input->isActualOnDeviceSide())
+        NDArray::prepareSpecialUse({output}, {input});
            input->syncToDevice();
        if (!input->isActualOnDeviceSide())
            input->syncToDevice();
        matrixBandKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(),
                input->getSpecialShapeInfo(), output->getSpecialBuffer(), output->getSpecialShapeInfo(),
                lowerBand, upperBand, packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets(), numTads, input->lengthOf());
        NDArray::registerSpecialUse({output}, {input});
    }
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void matrixBandPart(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
        BUILD_SINGLE_SELECTOR(input->dataType(), matrixBandPart_, (context, input, output, lowerBand, upperBand), FLOAT_TYPES);
    }
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
@ -31,6 +31,8 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // put diagonals from input batched matricies to output batched vectors
    template <typename T>
    static __global__ void matrixDiagPartKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength,
                                            Nd4jLong* tadOnlyInputShapeInfo,  Nd4jLong *tadInputOffsets,
@ -42,7 +44,6 @@ namespace helpers {
            for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
                Nd4jLong coords[2] = {j, j};
                Nd4jLong tadOffset = shape::getOffset(tadOnlyInputShapeInfo, coords);
                //shape::getIndexOffset(j, tadOnlyOutputShapeInfo)
                *(reinterpret_cast<T*>(outputBuffer) + xOffset + shape::getIndexOffset(j, tadOnlyOutputShapeInfo)) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + tadOffset);
            }
        }
@ -51,6 +52,7 @@ namespace helpers {
 //////////////////////////////////////////////////////////////////////////
 // Returns a batched matrix tensor with new batched diagonal values.
 // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
 //
    template <typename T>
    int _matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
        auto stream = context->getCudaStream();
@ -86,6 +88,9 @@ namespace helpers {
        return Status::OK();
    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // caller for _matrixDiagPart
 //
    int matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
        BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiagPart, (context, input, output), LIBND4J_TYPES);
    }
--- a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
@ -57,7 +57,7 @@ namespace helpers {
        Nd4jPointer params[2];
        params[0] = context;
        params[1] = context->getCudaStream();
-
+        // Nth element in sorted sequence : basic algorithm sort and retrieve nth element in sorted
        if (input->isVector()) {
            sort(params, nullptr, sortedVals.shapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), reverse);
@ -71,9 +71,7 @@ namespace helpers {
            auto pTadShape = packX.specialShapeInfo();
            auto pTadShapeH = packX.primaryShapeInfo();
            auto pTadOffsets = packX.specialOffsets();
 //            auto pLastDimData = (int*) manager.replicatePointer(lastDims.data(), lastDims.size() * sizeof(int));
            sortTad(params, sortedVals.buffer(), sortedVals.shapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), lastDims.data(), lastDims.size(), pTadShape, pTadOffsets, reverse);
 //            manager.synchronize();
            sortedVals.tickWriteDevice();
            sortedVals.syncToHost();
            auto stream = context->getCudaStream();