Shyrma docs (#222)

* - documenting and profiling matrix_set_diag cuda kernel Signed-off-by: Yurii <yurii@skymind.io> * - correct formula of pnorm pooling in cuda 2d/3d kernels - remove helper matrix_diag which duplicates work of helper matrix_set_diag Signed-off-by: Yurii <yurii@skymind.io>
2019-09-02 16:25:58 +03:00 · 2019-09-02 16:25:58 +03:00 · cb4c9377b1
commit cb4c9377b1
parent 106524663b
13 changed files with 190 additions and 320 deletions
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
@ -15,7 +15,7 @@
 ******************************************************************************/
 //
-// @author Yurii Shyrma (iuriish@yahoo.com), created on 07.12.2017
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 #include <op_boilerplate.h>
@ -38,10 +38,9 @@ CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) {
    for(int i = 0;  i < diagonal->rankOf() - 1; ++i)
        REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str());
-    REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)),
+    REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
        0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
-    helpers::matrixSetDiag(block.launchContext(), input, diagonal, output);
+    helpers::matrixSetDiag(block.launchContext(), *input, *diagonal, *output, false);
    return Status::OK();
 }
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
@ -15,49 +15,53 @@
 ******************************************************************************/
 //
-// Created to use with batched tensor by GS <sgazeos@gmail.com> 3/21/2018
+// @author GS <sgazeos@gmail.com> 3/21/2018
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 #include <ops/declarable/CustomOperations.h>
-#include <ops/declarable/helpers/matrix_diag.h>
+#include <ops/declarable/helpers/matrixSetDiag.h>
 namespace nd4j {
-    namespace ops {
+namespace ops  {
        CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
            auto input  = INPUT_VARIABLE(0);
            auto output = OUTPUT_VARIABLE(0);
-            REQUIRE_TRUE(!input->isScalar(), 0, "CUSTOM_OP matrix_diag: input array must be at list a vector, but scalar was given!");
+CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
-            output->nullify();
+    auto diagonal = INPUT_VARIABLE(0);
-            return helpers::matrixDiag(block.launchContext(), input, output);
+    auto output   = OUTPUT_VARIABLE(0);
        }
-        DECLARE_SHAPE_FN(matrix_diag) {
+    REQUIRE_TRUE(!diagonal->isScalar(), 0, "CUSTOM_OP matrix_diag: input diagonal array must be at list a vector, but scalar was given!");
            Nd4jLong* outShapeInfo = nullptr;
            auto in = inputShape->at(0);
            int inRank = shape::rank(in);
-            int outRank = inRank + 1;
+    helpers::matrixSetDiag(block.launchContext(), *output, *diagonal, *output, true);
            auto lastDimension = shape::sizeAt(in, -1);
-            ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
+    return Status::OK();
-            outShapeInfo[0] = outRank;
+}
            for(int i = 0; i < inRank; ++i)
                outShapeInfo[i + 1] = shape::sizeAt(in, i);
            outShapeInfo[outRank] = lastDimension;
-            ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
+DECLARE_SHAPE_FN(matrix_diag) {
-            return SHAPELIST(CONSTANT(outShapeInfo));
+    Nd4jLong* outShapeInfo = nullptr;
-        }
+    auto in = inputShape->at(0);
    int inRank = shape::rank(in);
-        DECLARE_TYPES(matrix_diag) {
+    int outRank = inRank + 1;
-            getOpDescriptor()
+    auto lastDimension = shape::sizeAt(in, -1);
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+
-                    ->setSameMode(true);
+    ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
-        }
+    outShapeInfo[0] = outRank;
    for(int i = 0; i < inRank; ++i)
        outShapeInfo[i + 1] = shape::sizeAt(in, i);
    outShapeInfo[outRank] = lastDimension;
    ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
    return SHAPELIST(CONSTANT(outShapeInfo));
 }
 DECLARE_TYPES(matrix_diag) {
    getOpDescriptor()
            ->setAllowedInputTypes(nd4j::DataType::ANY)
            ->setSameMode(true);
 }
 }
 }
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@ -76,8 +76,20 @@ namespace nd4j {
        #endif
        /**
-         * Returns a batched matrix tensor with new batched diagonal values.
+        * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
-         */
+        *
        * Input arrays:
        *    input:    input array, considered as batch of matrices
        *    diagonal: array containing elements to be inserted into input array,
        *              following rank condition should be satisfied: diagonal_rank = input_rank - 1,
        *              the shapes of diagonal and input arrays must be equal except last dimension of input array,
        *              for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
        *              also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions
        *              that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
        *
        * Output array:
        *    has the same shape as input, corresponding diagonal elements are substituted
        */
        #if NOT_EXCLUDED(OP_matrix_set_diag)
        DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
        #endif
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@ -2411,7 +2411,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
+                                                    pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f) *  nd4j::math::nd4j_sgn<T,T>(pIn[kd + kh + kw]);
                                    }
                                    else {
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@ -15,7 +15,7 @@
 ******************************************************************************/
 //
-// Created by Yurii Shyrma on 07.12.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 #include "ResultSet.h"
@ -27,31 +27,48 @@ namespace helpers {
 //////////////////////////////////////////////////////////////////////////
-// Returns a batched matrix tensor with new batched diagonal values.
+template<typename T>
-// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
+void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
 template <typename T>
 static void _matrixSetDiag(const NDArray* input, const NDArray* diagonal, NDArray* output) {
-    *output = *input;
+    // input and output are the same array (x == z) when zeroPad = true
    // xRank = zRank, xRank = yRank + 1
    // xLen = zLen
-    const int lastDimSize = input->sizeAt(-1);
+    const T* x = input.bufferAsT<T>();
-    const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
+    const T* y = diagonal.bufferAsT<T>();
-    const int lastSmallDim = diagonal->sizeAt(-1);
+          T* z = output.bufferAsT<T>();
    const int batchSize = input->lengthOf()/last2DimSize;
-    for(int i = 0; i < batchSize; ++i )
+    const Nd4jLong* xShapeInfo = input.getShapeInfo();
-        for(int j = 0; j < lastSmallDim; ++j) {
+    const Nd4jLong* yShapeInfo = diagonal.getShapeInfo();
-            output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
+    const Nd4jLong* zShapeInfo = output.getShapeInfo();
        }
    const bool areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);    // shapes are definitely the same, but strides might not
    const int xRank = input.rankOf();
    const auto xLen = input.lengthOf();
    std::vector<Nd4jLong> coords(xRank);  // we use the same coordinates storage both for input and output since their ranks are the same
    PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
    for (Nd4jLong i = 0; i < xLen; ++i) {
        shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data());
        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank);
        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank);
        // condition to be on diagonal of innermost matrix
        if(coords[xRank - 2] == coords[xRank - 1])
            z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)];
        else
            z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
    }
 }
-    void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
+//////////////////////////////////////////////////////////////////////////
-        BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (input, diagonal, output), LIBND4J_TYPES);
+void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
-    }
+    BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiag_, (input, diagonal, output, zeroPad), LIBND4J_TYPES);
-
+}
    BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp
@ -1,65 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // Created by GS <sgazeos@gmail.com> on 3/21/2018.
 //
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrix_diag.h>
 #include <Status.h>
 namespace nd4j {
 namespace ops {
 namespace helpers {
 //////////////////////////////////////////////////////////////////////////
 // Returns a batched matrix tensor with new batched diagonal values.
 // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
 template <typename T>
 static int _matrixDiag(const NDArray* input, NDArray* output) {
    auto listOut  = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
    auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
    if (listOut->size() != listDiag->size()) {
        nd4j_printf("matrix_diag: Input matrix has wrong shape.", "");
        return ND4J_STATUS_VALIDATION;
    }
    int lastDimension = input->sizeAt(-1);
    // TODO: tune this properlys
    int lO = listOut->size();
    PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold())
    for(int i = 0; i < lO; ++i)
        for (int e = 0; e < lastDimension; e++)
            listOut->at(i)->p(e, e, listDiag->at(i)->e<T>(e));
    delete listOut;
    delete listDiag;
    return Status::OK();
 }
    int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
        BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (input, output), LIBND4J_TYPES);
    }
    BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (const NDArray* input, NDArray* output), LIBND4J_TYPES);
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@ -957,9 +957,13 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
            val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
-            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
+            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
-                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
+                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
-                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
+                    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
                    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
                }
            }
        }
        break;
    }
@ -1123,10 +1127,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
            val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
-                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
+                        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
                        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
                    }
                }
            }
        }
        break;
    }
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@ -15,63 +15,87 @@
 ******************************************************************************/
 //
-// Created by Yurii Shyrma on 07.12.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrixSetDiag.h>
 #include <PointersManager.h>
-namespace nd4j {
+namespace nd4j    {
-namespace ops {
+namespace ops     {
 namespace helpers {
 ///////////////////////////////////////////////////////////////////
 template<typename T>
 __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
-    template <typename T>
+    // x - input,    shape [A,B,C]
-    static __global__ void matrixSetDiagKernel(void* outputBuffer, Nd4jLong* outputShape, void const* diagonalBuffer, Nd4jLong* diagonalShape, Nd4jLong lastDimSize, Nd4jLong last2DimSize, Nd4jLong lastSmallDim, Nd4jLong batchSize) {
+    // y - diagonal, shape [A,B]
-        __shared__ T* z;
+    // z - output,   shape [A,B,C]
-        __shared__ T const* x;
+    // input and output are the same array (x == z) when zeroPad = true
        __shared__ Nd4jLong outLength, diagonalLen;
        if (threadIdx.x == 0) {
            z = reinterpret_cast<T*>(outputBuffer);
            x = reinterpret_cast<T const*>(diagonalBuffer);
            outLength = shape::length(outputShape);
            diagonalLen = shape::length(diagonalShape);
        }
        __syncthreads();
-        for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
+    const auto x = reinterpret_cast<const T*>(vx);
-            for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
+    const auto y = reinterpret_cast<const T*>(vy);
-//                z[i * last2DimSize + j * (lastDimSize + 1)] = x[i * lastSmallDim + j];
+          auto z = reinterpret_cast<T*>(vz);
                z[shape::getIndexOffset(i * last2DimSize + j * (lastDimSize + 1), outputShape, outLength)] = x[shape::getIndexOffset(i * lastSmallDim + j, diagonalShape, diagonalLen)];
            }
    }
    //////////////////////////////////////////////////////////////////////////
    // Returns a batched matrix tensor with new batched diagonal values.
    // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
    template <typename T>
    static void _matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
        *output = *input;
-        const int lastDimSize = input->sizeAt(-1);
+    __shared__ int xRank;       // xRank = zRank, xRank = yRank + 1
-        const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
+    __shared__ Nd4jLong xLen, *sharedMem;   // xLen = zLen
-        const int lastSmallDim = diagonal->sizeAt(-1);
+    __shared__ bool areSameOffsets;
        const int batchSize = input->lengthOf()/last2DimSize;
        auto stream = context->getCudaStream();
        dim3 launchDims(256, 512, 8192);
        matrixSetDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), diagonal->getSpecialBuffer(), diagonal->getSpecialShapeInfo(), lastDimSize, last2DimSize, lastSmallDim, batchSize);
 //// #pragma omp parallel for if(batchSize > Environment::getInstance()->elementwiseThreshold()) schedule(static)
 //        for(int i = 0; i < batchSize; ++i )
 //            for(int j = 0; j < lastSmallDim; ++j) {
 //                output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
 //            }
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
        areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);    // shapes are definitely the same, but strides might not
        xRank = shape::rank(xShapeInfo);
        xLen  = shape::length(xShapeInfo);
    }
-    void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
+    __syncthreads();
        BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (context, input, diagonal, output), LIBND4J_TYPES);
    }
-    BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
+    auto coords = sharedMem + threadIdx.x * xRank;               // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
        shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords);
        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank);
        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank);
        // condition to be on diagonal of innermost matrix
        if(coords[xRank - 2] == coords[xRank - 1])
            z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)];
        else
            z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
    }
 }
 ///////////////////////////////////////////////////////////////////
 template<typename T>
 static void matrixSetDiagCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
    matrixSetDiagCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, zeroPad);
 }
 ///////////////////////////////////////////////////////////////////
 void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
    const int threadsPerBlock = MAX_NUM_THREADS / 2;
    const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
    PointersManager manager(context, "matrixSetDiag");
    NDArray::prepareSpecialUse({&output}, {&input, &diagonal});
    BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiagCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), diagonal.getSpecialBuffer(), diagonal.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), zeroPad), LIBND4J_TYPES);
    NDArray::registerSpecialUse({&output}, {&input, &diagonal});
    manager.synchronize();
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu
@ -1,95 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // Created by GS <sgazeos@gmail.com> on 3/21/2018.
 //
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrix_diag.h>
 #include <Status.h>
 #include <ShapeUtils.h>
 #include <ShapeUtils.h>
 #include <TAD.h>
 #include <cuda_exception.h>
 #include <helpers/ConstantTadHelper.h>
 namespace nd4j {
 namespace ops {
 namespace helpers {
    template <typename T>
    static __global__ void matrixDiagKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength,
                                       Nd4jLong* tadOnlyInputShapeInfo,  Nd4jLong *tadInputOffsets,
                                       Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) {
        int totalThreads = blockDim.x;
        for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) {
            auto yOffset = tadInputOffsets[i];
            auto xOffset = tadOutputOffsets[i];
            for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
                Nd4jLong coords[2] = {j, j};
                Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo), shape::stride(tadOnlyOutputShapeInfo), coords, 2);
                //shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)
                *(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffset) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + shape::getIndexOffset(j, tadOnlyInputShapeInfo, inputLength));
            }
        }
    }
    //////////////////////////////////////////////////////////////////////////
    // Returns a batched matrix tensor with new batched diagonal values.
    // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
    template <typename T>
    static int _matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
        cudaStream_t* stream = context->getCudaStream();
        //auto listOut  = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
        //auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
        //auto repeatDelta = shape::prodLong(newShape.data(), rank) / this->lengthOf();
        std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1});
        const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension});
        //printf("Repeat delta %lld, numTads %lld\n", repeatDelta, numTads);
        //tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets;
        std::vector<int> inputDims({input->rankOf() - 1});
        std::vector<int> outputDims({output->rankOf() - 2, output->rankOf() - 1});
        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims);
        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims);
        if (!input->isActualOnDeviceSide())
            input->syncToDevice();
        if (!output->isActualOnDeviceSide())
            output->syncToDevice();
        // create cuda stream and LaunchContext
        cudaError_t cudaResult;
        dim3 launchDims(256, 512, 8192);
        matrixDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(), output->getSpecialBuffer(), numTads, input->sizeAt(-1), packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets());
        return Status::OK();
    }
    int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
        BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (context, input, output), LIBND4J_TYPES);
    }
    BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES);
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
+++ b/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
@ -28,8 +28,7 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
-    void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output);
+    void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad);
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/matrix_diag.h
+++ b/libnd4j/include/ops/declarable/helpers/matrix_diag.h
@ -1,34 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 //  @author GS <sgazeos@gmail.com>
 //
 #ifndef __MATRIX_DIAG_HELPERS__
 #define __MATRIX_DIAG_HELPERS__
 #include <op_boilerplate.h>
 #include <NDArray.h>
 namespace nd4j {
 namespace ops {
 namespace helpers {
    int matrixDiag(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
 }
 }
 }
 #endif
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
@ -117,9 +117,9 @@ TEST_F(DeclarableOpsTests3, Test_Unique_1) {
    auto v = result->at(0);
    auto i = result->at(1);
-    v->printIndexedBuffer("Values");
+    // v->printIndexedBuffer("Values");
-    i->printIndexedBuffer("Indices");
+    // i->printIndexedBuffer("Indices");
-    i->printShapeInfo("Indices shape");
+    // i->printShapeInfo("Indices shape");
    ASSERT_TRUE(expV.isSameShape(v));
    ASSERT_TRUE(expV.equalsTo(v));
@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests3, Test_Unique_2) {
    auto i = result->at(1);
    auto c = result->at(2);
-     v->printShapeInfo();
+     // v->printShapeInfo();
-     v->printIndexedBuffer("Values");
+     // v->printIndexedBuffer("Values");
-     i->printShapeInfo();
+     // i->printShapeInfo();
-     i->printIndexedBuffer("Indices");
+     // i->printIndexedBuffer("Indices");
-     c->printShapeInfo();
+     // c->printShapeInfo();
-     c->printIndexedBuffer("Counts");
+     // c->printIndexedBuffer("Counts");
    ASSERT_TRUE(expV.isSameShape(v));
    ASSERT_TRUE(expV.equalsTo(v));
@ -200,11 +200,11 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
    auto result1 = op.execute({&x}, {1.}, {1});
    ASSERT_EQ(result1->status(), ND4J_STATUS_OK);
    auto z1 = result1->at(0);
-    z1->printIndexedBuffer("Z1");
+    // z1->printIndexedBuffer("Z1");
    auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false);
-    exp1.printIndexedBuffer("EXP1");
+    // exp1.printIndexedBuffer("EXP1");
-    z1->printShapeInfo("Z1 shape");
+    // z1->printShapeInfo("Z1 shape");
-    exp1.printShapeInfo("EXP1 shape");
+    // exp1.printShapeInfo("EXP1 shape");
    ASSERT_TRUE(exp1.isSameShape(z1));
    ASSERT_TRUE(exp1.equalsTo(z1));
@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_7) {
    auto exp = MmulHelper::mmul(&x, &y);
-    exp->printShapeInfo("exp shape");
+    // exp->printShapeInfo("exp shape");
    nd4j::ops::batched_gemm op;
    auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});
--- a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
@ -79,7 +79,7 @@ TEST_F(SortCudaTests, test_linear_sort_by_val_2) {
    sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true);
    k.tickWriteDevice();
    v.tickWriteDevice();
-    k.printIndexedBuffer("KEYS");
+    // k.printIndexedBuffer("KEYS");
    ASSERT_EQ(ek, k);
    ASSERT_EQ(ev, v);
 }
@ -98,8 +98,8 @@ TEST_F(SortCudaTests, test_tad_sort_by_key_1) {
    k.tickWriteDevice();
    v.tickWriteDevice();
-    k.printIndexedBuffer("k");
+    // k.printIndexedBuffer("k");
-    v.printIndexedBuffer("v");
+    // v.printIndexedBuffer("v");
    ASSERT_EQ(ek, k);
    ASSERT_EQ(ev, v);