diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
index f63469817..3a52057a5 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
@@ -15,7 +15,7 @@
  ******************************************************************************/
 
 //
-// @author Yurii Shyrma (iuriish@yahoo.com), created on 07.12.2017
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
@@ -38,10 +38,9 @@ CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) {
     for(int i = 0;  i < diagonal->rankOf() - 1; ++i)
         REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str());
 
-    REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)),
-        0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
+    REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
 
-    helpers::matrixSetDiag(block.launchContext(), input, diagonal, output);
+    helpers::matrixSetDiag(block.launchContext(), *input, *diagonal, *output, false);
 
     return Status::OK();
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
index 8fa5bfa41..c430fd4d2 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
@@ -15,49 +15,53 @@
  ******************************************************************************/
 
 //
-// Created to use with batched tensor by GS <sgazeos@gmail.com> 3/21/2018
+// @author GS <sgazeos@gmail.com> 3/21/2018
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <ops/declarable/CustomOperations.h>
-#include <ops/declarable/helpers/matrix_diag.h>
-
+#include <ops/declarable/helpers/matrixSetDiag.h>
 
 namespace nd4j {
-    namespace ops {
-        CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
-            auto input  = INPUT_VARIABLE(0);
-            auto output = OUTPUT_VARIABLE(0);
+namespace ops  {
 
-            REQUIRE_TRUE(!input->isScalar(), 0, "CUSTOM_OP matrix_diag: input array must be at list a vector, but scalar was given!");
+CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
 
-            output->nullify();
-            return helpers::matrixDiag(block.launchContext(), input, output);
-        }
+    auto diagonal = INPUT_VARIABLE(0);
+    auto output   = OUTPUT_VARIABLE(0);
 
-        DECLARE_SHAPE_FN(matrix_diag) {
-            Nd4jLong* outShapeInfo = nullptr;
-            auto in = inputShape->at(0);
-            int inRank = shape::rank(in);
+    REQUIRE_TRUE(!diagonal->isScalar(), 0, "CUSTOM_OP matrix_diag: input diagonal array must be at list a vector, but scalar was given!");
 
-            int outRank = inRank + 1;
-            auto lastDimension = shape::sizeAt(in, -1);
+    helpers::matrixSetDiag(block.launchContext(), *output, *diagonal, *output, true);
 
-            ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
-            outShapeInfo[0] = outRank;
-            for(int i = 0; i < inRank; ++i)
-                outShapeInfo[i + 1] = shape::sizeAt(in, i);
-            outShapeInfo[outRank] = lastDimension;
+    return Status::OK();
+}
 
-            ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
+DECLARE_SHAPE_FN(matrix_diag) {
 
-            return SHAPELIST(CONSTANT(outShapeInfo));
-        }
+    Nd4jLong* outShapeInfo = nullptr;
+    auto in = inputShape->at(0);
+    int inRank = shape::rank(in);
 
-        DECLARE_TYPES(matrix_diag) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setSameMode(true);
-        }
+    int outRank = inRank + 1;
+    auto lastDimension = shape::sizeAt(in, -1);
+
+    ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
+    outShapeInfo[0] = outRank;
+    for(int i = 0; i < inRank; ++i)
+        outShapeInfo[i + 1] = shape::sizeAt(in, i);
+    outShapeInfo[outRank] = lastDimension;
+
+    ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
+
+    return SHAPELIST(CONSTANT(outShapeInfo));
+}
+
+DECLARE_TYPES(matrix_diag) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setSameMode(true);
+}
 }
 }
 
diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h
index f9278fb36..c86f28499 100644
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@@ -76,8 +76,20 @@ namespace nd4j {
         #endif
 
         /**
-         * Returns a batched matrix tensor with new batched diagonal values.
-         */
+        * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
+        *
+        * Input arrays:
+        *    input:    input array, considered as batch of matrices
+        *    diagonal: array containing elements to be inserted into input array,
+        *              following rank condition should be satisfied: diagonal_rank = input_rank - 1,
+        *              the shapes of diagonal and input arrays must be equal except last dimension of input array,
+        *              for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
+        *              also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions
+        *              that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
+        *
+        * Output array:
+        *    has the same shape as input, corresponding diagonal elements are substituted
+        */
         #if NOT_EXCLUDED(OP_matrix_set_diag)
         DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
         #endif
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
index dd5516461..3d04bc129 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@@ -2411,7 +2411,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                         for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
                                             for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
                                                 for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
+                                                    pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f) *  nd4j::math::nd4j_sgn<T,T>(pIn[kd + kh + kw]);
                                     }
                                     else {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
index 7180a88b3..e974755ac 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@@ -15,7 +15,7 @@
  ******************************************************************************/
 
 //
-// Created by Yurii Shyrma on 07.12.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include "ResultSet.h"
@@ -27,31 +27,48 @@ namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-// Returns a batched matrix tensor with new batched diagonal values.
-// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
-template <typename T>
-static void _matrixSetDiag(const NDArray* input, const NDArray* diagonal, NDArray* output) {
+template<typename T>
+void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
 
-    *output = *input;
+    // input and output are the same array (x == z) when zeroPad = true
+    // xRank = zRank, xRank = yRank + 1
+    // xLen = zLen
 
-    const int lastDimSize = input->sizeAt(-1);
-    const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
-    const int lastSmallDim = diagonal->sizeAt(-1);
-    const int batchSize = input->lengthOf()/last2DimSize;
+    const T* x = input.bufferAsT<T>();
+    const T* y = diagonal.bufferAsT<T>();
+          T* z = output.bufferAsT<T>();
 
-    for(int i = 0; i < batchSize; ++i )
-        for(int j = 0; j < lastSmallDim; ++j) {
-            output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
-        }
-             
+    const Nd4jLong* xShapeInfo = input.getShapeInfo();
+    const Nd4jLong* yShapeInfo = diagonal.getShapeInfo();
+    const Nd4jLong* zShapeInfo = output.getShapeInfo();
 
+    const bool areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);    // shapes are definitely the same, but strides might not
+
+    const int xRank = input.rankOf();
+    const auto xLen = input.lengthOf();
+
+    std::vector<Nd4jLong> coords(xRank);  // we use the same coordinates storage both for input and output since their ranks are the same
+
+    PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
+    for (Nd4jLong i = 0; i < xLen; ++i) {
+
+        shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data());
+
+        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank);
+        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank);
+
+        // condition to be on diagonal of innermost matrix
+        if(coords[xRank - 2] == coords[xRank - 1])
+            z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)];
+        else
+            z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
+    }
 }
 
-    void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (input, diagonal, output), LIBND4J_TYPES);
-    }
-
-    BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
+//////////////////////////////////////////////////////////////////////////
+void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
+    BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiag_, (input, diagonal, output, zeroPad), LIBND4J_TYPES);
+}
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp
deleted file mode 100644
index 3f9883b54..000000000
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by GS <sgazeos@gmail.com> on 3/21/2018.
-//
-
-#include "ResultSet.h"
-#include <ops/declarable/helpers/matrix_diag.h>
-#include <Status.h>
-
-namespace nd4j {
-namespace ops {
-namespace helpers {
-
-
-//////////////////////////////////////////////////////////////////////////
-// Returns a batched matrix tensor with new batched diagonal values.
-// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
-template <typename T>
-static int _matrixDiag(const NDArray* input, NDArray* output) {
-
-    auto listOut  = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
-    auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
-
-    if (listOut->size() != listDiag->size()) {
-        nd4j_printf("matrix_diag: Input matrix has wrong shape.", "");
-        return ND4J_STATUS_VALIDATION;
-    }
-    int lastDimension = input->sizeAt(-1);
-    // TODO: tune this properlys
-    int lO = listOut->size();
-    PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold())
-    for(int i = 0; i < lO; ++i)
-        for (int e = 0; e < lastDimension; e++)
-            listOut->at(i)->p(e, e, listDiag->at(i)->e<T>(e));
-    
-    delete listOut;
-    delete listDiag;
-
-    return Status::OK();
-}
-
-    int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (input, output), LIBND4J_TYPES);
-    }
-
-    BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (const NDArray* input, NDArray* output), LIBND4J_TYPES);
-
-}
-}
-}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
index 87e7c4f08..c08551318 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@@ -957,9 +957,13 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
             val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 
-            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
-                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
-                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
+            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
+                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
+                    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+                    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
+                }
+            }
         }
         break;
     }
@@ -1123,10 +1127,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
             val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
+                        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+                        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
+                    }
+                }
+            }
         }
         break;
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
index 95eb5f439..01baaffb4 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@@ -15,63 +15,87 @@
  ******************************************************************************/
 
 //
-// Created by Yurii Shyrma on 07.12.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrixSetDiag.h>
+#include <PointersManager.h>
 
-namespace nd4j {
-namespace ops {
+namespace nd4j    {
+namespace ops     {
 namespace helpers {
 
+///////////////////////////////////////////////////////////////////
+template<typename T>
+__global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
 
-    template <typename T>
-    static __global__ void matrixSetDiagKernel(void* outputBuffer, Nd4jLong* outputShape, void const* diagonalBuffer, Nd4jLong* diagonalShape, Nd4jLong lastDimSize, Nd4jLong last2DimSize, Nd4jLong lastSmallDim, Nd4jLong batchSize) {
-        __shared__ T* z;
-        __shared__ T const* x;
-        __shared__ Nd4jLong outLength, diagonalLen;
-        if (threadIdx.x == 0) {
-            z = reinterpret_cast<T*>(outputBuffer);
-            x = reinterpret_cast<T const*>(diagonalBuffer);
-            outLength = shape::length(outputShape);
-            diagonalLen = shape::length(diagonalShape);
-        }
-        __syncthreads();
+    // x - input,    shape [A,B,C]
+    // y - diagonal, shape [A,B]
+    // z - output,   shape [A,B,C]
+    // input and output are the same array (x == z) when zeroPad = true
 
-        for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
-            for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
-//                z[i * last2DimSize + j * (lastDimSize + 1)] = x[i * lastSmallDim + j];
-                z[shape::getIndexOffset(i * last2DimSize + j * (lastDimSize + 1), outputShape, outLength)] = x[shape::getIndexOffset(i * lastSmallDim + j, diagonalShape, diagonalLen)];
-            }
-    }
-    //////////////////////////////////////////////////////////////////////////
-    // Returns a batched matrix tensor with new batched diagonal values.
-    // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
-    template <typename T>
-    static void _matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
-        *output = *input;
+    const auto x = reinterpret_cast<const T*>(vx);
+    const auto y = reinterpret_cast<const T*>(vy);
+          auto z = reinterpret_cast<T*>(vz);
 
-        const int lastDimSize = input->sizeAt(-1);
-        const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
-        const int lastSmallDim = diagonal->sizeAt(-1);
-        const int batchSize = input->lengthOf()/last2DimSize;
-        auto stream = context->getCudaStream();
-        dim3 launchDims(256, 512, 8192);
-        matrixSetDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), diagonal->getSpecialBuffer(), diagonal->getSpecialShapeInfo(), lastDimSize, last2DimSize, lastSmallDim, batchSize);
-//// #pragma omp parallel for if(batchSize > Environment::getInstance()->elementwiseThreshold()) schedule(static)
-//        for(int i = 0; i < batchSize; ++i )
-//            for(int j = 0; j < lastSmallDim; ++j) {
-//                output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
-//            }
+    __shared__ int xRank;       // xRank = zRank, xRank = yRank + 1
+    __shared__ Nd4jLong xLen, *sharedMem;   // xLen = zLen
+    __shared__ bool areSameOffsets;
 
+    if (threadIdx.x == 0) {
+
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);    // shapes are definitely the same, but strides might not
+
+        xRank = shape::rank(xShapeInfo);
+        xLen  = shape::length(xShapeInfo);
     }
 
-    void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (context, input, diagonal, output), LIBND4J_TYPES);
-    }
+    __syncthreads();
 
-    BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
+    auto coords = sharedMem + threadIdx.x * xRank;               // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
+    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
+
+        shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords);
+
+        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank);
+        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank);
+
+        // condition to be on diagonal of innermost matrix
+        if(coords[xRank - 2] == coords[xRank - 1])
+            z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)];
+        else
+            z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename T>
+static void matrixSetDiagCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
+
+    matrixSetDiagCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, zeroPad);
+}
+
+///////////////////////////////////////////////////////////////////
+void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
+
+    PointersManager manager(context, "matrixSetDiag");
+
+    NDArray::prepareSpecialUse({&output}, {&input, &diagonal});
+    BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiagCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), diagonal.getSpecialBuffer(), diagonal.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), zeroPad), LIBND4J_TYPES);
+    NDArray::registerSpecialUse({&output}, {&input, &diagonal});
+
+    manager.synchronize();
+}
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu
deleted file mode 100644
index 78304510d..000000000
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by GS <sgazeos@gmail.com> on 3/21/2018.
-//
-
-#include "ResultSet.h"
-#include <ops/declarable/helpers/matrix_diag.h>
-#include <Status.h>
-#include <ShapeUtils.h>
-#include <ShapeUtils.h>
-#include <TAD.h>
-#include <cuda_exception.h>
-#include <helpers/ConstantTadHelper.h>
-
-namespace nd4j {
-namespace ops {
-namespace helpers {
-
-
-    template <typename T>
-    static __global__ void matrixDiagKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength,
-                                       Nd4jLong* tadOnlyInputShapeInfo,  Nd4jLong *tadInputOffsets,
-                                       Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) {
-        int totalThreads = blockDim.x;
-        for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) {
-            auto yOffset = tadInputOffsets[i];
-            auto xOffset = tadOutputOffsets[i];
-            for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
-                Nd4jLong coords[2] = {j, j};
-                Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo), shape::stride(tadOnlyOutputShapeInfo), coords, 2);
-                //shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)
-                *(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffset) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + shape::getIndexOffset(j, tadOnlyInputShapeInfo, inputLength));
-            }
-        }
-    }
-    //////////////////////////////////////////////////////////////////////////
-    // Returns a batched matrix tensor with new batched diagonal values.
-    // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
-
-    template <typename T>
-    static int _matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
-        cudaStream_t* stream = context->getCudaStream();
-        //auto listOut  = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
-        //auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
-
-        //auto repeatDelta = shape::prodLong(newShape.data(), rank) / this->lengthOf();
-        std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1});
-        const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension});
-        //printf("Repeat delta %lld, numTads %lld\n", repeatDelta, numTads);
-        //tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets;
-        std::vector<int> inputDims({input->rankOf() - 1});
-        std::vector<int> outputDims({output->rankOf() - 2, output->rankOf() - 1});
-
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims);
-
-        if (!input->isActualOnDeviceSide())
-            input->syncToDevice();
-
-        if (!output->isActualOnDeviceSide())
-            output->syncToDevice();
-
-        // create cuda stream and LaunchContext
-        cudaError_t cudaResult;
-
-        dim3 launchDims(256, 512, 8192);
-        matrixDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(), output->getSpecialBuffer(), numTads, input->sizeAt(-1), packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets());
-
-        return Status::OK();
-    }
-
-    int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (context, input, output), LIBND4J_TYPES);
-    }
-
-    BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES);
-
-}
-}
-}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h b/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
index ea5a1a4ad..fb7d57d18 100644
--- a/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
+++ b/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
@@ -28,8 +28,7 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 
-    void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output);
-    
+    void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/matrix_diag.h b/libnd4j/include/ops/declarable/helpers/matrix_diag.h
deleted file mode 100644
index 0cbbcef16..000000000
--- a/libnd4j/include/ops/declarable/helpers/matrix_diag.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-//  @author GS <sgazeos@gmail.com>
-//
-#ifndef __MATRIX_DIAG_HELPERS__
-#define __MATRIX_DIAG_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
-
-namespace nd4j {
-namespace ops {
-namespace helpers {
-
-    int matrixDiag(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
-
-}
-}
-}
-#endif
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
index 1ec9650f9..7d166f831 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
@@ -117,9 +117,9 @@ TEST_F(DeclarableOpsTests3, Test_Unique_1) {
 
     auto v = result->at(0);
     auto i = result->at(1);
-    v->printIndexedBuffer("Values");
-    i->printIndexedBuffer("Indices");
-    i->printShapeInfo("Indices shape");
+    // v->printIndexedBuffer("Values");
+    // i->printIndexedBuffer("Indices");
+    // i->printShapeInfo("Indices shape");
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
 
@@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests3, Test_Unique_2) {
     auto i = result->at(1);
     auto c = result->at(2);
 
-     v->printShapeInfo();
-     v->printIndexedBuffer("Values");
-     i->printShapeInfo();
-     i->printIndexedBuffer("Indices");
-     c->printShapeInfo();
-     c->printIndexedBuffer("Counts");
+     // v->printShapeInfo();
+     // v->printIndexedBuffer("Values");
+     // i->printShapeInfo();
+     // i->printIndexedBuffer("Indices");
+     // c->printShapeInfo();
+     // c->printIndexedBuffer("Counts");
 
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
@@ -200,11 +200,11 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
     auto result1 = op.execute({&x}, {1.}, {1});
     ASSERT_EQ(result1->status(), ND4J_STATUS_OK);
     auto z1 = result1->at(0);
-    z1->printIndexedBuffer("Z1");
+    // z1->printIndexedBuffer("Z1");
     auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false);
-    exp1.printIndexedBuffer("EXP1");
-    z1->printShapeInfo("Z1 shape");
-    exp1.printShapeInfo("EXP1 shape");
+    // exp1.printIndexedBuffer("EXP1");
+    // z1->printShapeInfo("Z1 shape");
+    // exp1.printShapeInfo("EXP1 shape");
     ASSERT_TRUE(exp1.isSameShape(z1));
     ASSERT_TRUE(exp1.equalsTo(z1));
 
@@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_7) {
 
     auto exp = MmulHelper::mmul(&x, &y);
 
-    exp->printShapeInfo("exp shape");
+    // exp->printShapeInfo("exp shape");
 
     nd4j::ops::batched_gemm op;
     auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});
diff --git a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
index 49c1f7a95..6913722be 100644
--- a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
@@ -79,7 +79,7 @@ TEST_F(SortCudaTests, test_linear_sort_by_val_2) {
     sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true);
     k.tickWriteDevice();
     v.tickWriteDevice();
-    k.printIndexedBuffer("KEYS");
+    // k.printIndexedBuffer("KEYS");
     ASSERT_EQ(ek, k);
     ASSERT_EQ(ev, v);
 }
@@ -98,8 +98,8 @@ TEST_F(SortCudaTests, test_tad_sort_by_key_1) {
     k.tickWriteDevice();
     v.tickWriteDevice();
 
-    k.printIndexedBuffer("k");
-    v.printIndexedBuffer("v");
+    // k.printIndexedBuffer("k");
+    // v.printIndexedBuffer("v");
 
     ASSERT_EQ(ek, k);
     ASSERT_EQ(ev, v);