Shyrma temp (#131)

* - specifying template instantiation for certain types in float16 and bloat16 Signed-off-by: Yurii <iuriish@yahoo.com> * - polishing bfloat16 and float16 member functions template specialization Signed-off-by: Yurii <iuriish@yahoo.com> * - rewrite and overload array +-*/ scalar and scalar +-*/ arr in NDAray class Signed-off-by: Yurii <iuriish@yahoo.com> * - make corrections which have to do with and rvalue lvalue conversions Signed-off-by: Yurii <iuriish@yahoo.com> * - provide move semantic in NDArray operators array +-/* array Signed-off-by: Yurii <iuriish@yahoo.com> * float16/bfloat16 tweaks Signed-off-by: raver119 <raver119@gmail.com> * one more tweak Signed-off-by: raver119 <raver119@gmail.com> * - make float16 and bfloat16 to compile successfully on cuda Signed-off-by: Yurii <iuriish@yahoo.com> * - do not use resources of view-like arrays when move semantics is applied Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of pointers in signatures NDArray methods 1 Signed-off-by: Yurii <iuriish@yahoo.com> * - correction of signature of NDArray::dup method Signed-off-by: Yurii <iuriish@yahoo.com> * - correction of signature of NDArray::reduceAlongDimension method Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::applyIndexReduce and applyTrueBroadcast methods Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::applyReduce3 and varianceAlongDimension methods Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::tensorsAlongDimension and diagonal methods Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::allTensorsAlongDimension Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::reduceAlongDimension 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::applyTransform 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::applyPairwiseTransform 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::applyBroadcast 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::applyTrueBroadcast 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::applyScalar and applyScalarArr Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::lambda methods Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::reduce3 methods 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of following NDArray methods: add/sub/mul/div row/column and fillAsTriangular Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::tileToShape methods Signed-off-by: Yurii <iuriish@yahoo.com> * - signature correction of NDArray::isShapeSameStrict method Signed-off-by: Yurii <iuriish@yahoo.com> * minor corrections in tests Signed-off-by: Yurii <iuriish@yahoo.com> * - replace reduce op in batchnorm mkldnn Signed-off-by: Yurii <iuriish@yahoo.com> * - add explicit templates instantiations for operator+(NDArray&&. const scalar) Signed-off-by: Yurii <iuriish@yahoo.com> * - corrections of casts in float16/bfloat16 Signed-off-by: Yurii <iuriish@yahoo.com> * - provide move semantics in following NDArray methods: transform, applyTrueBroadcast, transpose, reshape, permute Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of input array A duplicate in svd cuda op Signed-off-by: Yurii <iuriish@yahoo.com> * - avoid available bug in svd cuda API Signed-off-by: Yurii <iuriish@yahoo.com> * - add temporary global memory buffer in svd cuda when calcUV = false and m != n Signed-off-by: Yurii <iuriish@yahoo.com> * - remove test with blfoat16 type for betainC Signed-off-by: Yurii <iuriish@yahoo.com> * - resolve conflicts after master has been merged in Signed-off-by: Yurii <iuriish@yahoo.com> * - changed type of affected input array in fused_batch_norm Signed-off-by: Yurii <iuriish@yahoo.com> * - add several explicit type castings Signed-off-by: Yurii <iuriish@yahoo.com> * - add ND4J_EXPORT to operators Signed-off-by: Yurii <iuriish@yahoo.com> * - add explicit template types in instantiations of template arithm operators of NDArray class Signed-off-by: Yurii <iuriish@yahoo.com> * - one more test fix Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com>
2019-12-20 21:35:39 +02:00 · 2019-12-20 21:35:39 +02:00 · 5d9b2a16e5
commit 5d9b2a16e5
parent 3e0afadea1
237 changed files with 5235 additions and 6513 deletions
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
--- a/libnd4j/blas/cpu/GraphExecutioner.cpp
+++ b/libnd4j/blas/cpu/GraphExecutioner.cpp
@ -133,7 +133,7 @@ namespace graph {
                if (variableSpace->hasVariable(v->getName())) {
                    // symbolic feeder
                    auto array = variableSpace->getVariable(v->getName())->getNDArray();
-                    auto vr = array->dup();
+                    auto vr = new NDArray(array->dup());
 //                    deletables.push_back(vr);
                    v->setNDArray(vr);
                } else {
@ -145,7 +145,7 @@ namespace graph {
                // if we're not using symbolic lookup - we'll use sequential approach then
                auto p = node->input()->at(cnt);
                auto array = variableSpace->getVariable(p)->getNDArray();
-                auto vr = array->dup();
+                auto vr = new NDArray(array->dup());
                //deletables.push_back(vr);
                v->setNDArray(vr);
            }
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@ -71,44 +71,41 @@ void NDArray::makeBothBuffersActual() const { }
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-void NDArray::fillAsTriangular(const float val, int lower, int upper, const char direction, NDArray* target) {
+void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& target, const char direction) {
    if (isS())
        throw std::runtime_error("NDArray::fillArrayAsTriangular: you can't use this method on String array!");
-    if(target == nullptr)
+    if(!isSameShape(target) && !(rankOf() == 1 && target.rankOf() == 2 && sizeAt(0) == target.sizeAt(0) && sizeAt(0) == target.sizeAt(1)))
        target = this;
    if(!isSameShape(target) && !(rankOf() == 1 && target->rankOf() == 2 && sizeAt(0) == target->sizeAt(0) && sizeAt(0) == target->sizeAt(1)))
        throw std::string("NDArray::fillArrayAsTriangular method: wrong shape of target array !");
    if (direction == 'u')
-        lower = -target->sizeAt(-2);
+        lower = -target.sizeAt(-2);
    else if (direction == 'l')
-        upper = target->sizeAt(-1);
+        upper = target.sizeAt(-1);
    const T value = static_cast<T>(val);
    const auto x = reinterpret_cast<const T*>(getBuffer());
-          auto z = reinterpret_cast<T*>(target->getBuffer());
+          auto z = reinterpret_cast<T*>(target.getBuffer());
    const int xRank = rankOf();
-    const int zRank = target->rankOf();
+    const int zRank = target.rankOf();
-    const auto zLen = target->lengthOf();
+    const auto zLen = target.lengthOf();
-    const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target->getShapeInfo());
+    const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target.getShapeInfo());
    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK];
        for (auto i = start; i < stop; i += increment) {
-            shape::index2coords(i, target->getShapeInfo(), coords);
+            shape::index2coords(i, target.getShapeInfo(), coords);
-            const auto zOffset = shape::getOffset(target->getShapeInfo(), coords);
+            const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
            // if( (row + upper < col) || (row + lower > col) )
            if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
                z[zOffset] = value;
-            else if (this != target) {      // when this and target are different arrays
+            else if (this != &target) {      // when this and target are different arrays
                if (xRank != zRank)
                    coords[0] = coords[1];
@ -120,7 +117,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char
    samediff::Threads::parallel_for(func, 0, zLen);
 }
-BUILD_SINGLE_TEMPLATE(template void NDArray::fillAsTriangular, (const float val, int lower, int upper, const char direction, NDArray* target), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void NDArray::fillAsTriangular, (const float val, int lower, int upper, NDArray& target, const char direction), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////
 void NDArray::setIdentity() {
@ -405,11 +402,11 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
 //////////////////////////////////////////////////////////////////////////
 // create new array by repeating it the number of times given by repeats
-NDArray* NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
+NDArray NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
-    auto output = new NDArray('c', ShapeUtils::evalRepeatShape(axis, repeats, *this), dataType(),  getContext());
+    NDArray output('c', ShapeUtils::evalRepeatShape(axis, repeats, *this), dataType(),  getContext());
-    BUILD_SINGLE_SELECTOR_TWICE(dataType(), repeat_, (*this, *output, repeats, axis), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR_TWICE(dataType(), repeat_, (*this, output, repeats, axis), LIBND4J_TYPES);
    return output;
 }
--- a/libnd4j/blas/cpu/NDArrayLambda.hpp
+++ b/libnd4j/blas/cpu/NDArrayLambda.hpp
@ -2,35 +2,24 @@
 template<typename T>
-void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<T(T, T, T)>& func, NDArray* target) {
+void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::function<T(T, T, T)>& func, NDArray& target) {
    if (target == nullptr)
        target = this;
    if (second == nullptr) {
        nd4j_printf("applyTriplewiseLambda requires three operands to be valid NDArrays, but Second is NULL\n","");
        throw std::runtime_error("second is null");
    }
    if (third == nullptr) {
        nd4j_printf("applyTriplewiseLambda requires three operands to be valid NDArrays, but Third is NULL\n","");
        throw std::runtime_error("third is null");
    }
    if(dataType() != DataTypeUtils::fromT<T>())
        throw std::runtime_error("NDArray::applyTriplewiseLambda<T> method: wrong template parameter T, its type should be the same as type of this array!");
-    if(dataType() != second->dataType() || dataType() != third->dataType() || dataType() != target->dataType())
+    if(dataType() != second.dataType() || dataType() != third.dataType() || dataType() != target.dataType())
        throw std::runtime_error("NDArray::applyTriplewiseLambda<T> method: bother four arrays (this, second, third, target) should have the same type !");
-    if (this->lengthOf() != second->lengthOf() || this->lengthOf() != third->lengthOf() || !this->isSameShape(second) || !this->isSameShape(third)) {
+    if (this->lengthOf() != second.lengthOf() || this->lengthOf() != third.lengthOf() || !this->isSameShape(second) || !this->isSameShape(third)) {
        nd4j_printf("applyPairwiseLambda requires both operands to have the same shape\n","");
        throw std::runtime_error("Shapes mismach");
    }
    auto f = this->bufferAsT<T>();
-    auto s = second->bufferAsT<T>();
+    auto s = second.bufferAsT<T>();
-    auto t = third->bufferAsT<T>();
+    auto t = third.bufferAsT<T>();
-    auto z = target->bufferAsT<T>();
+    auto z = target.bufferAsT<T>();
-    if (this->ordering() == second->ordering() && this->ordering() == third->ordering()  && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == second->ews() && this->ews() == third->ews()) {
+    if (this->ordering() == second.ordering() && this->ordering() == third.ordering()  && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) {
        auto loop = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e += increment)
@ -44,8 +33,8 @@ void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto tOffset = this->getOffset(e);
-                    auto uOffset = second->getOffset(e);
+                    auto uOffset = second.getOffset(e);
-                    auto vOffset = third->getOffset(e);
+                    auto vOffset = third.getOffset(e);
                    f[tOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
                }
@ -57,9 +46,9 @@ void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto tOffset = this->getOffset(e);
-                    auto uOffset = second->getOffset(e);
+                    auto uOffset = second.getOffset(e);
-                    auto vOffset = third->getOffset(e);
+                    auto vOffset = third.getOffset(e);
-                    auto zOffset = target->getOffset(e);
+                    auto zOffset = target.getOffset(e);
                    z[zOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
                }
@ -69,46 +58,39 @@ void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::
        }
    }
 }
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<double (double, double, double)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<double (double, double, double)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<float (float, float, float)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<float (float, float, float)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<float16 (float16, float16, float16)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<float16 (float16, float16, float16)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<bfloat16 (bfloat16, bfloat16, bfloat16)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<bfloat16 (bfloat16, bfloat16, bfloat16)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<Nd4jLong (Nd4jLong, Nd4jLong, Nd4jLong)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<Nd4jLong (Nd4jLong, Nd4jLong, Nd4jLong)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<int (int, int, int)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<int (int, int, int)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<int16_t (int16_t, int16_t, int16_t)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<int16_t (int16_t, int16_t, int16_t)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<uint8_t (uint8_t, uint8_t, uint8_t)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<uint8_t (uint8_t, uint8_t, uint8_t)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<uint16_t (uint16_t, uint16_t, uint16_t)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<uint16_t (uint16_t, uint16_t, uint16_t)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<uint32_t (uint32_t, uint32_t, uint32_t)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<uint32_t (uint32_t, uint32_t, uint32_t)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<uint64_t (uint64_t, uint64_t, uint64_t)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<uint64_t (uint64_t, uint64_t, uint64_t)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<int8_t (int8_t, int8_t, int8_t)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<int8_t (int8_t, int8_t, int8_t)>& func, NDArray& target);
-template void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::function<bool (bool, bool, bool)>& func, NDArray* target);
+template void NDArray::applyTriplewiseLambda(NDArray& second, NDArray &third, const std::function<bool (bool, bool, bool)>& func, NDArray& target);
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
-void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<T(T, T)>& func, NDArray* target) {
+void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T, T)>& func, NDArray& target) {
    if (target == nullptr)
        target = this;
    if (other == nullptr) {
        nd4j_printf("applyPairwiseLambda requires both operands to be valid NDArrays, but Y is NULL\n","");
        throw std::runtime_error("Other is null");
    }
    if(dataType() != DataTypeUtils::fromT<T>())
        throw std::runtime_error("NDArray::applyPairwiseLambda<T> method: wrong template parameter T, its type should be the same as type of this array!");
-    if(dataType() != other->dataType() || dataType() != target->dataType())
+    if(dataType() != other.dataType() || dataType() != target.dataType())
        throw std::runtime_error("NDArray::applyPairwiseLambda<T> method: all three arrays (this, other, target) must have the same type !");
-    if (this->lengthOf() != other->lengthOf()) {
+    if (this->lengthOf() != other.lengthOf()) {
        nd4j_printf("applyPairwiseLambda requires both operands to have the same shape\n","");
        throw std::runtime_error("Shapes mismach");
    }
    auto f = this->bufferAsT<T>();
-    auto s = other->bufferAsT<T>();
+    auto s = other.bufferAsT<T>();
-    auto z = target->bufferAsT<T>();
+    auto z = target.bufferAsT<T>();
-    if (this->ordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) {
+    if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
        auto loop = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e += increment)
@ -122,7 +104,7 @@ void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<T(T,
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto xOffset = this->getOffset(e);
-                    auto yOffset = other->getOffset(e);
+                    auto yOffset = other.getOffset(e);
                    f[xOffset] = func(f[xOffset], s[yOffset]);
                }
@ -134,8 +116,8 @@ void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<T(T,
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto xOffset = this->getOffset(e);
-                    auto yOffset = other->getOffset(e);
+                    auto yOffset = other.getOffset(e);
-                    auto zOffset = target->getOffset(e);
+                    auto zOffset = target.getOffset(e);
                    z[zOffset] = func(f[xOffset], s[yOffset]);
                }
@ -145,35 +127,33 @@ void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<T(T,
        }
    }
 }
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<double (double, double)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<double (double, double)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<float (float, float)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<float (float, float)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<float16 (float16, float16)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<float16 (float16, float16)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<bfloat16 (bfloat16, bfloat16)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<bfloat16 (bfloat16, bfloat16)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<Nd4jLong (Nd4jLong, Nd4jLong)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<Nd4jLong (Nd4jLong, Nd4jLong)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<int (int, int)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<int (int, int)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<int16_t (int16_t, int16_t)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<int16_t (int16_t, int16_t)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<uint8_t (uint8_t, uint8_t)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<uint8_t (uint8_t, uint8_t)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<uint16_t (uint16_t, uint16_t)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<uint16_t (uint16_t, uint16_t)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<uint32_t (uint32_t, uint32_t)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<uint32_t (uint32_t, uint32_t)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<uint64_t (uint64_t, uint64_t)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<uint64_t (uint64_t, uint64_t)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<int8_t (int8_t, int8_t)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<int8_t (int8_t, int8_t)>& func, NDArray& target);
-template void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<bool (bool, bool)>& func, NDArray* target);
+template void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<bool (bool, bool)>& func, NDArray& target);
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
-void NDArray::applyLambda(const std::function<T(T)>& func, NDArray* target) {
+void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
    if (target == nullptr)
        target = this;
    if(dataType() != DataTypeUtils::fromT<T>())
        throw std::runtime_error("NDArray::applyLambda<T> method: wrong template parameter T, its type should be the same as type of this array!");
-    if(dataType() != target->dataType())
+    if(dataType() != target.dataType())
        throw std::runtime_error("NDArray::applyLambda<T> method: types of this and target array should match !");
    auto f = this->bufferAsT<T>();
-    auto z = target->bufferAsT<T>();
+    auto z = target.bufferAsT<T>();
-    if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) {
+    if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
        auto loop = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e += increment)
@ -198,7 +178,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray* target) {
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto xOffset = this->getOffset(e);
-                    auto zOffset = target->getOffset(e);
+                    auto zOffset = target.getOffset(e);
                    z[zOffset] = func(f[xOffset]);
                }
@ -208,35 +188,33 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray* target) {
        }
    }
 }
-template void NDArray::applyLambda(const std::function<double(double)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<double(double)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<float(float)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<float(float)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<float16(float16)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<float16(float16)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<bfloat16(bfloat16)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<bfloat16(bfloat16)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<Nd4jLong(Nd4jLong)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<Nd4jLong(Nd4jLong)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<int16_t(int16_t)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<int16_t(int16_t)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<int32_t(int32_t)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<int32_t(int32_t)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<uint8_t(uint8_t)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<uint8_t(uint8_t)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<uint16_t(uint16_t)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<uint16_t(uint16_t)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<uint32_t(uint32_t)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<uint32_t(uint32_t)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<uint64_t(uint64_t)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<uint64_t(uint64_t)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<int8_t(int8_t)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<int8_t(int8_t)>& func, NDArray& target);
-template void NDArray::applyLambda(const std::function<bool(bool)>& func, NDArray* target);
+template void NDArray::applyLambda(const std::function<bool(bool)>& func, NDArray& target);
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
-void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDArray* target) {
+void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDArray& target) {
    if (target == nullptr)
        target = this;
    if(dataType() != DataTypeUtils::fromT<T>())
        throw std::runtime_error("NDArray::applyIndexedLambda<T> method: wrong template parameter T, its type should be the same as type of this array!");
-    if(dataType() != target->dataType())
+    if(dataType() != target.dataType())
        throw std::runtime_error("NDArray::applyIndexedLambda<T> method: types of this and target array should match !");
    auto f = this->bufferAsT<T>();
-    auto z = target->bufferAsT<T>();
+    auto z = target.bufferAsT<T>();
-    if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) {
+    if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
        auto loop = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e += increment)
@ -261,7 +239,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto xOffset = this->getOffset(e);
-                    auto zOffset = target->getOffset(e);
+                    auto zOffset = target.getOffset(e);
                    z[zOffset] = func(e, f[xOffset]);
                }
@ -271,44 +249,38 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
        }
    }
 }
-template void NDArray::applyIndexedLambda(const std::function<double(Nd4jLong, double)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<double(Nd4jLong, double)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<float(Nd4jLong, float)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<float(Nd4jLong, float)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<float16(Nd4jLong, float16)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<float16(Nd4jLong, float16)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<bfloat16(Nd4jLong, bfloat16)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<bfloat16(Nd4jLong, bfloat16)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<Nd4jLong(Nd4jLong, Nd4jLong)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<Nd4jLong(Nd4jLong, Nd4jLong)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<int(Nd4jLong, int)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<int(Nd4jLong, int)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<int16_t(Nd4jLong, int16_t)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<int16_t(Nd4jLong, int16_t)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<uint8_t (Nd4jLong, uint8_t)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<uint8_t (Nd4jLong, uint8_t)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<uint16_t (Nd4jLong, uint16_t)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<uint16_t (Nd4jLong, uint16_t)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<uint32_t (Nd4jLong, uint32_t)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<uint32_t (Nd4jLong, uint32_t)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<uint64_t (Nd4jLong, uint64_t)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<uint64_t (Nd4jLong, uint64_t)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<int8_t(Nd4jLong, int8_t)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<int8_t(Nd4jLong, int8_t)>& func, NDArray& target);
-template void NDArray::applyIndexedLambda(const std::function<bool(Nd4jLong, bool)>& func, NDArray* target);
+template void NDArray::applyIndexedLambda(const std::function<bool(Nd4jLong, bool)>& func, NDArray& target);
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
-void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<T(Nd4jLong, T, T)>& func, NDArray* target) {
+void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(Nd4jLong, T, T)>& func, NDArray& target) {
    if (target == nullptr)
        target = this;
    if (other == nullptr) {
        nd4j_printf("applyIndexedPairwiseLambda requires both operands to be valid NDArrays, but Y is NULL\n","");
        throw std::runtime_error("Other is null");
    }
    if(dataType() != DataTypeUtils::fromT<T>())
        throw std::runtime_error("NDArray::applyIndexedPairwiseLambda<T> method: wrong template parameter T, its type should be the same as type of this array!");
-    if(dataType() != target->dataType())
+    if(dataType() != target.dataType())
        throw std::runtime_error("NDArray::applyIndexedPairwiseLambda<T> method: types of this and target array should match !");
-    if (this->lengthOf() != other->lengthOf()) {
+    if (this->lengthOf() != other.lengthOf()) {
        nd4j_printf("applyIndexedPairwiseLambda requires both operands to have the same shape\n","");
        throw std::runtime_error("Shapes mismach");
    }
    auto f = this->bufferAsT<T>();
-    auto s = other->bufferAsT<T>();
+    auto s = other.bufferAsT<T>();
-    auto z = target->bufferAsT<T>();
+    auto z = target.bufferAsT<T>();
-    if (this->ordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) {
+    if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
        auto loop = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e += increment)
@ -322,7 +294,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<T(N
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto xOffset = this->getOffset(e);
-                    auto yOffset = other->getOffset(e);
+                    auto yOffset = other.getOffset(e);
                    f[xOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
                }
@ -334,8 +306,8 @@ void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<T(N
            auto loop = PRAGMA_THREADS_FOR {
                for (auto e = start; e < stop; e += increment) {
                    auto xOffset = this->getOffset(e);
-                    auto yOffset = other->getOffset(e);
+                    auto yOffset = other.getOffset(e);
-                    auto zOffset = target->getOffset(e);
+                    auto zOffset = target.getOffset(e);
                    z[zOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
                }
@ -345,16 +317,16 @@ void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<T(N
        }
    }
 }
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<double (Nd4jLong, double, double)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<double (Nd4jLong, double, double)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<float (Nd4jLong, float, float)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<float (Nd4jLong, float, float)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<float16 (Nd4jLong, float16, float16)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<float16 (Nd4jLong, float16, float16)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<bfloat16 (Nd4jLong, bfloat16, bfloat16)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<bfloat16 (Nd4jLong, bfloat16, bfloat16)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<Nd4jLong (Nd4jLong, Nd4jLong, Nd4jLong)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<Nd4jLong (Nd4jLong, Nd4jLong, Nd4jLong)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<int (Nd4jLong, int, int)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<int (Nd4jLong, int, int)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<int16_t (Nd4jLong, int16_t, int16_t)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<int16_t (Nd4jLong, int16_t, int16_t)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<uint8_t (Nd4jLong, uint8_t, uint8_t)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<uint8_t (Nd4jLong, uint8_t, uint8_t)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<uint16_t (Nd4jLong, uint16_t, uint16_t)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<uint16_t (Nd4jLong, uint16_t, uint16_t)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<uint32_t (Nd4jLong, uint32_t, uint32_t)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<uint32_t (Nd4jLong, uint32_t, uint32_t)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<uint64_t (Nd4jLong, uint64_t, uint64_t)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<uint64_t (Nd4jLong, uint64_t, uint64_t)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<int8_t (Nd4jLong, int8_t, int8_t)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<int8_t (Nd4jLong, int8_t, int8_t)>& func, NDArray& target);
-template void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<bool (Nd4jLong, bool, bool)>& func, NDArray* target);
+template void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<bool (Nd4jLong, bool, bool)>& func, NDArray& target);
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@ -2717,25 +2717,25 @@ static void  _scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSub
                switch (opCode) {
                    case 0:
-                        inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
+                        inSubArr.applyPairwiseTransform(pairwise::Add, updSubArr, inSubArr);
                        break;
                    case 1:
-                        inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
+                        inSubArr.applyPairwiseTransform(pairwise::Subtract, updSubArr, inSubArr);
                        break;
                    case 2:
-                        inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
+                        inSubArr.applyPairwiseTransform(pairwise::Multiply, updSubArr, inSubArr);
                        break;
                    case 3:
-                        inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
+                        inSubArr.applyPairwiseTransform(pairwise::Divide, updSubArr, inSubArr);
                        break;
                    case 4:
-                        inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
+                        inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, updSubArr, inSubArr);
                        break;
                    case 5:
-                        inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
+                        inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, updSubArr, inSubArr);
                        break;
                    case 6:
-                        inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
+                        inSubArr.applyPairwiseTransform(pairwise::CopyPws, updSubArr, inSubArr);
                        break;
                    default:
                        continue;
--- a/libnd4j/blas/cuda/NDArray.cu
+++ b/libnd4j/blas/cuda/NDArray.cu
@ -122,35 +122,32 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
 ///////////////////////////////////////////////////////////////////
 template<typename T>
-void NDArray::fillAsTriangular(const float val, int lower, int upper, const char direction, NDArray* target) {
+void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& target, const char direction) {
    if (isS())
        throw std::runtime_error("NDArray::fillAsTriangular: you can't use this method on String array!");
-    if(target == nullptr)
+    if(!isSameShape(target) && !(rankOf() == 1 && target.rankOf() == 2 && sizeAt(0) == target.sizeAt(0) && sizeAt(0) == target.sizeAt(1)))
        target = this;
    if(!isSameShape(target) && !(rankOf() == 1 && target->rankOf() == 2 && sizeAt(0) == target->sizeAt(0) && sizeAt(0) == target->sizeAt(1)))
        throw std::string("NDArray::fillAsTriangular method: wrong shape of target array !");
     if (direction == 'u')
-        lower = -target->sizeAt(-2);
+        lower = -target.sizeAt(-2);
    else if (direction == 'l')
-        upper = target->sizeAt(-1);
+        upper = target.sizeAt(-1);
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
-    const int blocksPerGrid = (target->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(decltype(*target->getShapeInfo())) * target->rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(decltype(*target.getShapeInfo())) * target.rankOf() + 128;
    PointersManager manager(getContext(), "NDArray::fillAsTriangular");
-    NDArray::prepareSpecialUse({target}, {this});
+    NDArray::prepareSpecialUse({&target}, {this});
-    fillAsTriangularCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *getContext()->getCudaStream()>>>(getPlatformBuffer(), getPlatformShapeInfo(), target->getPlatformBuffer(), target->getPlatformShapeInfo(), static_cast<T>(val), lower, upper);
+    fillAsTriangularCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *getContext()->getCudaStream()>>>(getPlatformBuffer(), getPlatformShapeInfo(), target.getPlatformBuffer(), target.getPlatformShapeInfo(), static_cast<T>(val), lower, upper);
-    NDArray::registerSpecialUse({target}, {this});
+    NDArray::registerSpecialUse({&target}, {this});
    manager.synchronize();
 }
-BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT void NDArray::fillAsTriangular, (const float val, int lower, int upper, const char direction, NDArray* target), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT void NDArray::fillAsTriangular, (const float val, int lower, int upper, NDArray& target, const char direction), LIBND4J_TYPES);
 ////////////////////////////////////////////////////////////////////////
 template<typename T>
@ -457,21 +454,21 @@ BUILD_DOUBLE_TEMPLATE(template void repeatCudaLauncher, (const int blocksPerGrid
 //////////////////////////////////////////////////////////////////////////
 // create new array by repeating it the number of times given by repeats
-NDArray* NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
+NDArray NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
-    auto output = new NDArray('c', ShapeUtils::evalRepeatShape(axis, repeats, *this), dataType(),  getContext());
+    NDArray output('c', ShapeUtils::evalRepeatShape(axis, repeats, *this), dataType(),  getContext());
    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (output->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = output->rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
    PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
    const int* reps = reinterpret_cast<int*>(manager.replicatePointer(repeats.data(), repeats.size() * sizeof(int)));
-    prepareSpecialUse({output}, {this});
+    prepareSpecialUse({&output}, {this});
-    BUILD_SINGLE_SELECTOR_TWICE(dataType(), repeatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, getContext()->getCudaStream(), getSpecialBuffer(), getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), reps, repeats.size(), axis), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR_TWICE(dataType(), repeatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, getContext()->getCudaStream(), getSpecialBuffer(), getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), reps, repeats.size(), axis), LIBND4J_TYPES);
-    prepareSpecialUse({output}, {this});
+    prepareSpecialUse({&output}, {this});
    manager.synchronize();
--- a/libnd4j/blas/cuda/NDArrayLambda.hpp
+++ b/libnd4j/blas/cuda/NDArrayLambda.hpp
@ -247,73 +247,73 @@ static _CUDA_G void lambdaTriplewiseKernel(void* vw, Nd4jLong *wShapeInfo, void*
 //////////////////////////////////////////////////////////////////////////
 template<typename Lambda>
-void NDArray::applyLambda(Lambda func, NDArray* target) {
+void NDArray::applyLambda(Lambda func, NDArray& target) {
-    auto result = target == nullptr ? this : target;
+
    auto dtype = this->dataType();
-    if (dtype != result->dataType())
+    if (dtype != target.dataType())
        throw std::runtime_error("NDArray::applyLambda X/Z data types must be the same");
-        //throw datatype_exception::build("NDArray::applyLambda X/Z data types must be the same", dtype, result->dataType());
+        //throw datatype_exception::build("NDArray::applyLambda X/Z data types must be the same", dtype, target.dataType());
-    prepareSpecialUse({result}, {this});
+    prepareSpecialUse({&target}, {this});
-    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), result->specialBuffer(), result->specialShapeInfo(), func), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), target.specialBuffer(), target.specialShapeInfo(), func), LIBND4J_TYPES);
-    registerSpecialUse({result}, {this});
+    registerSpecialUse({&target}, {this});
 }
 //////////////////////////////////////////////////////////////////////////
 template<typename Lambda>
-void NDArray::applyPairwiseLambda(const NDArray* other, Lambda func, NDArray* target) {
+void NDArray::applyPairwiseLambda(const NDArray& other, Lambda func, NDArray& target) {
-    auto result = target == nullptr ? this : target;
+
    auto dtype = this->dataType();
-    if (dtype != result->dataType() || dtype != other->dataType())
+    if (dtype != target.dataType() || dtype != other.dataType())
        throw std::runtime_error("NDArray::applyPairwiseLambda X/Y/Z data types must be the same");
-    //throw datatype_exception::build("NDArray::applyLambda X/Z data types must be the same", dtype, result->dataType());
+    //throw datatype_exception::build("NDArray::applyLambda X/Z data types must be the same", dtype, target.dataType());
-    prepareSpecialUse({result}, {this, other});
+    prepareSpecialUse({&target}, {this, &other});
-    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaPairwiseLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->specialBuffer(), result->specialShapeInfo(), func), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaPairwiseLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), target.specialBuffer(), target.specialShapeInfo(), func), LIBND4J_TYPES);
-    registerSpecialUse({result}, {this, other});
+    registerSpecialUse({&target}, {this, &other});
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename Lambda>
-void NDArray::applyIndexedLambda(Lambda func, NDArray* target) {
+void NDArray::applyIndexedLambda(Lambda func, NDArray& target) {
-    auto result = target == nullptr ? this : target;
+
    auto dtype = this->dataType();
-    if (dtype != result->dataType())
+    if (dtype != target.dataType())
        throw std::runtime_error("NDArray::applyIndexedLambda X/Z data types must be the same");
-    prepareSpecialUse({result}, {this});
+    prepareSpecialUse({&target}, {this});
-    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaIndexedLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), result->specialBuffer(), result->specialShapeInfo(), func), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaIndexedLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), target.specialBuffer(), target.specialShapeInfo(), func), LIBND4J_TYPES);
-    registerSpecialUse({result}, {this});
+    registerSpecialUse({&target}, {this});
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename Lambda>
-void NDArray::applyIndexedPairwiseLambda(NDArray* other, Lambda func, NDArray* target) {
+void NDArray::applyIndexedPairwiseLambda(NDArray& other, Lambda func, NDArray& target) {
-    auto result = target == nullptr ? this : target;
+
    auto dtype = this->dataType();
-    if (dtype != result->dataType() || dtype != other->dataType())
+    if (dtype != target.dataType() || dtype != other.dataType())
        throw std::runtime_error("NDArray::applyIndexedPairwiseLambda X/Y/Z data types must be the same");
-    prepareSpecialUse({result}, {this, other});
+    prepareSpecialUse({&target}, {this, &other});
-    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaIndexedPairwiseLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), other->getSpecialBuffer(), other->getSpecialShapeInfo(), result->specialBuffer(), result->specialShapeInfo(), func), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaIndexedPairwiseLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), target.specialBuffer(), target.specialShapeInfo(), func), LIBND4J_TYPES);
-    registerSpecialUse({result}, {this, other});
+    registerSpecialUse({&target}, {this, &other});
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename Lambda>
-void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, Lambda func, NDArray* target) {
+void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, Lambda func, NDArray& target) {
-    auto result = target == nullptr ? this : target;
+
    auto dtype = this->dataType();
-    if (dtype != result->dataType() || dtype != second->dataType() || dtype != third->dataType())
+    if (dtype != target.dataType() || dtype != second.dataType() || dtype != third.dataType())
        throw std::runtime_error("NDArray::applyTriplewiseLambda X/Y/Z data types must be the same");
-    prepareSpecialUse({result}, {this, second, third});
+    prepareSpecialUse({&target}, {this, &second, &third});
-    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaTriplewiseLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), second->specialBuffer(), second->specialShapeInfo(), third->specialBuffer(), third->specialShapeInfo(), result->specialBuffer(), result->specialShapeInfo(), func), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR(dtype, LambdaHelper ,::lambdaTriplewiseLauncher(this->_context->getCudaStream(), this->specialBuffer(), this->specialShapeInfo(), second.specialBuffer(), second.specialShapeInfo(), third.specialBuffer(), third.specialShapeInfo(), target.specialBuffer(), target.specialShapeInfo(), func), LIBND4J_TYPES);
-    registerSpecialUse({result}, {this, second, third});
+    registerSpecialUse({&target}, {this, &second, &third});
 }
--- a/libnd4j/include/array/DataTypeUtils.h
+++ b/libnd4j/include/array/DataTypeUtils.h
@ -91,6 +91,10 @@ namespace nd4j {
        template <typename T>
        FORCEINLINE static bool castShapeInfo(const Nd4jLong *originalShapeInfo, T *newShapeInfo);
        template<typename T>
        // struct scalarTypesForNDarray { static bool const value = std::is_same<double, T>::value || std::is_same<float, T>::value || std::is_same<int, T>::value || std::is_same<bfloat16, T>::value || std::is_same<float16, T>::value || std::is_same<long long, T>::value; };
        struct scalarTypesForNDarray { static bool const value = std::is_same<double, T>::value || std::is_same<float, T>::value || std::is_same<int, T>::value || std::is_same<unsigned int, T>::value || std::is_same<long long, T>::value || std::is_same<unsigned long long, T>::value || std::is_same<long int, T>::value || std::is_same<long unsigned int, T>::value || std::is_same<int8_t, T>::value || std::is_same<uint8_t, T>::value || std::is_same<int16_t, T>::value || std::is_same<uint16_t, T>::value || std::is_same<bool, T>::value || std::is_same<bfloat16, T>::value || std::is_same<float16, T>::value; };
    };
--- a/libnd4j/include/array/impl/NDArrayList.cpp
+++ b/libnd4j/include/array/impl/NDArrayList.cpp
@ -44,7 +44,7 @@ namespace nd4j {
    }
    NDArray* NDArrayList::read(int idx) {
-        return readRaw(idx)->dup();
+        return new NDArray(readRaw(idx)->dup());
    }
    nd4j::DataType NDArrayList::dataType() {
@ -136,11 +136,10 @@ namespace nd4j {
        std::vector<int> args({axis});
        auto newAxis = ShapeUtils::evalDimsToExclude(array->rankOf(), args);
        auto result = array->allTensorsAlongDimension(newAxis);
-        for (int e = 0; e < result->size(); e++) {
+        for (int e = 0; e < result.size(); e++) {
-            auto chunk = result->at(e);//->dup(array->ordering());
+            auto chunk = result.at(e);//->dup(array->ordering());
-            write(e, chunk->dup(array->ordering()));
+            write(e, new NDArray(chunk->dup(array->ordering())));
        }
        delete result;
    }
    NDArray* NDArrayList::stack() {
@ -161,7 +160,7 @@ namespace nd4j {
        auto result = op.execute(inputs, {}, {}, {});
-        auto array = result->at(0)->dup();
+        auto array = new NDArray(result->at(0)->dup());
        delete result;
@ -214,13 +213,11 @@ namespace nd4j {
        auto tads = array->allTensorsAlongDimension(axis);
        int indicesSize = indices.size();
-        if (tads->size() != indicesSize)
+        if (tads.size() != indicesSize)
            throw std::runtime_error("Number of TADs should match number of indices");
        for (int e = 0; e < indicesSize; e++)
-            tads->at(e)->assign(_chunks[indices[e]]);
+            tads.at(e)->assign(_chunks[indices[e]]);
        delete tads;
        return array;
    }
@ -234,7 +231,7 @@ namespace nd4j {
        list->_elements.store(_elements.load());
        for (auto const& v : _chunks) {
-            list->_chunks[v.first] = v.second->dup();
+            list->_chunks[v.first] = new NDArray(v.second->dup());
        }
        return list;
--- a/libnd4j/include/graph/execution/impl/LogicConditional.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicConditional.cpp
@ -48,7 +48,7 @@ namespace nd4j {
                } else {
                    // FIXME: in some cases it's possible to have no NDArray
                    if (inputVar->hasNDArray())
-                        innerVar->setNDArray(inputVar->getNDArray()->dup());
+                        innerVar->setNDArray(new NDArray(inputVar->getNDArray()->dup()));
                }
            }
--- a/libnd4j/include/graph/execution/impl/LogicWhile.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicWhile.cpp
@ -56,7 +56,7 @@ namespace nd4j {
                } else {
                    // FIXME: in some cases it's possible to have no NDArray
                    if (inputVar->hasNDArray())
-                        innerVar->setNDArray(inputVar->getNDArray()->dup());
+                        innerVar->setNDArray(new NDArray(inputVar->getNDArray()->dup()));
                }
            }
--- a/libnd4j/include/graph/impl/Variable.cpp
+++ b/libnd4j/include/graph/impl/Variable.cpp
@ -40,7 +40,7 @@ namespace nd4j {
            result->setIndex(this->_index);
            if (this->_ndarray != nullptr)
-                result->setNDArray(this->_ndarray->template asT<N>());
+                result->setNDArray(new NDArray(this->_ndarray->template asT<N>()));
            // FIXME: add support for ArrayList
            if (this->_list != nullptr) {
@ -61,7 +61,7 @@ namespace nd4j {
            result->_index = this->_index;
            if (this->_ndarray != nullptr)
-                result->_ndarray = this->_ndarray->dup(this->_ndarray->ordering());
+                result->_ndarray = new NDArray(this->_ndarray->dup(this->_ndarray->ordering()));
            if (this->_list != nullptr)
                result->_list = this->_list->clone();
--- a/libnd4j/include/helpers/benchmark/ScalarBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/ScalarBenchmark.h
@ -93,7 +93,7 @@ namespace nd4j {
        }
        OpBenchmark* clone() override  {
-            return new ScalarBenchmark((scalar::Ops) _opNum, _testName, _x == nullptr ? _x : _x->dup() , _y == nullptr ? _y : _y->dup(), _z == nullptr ? _z : _z->dup());
+            return new ScalarBenchmark((scalar::Ops) _opNum, _testName, _x == nullptr ? _x : new NDArray(_x->dup()) , _y == nullptr ? _y : new NDArray(_y->dup()), _z == nullptr ? _z : new NDArray(_z->dup()));
        }
    };
 }
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@ -230,17 +230,17 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, con
        bool cNcont = N == 1 || C->strideAt(1) == 1;
        if(!aMcont && !aKcont) {
-            pA = A->dup('f');
+            pA = new NDArray(A->dup('f'));
            toDelete.push_back(pA);
            aMcont = true;
        }
        if(!bKcont && !bNcont) {
-            pB = B->dup('f');
+            pB = new NDArray(B->dup('f'));
            toDelete.push_back(pB);
            bKcont = true;
        }
        if(!cMcont && !cNcont) {
-            pC = C->dup('f');
+            pC = new NDArray(C->dup('f'));
            toDelete.push_back(pC);
            cMcont = true;
        }
@ -332,7 +332,7 @@ NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, nd4j::NDArray*
        bool aNcont = N == 1 || A->strideAt(1) == 1;
        if(!aMcont && !aNcont) {
-            pA = A->dup('f');
+            pA = new NDArray(A->dup('f'));
            aMcont = true;
        }
        const CBLAS_ORDER blasOrder = aMcont ? CblasColMajor : CblasRowMajor;
--- a/libnd4j/include/helpers/cpu/householder.cpp
+++ b/libnd4j/include/helpers/cpu/householder.cpp
@ -60,11 +60,10 @@ NDArray Householder<T>::evalHHmatrix(const NDArray& x) {
 	w.p(Nd4jLong(0), 1.f);
 	wT.assign(&w);
-	auto identity = NDArrayFactory::create(x.ordering(), {(int)x.lengthOf(), (int)x.lengthOf()}, x.dataType(), x.getContext());
+	NDArray identity = NDArrayFactory::create(x.ordering(), {(int)x.lengthOf(), (int)x.lengthOf()}, x.dataType(), x.getContext());
 	identity.setIdentity();																			// identity matrix
 	return identity - mmul(w, wT) * coeff;
 }
 //////////////////////////////////////////////////////////////////////////
@ -95,9 +94,9 @@ void Householder<T>::evalHHmatrixData(const NDArray& x, NDArray& tail, T& coeff,
 		coeff = -u0 / normX;
 		if(x.isRowVector())
-			tail.assign(x({0,0, 1,-1}) / u0);
+			tail.assign(static_cast<const NDArray&>(x({0,0, 1,-1})) / u0);
 		else
-			tail.assign(x({1,-1, 0,0,}) / u0);
+			tail.assign(static_cast<const NDArray&>(x({1,-1, 0,0,})) / u0);
 	}
 }
--- a/libnd4j/include/helpers/cpu/jacobiSVD.cpp
+++ b/libnd4j/include/helpers/cpu/jacobiSVD.cpp
@ -269,7 +269,7 @@ void JacobiSVD<T>::evalData(const NDArray& matrix) {
        HHcolPivQR qr(matrix / scale);
        _m.assign(qr._qr({0,_cols, 0,_cols}));
-        _m.fillAsTriangular<T>(0., 0, 0, 'l');
+        _m.fillAsTriangular<T>(0., 0, 0, _m, 'l');
        HHsequence hhSeg(qr._qr, qr._coeffs, 'u');
@ -288,7 +288,7 @@ void JacobiSVD<T>::evalData(const NDArray& matrix) {
        auto matrixT = matrix.transpose();
        HHcolPivQR qr(matrixT / scale);
        _m.assign(qr._qr({0,_rows, 0,_rows}));
-        _m.fillAsTriangular<T>(0., 0, 0, 'l');
+        _m.fillAsTriangular<T>(0., 0, 0, _m, 'l');
        _m.transposei();
        HHsequence  hhSeg(qr._qr, qr._coeffs, 'u');          // type = 'u' is not mistake here !
@ -305,7 +305,7 @@ void JacobiSVD<T>::evalData(const NDArray& matrix) {
    }
    else {
-        _m.assign(matrix({0,_diagSize, 0,_diagSize}) / scale);
+        _m.assign(static_cast<const NDArray&>(matrix({0,_diagSize, 0,_diagSize})) / scale);
        if(_calcU)
            _u.setIdentity();
@ -366,7 +366,7 @@ void JacobiSVD<T>::evalData(const NDArray& matrix) {
        _s.p(i, math::nd4j_abs<T>(_m.e<T>(i,i)));
        if(_calcU && _m.e<T>(i,i) < (T)0.) {
            auto temp = _u({0,0, i,i+1}, true);
-            temp.applyTransform(transform::Neg, &temp, nullptr);
+            temp.applyTransform(transform::Neg, temp, nullptr);
        }
    }
--- a/libnd4j/include/helpers/cpu/svd.cpp
+++ b/libnd4j/include/helpers/cpu/svd.cpp
@ -223,26 +223,26 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
    const T almostZero = DataTypeUtils::min<T>();
    T maxElem;
    if(len == 1)
-        maxElem = math::nd4j_abs<T>(diagInterval->template e<T>(0));
+        maxElem = math::nd4j_abs<T>(diagInterval.template e<T>(0));
    else
-        maxElem = (*diagInterval)({1,-1, 0,0}, true).reduceNumber(reduce::AMax).template e<T>(0);
+        maxElem = diagInterval({1,-1, 0,0}, true).reduceNumber(reduce::AMax).template e<T>(0);
    T maxElem0 = colVec0->reduceNumber(reduce::AMax).template e<T>(0);
    T eps = math::nd4j_max<T>(almostZero, DataTypeUtils::eps<T>() * maxElem);
    T epsBig = (T)8. * DataTypeUtils::eps<T>() * math::nd4j_max<T>(maxElem0, maxElem);
-    if(diagInterval->template e<T>(0) < epsBig)
+    if(diagInterval.template e<T>(0) < epsBig)
-        diagInterval->p(Nd4jLong(0), epsBig);
+        diagInterval.p(Nd4jLong(0), epsBig);
    for(int i=1; i < len; ++i)
        if(math::nd4j_abs<T>(colVec0->template e<T>(i)) < eps)
            colVec0->p(i, 0.f);
    for(int i=1; i < len; i++)
-        if(diagInterval->template e<T>(i) < epsBig) {
+        if(diagInterval.template e<T>(i) < epsBig) {
            deflation1(col1, shift, i, len);
            for(int i = 0; i < len; ++i)
-                diagInterval->p(i, _m.e<T>(col1+shift+i,col1+shift+i));
+                diagInterval.p(i, _m.e<T>(col1+shift+i,col1+shift+i));
        }
    {
@ -261,7 +261,7 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
            int p = 1;
            for(int i=1; i<len; ++i)
-                if(math::nd4j_abs<T>(diagInterval->template e<T>(i)) < almostZero)
+                if(math::nd4j_abs<T>(diagInterval.template e<T>(i)) < almostZero)
                    permut[p++] = i;
            int k = 1, m = ind+1;
@ -271,7 +271,7 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
                    permut[p] = m++;
                else if(m >= len)
                    permut[p] = k++;
-                else if(diagInterval->template e<T>(k) < diagInterval->template e<T>(m))
+                else if(diagInterval.template e<T>(k) < diagInterval.template e<T>(m))
                    permut[p] = m++;
                else
                    permut[p] = k++;
@ -281,7 +281,7 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
        if(totDefl) {
            for(int i=1; i<len; ++i) {
                int ki = permut[i];
-                if(math::nd4j_abs<T>(diagInterval->template e<T>(ki)) < almostZero || diagInterval->template e<T>(0) < diagInterval->template e<T>(ki))
+                if(math::nd4j_abs<T>(diagInterval.template e<T>(ki)) < almostZero || diagInterval.template e<T>(0) < diagInterval.template e<T>(ki))
                    permut[i-1] = permut[i];
                else {
                    permut[i-1] = 0;
@ -303,10 +303,10 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
            const int ki = permut[len - (totDefl ? i+1 : i)];
            const int jac = tCol[ki];
-            T _e0 = diagInterval->template e<T>(jac);
+            T _e0 = diagInterval.template e<T>(jac);
            //math::nd4j_swap<T>(diagInterval)(i), (*diagInterval)(jac));
-            diagInterval->p(jac, diagInterval->template e<T>(i));
+            diagInterval.p(jac, diagInterval.template e<T>(i));
-            diagInterval->p(i, _e0);
+            diagInterval.p(i, _e0);
            if(i!=0 && jac!=0) {
                _e0 = colVec0->template e<T>(jac);
@ -315,9 +315,8 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
                colVec0->p(i, _e0);
            }
            NDArray* temp1 = nullptr, *temp2 = nullptr;
            if (_calcU) {
-              auto temp1 = _u({col1,col1+len+1, col1+i,  col1+i+1}, true);
+                auto temp1 = _u({col1,col1+len+1, col1+i,  col1+i+1}, true);
                auto temp2 = _u({col1,col1+len+1, col1+jac,col1+jac+1}, true);
                auto temp3 = temp1;
                temp1.assign(temp2);
@ -352,12 +351,12 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
    {
        int i = len-1;
-        while(i > 0 && (math::nd4j_abs<T>(diagInterval->template e<T>(i)) < almostZero || math::nd4j_abs<T>(colVec0->template e<T>(i)) < almostZero))
+        while(i > 0 && (math::nd4j_abs<T>(diagInterval.template e<T>(i)) < almostZero || math::nd4j_abs<T>(colVec0->template e<T>(i)) < almostZero))
            --i;
        for(; i > 1; --i) {
-            if( (diagInterval->template e<T>(i) - diagInterval->template e<T>(i-1)) < DataTypeUtils::eps<T>()*maxElem ) {
+            if( (diagInterval.template e<T>(i) - diagInterval.template e<T>(i-1)) < DataTypeUtils::eps<T>()*maxElem ) {
-                if (math::nd4j_abs<T>(diagInterval->template e<T>(i) - diagInterval->template e<T>(i-1)) >= epsBig)
+                if (math::nd4j_abs<T>(diagInterval.template e<T>(i) - diagInterval.template e<T>(i-1)) >= epsBig)
                    throw std::runtime_error("ops::helpers::SVD::deflation: diagonal elements are not properly sorted !");
                deflation2(col1, col1 + shift, row1W, col1W, i-1, i, len);
            }
@ -365,7 +364,6 @@ void SVD<T>::deflation(int col1, int col2, int ind, int row1W, int col1W, int sh
    }
    delete colVec0;
    delete diagInterval;
 }
@ -609,9 +607,7 @@ void SVD<T>::calcBlockSVD(int col1, int size, NDArray& U, NDArray& singVals, NDA
    const T almostZero = DataTypeUtils::min<T>();
    auto col0 = _m({col1, col1+size, col1, col1+1}, true);
-    auto diagP = _m({col1, col1+size, col1, col1+size}, true).diagonal('c');
+    auto diag = static_cast<const NDArray&>(_m({col1, col1+size, col1, col1+size}, true).diagonal('c'));
    auto diag = *diagP;
    delete diagP;
    diag.p(Nd4jLong(0), T(0));
    singVals = NDArrayFactory::create<T>(_m.ordering(), {size, 1}, _m.getContext());
@ -730,8 +726,7 @@ void SVD<T>::DivideAndConquer(int col1, int col2, int row1W, int col1W, int shif
        auto temp = _m({col1+shift,col1+shift+n+1, col1+shift,col1+shift+n}, true);
        temp.assign(0.);
        auto diag = _m.diagonal('c');
-        (*diag)({col1+shift, col1+shift+n, 0,0}, true).assign(jac._s({0,n, 0,0}, true));
+        diag({col1+shift, col1+shift+n, 0,0}, true).assign(jac._s({0,n, 0,0}, true));
        delete diag;
        return;
    }
@ -762,11 +757,6 @@ void SVD<T>::DivideAndConquer(int col1, int col2, int row1W, int col1W, int shif
        f.assign(_u({0,1, col1+k+1, col1+n}, true));
    }
    // UofSVD.printIndexedBuffer();
    // VofSVD.printIndexedBuffer();
    // singVals.printIndexedBuffer();
    // printf("!! \n");
    if (_calcV)
        _v.p(row1W+k, col1W, 1.f);
@ -789,14 +779,10 @@ void SVD<T>::DivideAndConquer(int col1, int col2, int row1W, int col1W, int shif
            temp.assign(_u({col1, col1+k+1, i, i+1}, true));
        }
-        auto temp1 = _u({col1,col1+k+1, col1,col1+1}, true);
+        _u({col1,col1+k+1, col1,col1+1}, true).assign(q1 * c0);
-        temp1.assign(q1 * c0);
+        _u({col1,col1+k+1, col2+1,col2+2}, true).assign(q1 * (-s0));
-        auto temp2 = _u({col1,col1+k+1, col2+1,col2+2}, true);
+        _u({col1+k+1,col1+n+1, col1, col1+1}, true).assign(static_cast<const NDArray&>(_u({col1+k+1, col1+n+1, col2+1, col2+2}, true)) * s0);
-        temp2.assign(q1 * (-s0));
+        _u({col1+k+1,col1+n+1, col2+1,col2+2}, true) *= c0;
        auto temp3 = _u({col1+k+1,col1+n+1, col1, col1+1}, true);
        temp3.assign(_u({col1+k+1, col1+n+1, col2+1, col2+2}, true) * s0);
        auto temp4 =_u({col1+k+1,col1+n+1, col2+1,col2+2}, true);
        temp4 *= c0;
    }
    else  {
@ -844,8 +830,7 @@ void SVD<T>::DivideAndConquer(int col1, int col2, int row1W, int col1W, int shif
    auto blockM = _m({col1+shift,col1+shift+n, col1+shift,col1+shift+n}, true);
    blockM = 0.f;
    auto diag = blockM.diagonal('c');
-    diag->assign(singVals);
+    diag.assign(singVals);
    delete diag;
 }
 //////////////////////////////////////////////////////////////////////////
--- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
@ -285,17 +285,17 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou
        bool cNcont = N == 1 || C->strideAt(1) == 1;
        if(!aMcont && !aKcont) {
-            pA = A->dup('f');
+            pA = new NDArray(A->dup('f'));
            toDelete.push_back(pA);
            aMcont = true;
        }
        if(!bKcont && !bNcont) {
-            pB = B->dup('f');
+            pB = new NDArray(B->dup('f'));
            toDelete.push_back(pB);
            bKcont = true;
        }
        if(!cMcont) {
-            pC = C->dup('f');
+            pC = new NDArray(C->dup('f'));
            toDelete.push_back(pC);
            cMcont = true;
        }
@ -418,7 +418,7 @@ NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, nd4j::NDArray*
        bool aNcont = N == 1 || A->strideAt(1) == 1;
        if(!aMcont && !aNcont) {
-            pA = A->dup('f');
+            pA = new NDArray(A->dup('f'));
            aMcont = true;
        }
@ -866,12 +866,12 @@ NDArray* MmulHelper::mmulNxNold2(const NDArray* A, const NDArray* B, NDArray* C,
    bool cNcont = N == 1 || C->strideAt(-1) == 1;
    if(!aMcont && !aKcont) {
-        pA = A->dup('c');
+        pA = new NDArray(A->dup('c'));
        toDelete.push_back(pA);
        aKcont = true;
    }
    if(!bKcont && !bNcont) {
-        pB = B->dup('c');
+        pB = new NDArray(B->dup('c'));
        toDelete.push_back(pB);
        bNcont = true;
    }
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@ -82,7 +82,7 @@ namespace nd4j {
        // now we actually apply quantization
        auto func = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e += increment) {
-                rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>(1.0f * x[e] / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
+                rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
            }
        };
@ -180,7 +180,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
            for (auto e = start; e < stop; e += increment) {
                int el = x[e];
                int ael = nd4j::math::nd4j_abs<int>(el) - 1;
-                z[ael] += el > 0 ? threshold : -threshold;
+                z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
            }
        };
--- a/libnd4j/include/ops/declarable/generic/activations/crelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/crelu.cpp
@ -32,21 +32,19 @@ namespace nd4j {
            REQUIRE_TRUE(x->isR(), 0, "CRELU: input must be real type");
            auto tmp = x->dup();
-            tmp->applyTransform(nd4j::transform::Neg, nullptr, nullptr);
+            tmp.applyTransform(nd4j::transform::Neg, tmp);
            auto z = OUTPUT_VARIABLE(0);
-            helpers::concat(block.launchContext(), {x, tmp}, *z, x->rankOf()-1);
+            helpers::concat(block.launchContext(), {x, &tmp}, *z, x->rankOf()-1);
            // NDArrayFactory<T>::concat({x, tmp}, -1, z);
            // TODO: make this configurable?
            double threshold = 0.0;
-            z->applyScalar(nd4j::scalar::RELU, threshold);
+            z->applyScalar(nd4j::scalar::RELU, threshold, *z);
            STORE_RESULT(z);
            delete tmp;
            return Status::OK();
        }
@ -94,7 +92,7 @@ namespace nd4j {
            auto pos = dec->at(0);
            auto neg = dec->at(1);
-            pos->applyPairwiseTransform(nd4j::pairwise::Subtract, neg, epsilon, nullptr);
+            pos->applyPairwiseTransform(nd4j::pairwise::Subtract, *neg, *epsilon);
            delete tmpResult;
            delete dec;
--- a/libnd4j/include/ops/declarable/generic/activations/cube.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/cube.cpp
@ -31,7 +31,7 @@ namespace nd4j {
            auto input = INPUT_VARIABLE(0);
            auto output = OUTPUT_VARIABLE(0);
-            input->applyTransform(nd4j::transform::Cube, output, nullptr);
+            input->applyTransform(nd4j::transform::Cube, *output);
            STORE_RESULT(output);
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/activations/elu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/elu.cpp
@ -32,7 +32,7 @@ namespace nd4j {
            const auto alpha = block.numT() > 0 ? T_ARG(0) : 1.f;
-            input->applyScalar(nd4j::scalar::ELU, alpha, output);
+            input->applyScalar(nd4j::scalar::ELU, alpha, *output);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/activations/hardsigmoid.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/hardsigmoid.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto input = INPUT_VARIABLE(0);
            auto output = OUTPUT_VARIABLE(0);
-            input->applyTransform(nd4j::transform::HardSigmoid, output, nullptr);
+            input->applyTransform(nd4j::transform::HardSigmoid, *output);
            STORE_RESULT(output);
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/activations/hardtanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/hardtanh.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto input = INPUT_VARIABLE(0);
            auto output = OUTPUT_VARIABLE(0);
-            input->applyTransform(nd4j::transform::HardTanh, output, nullptr);
+            input->applyTransform(nd4j::transform::HardTanh, *output);
            STORE_RESULT(output);
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/activations/identity.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/identity.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto z = this->getZ(block);
            // just for lulz
-            first->applyTransform(nd4j::transform::Identity, z, nullptr);
+            first->applyTransform(nd4j::transform::Identity, *z);
            STORE_RESULT(*z);
--- a/libnd4j/include/ops/declarable/generic/activations/identity_n.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/identity_n.cpp
@ -33,7 +33,7 @@ namespace nd4j {
                    auto x = INPUT_VARIABLE(i);
                    auto z = OUTPUT_VARIABLE(i);
-                    x->applyTransform(transform::Identity, z, nullptr);
+                    x->applyTransform(transform::Identity, *z);
                }
            }
--- a/libnd4j/include/ops/declarable/generic/activations/lrelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/lrelu.cpp
@ -31,7 +31,7 @@ namespace nd4j {
            float alpha = block.numT() > 0 ? T_ARG(0) : 0.01f;
-            input->applyScalar(nd4j::scalar::LeakyRELU, alpha, output);
+            input->applyScalar(nd4j::scalar::LeakyRELU, alpha, *output);
            STORE_RESULT(output);
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/activations/rationaltanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/rationaltanh.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto input = INPUT_VARIABLE(0);
            auto output = OUTPUT_VARIABLE(0);
-            input->applyTransform(nd4j::transform::RationalTanh, output, nullptr);
+            input->applyTransform(nd4j::transform::RationalTanh, *output);
            STORE_RESULT(output);
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/activations/rectifiedtanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/rectifiedtanh.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto input = INPUT_VARIABLE(0);
            auto output = OUTPUT_VARIABLE(0);
-            input->applyTransform(nd4j::transform::RectifiedTanh, output, nullptr);
+            input->applyTransform(nd4j::transform::RectifiedTanh, *output);
            STORE_RESULT(output);
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/activations/relu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/relu.cpp
@ -32,7 +32,7 @@ namespace nd4j {
            auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0;
-            first->applyScalar(nd4j::scalar::RELU, scalar, z);
+            first->applyScalar(nd4j::scalar::RELU, scalar, *z);
            STORE_RESULT(*z);
--- a/libnd4j/include/ops/declarable/generic/activations/relu6.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/relu6.cpp
@ -33,7 +33,7 @@ CONFIGURABLE_OP_IMPL(relu6, 1, 1, true, 1, 0) {
    auto input  = INPUT_VARIABLE(0);
    auto output = OUTPUT_VARIABLE(0);
-    input->applyScalar(nd4j::scalar::RELU6, T_ARG(0), output);
+    input->applyScalar(nd4j::scalar::RELU6, T_ARG(0), *output);
    return Status::OK();
 }
--- a/libnd4j/include/ops/declarable/generic/activations/selu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/selu.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto first = INPUT_VARIABLE(0);
            auto z = OUTPUT_VARIABLE(0);
-            first->applyTransform(nd4j::transform::SELU, z, nullptr);
+            first->applyTransform(nd4j::transform::SELU, *z);
            STORE_RESULT(*z);
--- a/libnd4j/include/ops/declarable/generic/activations/sigmoid.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/sigmoid.cpp
@ -29,7 +29,7 @@ namespace nd4j {
            auto first = INPUT_VARIABLE(0);
            auto z = OUTPUT_VARIABLE(0);
-            first->applyTransform(nd4j::transform::Sigmoid, z, nullptr);
+            first->applyTransform(nd4j::transform::Sigmoid, *z);
            STORE_RESULT(*z);
--- a/libnd4j/include/ops/declarable/generic/activations/softplus.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/softplus.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto first = INPUT_VARIABLE(0);
            auto z = OUTPUT_VARIABLE(0);
-            first->applyTransform(nd4j::transform::SoftPlus, z, nullptr);
+            first->applyTransform(nd4j::transform::SoftPlus, *z);
            STORE_RESULT(*z);
--- a/libnd4j/include/ops/declarable/generic/activations/softsign.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/softsign.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto first = INPUT_VARIABLE(0);
            auto z = OUTPUT_VARIABLE(0);
-            first->applyTransform(nd4j::transform::SoftSign, z, nullptr);
+            first->applyTransform(nd4j::transform::SoftSign, *z);
            STORE_RESULT(*z);
--- a/libnd4j/include/ops/declarable/generic/activations/tanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/tanh.cpp
@ -30,7 +30,7 @@ namespace nd4j {
            auto first = INPUT_VARIABLE(0);
            auto z = OUTPUT_VARIABLE(0);
-            first->applyTransform(nd4j::transform::Tanh, z, nullptr);
+            first->applyTransform(nd4j::transform::Tanh, *z);
            STORE_RESULT(*z);
--- a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_and.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_and.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
-            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::IntOps::IntAnd, pairwise::IntOps::IntAnd, broadcast::IntOps::IntAnd), y, z, false);
+            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::IntOps::IntAnd, pairwise::IntOps::IntAnd, broadcast::IntOps::IntAnd), *y, *z, false);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_or.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_or.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
-            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::IntOps::IntOr, pairwise::IntOps::IntOr, broadcast::IntOps::IntOr), y, z, false);
+            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::IntOps::IntOr, pairwise::IntOps::IntOr, broadcast::IntOps::IntOr), *y, *z, false);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_xor.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_xor.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
-            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::IntOps::IntXor, pairwise::IntOps::IntXor, broadcast::IntOps::IntXor), y, z, false);
+            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::IntOps::IntXor, pairwise::IntOps::IntXor, broadcast::IntOps::IntXor), *y, *z, false);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/bitwise/cyclic_rshift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/cyclic_rshift.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
-            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::CyclicShiftRight, pairwise::CyclicShiftRight, broadcast::CyclicShiftRight), y, z, false);
+            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::CyclicShiftRight, pairwise::CyclicShiftRight, broadcast::CyclicShiftRight), *y, *z, false);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/bitwise/cyclic_shift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/cyclic_shift.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
-            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::CyclicShiftLeft, pairwise::CyclicShiftLeft, broadcast::CyclicShiftLeft), y, z, false);
+            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::CyclicShiftLeft, pairwise::CyclicShiftLeft, broadcast::CyclicShiftLeft), *y, *z, false);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/bitwise/rshift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/rshift.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
-            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::ShiftRight, pairwise::ShiftRight, broadcast::ShiftRight), y, z, false);
+            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::ShiftRight, pairwise::ShiftRight, broadcast::ShiftRight), *y, *z, false);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/bitwise/shift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/shift.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
-            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::ShiftLeft, pairwise::ShiftLeft, broadcast::ShiftLeft), y, z, false);
+            x->applyTrueBroadcast(BroadcastIntOpsTuple::custom(scalar::ShiftLeft, pairwise::ShiftLeft, broadcast::ShiftLeft), *y, *z, false);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
@ -44,7 +44,7 @@ namespace nd4j {
            ExtraArguments arguments({a});
-            y->applyPairwiseTransform(pairwise::Axpy, x, z, &arguments);
+            y->applyPairwiseTransform(pairwise::Axpy, *x, *z, &arguments);
            return ND4J_STATUS_OK;
        }
--- a/libnd4j/include/ops/declarable/generic/blas/svd.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/svd.cpp
@ -33,8 +33,12 @@ CUSTOM_OP_IMPL(svd, 1, 1, false, 0, 3) {
    const int rank =  x->rankOf();
    REQUIRE_TRUE(rank >= 2 , 0, "SVD OP: the rank of input array must be >=2, but got %i instead!", rank);
-    const bool fullUV = (bool)INT_ARG(0);
+          bool fullUV = (bool)INT_ARG(0);
    const bool calcUV = (bool)INT_ARG(1);
    if(calcUV == false)
        fullUV = false;
    const int switchNum = INT_ARG(2);
    // #ifndef __CUDABLAS__
--- a/libnd4j/include/ops/declarable/generic/boolean/boolean_not.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/boolean_not.cpp
@ -29,7 +29,7 @@ namespace nd4j {
            auto x = INPUT_VARIABLE(0);
            auto z = OUTPUT_VARIABLE(0);
-            x->applyTransform(transform::Not, z, nullptr);
+            x->applyTransform(transform::Not, *z);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/boolean/select.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/select.cpp
@ -70,17 +70,13 @@ namespace nd4j {
                    auto tadsY = y->allTensorsAlongDimension(dims);
                    auto tadsZ = z->allTensorsAlongDimension(dims);
-                    for (int e = 0; e < tadsX->size(); e++) {
+                    for (int e = 0; e < tadsX.size(); e++) {
                        if (!cond->e<bool>(e)) {
-                            tadsZ->at(e)->assign(tadsY->at(e));
+                            tadsZ.at(e)->assign(tadsY.at(e));
                        } else {
-                            tadsZ->at(e)->assign(tadsX->at(e));
+                            tadsZ.at(e)->assign(tadsX.at(e));
                        }
                    }
                    delete tadsX;
                    delete tadsY;
                    delete tadsZ;
                }
            }
--- a/libnd4j/include/ops/declarable/generic/boolean/where.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/where.cpp
@ -59,17 +59,13 @@ namespace nd4j {
                    auto tadsY = y->allTensorsAlongDimension(dims);
                    auto tadsZ = z->allTensorsAlongDimension(dims);
-                    for (int e = 0; e < tadsX->size(); e++) {
+                    for (int e = 0; e < tadsX.size(); e++) {
                        if (!condition->e<bool>(e)) {
-                            tadsZ->at(e)->assign(tadsY->at(e));
+                            tadsZ.at(e)->assign(tadsY.at(e));
                        } else {
-                            tadsZ->at(e)->assign(tadsX->at(e));
+                            tadsZ.at(e)->assign(tadsX.at(e));
                        }
                    }
                    delete tadsX;
                    delete tadsY;
                    delete tadsZ;
                }
            } else {
                // in this case we return 2D matrix, which basically contains coordinates fo true
--- a/libnd4j/include/ops/declarable/generic/boolean/where_np.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/where_np.cpp
@ -89,16 +89,12 @@ namespace nd4j {
                    auto tadsY = y->allTensorsAlongDimension(dims);
                    auto tadsZ = z->allTensorsAlongDimension(dims);
-                    for (int e = 0; e < tadsX->size(); e++) {
+                    for (int e = 0; e < tadsX.size(); e++) {
                        if (!condition->e<bool>(e))
-                            tadsZ->at(e)->assign(tadsY->at(e));
+                            tadsZ.at(e)->assign(tadsY.at(e));
                        else
-                            tadsZ->at(e)->assign(tadsX->at(e));
+                            tadsZ.at(e)->assign(tadsX.at(e));
                    }
                    delete tadsX;
                    delete tadsY;
                    delete tadsZ;
                }
            } else {
                // in this case we return 2D matrix, which basically contains coordinates fo true
--- a/libnd4j/include/ops/declarable/generic/broadcastable/add.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/add.cpp
@ -82,14 +82,12 @@ namespace nd4j {
                if (axisX.size() > 0) {
                    auto sum = epsNext->reduceAlongDimension(nd4j::reduce::Sum, axisX);
                    gradX->assign(sum);
                    delete sum;
                } else
                    gradX->assign(epsNext);
                if (axisY.size() > 0) {
                    auto sum = epsNext->reduceAlongDimension(nd4j::reduce::Sum, axisY);
                    gradY->assign(sum);
                    delete sum;
                } else
                    gradY->assign(epsNext);
            }
--- a/libnd4j/include/ops/declarable/generic/broadcastable/assign.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/assign.cpp
@ -80,7 +80,6 @@ namespace nd4j {
                if (axisY.size() > 0) {
                    auto sum = epsNext->reduceAlongDimension(nd4j::reduce::Sum, axisY);
                    gradY->assign(sum);
                    delete sum;
                } else
                    gradY->assign(epsNext);
            }
--- a/libnd4j/include/ops/declarable/generic/broadcastable/atan2.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/atan2.cpp
@ -36,7 +36,7 @@ BROADCASTABLE_OP_IMPL(tf_atan2, 0, 0) {
    BROADCAST_CHECK_EMPTY(x,y,z);
    // auto tZ = BroadcastHelper<T>::template broadcastApply<simdOps::Atan2<T>>(y, x, z);
-    x->applyTrueBroadcast(nd4j::BroadcastOpsTuple::custom(scalar::Atan2, pairwise::Atan2, broadcast::Atan2), y, z, true);
+    x->applyTrueBroadcast(nd4j::BroadcastOpsTuple::custom(scalar::Atan2, pairwise::Atan2, broadcast::Atan2), *y, *z, true);
    // if (tZ == nullptr)
    //     return ND4J_STATUS_KERNEL_FAILURE;
--- a/libnd4j/include/ops/declarable/generic/broadcastable/divide.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/divide.cpp
@ -81,7 +81,7 @@ namespace nd4j {
                // Y gradient
                //epsNext->applyTriplewiseLambda(x, y, lambdaY, gradY);
                gradY->assign((*epsNext) * (*x) / ((*y) * (*y)));
-                gradY->applyTransform(transform::Neg, nullptr, nullptr);
+                gradY->applyTransform(transform::Neg, *gradY);
            } else if (y->isScalar()) {
                // scalar case
@ -91,17 +91,17 @@ namespace nd4j {
                //tmpX.printBuffer("SumX");
                //tmp.printBuffer("Sum Eps");
                gradY->assign(tmp * tmpX / ((*y) * (*y)));
-                gradY->applyTransform(transform::Neg, nullptr, nullptr);
+                gradY->applyTransform(transform::Neg, *gradY);
-                //epsNext->applyLambda(lambdaS, gradX);
+                //epsNext->applyLambda(lambdaS, *gradX);
-                epsNext->applyScalarArr(scalar::Divide, y, gradX, nullptr);
+                epsNext->applyScalarArr(scalar::Divide, *y, *gradX);
            } else {
                // broadcast case
                auto preX = *epsNext / *y;
                NDArray negX(*x);
-                x->applyTransform(transform::Neg, &negX);
+                x->applyTransform(transform::Neg, negX);
                auto preY = *epsNext * negX / ((*y) * (*y));
                auto axisX = ShapeUtils::evalBroadcastBackwardAxis(x->shapeInfo(), epsNext->shapeInfo());
@ -110,14 +110,12 @@ namespace nd4j {
                if (axisX.size() > 0) {
                    auto sum = preX.reduceAlongDimension(reduce::Sum, axisX);
                    gradX->assign(sum);
                    delete sum;
                } else
                    gradX->assign(preX);
                if (axisY.size() > 0) {
                    auto sum = preY.reduceAlongDimension(reduce::Sum, axisY);
                    gradY->assign(sum);
                    delete sum;
                } else
                    gradY->assign(preY);
            }
--- a/libnd4j/include/ops/declarable/generic/broadcastable/floormod.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/floormod.cpp
@ -69,7 +69,7 @@ namespace nd4j {
            std::unique_ptr<ResultSet> tmpResult(op.execute({x, y}, {}, {}, {}));
            if (gradY->rankOf() == gradX->rankOf())
-                epsNext->applyPairwiseTransform(pairwise::Multiply, tmpResult->at(0), gradY, nullptr);
+                epsNext->applyPairwiseTransform(pairwise::Multiply, *tmpResult->at(0), *gradY);
            else // epsNext is greater than gradY
            {
                std::vector<Nd4jLong> dims(epsNext->rankOf() * 2);
@ -78,7 +78,7 @@ namespace nd4j {
                    dims[d * 2 + 1] = 1;
                }
                auto tempIn((*tmpResult->at(0))(dims));
-                (*epsNext)(dims).applyPairwiseTransform(pairwise::Multiply, &tempIn, gradY, nullptr);
+                (*epsNext)(dims).applyPairwiseTransform(pairwise::Multiply, tempIn, *gradY);
            }
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/broadcastable/multiply.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/multiply.cpp
@ -79,24 +79,24 @@ CUSTOM_OP_IMPL(multiply_bp, 3, 2, false, 0, 0) {
    const Nd4jLong yLen = y->lengthOf();
    if(x->isScalar() && y->isScalar()) {    // both are scalars
-        y->applyPairwiseTransform(pairwise::Multiply, dLdz, dLdx, nullptr);
+        y->applyPairwiseTransform(pairwise::Multiply, *dLdz, *dLdx);
-        x->applyPairwiseTransform(pairwise::Multiply, dLdz, dLdy, nullptr);
+        x->applyPairwiseTransform(pairwise::Multiply, *dLdz, *dLdy);
        //dLdx->assign((*y) * (*dLdz));
        //dLdy->assign((*x) * (*dLdz));
    }
    else if(x->isScalar()) {            // x is scalar and y is not
        dLdx->assign((*y * *dLdz).reduceNumber(reduce::Sum));
-        dLdz->applyScalarArr(scalar::Multiply, x, dLdy, nullptr);
+        dLdz->applyScalarArr(scalar::Multiply, *x, *dLdy);
        //dLdz->applyTrueBroadcast(broadcast::Multiply, x, dLdy, true);
    }
    else if(y->isScalar()) {            // y is scalar and x is not
        dLdy->assign((*x * *dLdz).reduceNumber(reduce::Sum));
-        dLdz->applyScalarArr(scalar::Multiply, y, dLdx);
+        dLdz->applyScalarArr(scalar::Multiply, *y, *dLdx);
    }
    else if(x->isSameShape(y)) {
-        x->applyPairwiseTransform(pairwise::Multiply, dLdz, dLdy, nullptr);
+        x->applyPairwiseTransform(pairwise::Multiply, *dLdz, *dLdy);
-        y->applyPairwiseTransform(pairwise::Multiply, dLdz, dLdx, nullptr);
+        y->applyPairwiseTransform(pairwise::Multiply, *dLdz, *dLdx);
    }
    else if (x->isSameShape(dLdz)) {
@ -104,8 +104,8 @@ CUSTOM_OP_IMPL(multiply_bp, 3, 2, false, 0, 0) {
        y->tile(yTiled);
        std::vector<int> axesForY = ShapeUtils::evalBroadcastBackwardAxis(y->getShapeInfo(), dLdz->getShapeInfo());
-        dLdy->assign( (*x * *dLdz).reduceAlongDims(reduce::Sum, axesForY) );
+        dLdy->assign( (*x * *dLdz).reduceAlongDimension(reduce::Sum, axesForY) );
-        yTiled.applyPairwiseTransform(pairwise::Multiply, dLdz, dLdx, nullptr);
+        yTiled.applyPairwiseTransform(pairwise::Multiply, *dLdz, *dLdx);
    }
    else if (y->isSameShape(dLdz)) {
@ -113,8 +113,8 @@ CUSTOM_OP_IMPL(multiply_bp, 3, 2, false, 0, 0) {
        x->tile(xTiled);
        std::vector<int> axesForX = ShapeUtils::evalBroadcastBackwardAxis(x->getShapeInfo(), dLdz->getShapeInfo());
-        dLdx->assign( (*y * *dLdz).reduceAlongDims(reduce::Sum, axesForX) );
+        dLdx->assign( (*y * *dLdz).reduceAlongDimension(reduce::Sum, axesForX) );
-        xTiled.applyPairwiseTransform(pairwise::Multiply, dLdz, dLdy, nullptr);
+        xTiled.applyPairwiseTransform(pairwise::Multiply, *dLdz, *dLdy);
    }
    else {
@ -125,8 +125,8 @@ CUSTOM_OP_IMPL(multiply_bp, 3, 2, false, 0, 0) {
        std::vector<int> axesForX = ShapeUtils::evalBroadcastBackwardAxis(x->getShapeInfo(), dLdz->getShapeInfo());
        std::vector<int> axesForY = ShapeUtils::evalBroadcastBackwardAxis(y->getShapeInfo(), dLdz->getShapeInfo());
-        dLdx->assign( (*y * *dLdz).reduceAlongDims(reduce::Sum, axesForX) );
+        dLdx->assign( (*y * *dLdz).reduceAlongDimension(reduce::Sum, axesForX) );
-        dLdy->assign( (*x * *dLdz).reduceAlongDims(reduce::Sum, axesForY) );
+        dLdy->assign( (*x * *dLdz).reduceAlongDimension(reduce::Sum, axesForY) );
    }
    return Status::OK();
@ -182,7 +182,7 @@ DECLARE_SHAPE_FN(multiply_bp) {
                T tmpX = x->template reduceNumber<simdOps::Sum<T>>();
                gradY->assign(tmpX);
-                epsNext->applyLambda(lambdaS, gradX);
+                epsNext->applyLambda(lambdaS, *gradX);
            } else {
                // broadcast case
--- a/libnd4j/include/ops/declarable/generic/broadcastable/realdiv.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/realdiv.cpp
@ -71,7 +71,7 @@ namespace nd4j {
                // X gradient
                //epsNext->applyPairwiseLambda(y, lambdaX, gradX);
-                epsNext->applyPairwiseTransform(pairwise::Divide, y, gradX, nullptr);
+                epsNext->applyPairwiseTransform(pairwise::Divide, *y, *gradX);
                // Y gradient
                //epsNext->applyTriplewiseLambda(x, y, lambdaY, gradY);
@ -86,14 +86,14 @@ namespace nd4j {
                gradY->assign(tmp * -tmpX / ((*y) * (*y)));
                //epsNext->applyLambda(lambdaS, gradX);
-                epsNext->applyScalarArr(scalar::Divide, y, gradX, nullptr);
+                epsNext->applyScalarArr(scalar::Divide, *y, *gradX);
            } else {
                // broadcast case
                auto preX = *epsNext / *y;
                NDArray negX(*x);
-                x->applyTransform(transform::Neg, &negX);
+                x->applyTransform(transform::Neg, negX);
                auto preY = *epsNext * negX / ((*y) * (*y));
                auto axisX = ShapeUtils::evalBroadcastBackwardAxis(x->shapeInfo(), epsNext->shapeInfo());
@ -102,14 +102,12 @@ namespace nd4j {
                if (axisX.size() > 0) {
                    auto sum = preX.reduceAlongDimension(reduce::Sum, axisX);
                    gradX->assign(sum);
                    delete sum;
                } else
                    gradX->assign(preX);
                if (axisY.size() > 0) {
                    auto sum = preY.reduceAlongDimension(reduce::Sum, axisY);
                    gradY->assign(sum);
                    delete sum;
                } else
                    gradY->assign(preY);
            }
--- a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_divide.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_divide.cpp
@ -34,7 +34,7 @@ namespace nd4j {
            BROADCAST_CHECK_EMPTY(x,y,z);
            REQUIRE_TRUE(!x->isB(), 0, "REVERSEDIVIDE OP: you can't divide by bool array!");
-            x->applyTrueBroadcast(BROADCAST(ReverseDivide), y, z, true);            
+            x->applyTrueBroadcast(BROADCAST(ReverseDivide), *y, *z, true);
 			return Status::OK();
        }
@ -67,7 +67,7 @@ namespace nd4j {
                // X gradient
                //epsNext->applyTriplewiseLambda(x, y, lambdaX, gradX);
                gradX->assign((*epsNext) * (*y) / ((*x) * (*x)));
-                gradX->applyTransform(transform::Neg, nullptr, nullptr);
+                gradX->applyTransform(transform::Neg, *gradX);
                // Y gradient
                //epsNext->applyPairwiseLambda(x, lambdaY, gradY);
                gradY->assign((*epsNext) / (*x));
@ -78,14 +78,14 @@ namespace nd4j {
                gradY->assign(tmp / tmpX);
                gradX->assign((*epsNext) * (*y) / ((*x) * (*x)));
-                gradX->applyTransform(transform::Neg, nullptr, nullptr);
+                gradX->applyTransform(transform::Neg, *gradX);
            } else {
                // broadcast case
                auto preY = (*epsNext) / (*x);
                auto preX = *epsNext * (*y) / ((*x) * (*x));
-                preX.applyTransform(transform::Neg, nullptr, nullptr);
+                preX.applyTransform(transform::Neg, preX);
                auto axisX = ShapeUtils::evalBroadcastBackwardAxis(x->shapeInfo(), epsNext->shapeInfo());
                auto axisY = ShapeUtils::evalBroadcastBackwardAxis(y->shapeInfo(), epsNext->shapeInfo());
@ -93,14 +93,12 @@ namespace nd4j {
                if (axisX.size() > 0) {
                    auto sum = preX.reduceAlongDimension(reduce::Sum, axisX);
                    gradX->assign(sum);
                    delete sum;
                } else
                    gradX->assign(preX);
                if (axisY.size() > 0) {
                    auto sum = preY.reduceAlongDimension(reduce::Sum, axisY);
                    gradY->assign(sum);
                    delete sum;
                } else
                    gradY->assign(preY);
            }
--- a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_subtract.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_subtract.cpp
@ -61,13 +61,13 @@ namespace nd4j {
            if (x->isSameShape(y)) {
                // PWT case case
-                epsNext->applyTransform(transform::Neg, gradX, nullptr);
+                epsNext->applyTransform(transform::Neg, *gradX);
                gradY->assign(epsNext);
            } else if (y->isScalar()) {
                // scalar case
                auto tmp = epsNext->reduceNumber(reduce::Sum);
                gradY->assign(tmp);
-                epsNext->applyTransform(transform::Neg, gradX, nullptr);
+                epsNext->applyTransform(transform::Neg, *gradX);
            } else {
                // broadcastable
                auto axisX = ShapeUtils::evalBroadcastBackwardAxis(x->shapeInfo(), epsNext->shapeInfo());
@ -75,16 +75,14 @@ namespace nd4j {
                if (axisX.size() > 0) {
                    auto sum = epsNext->reduceAlongDimension(reduce::Sum, axisX);
-                    sum->applyTransform(transform::Neg, gradX);
+                    sum.applyTransform(transform::Neg, *gradX);
                    delete sum;
                } else {
-                    epsNext->applyTransform(transform::Neg, gradX, nullptr);
+                    epsNext->applyTransform(transform::Neg, *gradX);
                }
                if (axisY.size() > 0) {
                    auto sum = epsNext->reduceAlongDimension(reduce::Sum, axisY);
                    gradY->assign(sum);
                    delete sum;
                } else {
                    gradY->assign(epsNext);
                }
--- a/libnd4j/include/ops/declarable/generic/broadcastable/squared_subtract.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/squared_subtract.cpp
@ -98,37 +98,31 @@ namespace nd4j {
                auto targetShape = epsNext->getShapeAsVector();
-                preX->tileToShape(targetShape);
+                preX.tileToShape(targetShape, preX);
-                preY->tileToShape(targetShape);
+                preY.tileToShape(targetShape, preY);
                //epsNext->applyTriplewiseLambda(x, y, lambdaX, preX);
                //epsNext->applyTriplewiseLambda(x, y, lambdaY, preY);
                auto resX = (*epsNext) * ts * ((*x) - (*y));
-                preX->assign(resX);
+                preX.assign(resX);
                auto resY = (*epsNext) * ts * ((*y) - (*x));
-                preY->assign(resY);
+                preY.assign(resY);
                auto axisX = ShapeUtils::evalBroadcastBackwardAxis(x->shapeInfo(), epsNext->shapeInfo());
                auto axisY = ShapeUtils::evalBroadcastBackwardAxis(y->shapeInfo(), epsNext->shapeInfo());
                if (axisX.size() > 0) {
-                    auto sum = preX->reduceAlongDimension(reduce::Sum, axisX);
+                    auto sum = preX.reduceAlongDimension(reduce::Sum, axisX);
                    gradX->assign(sum);
                    delete sum;
                } else
                    gradX->assign(preX);
                if (axisY.size() > 0) {
-                    auto sum = preY->reduceAlongDimension(reduce::Sum, axisY);
+                    auto sum = preY.reduceAlongDimension(reduce::Sum, axisY);
                    gradY->assign(sum);
                    delete sum;
                } else
                    gradY->assign(preY);
                delete preX;
                delete preY;
            }
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/broadcastable/subtract.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/subtract.cpp
@ -62,7 +62,7 @@ namespace nd4j {
            if (x->isSameShape(y)) {
                // PWT case case
-                epsNext->applyTransform(transform::Neg, gradY, nullptr);
+                epsNext->applyTransform(transform::Neg, *gradY);
                gradX->assign(epsNext);
            } else if (y->isScalar()) {
                // scalar case
@ -77,16 +77,14 @@ namespace nd4j {
                if (axisX.size() > 0) {
                    auto sum = epsNext->reduceAlongDimension(reduce::Sum, axisX);
                    gradX->assign(sum);
                    delete sum;
                } else
                    gradX->assign(epsNext);
                if (axisY.size() > 0) {
                    auto sum = epsNext->reduceAlongDimension(reduce::Sum, axisY);
-                    sum->applyTransform(transform::Neg, gradY);
+                    sum.applyTransform(transform::Neg, *gradY);
                    delete sum;
                } else {
-                    epsNext->applyTransform(transform::Neg, gradY);
+                    epsNext->applyTransform(transform::Neg, *gradY);
                }
            }
--- a/libnd4j/include/ops/declarable/generic/flow/flow_control_ops.cpp
+++ b/libnd4j/include/ops/declarable/generic/flow/flow_control_ops.cpp
@ -41,10 +41,10 @@ namespace nd4j {
            // but we'll ensure only one node is active, and other is disabled
            if (condition->e<int>(0) == 0) {
                block.setBranch(0);
-                this->storeResult(block, 0, input->dup());
+                this->storeResult(block, 0, new NDArray(input->dup()));
            } else {
                block.setBranch(1);
-                this->storeResult(block, 1, *input->dup());
+                this->storeResult(block, 1, new NDArray(input->dup()));
            }
            return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/helpers/BroadcastHelper.h
+++ b/libnd4j/include/ops/declarable/generic/helpers/BroadcastHelper.h
@ -42,34 +42,34 @@ namespace nd4j {
                std::unique_ptr<NDArray> ptr;
                if (!Environment::getInstance()->isExperimentalBuild()) {
                    if (y->dataType() != x->dataType()) {
-                        y = y->cast(x->dataType());
+                        y = new NDArray(y->cast(x->dataType()));
                        std::unique_ptr<NDArray> ptr2(y);
                        ptr.swap(ptr2);
                    }
                }
                if (!x->isScalar() && !y->isScalar() && x->isSameShape(y)) {
-				    x->applyPairwiseTransform(op.p, y, z, nullptr);
+				    x->applyPairwiseTransform(op.p, *y, *z);
                } else if (!x->isScalar() && y->isScalar()) {
-                    x->applyScalarArr(op.s, const_cast<const NDArray*>(y), z);
+                    x->applyScalarArr(op.s, const_cast<const NDArray&>(*y), *z);
                } else if (x->isScalar() && !y->isScalar()) {
                    if (z->isSameShape(y)) {
                        if (op.s == scalar::Add || op.s == scalar::Multiply ) {
-                            y->applyScalarArr(op.s, x, z, nullptr);
+                            y->applyScalarArr(op.s, *x, *z);
                        } else if (op.s == scalar::SquaredSubtract) {
-                            y->applyScalarArr(scalar::SquaredReverseSubtract, x, z, nullptr);
+                            y->applyScalarArr(scalar::SquaredReverseSubtract, *x, *z);
                        } else if (op.s == scalar::Subtract) {
-                            y->applyScalarArr(scalar::ReverseSubtract, x, z, nullptr);
+                            y->applyScalarArr(scalar::ReverseSubtract, *x, *z);
                        } else if (op.s == scalar::Divide) {
-                            y->applyScalarArr(scalar::ReverseDivide, x, z, nullptr);
+                            y->applyScalarArr(scalar::ReverseDivide, *x, *z);
                        } else if (op.s == scalar::Pow) {
-                            y->applyScalarArr(scalar::ReversePow, x, z, nullptr);
+                            y->applyScalarArr(scalar::ReversePow, *x, *z);
                        } else if (op.s == scalar::ReverseSubtract) {
-                            y->applyScalarArr(scalar::Subtract, x, z, nullptr);
+                            y->applyScalarArr(scalar::Subtract, *x, *z);
                        } else if (op.s == scalar::ReverseDivide) {
-                            y->applyScalarArr(scalar::Divide, x, z, nullptr);
+                            y->applyScalarArr(scalar::Divide, *x, *z);
                        } else if (op.s == scalar::MaxPairwise || op.s == scalar::MinPairwise || op.s == scalar::AMaxPairwise || op.s == scalar::AMinPairwise) {
-                            y->applyScalarArr(op.s, x, z, nullptr);
+                            y->applyScalarArr(op.s, *x, *z);
                        } else if (op.s == scalar::CopyPws) {
                            z->assign(y);
                        } else {
@ -84,9 +84,9 @@ namespace nd4j {
                        return tZ;
                    }
                } else if (x->isScalar() && y->isScalar()) { // x->isScalar() && y->isScalar()
-				    x->applyScalarArr(op.s, const_cast<const NDArray*>(y), z, nullptr);
+				    x->applyScalarArr(op.s, const_cast<const NDArray&>(*y), *z);
 			    } else if (ShapeUtils::areShapesBroadcastable(*x, *y)) {
-                    x->applyTrueBroadcast(op, y, z, true, extraArgs);
+                    x->applyTrueBroadcast(op, *y, *z, true, extraArgs);
                    return z;
                } else {
                    auto sx = ShapeUtils::shapeAsString(x);
@ -107,16 +107,16 @@ namespace nd4j {
                }
                if (!x->isScalar() && !y->isScalar() && x->isSameShape(y)) {
-                    x->applyPairwiseTransform(op.p, y, z, nullptr);
+                    x->applyPairwiseTransform(op.p, *y, *z);
                } else if (ShapeUtils::areShapesBroadcastable(*x, *y)) {
-                    x->applyTrueBroadcast(op, y, z, true, extraArgs);
+                    x->applyTrueBroadcast(op, *y, *z, true, extraArgs);
                    return z;
                } else if (!x->isScalar() && y->isScalar()) {
-                    x->applyScalarArr(op.s, const_cast<const NDArray*>(y), z);
+                    x->applyScalarArr(op.s, const_cast<const NDArray&>(*y), *z);
                } else if (x->isScalar() && !y->isScalar()) {
                    if (z->isSameShape(y)) {
                        //z->assign(x);
-                        x->applyPairwiseTransform(op.p, y, z, extraArgs);
+                        x->applyPairwiseTransform(op.p, *y, *z, extraArgs);
                        return z;
                    } else {
                        auto v = y->getShapeAsVector();
@ -125,9 +125,9 @@ namespace nd4j {
                        return tZ;
                    }
                } else if (x->isScalar() && y->isScalar()) { // x->isScalar() && y->isScalar()
-                    x->applyScalarArr(op.s, const_cast<const NDArray*>(y), z, nullptr);
+                    x->applyScalarArr(op.s, const_cast<const NDArray&>(*y), *z);
                } else if (ShapeUtils::areShapesBroadcastable(*x, *y)) {
-                    x->applyTrueBroadcast(op, y, z, true, extraArgs);
+                    x->applyTrueBroadcast(op, *y, *z, true, extraArgs);
                    return z;
                } else {
                    auto sx = ShapeUtils::shapeAsString(x);
--- a/libnd4j/include/ops/declarable/generic/list/scatter_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/scatter_list.cpp
@ -51,12 +51,12 @@ namespace nd4j {
            std::vector<int> axis = ShapeUtils::evalDimsToExclude(array->rankOf(), {0});
            auto tads = array->allTensorsAlongDimension( axis);
-            for (int e = 0; e < tads->size(); e++) {
+            for (int e = 0; e < tads.size(); e++) {
                auto idx = indices->e<int>(e);
-                if (idx >= tads->size())
+                if (idx >= tads.size())
                    return ND4J_STATUS_BAD_ARGUMENTS;
-                auto arr = tads->at(e)->dup(array->ordering());
+                auto arr = new NDArray(tads.at(e)->dup(array->ordering()));
                auto res = list->write(idx, arr);
                if (res != ND4J_STATUS_OK)
                    return res;
@ -65,7 +65,6 @@ namespace nd4j {
            if (!hasList)
                //OVERWRITE_RESULT(list);
                setupResultList(list, block);
            delete tads;
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/list/split_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/split_list.cpp
@ -66,7 +66,7 @@ namespace nd4j {
                auto subarray = (*array)(indices);
-                auto status = list->write(e, subarray.dup(array->ordering()));
+                auto status = list->write(e, new NDArray(subarray.dup(array->ordering())));
                if (status != ND4J_STATUS_OK)
                    return status;
--- a/libnd4j/include/ops/declarable/generic/list/write_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/write_list.cpp
@ -39,7 +39,7 @@ namespace nd4j {
                //nd4j_printf("Writing [%i]:\n", idx->e<int>(0));
                //input->printShapeInfo("input shape");
                //input->printIndexedBuffer("input buffer");
-                Nd4jStatus result = list->write(idx->e<int>(0), input->dup());
+                Nd4jStatus result = list->write(idx->e<int>(0), new NDArray(input->dup()));
                auto res = NDArrayFactory::create_(list->counter(), block.launchContext());
                //res->printShapeInfo("Write_list 2 output shape");
@ -52,7 +52,7 @@ namespace nd4j {
                auto input = INPUT_VARIABLE(1);
                auto idx = INT_ARG(0);
-                Nd4jStatus result = list->write(idx, input->dup());
+                Nd4jStatus result = list->write(idx, new NDArray(input->dup()));
                auto res = NDArrayFactory::create_(list->counter(), block.launchContext());
                //res->printShapeInfo("Write_list 1 output shape");
--- a/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
@ -169,10 +169,10 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 	NDArray E = *predictions - *labels;
 	// dE_i/dp_i = sign(p_i - y_i)
-	E.applyTransform(nd4j::transform::Sign, dLdp);	// dE/dp
+	E.applyTransform(nd4j::transform::Sign, *dLdp);	// dE/dp
 	// dE_i/dy_i = -sign(p_i - y_i)
-	E.applyTransform(nd4j::transform::Abs);
+	E.applyTransform(nd4j::transform::Abs, E);
 	switch (reductionMode) {
@ -184,7 +184,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-				E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 			}
 			else
 				dLdw->assign(E);
@ -210,7 +210,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -238,7 +238,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
--- a/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
@ -56,7 +56,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss, 3, 1, false, 0, 2) {
 	    REQUIRE_TRUE(weights->isScalar() || ShapeUtils::areShapesBroadcastable(*weights, *output), 0, "COSINE_DISTANCE_LOSS OP: shapes of weights and output arrays should be broadcastable, but got weights = %s and output = %s instead!", ShapeUtils::shapeAsString(weights).c_str(), ShapeUtils::shapeAsString(labels).c_str());
    }
-    NDArray E = 1. - (*predictions * *labels).reduceAlongDims(reduce::Sum, {dim}, true);
+    NDArray E = 1. - (*predictions * *labels).reduceAlongDimension(reduce::Sum, {dim}, true);
 	// perform weights broadcasting/tile to E if it is necessary
 	auto weightsBroad = weights;
@ -194,7 +194,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
    // input dimension can't be larger than labels/predictions/weights rank
    REQUIRE_TRUE(dim < labels->rankOf(), 0, "COSINE_DISTANCE_LOSS_GRAD OP: input reduction dimension (got %i) must be < labels rank %i!", dim, labels->rankOf());
-    NDArray E = 1. - (*predictions * *labels).reduceAlongDims(reduce::Sum, {dim}, true);
+    NDArray E = 1. - (*predictions * *labels).reduceAlongDimension(reduce::Sum, {dim}, true);
 	// perform weights broadcasting/tile to E if it is necessary
 	auto weightsBroad = weights;
@ -216,7 +216,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
 			else {
 				if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 				}
 				else
 					dLdw->assign(E);
@ -249,7 +249,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 					}
 					else
 						dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -284,7 +284,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-						E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 						*dLdw /= numOfNonZeroWeights;
 					}
 					else
--- a/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
@ -52,7 +52,7 @@ namespace nd4j {
            // We first need to convert binary labels to -1/1 labels (as floats)
            NDArray E = 1.f - (*labels * 2.f - 1.f) * (*logits);
-            E.applyScalar(scalar::RELU, 0.0f, &E);
+            E.applyScalar(scalar::RELU, 0.0f, E);
            // multiply E on weights
            E *= *weightsBroad;
@ -172,11 +172,11 @@ namespace nd4j {
            NDArray z = (*labels * 2.f - 1.f);
            NDArray E = 1.f - z * (*logits);
-            E.applyScalar(scalar::RELU, 0.0f, &E);
+            E.applyScalar(scalar::RELU, 0.0f, E);
            // turn E into gradient mask
            NDArray gradientMask(E.getShapeInfo(), block.getWorkspace());
-            E.applyTransform(nd4j::transform::Sign, &gradientMask);
+            E.applyTransform(nd4j::transform::Sign, gradientMask);
            dLdp->assign(-z * gradientMask);
            dLdl->assign(-2.f * (*logits) * gradientMask);
@ -192,7 +192,7 @@ namespace nd4j {
                        dLdw->assign(E.reduceNumber(reduce::Sum));
                    else if(weights != weightsBroad) {
                        std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                        E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                    }
                    else
                        dLdw->assign(E);
@ -220,7 +220,7 @@ namespace nd4j {
                            *dLdw = 0.;
                        else if(weights != weightsBroad) {
                            std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                        }
                        else
                            dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -249,7 +249,7 @@ namespace nd4j {
                            dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
                        else if(weights != weightsBroad) {
                            std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                            E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                            E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                            *dLdw /= numOfNonZeroWeightsScalar;
                        }
                        else
--- a/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
@ -53,9 +53,9 @@ CUSTOM_OP_IMPL(huber_loss, 3, 1, false, 1, 1) {
 		weightsBroad = new NDArray(weights->tileToShape(predictions->getShapeInfo()));
 	auto error = *predictions - *labels;
-	error.applyTransform(transform::Abs);
+	error.applyTransform(transform::Abs, error);
 	NDArray quadratic(error.getShapeInfo(), block.getWorkspace());
-	error.applyScalar(scalar::MinPairwise, delta, &quadratic);
+	error.applyScalar(scalar::MinPairwise, delta, quadratic);
    NDArray E = quadratic * quadratic * 0.5f + (error - quadratic)*delta;
@ -173,24 +173,24 @@ DECLARE_SHAPE_FN(huber_loss) {
 			NDArray diff = *predictions - *labels;
 			NDArray absDiff(diff);
-			absDiff.applyTransform(transform::Abs);
+			absDiff.applyTransform(transform::Abs, absDiff);
 			NDArray quadratic(absDiff);
-			absDiff.applyScalar(scalar::MinPairwise, delta, &quadratic);
+			absDiff.applyScalar(scalar::MinPairwise, delta, quadratic);
 			NDArray E = quadratic * quadratic * 0.5f + (absDiff - quadratic)*delta;
 			NDArray lteMask(diff.getShapeInfo(), BOOL, true, block.launchContext());
-			absDiff.applyScalar(scalar::LessThanOrEqual, delta, &lteMask);
+			absDiff.applyScalar(scalar::LessThanOrEqual, delta, lteMask);
            NDArray gtMask(diff.getShapeInfo(), BOOL, true, block.launchContext());
-			absDiff.applyScalar(scalar::GreaterThan, delta, &gtMask);
+			absDiff.applyScalar(scalar::GreaterThan, delta, gtMask);
 			NDArray signDiff(diff);
-			diff.applyTransform(transform::Sign, &signDiff);
+			diff.applyTransform(transform::Sign, signDiff);
-			auto gtMaskFloat = *gtMask.cast(diff.dataType());
+			auto gtMaskFloat = gtMask.cast(diff.dataType());
-			auto lteMaskFloat = *lteMask.cast(diff.dataType());
+			auto lteMaskFloat = lteMask.cast(diff.dataType());
 			dLdp->assign( lteMaskFloat * diff + gtMaskFloat * delta * signDiff);
@ -207,7 +207,7 @@ DECLARE_SHAPE_FN(huber_loss) {
 						dLdw->assign(E.reduceNumber(reduce::Sum));
 					else if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-						E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 					}
 					else
 						dLdw->assign(E);
@ -235,7 +235,7 @@ DECLARE_SHAPE_FN(huber_loss) {
 							*dLdw = 0.;
 						else if(weights != weightsBroad) {
 							std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-							((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+							((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 						}
 						else
 							dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -264,7 +264,7 @@ DECLARE_SHAPE_FN(huber_loss) {
 							dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
 						else if(weights != weightsBroad) {
 							std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-							E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+							E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 							*dLdw /= numOfNonZeroWeightsScalar;
 						}
 						else
--- a/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
@ -181,7 +181,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 	// dE_i/dp_i = (1-y_i)/(1-p_i+eps) - y_i/(p_i+eps)
 	dLdp->assign(oneMinusLabels / onePlusEpsMinusPredict - *labels / predictPlusEps);	// dE/dp
 	// dE_i/dy_i = log((1+2eps)/(p_i+eps) - 1)
-	((1. + 2. * epsilon) / predictPlusEps  - 1.).applyTransform(transform::Log, dLdl);		// dE/dy
+	((1. + 2. * epsilon) / predictPlusEps  - 1.).applyTransform(transform::Log, *dLdl);		// dE/dy
 	NDArray E = -(*labels) * predictPlusEps.transform(transform::Log) - oneMinusLabels * onePlusEpsMinusPredict.transform(transform::Log);
@ -196,7 +196,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-				E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 			}
 			else
 				dLdw->assign(E);
@ -226,7 +226,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -254,7 +254,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / numOfNonZeroWeights);
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
--- a/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
@ -55,9 +55,9 @@ namespace ops {
        NDArray E(labels->getShapeInfo(), block.getWorkspace());
        if (computeFullLoss)
-            labels->applyPairwiseTransform(pairwise::LogPoissonLossFull, log_predictions, &E, nullptr);
+            labels->applyPairwiseTransform(pairwise::LogPoissonLossFull, *log_predictions, E);
        else
-            labels->applyPairwiseTransform(pairwise::LogPoissonLoss, log_predictions, &E, nullptr);
+            labels->applyPairwiseTransform(pairwise::LogPoissonLoss, *log_predictions, E);
        // multiply E on weights
@ -176,13 +176,13 @@ namespace ops {
        NDArray E(labels->getShapeInfo(), block.getWorkspace());
        if (computeFullLoss) {
-            labels->applyPairwiseTransform(pairwise::LogPoissonLossFull, log_predictions, &E, nullptr);
+            labels->applyPairwiseTransform(pairwise::LogPoissonLossFull, *log_predictions, E);
            NDArray rDiv(labels->getShapeInfo(), block.getWorkspace());
-            labels->applyScalar(scalar::ReverseDivide, 0.5f, &rDiv);
+            labels->applyScalar(scalar::ReverseDivide, 0.5f, rDiv);
            dLdl->assign(rDiv  + labels->transform(transform::Log) + -(*log_predictions));
        } else {
-            labels->applyPairwiseTransform(pairwise::LogPoissonLoss, log_predictions, &E, nullptr);
+            labels->applyPairwiseTransform(pairwise::LogPoissonLoss, *log_predictions, E);
            dLdl->assign(-(*log_predictions));
        }
@ -200,7 +200,7 @@ namespace ops {
                    dLdw->assign(E.reduceNumber(reduce::Sum));
                else if(weights != weightsBroad) {
                    std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                    E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                    E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                }
                else
                    dLdw->assign(E);
@ -228,7 +228,7 @@ namespace ops {
                        *dLdw = 0.;
                    else if(weights != weightsBroad) {
                        std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                        ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                        ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                    }
                    else
                        dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -257,7 +257,7 @@ namespace ops {
                        dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
                    else if(weights != weightsBroad) {
                        std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                        E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                        *dLdw /= numOfNonZeroWeightsScalar;
                    }
                    else
--- a/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
@ -112,10 +112,10 @@ namespace nd4j {
            auto n = double(labels->sizeAt(1));
            auto diffs = *predictions - *labels;
-            auto sumOfSquares = (diffs * diffs).reduceAlongDims(reduce::Sum, reductionIdx, true);
+            auto sumOfSquares = (diffs * diffs).reduceAlongDimension(reduce::Sum, reductionIdx, true);
-            auto squareOfSum  = diffs.reduceAlongDims(reduce::Sum, reductionIdx, true);
+            auto squareOfSum  = diffs.reduceAlongDimension(reduce::Sum, reductionIdx, true);
-            squareOfSum.applyScalar(scalar::Pow, 2);
+            squareOfSum.applyScalar(scalar::Pow, 2, squareOfSum);
            auto E = ((sumOfSquares * n) - squareOfSum) * (4/(n*(n-1)));
@ -240,15 +240,15 @@ namespace nd4j {
            auto diffs = *predictions - *labels;
            std::vector<int> reductionIdx = ShapeUtils::evalDimsToExclude(labels->rankOf(), {0});
-            auto sumOfSquares = (diffs * diffs).reduceAlongDims(reduce::Sum, reductionIdx, true);
+            auto sumOfSquares = (diffs * diffs).reduceAlongDimension(reduce::Sum, reductionIdx, true);
-            auto squareOfSum  = diffs.reduceAlongDims(reduce::Sum, reductionIdx, true);
+            auto squareOfSum  = diffs.reduceAlongDimension(reduce::Sum, reductionIdx, true);
-            squareOfSum.applyScalar(scalar::Pow, 2);
+            squareOfSum.applyScalar(scalar::Pow, 2, squareOfSum);
            auto E = ((sumOfSquares * n) - squareOfSum) * (4/(n*(n-1)));
-            auto sumPred = predictions->reduceAlongDims(reduce::Sum, reductionIdx, true);
+            auto sumPred = predictions->reduceAlongDimension(reduce::Sum, reductionIdx, true);
-            auto sumLabel = labels->reduceAlongDims(reduce::Sum, reductionIdx, true);
+            auto sumLabel = labels->reduceAlongDimension(reduce::Sum, reductionIdx, true);
            dLdp->assign(((diffs * n) - sumPred + sumLabel)*(8/(n*(n-1))));
@ -273,7 +273,7 @@ namespace nd4j {
                        dLdw->assign(E.reduceNumber(reduce::Sum));
                    else if(weights != weightsBroad) {
                        std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                        E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                    }
                    else
                        dLdw->assign(E);
@ -299,7 +299,7 @@ namespace nd4j {
                            *dLdw = 0.;
                        else if(weights != weightsBroad) {
                            std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                        }
                        else
                            dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -327,7 +327,7 @@ namespace nd4j {
                            dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
                        else if(weights != weightsBroad) {
                            std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-                            E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+                            E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
                            *dLdw /= numOfNonZeroWeightsScalar;
                        }
                        else
--- a/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
@ -51,7 +51,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss, 3, 1, false, 0, 1) {
 		weightsBroad = new NDArray(weights->tileToShape(predictions->getShapeInfo()));
 	NDArray E(labels->getShapeInfo(), false, block.launchContext());
-	predictions->applyPairwiseTransform(pairwise::SquaredSubtract, labels, &E, nullptr);
+	predictions->applyPairwiseTransform(pairwise::SquaredSubtract, *labels, E);
    // multiply E on weights
    E *= (*weightsBroad);
@ -191,7 +191,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss_grad, 3, 3, false, 0, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-				E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 			}
 			else
 				dLdw->assign(E);
@ -217,7 +217,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss_grad, 3, 3, false, 0, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -245,7 +245,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss_grad, 3, 3, false, 0, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
--- a/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
@ -56,7 +56,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss, 3, 1, false, 1, 1) {
 	auto newLabels = labels;
 	if(labelsSmoothing != 0.) {
 		newLabels = new NDArray(*labels);
-    	newLabels->applyScalar(scalar::SXELogitsSmoother, labelsSmoothing, newLabels, nullptr);
+    	newLabels->applyScalar(scalar::SXELogitsSmoother, labelsSmoothing, *newLabels);
 	}
 	NDArray E(labels, false, block.launchContext());
@ -186,7 +186,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 	auto newLabels = labels;
 	if(labelsSmoothing.e<float>(0) != 0.f) {
 		newLabels = new NDArray(*labels);
-    	newLabels->applyScalar(scalar::SXELogitsSmoother, labelsSmoothing.e<float>(0), newLabels, nullptr);
+    	newLabels->applyScalar(scalar::SXELogitsSmoother, labelsSmoothing.e<float>(0), *newLabels);
 	}
 	NDArray E(labels, false, block.launchContext());
@ -211,7 +211,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-				E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 			}
 			else
 				dLdw->assign(E);
@ -239,7 +239,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum * sum));
@ -267,7 +267,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / numOfNonZeroWeightsScalar);
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
--- a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
@ -54,11 +54,11 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss, 3, 1, false, 1, 1) {
 	// If label_smoothing is nonzero, smooth the labels towards 1/num_classes: new_onehot_labels = onehot_labels * (1 - label_smoothing) + label_smoothing / num_classes
 	// num_classes = labels->sizeAt(1)
-	auto cLabels = labels->cast(weights->dataType());
+	NDArray* cLabels = new NDArray(labels->cast(weights->dataType()));
-	auto newLabels = cLabels;
+	NDArray* newLabels = cLabels;
 	if(labelsSmoothing != 0.) {
 		newLabels = new NDArray(cLabels);
-    	*newLabels = (1.f - labelsSmoothing) * *cLabels + labelsSmoothing / cLabels->sizeAt(1);
+    	newLabels->assign((1.f - labelsSmoothing) * *cLabels + labelsSmoothing / cLabels->sizeAt(1));
 	}
 	// main formula: result = - sum_i(lables_i * log(softmax_i)) - sum over last dimension
@ -70,9 +70,9 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss, 3, 1, false, 1, 1) {
 	std::vector<int> dimensions = {-1};
-	NDArray shiftedLogits = *logits - logits->reduceAlongDims(reduce::Max, dimensions, true);
+	NDArray shiftedLogits = *logits - logits->reduceAlongDimension(reduce::Max, dimensions, true);
-	NDArray logSumExp = shiftedLogits.transform(transform::Exp).reduceAlongDims(reduce::Sum, dimensions, true).transform(transform::Log);
+	NDArray logSumExp = shiftedLogits.transform(transform::Exp).reduceAlongDimension(reduce::Sum, dimensions, true).transform(transform::Log);
-	NDArray E = (*newLabels * (logSumExp - shiftedLogits)).reduceAlongDims(reduce::Sum, dimensions);
+	NDArray E = (*newLabels * (logSumExp - shiftedLogits)).reduceAlongDimension(reduce::Sum, dimensions);
 	// perform weights broadcasting/tile to E if it is necessary
 	auto weightsBroad = weights;
@ -217,25 +217,25 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 	// If label_smoothing is nonzero, smooth the labels towards 1/num_classes: new_onehot_labels = onehot_labels * (1 - label_smoothing) + label_smoothing / num_classes
 	// num_classes = labels->sizeAt(1)
-	auto cLabels = labels->cast(weights->dataType());
+	NDArray* cLabels = new NDArray(labels->cast(weights->dataType()));
-	auto newLabels = cLabels;
+	NDArray* newLabels = cLabels;
 	if(labelsSmoothing != 0.) {
 		newLabels = new NDArray(labels->getShapeInfo(), dLdl->dataType(), false, block.launchContext());
    	newLabels->assign((1.f - labelsSmoothing) * *cLabels + labelsSmoothing / cLabels->sizeAt(1));
 	}
-	NDArray softmax = (*logits - logits->reduceAlongDims(reduce::Max, dimensions, true)).transform(transform::Exp);
+	NDArray softmax = (*logits - logits->reduceAlongDimension(reduce::Max, dimensions, true)).transform(transform::Exp);
-	softmax /= softmax.reduceAlongDims(reduce::Sum, dimensions, true);
+	softmax /= softmax.reduceAlongDimension(reduce::Sum, dimensions, true);
 	// dEdp = softmax * sum_i(lables_i) - labels
-	dLdp->assign(softmax * newLabels->reduceAlongDims(reduce::Sum, dimensions, true) - *newLabels);
+	dLdp->assign(softmax * newLabels->reduceAlongDimension(reduce::Sum, dimensions, true) - *newLabels);
 	// dEdl = -log(softmax)
 	dLdl->assign(-softmax.transform(transform::Log)* (1.f - labelsSmoothing));
-	NDArray shiftedLogits = *logits - logits->reduceAlongDims(reduce::Max, dimensions, true);
+	NDArray shiftedLogits = *logits - logits->reduceAlongDimension(reduce::Max, dimensions, true);
-    NDArray logSumExp = shiftedLogits.transform(transform::Exp).reduceAlongDims(reduce::Sum, dimensions, true).transform(transform::Log);
+    NDArray logSumExp = shiftedLogits.transform(transform::Exp).reduceAlongDimension(reduce::Sum, dimensions, true).transform(transform::Log);
-    NDArray E = (*newLabels * (logSumExp - shiftedLogits)).reduceAlongDims(reduce::Sum, dimensions);
+    NDArray E = (*newLabels * (logSumExp - shiftedLogits)).reduceAlongDimension(reduce::Sum, dimensions);
 	// perform weights broadcasting/tile to E if it is necessary
 	auto weightsBroad = weights;
@ -253,12 +253,12 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				*dLdl *= *weights;
 			}
 			else {
-				dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, weightsBroad);
+				dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, *weightsBroad, *dLdp);
-				dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, weightsBroad);
+				dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, *weightsBroad, *dLdl);
 				if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-					E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 				}
 				else
 					dLdw->assign(E);
@ -289,12 +289,12 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				else {
 					NDArray temp = *weightsBroad / sum;
-					dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, &temp);
+					dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdp);
-					dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, &temp);
+					dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdl);
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 					}
 					else
 						dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@ -326,12 +326,12 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				}
 				else {
 					NDArray temp = *weightsBroad / numOfNonZeroWeights;
-					dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, &temp);
+					dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdp);
-					dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, &temp);
+					dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdl);
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
-						E.reduceAlongDimension(reduce::Sum, dLdw, axesToReduceAlong, true, false, false);
+						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
 						*dLdw /= numOfNonZeroWeights;
 					}
 					else
--- a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropyWithLogits.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropyWithLogits.cpp
@ -41,11 +41,11 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_with_logits, 2, 1, false, 0, 0) {
    std::vector<int> dimension = {classesDim};
-    auto maxAlongDim = logits->reduceAlongDims(reduce::Max, {classesDim}, true);
+    auto maxAlongDim = logits->reduceAlongDimension(reduce::Max, {classesDim}, true);
    auto logExp = (*logits - maxAlongDim).transform(transform::Exp);
-    auto logSoftMax = ( logExp / logExp.reduceAlongDims(reduce::Sum, {classesDim}, true) ).transform(transform::Log);
+    auto logSoftMax = ( logExp / logExp.reduceAlongDimension(reduce::Sum, {classesDim}, true) ).transform(transform::Log);
-	(-(*labels) * logSoftMax).reduceAlongDimension(reduce::Sum, output, dimension);
+	(-(*labels) * logSoftMax).reduceAlongDimension(reduce::Sum, *output, dimension);
    return Status::OK();
 }
@ -97,14 +97,14 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_with_logits_grad, 2, 2, false, 0, 0) {
    std::vector<int> dimension = {classesDim};
-    NDArray softmax = (*logits - logits->reduceAlongDims(reduce::Max, dimension, true)).transform(transform::Exp);
+    NDArray softmax = (*logits - logits->reduceAlongDimension(reduce::Max, dimension, true)).transform(transform::Exp);
-    softmax /= softmax.reduceAlongDims(reduce::Sum, dimension, true);
+    softmax /= softmax.reduceAlongDimension(reduce::Sum, dimension, true);
    // dEdp = softmax * sum_i(labels_i) - labels
-    dLdp->assign(softmax * labels->reduceAlongDims(reduce::Sum, dimension, true) - *labels);
+    dLdp->assign(softmax * labels->reduceAlongDimension(reduce::Sum, dimension, true) - *labels);
    // dEdl = -log(softmax)
-    (-softmax).applyTransform(transform::Log, dLdl);
+    (-softmax).applyTransform(transform::Log, *dLdl);
    return Status::OK();
 }
--- a/libnd4j/include/ops/declarable/generic/loss/sparseSoftmaxCrossEntropyWithLogits.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/sparseSoftmaxCrossEntropyWithLogits.cpp
@ -50,9 +50,9 @@ CUSTOM_OP_IMPL(sparse_softmax_cross_entropy_loss_with_logits, 2, 1, false, 0, 0)
    std::vector<int> dimension = {-1};
-    auto maxAlongDim = logits->reduceAlongDims(reduce::Max, dimension, true);
+    auto maxAlongDim = logits->reduceAlongDimension(reduce::Max, dimension, true);
    auto logitsExp = (*logits - maxAlongDim).transform(transform::Exp, nullptr);
-    auto logSoftMax = -(( logitsExp / logitsExp.reduceAlongDims(reduce::Sum, dimension, true) ).transform(transform::Log));
+    auto logSoftMax = -(( logitsExp / logitsExp.reduceAlongDimension(reduce::Sum, dimension, true) ).transform(transform::Log));
    helpers::scatterForLoss(block.launchContext(), *labels, logSoftMax, *output, false);
@ -117,8 +117,8 @@ CUSTOM_OP_IMPL(sparse_softmax_cross_entropy_loss_with_logits_grad, 2, 1, false,
    std::vector<int> dimension = {-1};
-    NDArray softmax = (*logits - logits->reduceAlongDims(reduce::Max, dimension, true)).transform(transform::Exp);
+    NDArray softmax = (*logits - logits->reduceAlongDimension(reduce::Max, dimension, true)).transform(transform::Exp);
-    softmax /= softmax.reduceAlongDims(reduce::Sum, dimension, true);
+    softmax /= softmax.reduceAlongDimension(reduce::Sum, dimension, true);
    // dEdp = softmax - 1 (or 0)
    dLdp->assign(softmax);
--- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
@ -229,19 +229,19 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
    // input - mean
    NDArray xMinusMean(input); // empty array with same shape as input
-    input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMinusMean);
+    input->applyBroadcast(nd4j::broadcast::Subtract, axes, *mean, xMinusMean);
    // stdInv
    NDArray stdInv = *variance + epsilon;
-    stdInv.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
+    stdInv.applyTransform(transform::Reciprocal, stdInv);                           // 1 / (variance + epsilon)
-    stdInv.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
+    stdInv.applyTransform(transform::Sqrt, stdInv);                                 // 1 / (variance + epsilon)^0.5
    // dvdm (use dLdM as storage for dvdm)
-    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, dLdM, excludedAxes, keepUnitiesInShape);
+    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, *dLdM, excludedAxes, keepUnitiesInShape);
    *dLdM *= -Ninv;
    // g_sum
-    auto gSum = dLdO->reduceAlongDims(nd4j::reduce::Sum, excludedAxes, keepUnitiesInShape);
+    auto gSum = dLdO->reduceAlongDimension(nd4j::reduce::Sum, excludedAxes, keepUnitiesInShape);
    // dLdB
    if(applyOffset)
@ -249,11 +249,11 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
    // stdInv * (g - g_sum/N) (use dLdI as storage for this expression)
    gSum *= Ninv;
-    dLdO->applyBroadcast(nd4j::broadcast::Subtract, axes, &gSum, dLdI);
+    dLdO->applyBroadcast(nd4j::broadcast::Subtract, axes, gSum, *dLdI);
-    dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv);
+    dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, stdInv, *dLdI);
    // dLdV <- [g*(x - m)]_sum
-    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);
+    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, *dLdV, excludedAxes, keepUnitiesInShape);
    // dLdG
    *dLdV *= stdInv;
@ -265,13 +265,13 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
    *dLdV *=  -Ninv;             // -0.5f * (2 / N);
    // dfdv * (dvdm  + (x - m)) (use xMinusMean as storage for this expression)
-    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, dLdM);
+    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, *dLdM, xMinusMean);
-    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV);
+    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, *dLdV, xMinusMean);
    // dLdI
    *dLdI += xMinusMean;
    if(applyScale)
-        dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma);
+        dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, *gamma, *dLdI);
    *dLdM = 0;      // put zeros so far
    *dLdV = 0;      // put zeros so far
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
@ -240,7 +240,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
    if(gradB) {
        if(gradB->rankOf() == 2)
            gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
-        gradO->reduceAlongDimension(reduce::Sum, gradB, gradOaxesForDot);                          // sum over bS oD oH oW
+        gradO->reduceAlongDimension(reduce::Sum, *gradB, gradOaxesForDot);                          // sum over bS oD oH oW
        if(gradB != OUTPUT_VARIABLE(2))
            delete gradB;
    }
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
@ -234,7 +234,7 @@ CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) {
    if(gradB) {
        if(gradB->rankOf() == 2)
            gradB = new NDArray(gradB->reshape(gradB->ordering(), {gradB->lengthOf()}));
-        gradO->reduceAlongDimension(reduce::Sum, gradB, {0, 2, 3});                                // sum over bS, oH, oW
+        gradO->reduceAlongDimension(reduce::Sum, *gradB, {0, 2, 3});                                // sum over bS, oH, oW
        if(gradB != OUTPUT_VARIABLE(2))
            delete gradB;
    }
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
@ -244,7 +244,7 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {
    if(gradB) {
        if(gradB->rankOf() == 2)
            gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
-        gradO->reduceAlongDimension(reduce::Sum, gradB, {0, 2, 3, 4});                                // sum over bS, oD, oH, oW
+        gradO->reduceAlongDimension(reduce::Sum, *gradB, {0, 2, 3, 4});                                // sum over bS, oD, oH, oW
        if(gradB != OUTPUT_VARIABLE(2))
            delete gradB;
    }
--- a/libnd4j/include/ops/declarable/generic/nn/fusedBatchNorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/fusedBatchNorm.cpp
@ -84,7 +84,7 @@ CUSTOM_OP_IMPL(fused_batch_norm, 3, 3, false, 0, 2) {
        epsilon = 0.001;
    const int restSize = x->lengthOf() / iD;
-    auto xAffected = NDArrayFactory::create(x->ordering(), {restSize, iD}, x->dataType(), block.launchContext());
+    auto xAffected = NDArrayFactory::create(x->ordering(), {restSize, iD}, mean->dataType(), block.launchContext());
    xAffected.assign(x);
    const int restSizeMinusOne = (restSize > 1) ? (restSize - 1) : 1;
@ -93,7 +93,7 @@ CUSTOM_OP_IMPL(fused_batch_norm, 3, 3, false, 0, 2) {
    const double restSizeAdjust = (double)restSize / restSizeMinusOne;
    if(isTraining) {
-        auto sum = xAffected.reduceAlongDims(reduce::Sum, {0});
+        auto sum = xAffected.reduceAlongDimension(reduce::Sum, {0});
        sum *= restSizeInv;
        mean->assign(sum);
        *batchMean = *mean;
@ -106,8 +106,8 @@ CUSTOM_OP_IMPL(fused_batch_norm, 3, 3, false, 0, 2) {
    if(isTraining) {
        int power = 2;
-        xAffected.applyScalar(scalar::Pow, power);
+        xAffected.applyScalar(scalar::Pow, power, xAffected);
-        auto sum = xAffected.reduceAlongDims(reduce::Sum, {0});
+        auto sum = xAffected.reduceAlongDimension(reduce::Sum, {0});
        sum *= restSizeInv;
        variance->assign(sum);
        *batchVar  = (*variance) * restSizeAdjust;
--- a/libnd4j/include/ops/declarable/generic/nn/logSoftmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/logSoftmax.cpp
@ -68,7 +68,7 @@ CONFIGURABLE_OP_IMPL(log_softmax_bp, 2, 1, true, 0, 0) {
    helpers::softmax(block.launchContext(), *input, *gradI, dim);
-    gradI->assign( *gradO - (*gradI * *gradO).reduceAlongDims(reduce::Sum, {dim}, true) );
+    gradI->assign( *gradO - (*gradI * *gradO).reduceAlongDimension(reduce::Sum, {dim}, true) );
    return Status::OK();
 }
--- a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
@ -46,7 +46,7 @@ namespace nd4j {
            auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0;
            auto xw = result->at(0);
-            xw->applyScalar(nd4j::scalar::RELU, scalar, output);
+            xw->applyScalar(nd4j::scalar::RELU, scalar, *output);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/nn/softmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/softmax.cpp
@ -62,7 +62,7 @@ CONFIGURABLE_OP_IMPL(softmax_bp, 2, 1, true, 0, 0) {
    helpers::softmax(block.launchContext(), *input, *gradI, dim);
-    auto sumAlongDim = (*gradI * *gradO).reduceAlongDims(reduce::Sum, {dim}, true);
+    auto sumAlongDim = (*gradI * *gradO).reduceAlongDimension(reduce::Sum, {dim}, true);
    gradI->assign(*gradI * (*gradO - sumAlongDim));
    return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
@ -56,7 +56,7 @@ CONFIGURABLE_OP_IMPL(adjust_contrast, 1, 1, true, 0, 0) {
        axes[i] = i;
    // mean as reduction for last dimension set
-    auto mean = input->reduceAlongDims(reduce::Mean, axes);
+    auto mean = input->reduceAlongDimension(reduce::Mean, axes);
    // this is contrast calculation
    output->assign((*input - mean) * (*factor) + mean);
@ -104,13 +104,13 @@ CONFIGURABLE_OP_IMPL(adjust_contrast_v2, 1, 1, true, 0, 0) {
    std::vector<int> axes({1}); // dim 1 of pseudoresult
 // mean as reduction for last dimension set over size (dim 1) of result3D
-    auto mean = input3D.reduceAlongDims(reduce::Mean, axes);
+    auto mean = input3D.reduceAlongDimension(reduce::Mean, axes);
    // result as (x - mean) * factor + mean
    auto temp = input3D.ulike();
-    input3D.applyBroadcast(broadcast::Subtract, {0, 2}, &mean, &temp, nullptr);
+    input3D.applyBroadcast(broadcast::Subtract, {0, 2}, mean, temp);
-    temp.applyScalarArr(scalar::Multiply, factor);
+    temp.applyScalarArr(scalar::Multiply, *factor, temp);
-    temp.applyBroadcast(broadcast::Add, {0, 2},  &mean, &output3D);
+    temp.applyBroadcast(broadcast::Add, {0, 2},  mean, output3D);
    output->assign(output3D);
    if(block.width() == 1)
        delete factor;
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
@ -44,11 +44,11 @@ namespace nd4j {
                auto axisVector = INPUT_VARIABLE(1);
                helpers::adjustAxis(input->rankOf(), axisVector, axis);
-                input->applyIndexReduce(indexreduce::IndexMax, output, axis);
+                input->applyIndexReduce(indexreduce::IndexMax, *output, axis);
            } else {
                helpers::adjustAxis(input->rankOf(), axis);
-                input->applyIndexReduce(indexreduce::IndexMax, output, axis);
+                input->applyIndexReduce(indexreduce::IndexMax, *output, axis);
            }
            STORE_RESULT(output);
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
@ -44,11 +44,11 @@ namespace nd4j {
                auto axisVector = INPUT_VARIABLE(1);
                helpers::adjustAxis(input->rankOf(), axisVector, axis);
-                input->applyIndexReduce(indexreduce::IndexMin, output, axis);
+                input->applyIndexReduce(indexreduce::IndexMin, *output, axis);
            } else {
                helpers::adjustAxis(input->rankOf(), axis);
-                input->applyIndexReduce(indexreduce::IndexMin, output, axis);
+                input->applyIndexReduce(indexreduce::IndexMin, *output, axis);
            }
            STORE_RESULT(output);
--- a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
@ -82,7 +82,7 @@ CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) {
    gradI->assign(gradO);
-    gradO->reduceAlongDimension(nd4j::reduce::Sum, gradB, ShapeUtils::evalDimsToExclude(gradO->rankOf(), {channelDim}));
+    gradO->reduceAlongDimension(nd4j::reduce::Sum, *gradB, ShapeUtils::evalDimsToExclude(gradO->rankOf(), {channelDim}));
    return ND4J_STATUS_OK;
 }
--- a/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp
@ -45,7 +45,7 @@ CUSTOM_OP_IMPL(embedding_lookup, 2, 1, false, 0, 1) {
            v = i++;
        }
-        std::unique_ptr<ResultSet> outputView(output->allTensorsAlongDimension(dims));
+        ResultSet outputView = output->allTensorsAlongDimension(dims);
        REQUIRE_TRUE(block.width() > output->sizeAt(0), 0, "embedding_lookup: input list should be greater then %i, but %i given.",
                    output->sizeAt(0), block.width()
                );
@ -53,7 +53,7 @@ CUSTOM_OP_IMPL(embedding_lookup, 2, 1, false, 0, 1) {
            Nd4jLong thisIndex = (*indeces).e<Nd4jLong>(e);
            input   = INPUT_VARIABLE(thisIndex); // lookup param
-            outputView->at(e)->assign(input);
+            outputView.at(e)->assign(input);
        }
    }
    else {
--- a/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp
@ -49,8 +49,8 @@ namespace nd4j {
            }
            std::vector<int>& dims = axis;
-            input->varianceAlongDimension(variance::SummaryStatsVariance, variances, false, axis);
+            input->varianceAlongDimension(variance::SummaryStatsVariance, *variances, false, axis);
-            input->reduceAlongDimension(reduce::Mean, means, axis, keepDims);
+            input->reduceAlongDimension(reduce::Mean, *means, axis, keepDims);
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/parity_ops/norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/norm.cpp
@ -52,31 +52,31 @@ namespace nd4j {
                case 0: {
                    REQUIRE_TRUE(dims.size() == 2 || (input->rankOf() == 2 && dims.size() == 0), 0, "Norm: Frobenius is defined for 2D matrices or TADS only");
                    // fro
-                    input->reduceAlongDimension(reduce::NormFrobenius, output, dims, false, output->rankOf() == 2);
+                    input->reduceAlongDimension(reduce::NormFrobenius, *output, dims, false, output->rankOf() == 2);
                }
                break;
                case 1: {
                    // euclidean
                    if ((input->rankOf() == 2 && dims.size() == 0) || dims.size() == 2) {
-                        input->reduceAlongDimension(reduce::NormFrobenius, output, dims, false, output->rankOf() == 2);
+                        input->reduceAlongDimension(reduce::NormFrobenius, *output, dims, false, output->rankOf() == 2);
                    } else {
-                        input->reduceAlongDimension(reduce::Norm2, output, dims, false, output->rankOf() == 2);
+                        input->reduceAlongDimension(reduce::Norm2, *output, dims, false, output->rankOf() == 2);
                    }
                }
                break;
                case 2: {
                    // 1
-                    input->reduceAlongDimension(reduce::Norm1, output, dims, false, output->rankOf() == 2);
+                    input->reduceAlongDimension(reduce::Norm1, *output, dims, false, output->rankOf() == 2);
                }
                break;
                case 3: {
                    // 2
-                    input->reduceAlongDimension(reduce::Norm2, output, dims, false, output->rankOf() == 2);
+                    input->reduceAlongDimension(reduce::Norm2, *output, dims, false, output->rankOf() == 2);
                }
                break;
                case 4: {
                    // inf-norm
-                    input->reduceAlongDimension(reduce::NormMax, output, dims, false, output->rankOf() == 2);
+                    input->reduceAlongDimension(reduce::NormMax, *output, dims, false, output->rankOf() == 2);
                }
                break;
                default: {
@ -84,7 +84,7 @@ namespace nd4j {
                    REQUIRE_TRUE(block.getIArguments()->size() > 1, 0, "P-Norm reductions requires 2 TArguments, but only 1 was provided");
                    // FIXME: p is required here
                    //T p = T_ARG(1);
-                    input->reduceAlongDimension(reduce::NormP, output, dims, false, output->rankOf() == 2);
+                    input->reduceAlongDimension(reduce::NormP, *output, dims, false, output->rankOf() == 2);
                }
            }
--- a/libnd4j/include/ops/declarable/generic/parity_ops/normalize_moments.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/normalize_moments.cpp
@ -40,23 +40,20 @@ namespace nd4j {
                shift.assign(T_ARG(0));
            }
-            means->applyScalarArr(scalar::Divide, counts, resMeans, nullptr);
+            means->applyScalarArr(scalar::Divide, *counts, *resMeans);
-            NDArray* squareMeans = resMeans->dup('c');
+            NDArray squareMeans = resMeans->dup('c');
-            NDArray* tempVariances = resVariances->dup('c');
+            NDArray tempVariances = resVariances->dup('c');
-            squareMeans->applyTransform(transform::Square, squareMeans, nullptr);
+            squareMeans.applyTransform(transform::Square, squareMeans, nullptr);
-            variances->applyScalarArr(scalar::Divide, counts, tempVariances, nullptr);
+            variances->applyScalarArr(scalar::Divide, *counts, tempVariances);
-//            tempVariances->printIndexedBuffer("varianced divided by count");
+//            tempVariances.printIndexedBuffer("varianced divided by count");
-            tempVariances->applyPairwiseTransform(pairwise::Subtract, squareMeans, resVariances, nullptr);
+            tempVariances.applyPairwiseTransform(pairwise::Subtract, squareMeans, *resVariances);
            if (shift.e<double>(0) != 0) {
-                resMeans->applyScalarArr(scalar::Add, &shift, resMeans, nullptr);
+                resMeans->applyScalarArr(scalar::Add, shift, *resMeans);
            }
            delete squareMeans;
            delete tempVariances;
            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduceMean.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduceMean.cpp
@ -47,7 +47,7 @@ CUSTOM_OP_IMPL(reduce_mean, 1, 1, false, 0, 0) {
    for(const auto& item : dimensions)
        REQUIRE_TRUE(item >= -input->rankOf() && item < input->rankOf(), 0, "REDUCE_MEAN OP: the input dimension to reduce along must be in range [-%i, %i), but got %i instead !" , input->rankOf(), input->rankOf(), item);
-    input->reduceAlongDimension(reduce::Mean, output, dimensions, keepDims);
+    input->reduceAlongDimension(reduce::Mean, *output, dimensions, keepDims);
    return Status::OK();
 }
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduceStDev.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduceStDev.cpp
@ -55,7 +55,7 @@ CUSTOM_OP_IMPL(reduce_stdev, 1, 1, false, 0, 0) {
    for(const auto& item : dimensions)
        REQUIRE_TRUE(item >= -input->rankOf() && item < input->rankOf(), 0, "REDUCE_STDEV OP: the input dimension to reduce along must be in range [-%i, %i), but got %i instead !" , input->rankOf(), input->rankOf(), item);
-    input->varianceAlongDimension(variance::SummaryStatsStandardDeviation, output, biasCorrected, dimensions);
+    input->varianceAlongDimension(variance::SummaryStatsStandardDeviation, *output, biasCorrected, dimensions);
    return Status::OK();
 }
@ -130,10 +130,10 @@ CUSTOM_OP_IMPL(reduce_stdev_bp, 2, 1, false, 0, 0) {
    const Nd4jLong N = input->lengthOf() / gradO->lengthOf();
    const Nd4jLong NminusOne = biasCorrected ? N - 1 : N;
-    auto mean = input->reduceAlongDims(reduce::Mean, dimensions, true);
+    auto mean = input->reduceAlongDimension(reduce::Mean, dimensions, true);
    NDArray variance(mean.getShapeInfo(), true, block.launchContext());                    // create empty array with shape matching shape of mean array
-    input->varianceAlongDimension(variance::SummaryStatsStandardDeviation, &variance, biasCorrected, dimensions);
+    input->varianceAlongDimension(variance::SummaryStatsStandardDeviation, variance, biasCorrected, dimensions);
    gradI->assign( (*input - mean) / (variance * NminusOne));                              // automatic broadcasting happens here
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduceVariance.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduceVariance.cpp
@ -55,7 +55,7 @@ CUSTOM_OP_IMPL(reduce_variance, 1, 1, false, 0, 0) {
    for(const auto& item : dimensions)
        REQUIRE_TRUE(item >= -input->rankOf() && item < input->rankOf(), 0, "REDUCE_VARIANCE OP: the input dimension to reduce along must be in range [-%i, %i), but got %i instead !" , input->rankOf(), input->rankOf(), item);
-    input->varianceAlongDimension(variance::SummaryStatsVariance, output, biasCorrected, dimensions);
+    input->varianceAlongDimension(variance::SummaryStatsVariance, *output, biasCorrected, dimensions);
    return Status::OK();
 }
@ -129,7 +129,7 @@ CUSTOM_OP_IMPL(reduce_variance_bp, 2, 1, false, 0, 0) {
    const double factor1 = 2.0 / NminusOne;
    const double factor2 = 2.0 / (N * NminusOne);
-    auto mean = input->reduceAlongDims(reduce::Mean, dimensions, true);
+    auto mean = input->reduceAlongDimension(reduce::Mean, dimensions, true);
    gradI->assign( (*input - mean) * (2.0f / NminusOne));                                    // automatic broadcasting happens here
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_logsumexp.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_logsumexp.cpp
@ -45,9 +45,9 @@ namespace ops {
        //void* whereMax = (void*)();
        auto internal = (*input);
        internal -= maxVals;
-        internal.applyTransform(transform::Exp, nullptr, nullptr);
+        internal.applyTransform(transform::Exp, internal);
-        internal.reduceAlongDimension(reduce::Sum, output, axes, keepDims, false); //, (void*)&maxVals);
+        internal.reduceAlongDimension(reduce::Sum, *output, axes, keepDims, false); //, (void*)&maxVals);
-        output->applyTransform(transform::Log, nullptr, nullptr);
+        output->applyTransform(transform::Log, *output);
        (*output) += maxVals;
        return ND4J_STATUS_OK;
    }
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_max.cpp
@ -52,7 +52,7 @@ CUSTOM_OP_IMPL(reduce_max, 1, 1, false, 0, 0) {
    else if (block.getTArguments()->size() > 0)
        keepDims = (bool)T_ARG(0);
-    input->reduceAlongDimension(reduce::Max, output, dimensions, keepDims);
+    input->reduceAlongDimension(reduce::Max, *output, dimensions, keepDims);
    return Status::OK();
 }
@ -122,8 +122,7 @@ CUSTOM_OP_IMPL(reduce_max_bp, 2, 1, false, 0, 0) {
    else {
        auto indicesArr = input->applyIndexReduce(nd4j::indexreduce::IndexMax, dimensions);
-        helpers::scatterSimple(block.launchContext(), 6, *gradI, *gradO, *indicesArr, ShapeUtils::evalDimsToExclude(gradI->rankOf(), dimensions)); // 6 corresponds to copy operation
+        helpers::scatterSimple(block.launchContext(), 6, *gradI, *gradO, indicesArr, ShapeUtils::evalDimsToExclude(gradI->rankOf(), dimensions)); // 6 corresponds to copy operation
        delete indicesArr;
    }
    return Status::OK();
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_min.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_min.cpp
@ -52,7 +52,7 @@ CUSTOM_OP_IMPL(reduce_min, 1, 1, false, 0, 0) {
    else if (block.getTArguments()->size() > 0)
        keepDims = (bool)T_ARG(0);
-    input->reduceAlongDimension(reduce::Min, output, dimensions, keepDims);
+    input->reduceAlongDimension(reduce::Min, *output, dimensions, keepDims);
    return Status::OK();
 }
@ -125,8 +125,7 @@ CUSTOM_OP_IMPL(reduce_min_bp, 2, 1, false, 0, 0) {
    else {
        auto indicesArr = input->applyIndexReduce(nd4j::indexreduce::IndexMin, dimensions);
-        helpers::scatterSimple(block.launchContext(), 6, *gradI, *gradO, *indicesArr, ShapeUtils::evalDimsToExclude(gradI->rankOf(), dimensions));  // 6 corresponds to copy operation
+        helpers::scatterSimple(block.launchContext(), 6, *gradI, *gradO, indicesArr, ShapeUtils::evalDimsToExclude(gradI->rankOf(), dimensions));  // 6 corresponds to copy operation
        delete indicesArr;
    }
    return Status::OK();
--- a/Show More
+++ b/Show More