[WIP] cuda concat (#107)

* - correct cuda concat Signed-off-by: Yurii <yurii@skymind.io> * - pooling 2d/3d : take into account possible case when input and gradI have different strides Signed-off-by: Yurii <yurii@skymind.io> * master pulled in Signed-off-by: raver119 <raver119@gmail.com> * floordiv_bp test reverted Signed-off-by: raver119 <raver119@gmail.com> * - add NDArray::printLinearBuffer method Signed-off-by: Yurii <yurii@skymind.io>
2019-08-08 18:05:21 +03:00 · 2019-08-08 18:05:21 +03:00 · 7fa01288bb
commit 7fa01288bb
parent 62a025439b
6 changed files with 282 additions and 178 deletions
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@ -477,6 +477,11 @@ namespace nd4j {
        */
        void printBuffer(const char* msg = nullptr, Nd4jLong limit = -1, const bool sync = true) const;

+        /**
+        * print element by element consequently in a way they (elements) are stored in physical memory
+        */
+        void printLinearBuffer() const;
+
        /**
        *  prints _buffer (if host = true) or _bufferD (if host = false) as it is, that is in current state without checking buffer status
        */
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@ -1137,6 +1137,39 @@ void NDArray::printBuffer(const char* msg, Nd4jLong limit, const bool sync) cons
    fflush(stdout);
 }

+//////////////////////////////////////////////////////////////////////////
+// print element by element consequently in a way they (elements) are stored in physical memory
+void NDArray::printLinearBuffer() const {
+
+    syncToHost();
+
+    const auto ews = this->ews() > 0 ? this->ews() : 1;
+    const auto len = this->lengthOf();
+
+    printf("[");
+
+    if (this->dataType() == nd4j::DataType::INT32) {
+        for(Nd4jLong e = 0; e < len; e++)
+            printf("%d, ", this->bufferAsT<int>()[e * ews]);
+    }
+    else if(this->dataType() == nd4j::DataType::INT64) {
+        for(Nd4jLong e = 0; e < len; e++)
+            printf("%lld, ", this->bufferAsT<Nd4jLong>()[e * ews]);
+    }
+    else if(this->dataType() == nd4j::DataType::FLOAT32) {
+        for(Nd4jLong e = 0; e < len; e++)
+            printf("%.3f, ", this->bufferAsT<float>()[e * ews]);
+    }
+    else if(this->dataType() == nd4j::DataType::DOUBLE) {
+        for(Nd4jLong e = 0; e < len; e++)
+            printf("%.3f, ", this->bufferAsT<double>()[e * ews]);
+    }
+    else
+        throw std::invalid_argument("NDArray::printLinearBuffer: not implemented yet for this data type !");
+
+    printf("]\n");
+    fflush(stdout);
+}
 //////////////////////////////////////////////////////////////////////////
 static void printFormatted(NDArray const* arr, int depth, int limit) {

--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@ -1863,17 +1863,25 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
 #endif
            nd4j_debug("MKL-DNN is not used for pooling2d_bp!\n", 0);

-            const Nd4jLong iStride0 = gradI.stridesOf()[0];
-            const Nd4jLong iStride1 = gradI.stridesOf()[1];
-            const Nd4jLong iStride2 = gradI.stridesOf()[2];
-            const Nd4jLong iStride3 = gradI.stridesOf()[3];
-            const Nd4jLong oStride0 = gradO.stridesOf()[0];
-            const Nd4jLong oStride1 = gradO.stridesOf()[1];
-            const Nd4jLong oStride2 = gradO.stridesOf()[2];
-            const Nd4jLong oStride3 = gradO.stridesOf()[3];
-            const Nd4jLong iStep2   = dH*iStride2;
-            const Nd4jLong iStep3   = dW*iStride3;
-            const int      kProd    = kH*kW;
+            const Nd4jLong iStride0  = input.stridesOf()[0];
+            const Nd4jLong iStride1  = input.stridesOf()[1];
+            const Nd4jLong iStride2  = input.stridesOf()[2];
+            const Nd4jLong iStride3  = input.stridesOf()[3];
+            const Nd4jLong gIStride0 = gradI.stridesOf()[0];
+            const Nd4jLong gIStride1 = gradI.stridesOf()[1];
+            const Nd4jLong gIStride2 = gradI.stridesOf()[2];
+            const Nd4jLong gIStride3 = gradI.stridesOf()[3];
+            const Nd4jLong oStride0  = gradO.stridesOf()[0];
+            const Nd4jLong oStride1  = gradO.stridesOf()[1];
+            const Nd4jLong oStride2  = gradO.stridesOf()[2];
+            const Nd4jLong oStride3  = gradO.stridesOf()[3];
+            const Nd4jLong iStep2    = dH*iStride2;
+            const Nd4jLong iStep3    = dW*iStride3;
+            const Nd4jLong gIStep2   = dH*gIStride2;
+            const Nd4jLong gIStep3   = dW*gIStride3;
+            const int      kProd     = kH*kW;
+
+            const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3;

            Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
            T sum, valO, *pIn, *pgI;
@ -1901,28 +1909,48 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                if(wend > iW)
                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));

-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
-
                                sum = -DataTypeUtils::max<T>();
                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];

-                                // we set these to default values
-                                maxKH = hstart;
-                                maxKW = wstart;
+                                if(sameStrides) {

-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                        T valIn = pIn[kh + kw];
-                                        if (valIn > sum) {
-                                            sum = valIn;
-                                            maxKH = kh;
-                                            maxKW = kw;
+                                    hstart *= iStride2;
+                                    hend   *= iStride2;
+                                    wstart *= iStride3;
+                                    wend   *= iStride3;
+
+                                    // we set these to default values
+                                    maxKH = hstart;
+                                    maxKW = wstart;
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
+                                            T valIn = pIn[kh + kw];
+                                            if (valIn > sum) {
+                                                sum = valIn;
+                                                maxKH = kh;
+                                                maxKW = kw;
+                                            }
                                        }
-                                    }
-                                gI[pIn - in + maxKH + maxKW] += valO;
+                                    gI[pIn - in + maxKH + maxKW] += valO;
+                                }
+                                else {
+
+                                    // we set these to default values
+                                    maxKH = hstart;
+                                    maxKW = wstart;
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                            T valIn = pIn[kh * iStride2 + kw * iStride3];
+                                            if (valIn > sum) {
+                                                sum = valIn;
+                                                maxKH = kh;
+                                                maxKW = kw;
+                                            }
+                                        }
+                                    gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
+                                }
                            }
                        }
                    }
@ -1936,7 +1964,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                        for(int oh = 0; oh < oH; ++oh) {
                            for(int ow = 0; ow < oW; ++ow) {

-                                pgI  = gI + b * iStride0 + c * iStride1;
+                                pgI  = gI + b * gIStride0 + c * gIStride1;

                                hstart = oh * sH - pH;
                                wstart = ow * sW - pW;
@ -1952,20 +1980,20 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                if(wend > iW)
                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));

-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
+                                hstart *= gIStride2;
+                                hend   *= gIStride2;
+                                wstart *= gIStride3;
+                                wend   *= gIStride3;

                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];

                                if ((int) extraParam0 == 0)         //Exclude padding
-                                    valO /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep3)));   //Accounts for dilation
+                                    valO /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep3)));   //Accounts for dilation
                                else if ((int) extraParam0 == 1)    //Include padding
                                    valO /= kProd;

-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
+                                    for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
                                        pgI[kh + kw] += valO;
                            }
                        }
@ -1981,7 +2009,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                            for(int ow = 0; ow < oW; ++ow) {

                                pIn  = in + b * iStride0 + c * iStride1;
-                                pgI  = gI + (pIn - in);
+                                pgI  = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;

                                hstart = oh * sH - pH;
                                wstart = ow * sW - pW;
@ -1997,24 +2025,41 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                if(wend > iW)
                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));

-
-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
-
                                sum = static_cast<T>(0.f);
                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];

-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+                                if(sameStrides) {

-                                valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
+                                    hstart *= iStride2;
+                                    hend   *= iStride2;
+                                    wstart *= iStride3;
+                                    wend   *= iStride3;

-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                        pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kh + kw]);
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+
+                                    valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kh + kw]);
+                                }
+                                else {
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                            sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]), extraParam0);
+
+                                    valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                            const auto inVal = pIn[kh * iStride2 + kw * iStride3];
+                                            pgI[kh * gIStride2 + kw * gIStride3] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
+                                        }
+                                    }
+                                }
                            }
                        }
                    }
@ -2144,11 +2189,16 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
 #endif
            nd4j_debug("MKL-DNN is not used for pooling3d_bp!\n", 0);

-            const Nd4jLong iStride0 = gradI.stridesOf()[0];
-            const Nd4jLong iStride1 = gradI.stridesOf()[1];
-            const Nd4jLong iStride2 = gradI.stridesOf()[2];
-            const Nd4jLong iStride3 = gradI.stridesOf()[3];
-            const Nd4jLong iStride4 = gradI.stridesOf()[4];
+            const Nd4jLong iStride0  = input.stridesOf()[0];
+            const Nd4jLong iStride1  = input.stridesOf()[1];
+            const Nd4jLong iStride2  = input.stridesOf()[2];
+            const Nd4jLong iStride3  = input.stridesOf()[3];
+            const Nd4jLong iStride4  = input.stridesOf()[4];
+            const Nd4jLong gIStride0 = gradI.stridesOf()[0];
+            const Nd4jLong gIStride1 = gradI.stridesOf()[1];
+            const Nd4jLong gIStride2 = gradI.stridesOf()[2];
+            const Nd4jLong gIStride3 = gradI.stridesOf()[3];
+            const Nd4jLong gIStride4 = gradI.stridesOf()[4];
            const Nd4jLong oStride0 = gradO.stridesOf()[0];
            const Nd4jLong oStride1 = gradO.stridesOf()[1];
            const Nd4jLong oStride2 = gradO.stridesOf()[2];
@ -2157,8 +2207,13 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
            const Nd4jLong iStep2   = dD*iStride2;
            const Nd4jLong iStep3   = dH*iStride3;
            const Nd4jLong iStep4   = dW*iStride4;
+            const Nd4jLong gIStep2  = dD*gIStride2;
+            const Nd4jLong gIStep3  = dH*gIStride3;
+            const Nd4jLong gIStep4  = dW*gIStride4;
            const int      kProd    = kD*kH*kW;

+            const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4;
+
            Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
            T sum, valO, *pIn, *pgI;

@ -2192,32 +2247,55 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                    if(wend > iW)
                                        wend -= dW * ((wend-iW + dW - 1) / dW);

-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
-
-                                    maxKD = dstart;
-                                    maxKH = hstart;
-                                    maxKW = wstart;
-
                                    sum = -DataTypeUtils::max<T>();
                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];

-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
-                                                T valIn = pIn[kd + kh + kw];
-                                                if (valIn > sum) {
-                                                    sum = valIn;
-                                                    maxKD = kd;
-                                                    maxKH = kh;
-                                                    maxKW = kw;
+                                    if(sameStrides) {
+
+                                        dstart *= iStride2;
+                                        dend   *= iStride2;
+                                        hstart *= iStride3;
+                                        hend   *= iStride3;
+                                        wstart *= iStride4;
+                                        wend   *= iStride4;
+
+                                        maxKD = dstart;
+                                        maxKH = hstart;
+                                        maxKW = wstart;
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
+                                                    T valIn = pIn[kd + kh + kw];
+                                                    if (valIn > sum) {
+                                                        sum = valIn;
+                                                        maxKD = kd;
+                                                        maxKH = kh;
+                                                        maxKW = kw;
+                                                    }
                                                }
-                                            }
-                                    gI[pIn - in + maxKD + maxKH + maxKW] += valO;
+                                        gI[pIn - in + maxKD + maxKH + maxKW] += valO;
+                                    }
+                                    else {
+
+                                        // we set these to default values
+                                        maxKH = hstart;
+                                        maxKW = wstart;
+                                        maxKD = dstart;
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                    T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
+                                                    if (valIn > sum) {
+                                                        sum = valIn;
+                                                        maxKD = kd;
+                                                        maxKH = kh;
+                                                        maxKW = kw;
+                                                    }
+                                                }
+                                        gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
+                                    }
                                }
                            }
                        }
@ -2233,7 +2311,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                            for(int oh = 0; oh < oH; ++oh) {
                                for(int ow = 0; ow < oW; ++ow) {

-                                    pgI  = gI + b * iStride0 + c * iStride1;
+                                    pgI  = gI + b * gIStride0 + c * gIStride1;

                                    dstart = od * sD - pD;
                                    hstart = oh * sH - pH;
@ -2255,23 +2333,23 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                    if(wend > iW)
                                        wend -= dW * ((wend-iW + dW - 1) / dW);

-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
+                                    dstart *= gIStride2;
+                                    dend   *= gIStride2;
+                                    hstart *= gIStride3;
+                                    hend   *= gIStride3;
+                                    wstart *= gIStride4;
+                                    wend   *= gIStride4;

                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];

                                    if (extraParam0 == 0)         //Exclude padding
-                                        valO /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(iStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep4));   //Accounts for dilation
+                                        valO /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(gIStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
                                    else if (extraParam0 == 1)    //Include padding
                                        valO /= kProd;

-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                    for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
                                                pgI[kd + kh + kw] += valO;
                                }
                            }
@ -2311,27 +2389,46 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
                                    if(wend > iW)
                                        wend -= dW * ((wend-iW + dW - 1) / dW);

-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
-
                                    sum = static_cast<T>(0.);
                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];

-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+                                    if(sameStrides) {

-                                    valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+                                        dstart *= iStride2;
+                                        dend   *= iStride2;
+                                        hstart *= iStride3;
+                                        hend   *= iStride3;
+                                        wstart *= iStride4;
+                                        wend   *= iStride4;

-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+
+                                        valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
+                                    }
+                                    else {
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
+
+                                        valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                    const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
+                                                    pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
+                                                }
+                                    }
                                }
                            }
                        }
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@ -32,98 +32,47 @@
 namespace nd4j {
    namespace ops {
        namespace helpers {
-            ///////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////
            template<typename T>
            __global__ static void concatCuda(const int numOfArrs, void* pVx,  void* pxShapeInfo, void* pVz, void* pzShapeInfo) {

                __shared__ int arrIdx, blocksPerArr;
-                __shared__ T *x, *z;
-                __shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenPerBlock, start, end;

                if (threadIdx.x == 0) {

                    blocksPerArr = (gridDim.x + numOfArrs - 1) / numOfArrs;     // ceil
                    arrIdx = blockIdx.x / blocksPerArr;
-
-                    x = reinterpret_cast<T*>(reinterpret_cast<void**>(pVx)[arrIdx]);
-                    z = reinterpret_cast<T*>(reinterpret_cast<void**>(pVz)[arrIdx]);
-                    xShapeInfo = reinterpret_cast<Nd4jLong**>(pxShapeInfo)[arrIdx];
-                    zShapeInfo = reinterpret_cast<Nd4jLong**>(pzShapeInfo)[arrIdx];
-                    arrLen = shape::length(xShapeInfo);
-
-                    arrLenPerBlock = (arrLen + blocksPerArr - 1) / blocksPerArr;  // ceil
-
-                    start = (blockIdx.x % blocksPerArr) * arrLenPerBlock;
-                    end   = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
                }

                __syncthreads();

-                for (Nd4jLong i = start + threadIdx.x; i < end; i += blockDim.x)
-                    z[shape::getIndexOffset(i, zShapeInfo, arrLen)] = x[shape::getIndexOffset(i, xShapeInfo, arrLen)];
+                for(int j = arrIdx; j < numOfArrs; j += gridDim.x) {
+
+                    const auto* x = reinterpret_cast<T*>(reinterpret_cast<void**>(pVx)[j]);
+                    auto* z = reinterpret_cast<T*>(reinterpret_cast<void**>(pVz)[j]);
+                    const auto* xShapeInfo = reinterpret_cast<Nd4jLong**>(pxShapeInfo)[j];
+                    const auto* zShapeInfo = reinterpret_cast<Nd4jLong**>(pzShapeInfo)[j];
+
+                    const auto arrLen = shape::length(xShapeInfo);
+
+                    const auto arrLenPerBlock = (arrLen + blocksPerArr - 1) / blocksPerArr;  // ceil
+
+                    const auto start = (blockIdx.x % blocksPerArr) * arrLenPerBlock;
+                    const auto end   = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
+
+                    for (Nd4jLong i = start + threadIdx.x; i < end; i += blockDim.x)
+                        z[shape::getIndexOffset(i, zShapeInfo, arrLen)] = x[shape::getIndexOffset(i, xShapeInfo, arrLen)];
+                }
            }

 ///////////////////////////////////////////////////////////////////
            template<typename T>
            __host__ static void concatCudaLauncher(const int numOfArrs, const cudaStream_t *stream,  void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo) {

-                concatCuda<T><<<512, 256, 1024, *stream>>>(numOfArrs, pVx, pxShapeInfo, pVz, pzShapeInfo);
+                concatCuda<T><<<512, 512, 512, *stream>>>(numOfArrs, pVx, pxShapeInfo, pVz, pzShapeInfo);
            }
            BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher,  (const int numOfArrs, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo), LIBND4J_TYPES);
-
-            //////////////////////////////////////////////////////////////////////////
-            void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-
-                const int numOfArrs = inArrs.size();
-                for(int i = 0; i < numOfArrs; ++i)
-                    if(!inArrs[i]->isActualOnDeviceSide()) inArrs[i]->syncToDevice();
-
-                const int rank  = inArrs[0]->rankOf();
-                const int rank2 = 2*rank;
-                std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
-
-                // take into account indices for first array
-                indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
-
-                // loop through the rest of input arrays
-                for(int i = 1; i < numOfArrs; ++i) {
-                    indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
-                    indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
-                }
-
-                std::vector<NDArray*> outSubArrs(numOfArrs);
-                for(int i = 0; i < numOfArrs; ++i)
-                    outSubArrs[i] = new NDArray(output(indices[i], true));
-
-                // prepare arrays of pointers on buffers and shapes
-                std::vector<void*>     hOutBuffers(numOfArrs), hInBuffers(numOfArrs);
-                std::vector<Nd4jLong*> hOutShapeInfo(numOfArrs), hInShapeInfo(numOfArrs);
-                for(int i = 0; i < numOfArrs; ++i) {
-                    hOutBuffers[i]   = outSubArrs[i]->getSpecialBuffer();
-                    hInBuffers[i]    =     inArrs[i]->getSpecialBuffer();
-                    hOutShapeInfo[i] = outSubArrs[i]->getSpecialShapeInfo();
-                    hInShapeInfo[i]  =     inArrs[i]->getSpecialShapeInfo();
-                }
-
-                // allocate and copy all buffers and shapes arrays to global memory
-                PointersManager manager(context, "helpers::concat");
-                void* dOutBuffers	= manager.replicatePointer(hOutBuffers.data(),   hOutBuffers.size() * sizeof(void*));
-                void* dInBuffers	= manager.replicatePointer(hInBuffers.data(),    hInBuffers.size() * sizeof(void*));
-                void* dInShapeInfo  = manager.replicatePointer(hInShapeInfo.data(),  hInShapeInfo.size() * sizeof(Nd4jLong*));
-                void* dOutShapeInfo = manager.replicatePointer(hOutShapeInfo.data(), hOutShapeInfo.size() * sizeof(Nd4jLong*));
-
-                BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (numOfArrs, context->getCudaStream(), dInBuffers, dInShapeInfo, dOutBuffers, dOutShapeInfo), LIBND4J_TYPES);
-
-                manager.synchronize();
-
-                for(int i = 0; i < numOfArrs; ++i)
-                    delete outSubArrs[i];
-
-                for(int i = 0; i < numOfArrs; ++i)
-                    inArrs[i]->tickReadHost();
-
-                output.tickWriteDevice();
-            }
        }
    }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
@ -32,7 +32,7 @@
 namespace nd4j {
    namespace ops {
        namespace helpers {
-            ///////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////
 // x - input, y - paddings, z - output
            template<typename X, typename Y>
            __global__ static void padCuda(const int mode,
@ -130,6 +130,26 @@ namespace nd4j {
            }
            BUILD_DOUBLE_TEMPLATE(template void padCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const int mode, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const void* vPadVal), LIBND4J_TYPES, INTEGER_TYPES);

+///////////////////////////////////////////////////////////////////
+            void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) {
+
+                PointersManager manager(context, "pad");
+
+                NDArray::prepareSpecialUse({&output}, {&input, &paddings, &padValue});
+
+                const int threadsPerBlock = MAX_NUM_THREADS / 4;
+                const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+                const int sharedMem = 8 * threadsPerBlock * output.rankOf() + 128;
+
+                const auto xType = input.dataType();
+                const auto yType = paddings.dataType();
+
+                BUILD_DOUBLE_SELECTOR(xType, yType, padCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), mode, input.getSpecialBuffer(), input.getSpecialShapeInfo(), paddings.getSpecialBuffer(), paddings.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), padValue.getSpecialBuffer()), LIBND4J_TYPES, INTEGER_TYPES);
+
+                NDArray::registerSpecialUse({&output}, {&input, &paddings, &padValue});
+                manager.synchronize();
+            }
+
 ///////////////////////////////////////////////////////////////////
            void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) {

--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
@ -502,8 +502,8 @@ TEST_F(DeclarableOpsTests2, Test_FloorDiv_2) {
    auto x = NDArrayFactory::create<float>('c', {1, 3}, {3.0, 6.0, -3.0});
    auto y = NDArrayFactory::create<float>('c', {1, 3}, {-2.0, 2.0, -2.0});
    auto eps = NDArrayFactory::create<float>('c', {1, 3}, {1, 2, 3});
-    auto exp1 = NDArrayFactory::create<float>('c', {1, 3}, {1,  2., 3});
-    auto exp2 = NDArrayFactory::create<float>('c', {1, 3}, {-0,  -2., 3});
+    auto exp1 = NDArrayFactory::create<float>('c', {1, 3}, {0.f,  0.f, 0.f});
+    auto exp2 = NDArrayFactory::create<float>('c', {1, 3}, {0.f, 0.f, 0.f});

    nd4j::ops::floordiv_bp op;