[WIP] More of CUDA (#63)

* less spam Signed-off-by: raver119 <raver119@gmail.com> * flatten kernel Signed-off-by: raver119 <raver119@gmail.com> * adjust_hue/adjust_saturation tweaks Signed-off-by: raver119 <raver119@gmail.com> * adjust_hue cuda single Signed-off-by: raver119 <raver119@gmail.com> * adjust_hue cuda batch Signed-off-by: raver119 <raver119@gmail.com> * adjust_saturation cuda Signed-off-by: raver119 <raver119@gmail.com>
2019-07-17 14:02:01 +03:00 · 2019-07-17 14:02:01 +03:00 · 91a8fb0d90
commit 91a8fb0d90
parent cf2311859a
11 changed files with 267 additions and 48 deletions
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@ -1327,7 +1327,7 @@ void NativeOps::concat(
    // take into account indices for first array
    auto axisSize = shape::sizeAt(reinterpret_cast<Nd4jLong*>(inputShapeInfo[0]), axis);
    indices[0][2 * axis + 1] = axisSize;
-    printf("The axe size is %lld\n", axisSize);
+    //printf("The axe size is %lld\n", axisSize);
    // loop through the rest of input arrays
    for(int i = 1; i < numArrays; ++i) {
        indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
--- a/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
@ -42,7 +42,7 @@ namespace nd4j {
            }

            char order = (char) INT_ARG(0);
-            helpers::flatten(arrays, output, order);
+            helpers::flatten(block.launchContext(), arrays, output, order);

            return Status::OK();
        }
--- a/libnd4j/include/ops/declarable/helpers/adjust_hue.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_hue.h
@ -25,7 +25,7 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
    template <typename T>
-    static FORCEINLINE void rgb_to_hv(nd4j::LaunchContext * context, T r, T g, T b, T* h, T* v_min, T* v_max) {
+    static FORCEINLINE _CUDA_HD void rgb_to_hv(T r, T g, T b, T* h, T* v_min, T* v_max) {
        T v_mid;
        int h_category;
        // According to the figures in:
@ -84,7 +84,7 @@ namespace helpers {
    }

    template <typename T>
-    static FORCEINLINE void hv_to_rgb(nd4j::LaunchContext * context, T h, T v_min, T v_max, T* r, T* g, T* b) {
+    static FORCEINLINE _CUDA_HD void hv_to_rgb(T h, T v_min, T v_max, T* r, T* g, T* b) {
        int h_category = static_cast<int>(h);
        T ratio = h - (T)h_category;
        bool increase = ((h_category & 0x1) == 0);
--- a/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
@ -26,7 +26,7 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
    template <typename T>
-    static FORCEINLINE void rgb_to_hsv(nd4j::LaunchContext * context, T r, T g, T b, T* h, T* s, T* v) {
+    static FORCEINLINE _CUDA_HD void rgb_to_hsv(T r, T g, T b, T* h, T* s, T* v) {
        T vv = nd4j::math::nd4j_max<T>(r, nd4j::math::nd4j_max<T>(g, b));
        T range = vv - nd4j::math::nd4j_min<T>(r, nd4j::math::nd4j_min<T>(g, b));
        if (vv > 0) {
@ -54,7 +54,7 @@ namespace helpers {
    }

    template <typename T>
-    static FORCEINLINE void hsv_to_rgb(nd4j::LaunchContext * context, T h, T s, T v, T* r, T* g, T* b) {
+    static FORCEINLINE _CUDA_HD void hsv_to_rgb(T h, T s, T v, T* r, T* g, T* b) {
        T c = s * v;
        T m = v - c;
        T dh = h * 6;
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@ -42,7 +42,7 @@ namespace helpers {
                auto o = bOut + e * numChannels;

                T h, v_min, v_max;
-                helpers::rgb_to_hv(context, i[0], i[1], i[2], &h, &v_min, &v_max);
+                helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);

                h += delta * kChannelRange;
                while (h < (T) 0.)
@ -51,7 +51,7 @@ namespace helpers {
                while (h >= (T) kChannelRange)
                    h -= (T) kChannelRange;

-                helpers::hv_to_rgb(context, h, v_min, v_max, o, o + 1, o + 2);
+                helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
            }
        } else {
            auto tadsChannelsIn  = array->allTensorsAlongDimension({0});
@ -76,7 +76,7 @@ namespace helpers {
                auto _bo = outputB + e;

                T h, v_min, v_max;
-                helpers::rgb_to_hv(context, _ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
+                helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);

                h += delta * kChannelRange;
                while (h < (T) 0)
@ -85,7 +85,7 @@ namespace helpers {
                while (h >= (T) kChannelRange)
                    h -= (T) kChannelRange;

-                helpers::hv_to_rgb(context, h, v_min, v_max, _ro, _go, _bo);
+                helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
            }

            delete tadsChannelsIn;
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@ -43,10 +43,10 @@ namespace helpers {

                T h, s, v;
                // Convert the RGB color to Hue/V-range.
-                helpers::rgb_to_hsv(context, i[0], i[1], i[2], &h, &s, &v);
+                helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
                s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
                // Convert the hue and v-range back into RGB.
-                helpers::hsv_to_rgb(context, h, s, v, o, o + 1, o + 2);
+                helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
            }
        } else {
            auto tadsChannelsIn = array->allTensorsAlongDimension({0});
@ -72,10 +72,10 @@ namespace helpers {

                T h, s, v;
                // Convert the RGB color to Hue/V-range.
-                helpers::rgb_to_hsv(context, _ri[0], _gi[0], _bi[0], &h, &s, &v);
+                helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
                s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
                // Convert the hue and v-range back into RGB.
-                helpers::hsv_to_rgb(context, h, s, v, _ro, _go, _bo);
+                helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
            }

            delete tadsChannelsIn;
--- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
@ -58,7 +58,7 @@ namespace nd4j {
                }
            }

-            void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order) {
+            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
                BUILD_SINGLE_SELECTOR(output->dataType(), flatten_, (inputs, output, order), LIBND4J_TYPES);
            }
        }
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@ -19,14 +19,114 @@
 //

 #include <ops/declarable/helpers/adjust_hue.h>
+#include <helpers/ConstantTadHelper.h>

 namespace nd4j {
 namespace ops {
 namespace helpers {

    template <typename T>
-    static void _adjust_hue_single(NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    static void _CUDA_G adjustHueSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;

+        auto bIn = reinterpret_cast<T*>(xBuffer);
+        auto bOut = reinterpret_cast<T*>(zBuffer);
+        static const int kChannelRange = 6;
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto i = bIn + e * numChannels;
+            auto o = bOut + e * numChannels;
+
+            T h, v_min, v_max;
+            helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
+
+            h += delta * kChannelRange;
+            while (h < (T) 0.)
+                h += (T) kChannelRange;
+
+            while (h >= (T) kChannelRange)
+                h -= (T) kChannelRange;
+
+            helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
+        }
+    }
+
+    template <typename T>
+    static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+        static const int kChannelRange = 6;
+
+        auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
+        auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
+        auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
+
+        auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
+        auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
+        auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
+
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+
+            auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+
+            T h, v_min, v_max;
+            helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
+
+            h += delta * kChannelRange;
+            while (h < (T) 0)
+                h += (T) kChannelRange;
+
+            while (h >= (T) kChannelRange)
+                h -= (T) kChannelRange;
+
+            helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
+        }
+    }
+
+    template <typename T>
+    static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+        if (isNHWC) {
+            adjustHueSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
+        } else {
+            // TODO: check this one
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
+    }
+
+
+    template <typename T>
+    static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        auto xType = array->dataType();
+
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+
+        if (isNHWC) {
+            // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
+            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
+        } else {
+            // TODO: check this one
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
    }

    void _adjust_hue(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
@ -34,25 +134,11 @@ namespace helpers {

        float d = delta->e<float>(0);
        if (array->rankOf() == 4) {
-            auto tadsIn = array->allTensorsAlongDimension({0});
-            auto tadsOut = output->allTensorsAlongDimension({0});
-
-            // FIXME: template selector should be moved out of loop
-PRAGMA_OMP_PARALLEL_FOR
-            for (int e = 0; e < tadsIn->size(); e++) {
-                BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
-            }
-            
-
-            delete tadsIn;
-            delete tadsOut;
        } else {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (array, output, d, isNHWC);, FLOAT_TYPES);
+            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
        }
    }

-    BUILD_SINGLE_TEMPLATE(template void _adjust_hue_single, (NDArray *array, NDArray *output, float delta, bool isNHWC);, FLOAT_TYPES);
-
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@ -19,6 +19,7 @@
 //

 #include <ops/declarable/helpers/adjust_saturation.h>
+#include <helpers/ConstantTadHelper.h>


 namespace nd4j {
@ -26,8 +27,96 @@ namespace ops {
 namespace helpers {

    template <typename T>
-    static void _adjust_saturation_single(NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    static void _CUDA_G adjustSaturationSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;

+        auto bIn = reinterpret_cast<T*>(xBuffer);
+        auto bOut = reinterpret_cast<T*>(zBuffer);
+        static const int kChannelRange = 6;
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto i = bIn + e * numChannels;
+            auto o = bOut + e * numChannels;
+
+            T h, s, v;
+            // Convert the RGB color to Hue/V-range.
+            helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
+            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+
+            // Convert the hue and v-range back into RGB.
+            helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
+        }
+    }
+
+    template <typename T>
+    static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+        static const int kChannelRange = 6;
+
+        auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
+        auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
+        auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
+
+        auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
+        auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
+        auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+
+            auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+
+            T h, s, v;
+            // Convert the RGB color to Hue/V-range.
+            helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
+            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+            // Convert the hue and v-range back into RGB.
+            helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
+        }
+    }
+
+    template <typename T>
+    static void _adjust_saturation_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+
+        if (isNHWC) {
+            adjustSaturationSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
+        } else {
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
+    }
+
+    template <typename T>
+    static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        auto xType = array->dataType();
+
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+
+        if (isNHWC) {
+            // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
+            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
+        } else {
+            // TODO: check this one
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
    }

    void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
@ -35,24 +124,12 @@ namespace helpers {

        float d = delta->e<float>(0);
        if (array->rankOf() == 4) {
-            auto tadsIn = array->allTensorsAlongDimension({0});
-            auto tadsOut = output->allTensorsAlongDimension({0});
-
-            // FIXME: template selector should be moved out of loop
-PRAGMA_OMP_PARALLEL_FOR
-            for (int e = 0; e < tadsIn->size(); e++) {
-                BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
-            }
-            
-
-            delete tadsIn;
-            delete tadsOut;
+            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_batch, (context, array, output, d, isNHWC);, FLOAT_TYPES);
        } else {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (array, output, d, isNHWC);, FLOAT_TYPES);
+            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
        }
    }

-    BUILD_SINGLE_TEMPLATE(template void _adjust_saturation_single, (NDArray *array, NDArray *output, float delta, bool isNHWC), FLOAT_TYPES);

 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
@ -19,12 +19,68 @@
 //

 #include <ops/declarable/helpers/flatten.h>
+#include <helpers/PointersManager.h>

 namespace nd4j {
    namespace ops {
        namespace helpers {
-            void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order) {
+            template <typename T>
+            void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) {

+                Nd4jLong xCoord[MAX_RANK];
+
+                for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
+                    auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];
+
+                    auto xBuffer = reinterpret_cast<T*>(xBuffers[e]);
+                    auto xShapeInfo = xShapeInfos[e];
+                    auto xShape = shape::shapeOf(xShapeInfo);
+                    auto xStride = shape::stride(xShapeInfo);
+                    auto xRank = shape::rank(xShapeInfo);
+                    auto xLength = shape::length(xShapeInfo);
+
+                    for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
+                        shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
+                        auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
+                        z[i] = xBuffer[xOffset];
+                    }
+                }
+            }
+
+            template <typename T>
+            void flatten_(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+                PointersManager pm(context, "flatten");
+
+                std::vector<void*> hdBuffers(inputs.size());
+                std::vector<Nd4jLong> hOffsets(inputs.size());
+                std::vector<Nd4jLong *> hdShapes(inputs.size());
+                Nd4jLong cOffset = 0;
+
+                // calculating offsets in output
+                for (int e = 0; e < inputs.size(); e++) {
+                    hOffsets[e] = cOffset;
+                    cOffset += inputs[e]->lengthOf();
+
+                    hdBuffers[e] = inputs[e]->specialBuffer();
+                    hdShapes[e] = inputs[e]->specialShapeInfo();
+                }
+
+                auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
+                auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
+                auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
+
+
+                flattenKernel<T><<<256, 512, 8192, *context->getCudaStream()>>>(dBuffers, dShapes, dOffsets, inputs.size(), output->getSpecialBuffer(), output->getSpecialShapeInfo(), order);
+
+                pm.synchronize();
+            }
+
+            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+                for (auto v:inputs)
+                    v->syncToDevice();
+
+                BUILD_SINGLE_SELECTOR(output->dataType(), flatten_, (context, inputs, output, order), LIBND4J_TYPES);
+                NDArray::registerSpecialUse({output}, {});
            }
        }
    }
--- a/libnd4j/include/ops/declarable/helpers/flatten.h
+++ b/libnd4j/include/ops/declarable/helpers/flatten.h
@ -27,7 +27,7 @@
 namespace nd4j {
    namespace ops {
        namespace helpers {
-            void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order);
+            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order);
        }
    }
 }