diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index 2c292556c..b651ed6ff 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -1327,7 +1327,7 @@ void NativeOps::concat(
     // take into account indices for first array
     auto axisSize = shape::sizeAt(reinterpret_cast<Nd4jLong*>(inputShapeInfo[0]), axis);
     indices[0][2 * axis + 1] = axisSize;
-    printf("The axe size is %lld\n", axisSize);
+    //printf("The axe size is %lld\n", axisSize);
     // loop through the rest of input arrays
     for(int i = 1; i < numArrays; ++i) {
         indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
diff --git a/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp b/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
index dbd19533b..b387a0970 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
@@ -42,7 +42,7 @@ namespace nd4j {
             }
 
             char order = (char) INT_ARG(0);
-            helpers::flatten(arrays, output, order);
+            helpers::flatten(block.launchContext(), arrays, output, order);
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/helpers/adjust_hue.h b/libnd4j/include/ops/declarable/helpers/adjust_hue.h
index 59bee6231..dda47ad99 100644
--- a/libnd4j/include/ops/declarable/helpers/adjust_hue.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_hue.h
@@ -25,7 +25,7 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
     template <typename T>
-    static FORCEINLINE void rgb_to_hv(nd4j::LaunchContext * context, T r, T g, T b, T* h, T* v_min, T* v_max) {
+    static FORCEINLINE _CUDA_HD void rgb_to_hv(T r, T g, T b, T* h, T* v_min, T* v_max) {
         T v_mid;
         int h_category;
         // According to the figures in:
@@ -84,7 +84,7 @@ namespace helpers {
     }
 
     template <typename T>
-    static FORCEINLINE void hv_to_rgb(nd4j::LaunchContext * context, T h, T v_min, T v_max, T* r, T* g, T* b) {
+    static FORCEINLINE _CUDA_HD void hv_to_rgb(T h, T v_min, T v_max, T* r, T* g, T* b) {
         int h_category = static_cast<int>(h);
         T ratio = h - (T)h_category;
         bool increase = ((h_category & 0x1) == 0);
diff --git a/libnd4j/include/ops/declarable/helpers/adjust_saturation.h b/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
index 7e5b6aac9..503c45626 100644
--- a/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
@@ -26,7 +26,7 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
     template <typename T>
-    static FORCEINLINE void rgb_to_hsv(nd4j::LaunchContext * context, T r, T g, T b, T* h, T* s, T* v) {
+    static FORCEINLINE _CUDA_HD void rgb_to_hsv(T r, T g, T b, T* h, T* s, T* v) {
         T vv = nd4j::math::nd4j_max<T>(r, nd4j::math::nd4j_max<T>(g, b));
         T range = vv - nd4j::math::nd4j_min<T>(r, nd4j::math::nd4j_min<T>(g, b));
         if (vv > 0) {
@@ -54,7 +54,7 @@ namespace helpers {
     }
 
     template <typename T>
-    static FORCEINLINE void hsv_to_rgb(nd4j::LaunchContext * context, T h, T s, T v, T* r, T* g, T* b) {
+    static FORCEINLINE _CUDA_HD void hsv_to_rgb(T h, T s, T v, T* r, T* g, T* b) {
         T c = s * v;
         T m = v - c;
         T dh = h * 6;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
index ebdbbb5cd..e25ad1b23 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@@ -42,7 +42,7 @@ namespace helpers {
                 auto o = bOut + e * numChannels;
 
                 T h, v_min, v_max;
-                helpers::rgb_to_hv(context, i[0], i[1], i[2], &h, &v_min, &v_max);
+                helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
 
                 h += delta * kChannelRange;
                 while (h < (T) 0.)
@@ -51,7 +51,7 @@ namespace helpers {
                 while (h >= (T) kChannelRange)
                     h -= (T) kChannelRange;
 
-                helpers::hv_to_rgb(context, h, v_min, v_max, o, o + 1, o + 2);
+                helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
             }
         } else {
             auto tadsChannelsIn  = array->allTensorsAlongDimension({0});
@@ -76,7 +76,7 @@ namespace helpers {
                 auto _bo = outputB + e;
 
                 T h, v_min, v_max;
-                helpers::rgb_to_hv(context, _ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
+                helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
 
                 h += delta * kChannelRange;
                 while (h < (T) 0)
@@ -85,7 +85,7 @@ namespace helpers {
                 while (h >= (T) kChannelRange)
                     h -= (T) kChannelRange;
 
-                helpers::hv_to_rgb(context, h, v_min, v_max, _ro, _go, _bo);
+                helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
             }
 
             delete tadsChannelsIn;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
index e8f875f1e..58e5c7831 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@@ -43,10 +43,10 @@ namespace helpers {
 
                 T h, s, v;
                 // Convert the RGB color to Hue/V-range.
-                helpers::rgb_to_hsv(context, i[0], i[1], i[2], &h, &s, &v);
+                helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
                 s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
                 // Convert the hue and v-range back into RGB.
-                helpers::hsv_to_rgb(context, h, s, v, o, o + 1, o + 2);
+                helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
             }
         } else {
             auto tadsChannelsIn = array->allTensorsAlongDimension({0});
@@ -72,10 +72,10 @@ namespace helpers {
 
                 T h, s, v;
                 // Convert the RGB color to Hue/V-range.
-                helpers::rgb_to_hsv(context, _ri[0], _gi[0], _bi[0], &h, &s, &v);
+                helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
                 s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
                 // Convert the hue and v-range back into RGB.
-                helpers::hsv_to_rgb(context, h, s, v, _ro, _go, _bo);
+                helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
             }
 
             delete tadsChannelsIn;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
index 66c674a87..6a08064fc 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
@@ -58,7 +58,7 @@ namespace nd4j {
                 }
             }
 
-            void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order) {
+            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
                 BUILD_SINGLE_SELECTOR(output->dataType(), flatten_, (inputs, output, order), LIBND4J_TYPES);
             }
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
index 7e8bb99f7..295e15c6d 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@@ -19,14 +19,114 @@
 //
 
 #include <ops/declarable/helpers/adjust_hue.h>
+#include <helpers/ConstantTadHelper.h>
 
 namespace nd4j {
 namespace ops {
 namespace helpers {
 
     template <typename T>
-    static void _adjust_hue_single(NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    static void _CUDA_G adjustHueSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
 
+        auto bIn = reinterpret_cast<T*>(xBuffer);
+        auto bOut = reinterpret_cast<T*>(zBuffer);
+        static const int kChannelRange = 6;
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto i = bIn + e * numChannels;
+            auto o = bOut + e * numChannels;
+
+            T h, v_min, v_max;
+            helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
+
+            h += delta * kChannelRange;
+            while (h < (T) 0.)
+                h += (T) kChannelRange;
+
+            while (h >= (T) kChannelRange)
+                h -= (T) kChannelRange;
+
+            helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
+        }
+    }
+
+    template <typename T>
+    static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+        static const int kChannelRange = 6;
+
+        auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
+        auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
+        auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
+
+        auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
+        auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
+        auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
+
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+
+            auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+            auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+
+            T h, v_min, v_max;
+            helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
+
+            h += delta * kChannelRange;
+            while (h < (T) 0)
+                h += (T) kChannelRange;
+
+            while (h >= (T) kChannelRange)
+                h -= (T) kChannelRange;
+
+            helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
+        }
+    }
+
+    template <typename T>
+    static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+        if (isNHWC) {
+            adjustHueSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
+        } else {
+            // TODO: check this one
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
+    }
+
+
+    template <typename T>
+    static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        auto xType = array->dataType();
+
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+
+        if (isNHWC) {
+            // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
+            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
+        } else {
+            // TODO: check this one
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
     }
 
     void _adjust_hue(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
@@ -34,25 +134,11 @@ namespace helpers {
 
         float d = delta->e<float>(0);
         if (array->rankOf() == 4) {
-            auto tadsIn = array->allTensorsAlongDimension({0});
-            auto tadsOut = output->allTensorsAlongDimension({0});
-
-            // FIXME: template selector should be moved out of loop
-PRAGMA_OMP_PARALLEL_FOR
-            for (int e = 0; e < tadsIn->size(); e++) {
-                BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
-            }
-            
-
-            delete tadsIn;
-            delete tadsOut;
         } else {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (array, output, d, isNHWC);, FLOAT_TYPES);
+            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
         }
     }
 
-    BUILD_SINGLE_TEMPLATE(template void _adjust_hue_single, (NDArray *array, NDArray *output, float delta, bool isNHWC);, FLOAT_TYPES);
-
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
index cdd6640c4..474b4de03 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/adjust_saturation.h>
+#include <helpers/ConstantTadHelper.h>
 
 
 namespace nd4j {
@@ -26,8 +27,96 @@ namespace ops {
 namespace helpers {
 
     template <typename T>
-    static void _adjust_saturation_single(NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    static void _CUDA_G adjustSaturationSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
 
+        auto bIn = reinterpret_cast<T*>(xBuffer);
+        auto bOut = reinterpret_cast<T*>(zBuffer);
+        static const int kChannelRange = 6;
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto i = bIn + e * numChannels;
+            auto o = bOut + e * numChannels;
+
+            T h, s, v;
+            // Convert the RGB color to Hue/V-range.
+            helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
+            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+
+            // Convert the hue and v-range back into RGB.
+            helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
+        }
+    }
+
+    template <typename T>
+    static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
+        int numChannels = 3;
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+        static const int kChannelRange = 6;
+
+        auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
+        auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
+        auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
+
+        auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
+        auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
+        auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
+
+        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+            auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+
+            auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+            auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+
+            T h, s, v;
+            // Convert the RGB color to Hue/V-range.
+            helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
+            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+            // Convert the hue and v-range back into RGB.
+            helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
+        }
+    }
+
+    template <typename T>
+    static void _adjust_saturation_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+
+        if (isNHWC) {
+            adjustSaturationSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
+        } else {
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
+    }
+
+    template <typename T>
+    static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+        auto xType = array->dataType();
+
+        // numChannels is always 3
+        auto tuples = array->lengthOf() / 3;
+
+        if (isNHWC) {
+            // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
+            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
+        } else {
+            // TODO: check this one
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+
+            auto tadLength = shape::length(packX.primaryShapeInfo());
+
+            adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+        }
     }
 
     void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
@@ -35,24 +124,12 @@ namespace helpers {
 
         float d = delta->e<float>(0);
         if (array->rankOf() == 4) {
-            auto tadsIn = array->allTensorsAlongDimension({0});
-            auto tadsOut = output->allTensorsAlongDimension({0});
-
-            // FIXME: template selector should be moved out of loop
-PRAGMA_OMP_PARALLEL_FOR
-            for (int e = 0; e < tadsIn->size(); e++) {
-                BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
-            }
-            
-
-            delete tadsIn;
-            delete tadsOut;
+            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_batch, (context, array, output, d, isNHWC);, FLOAT_TYPES);
         } else {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (array, output, d, isNHWC);, FLOAT_TYPES);
+            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
         }
     }
 
-    BUILD_SINGLE_TEMPLATE(template void _adjust_saturation_single, (NDArray *array, NDArray *output, float delta, bool isNHWC), FLOAT_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
index 913d137a0..9baab5b36 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
@@ -19,12 +19,68 @@
 //
 
 #include <ops/declarable/helpers/flatten.h>
+#include <helpers/PointersManager.h>
 
 namespace nd4j {
     namespace ops {
         namespace helpers {
-            void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order) {
+            template <typename T>
+            void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) {
 
+                Nd4jLong xCoord[MAX_RANK];
+
+                for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
+                    auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];
+
+                    auto xBuffer = reinterpret_cast<T*>(xBuffers[e]);
+                    auto xShapeInfo = xShapeInfos[e];
+                    auto xShape = shape::shapeOf(xShapeInfo);
+                    auto xStride = shape::stride(xShapeInfo);
+                    auto xRank = shape::rank(xShapeInfo);
+                    auto xLength = shape::length(xShapeInfo);
+
+                    for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
+                        shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
+                        auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
+                        z[i] = xBuffer[xOffset];
+                    }
+                }
+            }
+
+            template <typename T>
+            void flatten_(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+                PointersManager pm(context, "flatten");
+
+                std::vector<void*> hdBuffers(inputs.size());
+                std::vector<Nd4jLong> hOffsets(inputs.size());
+                std::vector<Nd4jLong *> hdShapes(inputs.size());
+                Nd4jLong cOffset = 0;
+
+                // calculating offsets in output
+                for (int e = 0; e < inputs.size(); e++) {
+                    hOffsets[e] = cOffset;
+                    cOffset += inputs[e]->lengthOf();
+
+                    hdBuffers[e] = inputs[e]->specialBuffer();
+                    hdShapes[e] = inputs[e]->specialShapeInfo();
+                }
+
+                auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
+                auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
+                auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
+
+
+                flattenKernel<T><<<256, 512, 8192, *context->getCudaStream()>>>(dBuffers, dShapes, dOffsets, inputs.size(), output->getSpecialBuffer(), output->getSpecialShapeInfo(), order);
+
+                pm.synchronize();
+            }
+
+            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+                for (auto v:inputs)
+                    v->syncToDevice();
+
+                BUILD_SINGLE_SELECTOR(output->dataType(), flatten_, (context, inputs, output, order), LIBND4J_TYPES);
+                NDArray::registerSpecialUse({output}, {});
             }
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/flatten.h b/libnd4j/include/ops/declarable/helpers/flatten.h
index 7037599e2..05421383f 100644
--- a/libnd4j/include/ops/declarable/helpers/flatten.h
+++ b/libnd4j/include/ops/declarable/helpers/flatten.h
@@ -27,7 +27,7 @@
 namespace nd4j {
     namespace ops {
         namespace helpers {
-            void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order);
+            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order);
         }
     }
 }