diff --git a/arbiter/arbiter-core/pom.xml b/arbiter/arbiter-core/pom.xml
index 76251f4cd..7c65b2efb 100644
--- a/arbiter/arbiter-core/pom.xml
+++ b/arbiter/arbiter-core/pom.xml
@@ -99,7 +99,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/arbiter/arbiter-deeplearning4j/pom.xml b/arbiter/arbiter-deeplearning4j/pom.xml
index ec7e22d3c..92e3fb7aa 100644
--- a/arbiter/arbiter-deeplearning4j/pom.xml
+++ b/arbiter/arbiter-deeplearning4j/pom.xml
@@ -77,7 +77,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/arbiter/arbiter-server/pom.xml b/arbiter/arbiter-server/pom.xml
index c4306b967..26aa80b07 100644
--- a/arbiter/arbiter-server/pom.xml
+++ b/arbiter/arbiter-server/pom.xml
@@ -63,7 +63,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/arbiter/arbiter-ui/pom.xml b/arbiter/arbiter-ui/pom.xml
index 7392392db..88f39a310 100644
--- a/arbiter/arbiter-ui/pom.xml
+++ b/arbiter/arbiter-ui/pom.xml
@@ -37,7 +37,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/arbiter/pom.xml b/arbiter/pom.xml
index 93f877968..ab8d72365 100644
--- a/arbiter/pom.xml
+++ b/arbiter/pom.xml
@@ -151,7 +151,7 @@
                                 <skip>${skipTestResourceEnforcement}</skip>
                                 <rules>
                                     <requireActiveProfile>
-                                        <profiles>test-nd4j-native,test-nd4j-cuda-10.2</profiles>
+                                        <profiles>test-nd4j-native,test-nd4j-cuda-11.0</profiles>
                                         <all>false</all>
                                     </requireActiveProfile>
                                 </rules>
@@ -333,11 +333,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${nd4j.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/change-cuda-versions.sh b/change-cuda-versions.sh
index 7b354d68b..cbe75d830 100755
--- a/change-cuda-versions.sh
+++ b/change-cuda-versions.sh
@@ -20,7 +20,7 @@
 
 set -e
 
-VALID_VERSIONS=( 9.2 10.0 10.1 10.2 )
+VALID_VERSIONS=( 9.2 10.0 10.1 10.2 11.0 )
 
 usage() {
   echo "Usage: $(basename $0) [-h|--help] <cuda version to be used>
@@ -47,6 +47,10 @@ check_cuda_version() {
 check_cuda_version "$VERSION"
 
 case $VERSION in
+  11.0)
+    VERSION2="8.0"
+    VERSION3="1.5.4-SNAPSHOT"
+    ;;
   10.2)
     VERSION2="7.6"
     VERSION3="1.5.3"
diff --git a/datavec/datavec-api/pom.xml b/datavec/datavec-api/pom.xml
index 3c3eec86e..0b01863f6 100644
--- a/datavec/datavec-api/pom.xml
+++ b/datavec/datavec-api/pom.xml
@@ -126,7 +126,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-arrow/pom.xml b/datavec/datavec-arrow/pom.xml
index 60409bc53..eb61221c8 100644
--- a/datavec/datavec-arrow/pom.xml
+++ b/datavec/datavec-arrow/pom.xml
@@ -62,7 +62,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-data/datavec-data-audio/pom.xml b/datavec/datavec-data/datavec-data-audio/pom.xml
index 3b9674cd9..0c67ae396 100644
--- a/datavec/datavec-data/datavec-data-audio/pom.xml
+++ b/datavec/datavec-data/datavec-data-audio/pom.xml
@@ -79,7 +79,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-data/datavec-data-codec/pom.xml b/datavec/datavec-data/datavec-data-codec/pom.xml
index 7ef25d5a3..58eda4820 100644
--- a/datavec/datavec-data/datavec-data-codec/pom.xml
+++ b/datavec/datavec-data/datavec-data-codec/pom.xml
@@ -66,7 +66,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-data/datavec-data-image/pom.xml b/datavec/datavec-data/datavec-data-image/pom.xml
index aef66381a..c88c89208 100644
--- a/datavec/datavec-data/datavec-data-image/pom.xml
+++ b/datavec/datavec-data/datavec-data-image/pom.xml
@@ -128,7 +128,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-data/datavec-data-nlp/pom.xml b/datavec/datavec-data/datavec-data-nlp/pom.xml
index fb30b93e7..dd5860f9a 100644
--- a/datavec/datavec-data/datavec-data-nlp/pom.xml
+++ b/datavec/datavec-data/datavec-data-nlp/pom.xml
@@ -81,7 +81,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-data/datavec-geo/pom.xml b/datavec/datavec-data/datavec-geo/pom.xml
index f88bc84d1..792265434 100644
--- a/datavec/datavec-data/datavec-geo/pom.xml
+++ b/datavec/datavec-data/datavec-geo/pom.xml
@@ -56,7 +56,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-data/datavec-hadoop/pom.xml b/datavec/datavec-data/datavec-hadoop/pom.xml
index fb7eee69c..4ab64d8b5 100644
--- a/datavec/datavec-data/datavec-hadoop/pom.xml
+++ b/datavec/datavec-data/datavec-hadoop/pom.xml
@@ -74,7 +74,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-data/pom.xml b/datavec/datavec-data/pom.xml
index e40d96149..2510924b8 100644
--- a/datavec/datavec-data/pom.xml
+++ b/datavec/datavec-data/pom.xml
@@ -54,7 +54,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/datavec/datavec-excel/pom.xml b/datavec/datavec-excel/pom.xml
index 49dc26db8..dc6d3d6b7 100644
--- a/datavec/datavec-excel/pom.xml
+++ b/datavec/datavec-excel/pom.xml
@@ -65,7 +65,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-jdbc/pom.xml b/datavec/datavec-jdbc/pom.xml
index 6ef9b0441..612fb05e7 100644
--- a/datavec/datavec-jdbc/pom.xml
+++ b/datavec/datavec-jdbc/pom.xml
@@ -72,7 +72,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-local/pom.xml b/datavec/datavec-local/pom.xml
index 3adc0e011..06fd13fe2 100644
--- a/datavec/datavec-local/pom.xml
+++ b/datavec/datavec-local/pom.xml
@@ -95,7 +95,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-python/pom.xml b/datavec/datavec-python/pom.xml
index 526b8238a..dae915909 100644
--- a/datavec/datavec-python/pom.xml
+++ b/datavec/datavec-python/pom.xml
@@ -78,7 +78,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml b/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
index 3b564b1b3..ff8bdb853 100644
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
@@ -60,7 +60,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-spark-inference-parent/datavec-spark-inference-model/pom.xml b/datavec/datavec-spark-inference-parent/datavec-spark-inference-model/pom.xml
index bac20d42e..68a78450d 100644
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-model/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-model/pom.xml
@@ -59,7 +59,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml b/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
index 0c05f327b..331c58a8c 100644
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
@@ -178,7 +178,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-spark-inference-parent/pom.xml b/datavec/datavec-spark-inference-parent/pom.xml
index cc3a6b0c1..5e98a1aad 100644
--- a/datavec/datavec-spark-inference-parent/pom.xml
+++ b/datavec/datavec-spark-inference-parent/pom.xml
@@ -38,7 +38,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/datavec-spark/pom.xml b/datavec/datavec-spark/pom.xml
index 345b774c3..2c547499c 100644
--- a/datavec/datavec-spark/pom.xml
+++ b/datavec/datavec-spark/pom.xml
@@ -144,7 +144,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/datavec/pom.xml b/datavec/pom.xml
index 1c49960d6..8e403ea1e 100644
--- a/datavec/pom.xml
+++ b/datavec/pom.xml
@@ -108,7 +108,7 @@
                                 <skip>${skipTestResourceEnforcement}</skip>
                                 <rules>
                                     <requireActiveProfile>
-                                        <profiles>test-nd4j-native,test-nd4j-cuda-10.2</profiles>
+                                        <profiles>test-nd4j-native,test-nd4j-cuda-11.0</profiles>
                                         <all>false</all>
                                     </requireActiveProfile>
                                 </rules>
@@ -361,11 +361,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${nd4j.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-common-tests/pom.xml b/deeplearning4j/deeplearning4j-common-tests/pom.xml
index 5a4ba921d..d19aa85c4 100644
--- a/deeplearning4j/deeplearning4j-common-tests/pom.xml
+++ b/deeplearning4j/deeplearning4j-common-tests/pom.xml
@@ -62,11 +62,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-common/pom.xml b/deeplearning4j/deeplearning4j-common/pom.xml
index 1928eff91..7e2e0ebb6 100644
--- a/deeplearning4j/deeplearning4j-common/pom.xml
+++ b/deeplearning4j/deeplearning4j-common/pom.xml
@@ -40,7 +40,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-core/pom.xml b/deeplearning4j/deeplearning4j-core/pom.xml
index 90c88d4c3..f8dc0123b 100644
--- a/deeplearning4j/deeplearning4j-core/pom.xml
+++ b/deeplearning4j/deeplearning4j-core/pom.xml
@@ -180,11 +180,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-cuda/pom.xml b/deeplearning4j/deeplearning4j-cuda/pom.xml
index 30373db3a..e76e905df 100644
--- a/deeplearning4j/deeplearning4j-cuda/pom.xml
+++ b/deeplearning4j/deeplearning4j-cuda/pom.xml
@@ -16,7 +16,7 @@
 
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
-    <artifactId>deeplearning4j-cuda-10.2</artifactId>
+    <artifactId>deeplearning4j-cuda-11.0</artifactId>
     <name>deeplearning4j-cuda</name>
     <parent>
         <groupId>org.deeplearning4j</groupId>
@@ -26,9 +26,9 @@
 
     <properties>
         <!-- CUDA version is linked with the artifact name so cannot move to parent pom.xml -->
-        <cuda.version>10.2</cuda.version>
-        <cudnn.version>7.6</cudnn.version>
-        <javacpp-presets.cuda.version>1.5.3</javacpp-presets.cuda.version>
+        <cuda.version>11.0</cuda.version>
+        <cudnn.version>8.0</cudnn.version>
+        <javacpp-presets.cuda.version>1.5.4-SNAPSHOT</javacpp-presets.cuda.version>
     </properties>
 
     <dependencyManagement>
@@ -112,7 +112,7 @@
             </build>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-cuda/src/main/java/org/deeplearning4j/cuda/convolution/CudnnConvolutionHelper.java b/deeplearning4j/deeplearning4j-cuda/src/main/java/org/deeplearning4j/cuda/convolution/CudnnConvolutionHelper.java
index a4dddb759..91e3c4829 100644
--- a/deeplearning4j/deeplearning4j-cuda/src/main/java/org/deeplearning4j/cuda/convolution/CudnnConvolutionHelper.java
+++ b/deeplearning4j/deeplearning4j-cuda/src/main/java/org/deeplearning4j/cuda/convolution/CudnnConvolutionHelper.java
@@ -254,17 +254,34 @@ public class CudnnConvolutionHelper extends BaseCudnnHelper implements Convoluti
                     throw new IllegalArgumentException("Unknown BwdDataAlgo: " + bwdDataAlgo);
             }
         } else {
+            /*
             code = cudnnGetConvolutionBackwardFilterAlgorithm(cudnnContext, cudnnContext.srcTensorDesc,
                     cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.filterDesc,
                     mode == AlgoMode.NO_WORKSPACE ? CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE
                             : CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
                     0, algo1);
+            */
+            val fa = new cudnnConvolutionBwdFilterAlgoPerf_t();
+            val counts = new int[1];
+            code = cudnnFindConvolutionBackwardFilterAlgorithm(cudnnContext,  cudnnContext.srcTensorDesc,
+                    cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.filterDesc, 1, counts, fa);
+            algo1[0] = fa.algo();
+
             checkCudnn(false, "cudnnGetConvolutionBackwardFilterAlgorithm", code, input, weights, null, delta, kernel, strides, pad, mode, null, bwdFilterAlgo, bwdDataAlgo, convolutionMode, dilation);
+
+            /*
             code = cudnnGetConvolutionBackwardDataAlgorithm(cudnnContext, cudnnContext.filterDesc,
                     cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.srcTensorDesc,
                     mode == AlgoMode.NO_WORKSPACE ? CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE
                             : CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
                     0, algo2);
+             */
+
+            val da = new cudnnConvolutionBwdDataAlgoPerf_t();
+            code = cudnnFindConvolutionBackwardDataAlgorithm(cudnnContext, cudnnContext.filterDesc,
+                    cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.srcTensorDesc, 1, counts, da);
+
+            algo2[0] = da.algo();
             checkCudnn(false, "cudnnGetConvolutionBackwardDataAlgorithm", code, input, weights, null, delta, kernel, strides, pad, mode, null, bwdFilterAlgo, bwdDataAlgo, convolutionMode, dilation);
         }
 
@@ -461,11 +478,17 @@ public class CudnnConvolutionHelper extends BaseCudnnHelper implements Convoluti
                     throw new IllegalArgumentException("Unknown FwdAlgo: " + fwdAlgo);
             }
         } else {
-            code = cudnnGetConvolutionForwardAlgorithm(cudnnContext, cudnnContext.srcTensorDesc,
+            /*
+            code = cudnnGetConvolutionForwardAlgorithm_v7(cudnnContext, cudnnContext.srcTensorDesc,
                     cudnnContext.filterDesc, cudnnContext.convDesc,
                     cudnnContext.dstTensorDesc, mode == AlgoMode.NO_WORKSPACE
-                            ? CUDNN_CONVOLUTION_FWD_NO_WORKSPACE : CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+                            ? CUDNN_CONVOLUTION_FWD_ : CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
                     0, algo);
+            */
+
+            val cdf = new cudnnConvolutionFwdAlgoPerf_t();
+            val count = new int[1];
+            code = cudnnFindConvolutionForwardAlgorithm(cudnnContext, cudnnContext.srcTensorDesc, cudnnContext.filterDesc, cudnnContext.convDesc, cudnnContext.dstTensorDesc, 1, count, cdf);
 
             if(code != CUDNN_STATUS_SUCCESS){
                 //If CuDNN can't infer algorithm - try IMPLICIT_GEMM
@@ -477,6 +500,8 @@ public class CudnnConvolutionHelper extends BaseCudnnHelper implements Convoluti
                 fwdAlgo = FwdAlgo.IMPLICIT_GEMM;
                 algo[0] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
             }
+
+            algo[0] = cdf.algo();
         }
 
         if(log.isTraceEnabled()){
diff --git a/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/cuda/lstm/ValidateCudnnLSTM.java b/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/cuda/lstm/ValidateCudnnLSTM.java
index 8d5ece6ff..636071a28 100644
--- a/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/cuda/lstm/ValidateCudnnLSTM.java
+++ b/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/cuda/lstm/ValidateCudnnLSTM.java
@@ -29,6 +29,7 @@ import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.concurrency.AffinityManager;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.dataset.DataSet;
 import org.nd4j.linalg.factory.Nd4j;
@@ -269,7 +270,7 @@ public class ValidateCudnnLSTM extends BaseDL4JTest {
         assertTrue(f.get(l0) instanceof CudnnLSTMHelper);
         assertTrue(f.get(l1) instanceof CudnnLSTMHelper);
 
-        Random r = new Random(12345);
+        Random r = new Random(123456);
         for (int x = 0; x < 1; x++) {
             INDArray input = Nd4j.rand(new int[] {minibatch, inputSize, timeSeriesLength});
             INDArray labels = Nd4j.zeros(minibatch, nOut, timeSeriesLength);
@@ -284,7 +285,6 @@ public class ValidateCudnnLSTM extends BaseDL4JTest {
             mln2.fit(ds);
         }
 
-
         assertEquals(mln1.params(), mln2.params());
     }
 
diff --git a/deeplearning4j/deeplearning4j-data/deeplearning4j-datasets/pom.xml b/deeplearning4j/deeplearning4j-data/deeplearning4j-datasets/pom.xml
index b1adbe93e..5a48cfc2d 100644
--- a/deeplearning4j/deeplearning4j-data/deeplearning4j-datasets/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/deeplearning4j-datasets/pom.xml
@@ -51,7 +51,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-data/deeplearning4j-datavec-iterators/pom.xml b/deeplearning4j/deeplearning4j-data/deeplearning4j-datavec-iterators/pom.xml
index c87a94b37..2ce302370 100644
--- a/deeplearning4j/deeplearning4j-data/deeplearning4j-datavec-iterators/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/deeplearning4j-datavec-iterators/pom.xml
@@ -46,7 +46,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/pom.xml b/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/pom.xml
index 7806bab88..103a1a8e7 100644
--- a/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/pom.xml
@@ -43,7 +43,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-data/pom.xml b/deeplearning4j/deeplearning4j-data/pom.xml
index ca29f35b7..bcf4d3355 100644
--- a/deeplearning4j/deeplearning4j-data/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/pom.xml
@@ -38,7 +38,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml b/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml
index ed0160ccb..15c3c0715 100644
--- a/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml
+++ b/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml
@@ -116,11 +116,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-graph/pom.xml b/deeplearning4j/deeplearning4j-graph/pom.xml
index 645b4eca2..876470377 100644
--- a/deeplearning4j/deeplearning4j-graph/pom.xml
+++ b/deeplearning4j/deeplearning4j-graph/pom.xml
@@ -64,7 +64,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-manifold/deeplearning4j-tsne/pom.xml b/deeplearning4j/deeplearning4j-manifold/deeplearning4j-tsne/pom.xml
index 7ebb82e75..86c3e23d6 100644
--- a/deeplearning4j/deeplearning4j-manifold/deeplearning4j-tsne/pom.xml
+++ b/deeplearning4j/deeplearning4j-manifold/deeplearning4j-tsne/pom.xml
@@ -69,7 +69,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-manifold/pom.xml b/deeplearning4j/deeplearning4j-manifold/pom.xml
index 921ee9653..6e1a8cd42 100644
--- a/deeplearning4j/deeplearning4j-manifold/pom.xml
+++ b/deeplearning4j/deeplearning4j-manifold/pom.xml
@@ -41,7 +41,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml b/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml
index 383eb1c8c..13b7ee45d 100644
--- a/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml
+++ b/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml
@@ -302,11 +302,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-modelimport/pom.xml b/deeplearning4j/deeplearning4j-modelimport/pom.xml
index 3dcdcc720..4bbb32806 100644
--- a/deeplearning4j/deeplearning4j-modelimport/pom.xml
+++ b/deeplearning4j/deeplearning4j-modelimport/pom.xml
@@ -135,11 +135,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
index 911432cf0..9820c29f2 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
@@ -125,11 +125,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
index 0886e8d5b..eed007f32 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
@@ -49,7 +49,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-model/pom.xml b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-model/pom.xml
index bfd004c41..902a67ae7 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-model/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-model/pom.xml
@@ -53,7 +53,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/pom.xml b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/pom.xml
index fbe0ddccf..6987dc556 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/pom.xml
@@ -89,11 +89,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/pom.xml b/deeplearning4j/deeplearning4j-nearestneighbors-parent/pom.xml
index 23d5d225d..70778f67d 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/pom.xml
@@ -44,7 +44,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
index 23d863cdc..3b0fd8944 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
@@ -72,7 +72,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-japanese/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-japanese/pom.xml
index a4fea6b07..c85e18cdd 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-japanese/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-japanese/pom.xml
@@ -75,7 +75,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-korean/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-korean/pom.xml
index c0fcdb84a..be02f45b6 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-korean/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-korean/pom.xml
@@ -67,7 +67,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
index e5ee63ea0..7ec64d395 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
@@ -110,7 +110,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
index 668c728ae..6595a3a1e 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
@@ -91,7 +91,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/pom.xml
index 6c4eea3fd..61627d8a9 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/pom.xml
@@ -42,7 +42,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-nn/pom.xml b/deeplearning4j/deeplearning4j-nn/pom.xml
index b817c0dc6..268a70cd9 100644
--- a/deeplearning4j/deeplearning4j-nn/pom.xml
+++ b/deeplearning4j/deeplearning4j-nn/pom.xml
@@ -128,7 +128,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-remote/deeplearning4j-json-server/pom.xml b/deeplearning4j/deeplearning4j-remote/deeplearning4j-json-server/pom.xml
index 338d0173f..ca4c49e9e 100644
--- a/deeplearning4j/deeplearning4j-remote/deeplearning4j-json-server/pom.xml
+++ b/deeplearning4j/deeplearning4j-remote/deeplearning4j-json-server/pom.xml
@@ -107,14 +107,14 @@
     </profile>
 
     <profile>
-      <id>test-nd4j-cuda-10.2</id>
+      <id>test-nd4j-cuda-11.0</id>
       <activation>
         <activeByDefault>false</activeByDefault>
       </activation>
       <dependencies>
         <dependency>
           <groupId>org.nd4j</groupId>
-          <artifactId>nd4j-cuda-10.2</artifactId>
+          <artifactId>nd4j-cuda-11.0</artifactId>
           <version>${project.version}</version>
           <scope>test</scope>
         </dependency>
diff --git a/deeplearning4j/deeplearning4j-remote/pom.xml b/deeplearning4j/deeplearning4j-remote/pom.xml
index 4329a1554..1816689a4 100644
--- a/deeplearning4j/deeplearning4j-remote/pom.xml
+++ b/deeplearning4j/deeplearning4j-remote/pom.xml
@@ -27,7 +27,7 @@
       <id>test-nd4j-native</id>
     </profile>
     <profile>
-      <id>test-nd4j-cuda-10.2</id>
+      <id>test-nd4j-cuda-11.0</id>
     </profile>
   </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/pom.xml b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/pom.xml
index 2c7a94de8..18392dfc0 100644
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/pom.xml
@@ -109,11 +109,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/pom.xml b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/pom.xml
index 3c083d40d..36a77391e 100644
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/pom.xml
@@ -104,7 +104,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/pom.xml b/deeplearning4j/deeplearning4j-scaleout/pom.xml
index 539aa3ef7..d0a199d76 100644
--- a/deeplearning4j/deeplearning4j-scaleout/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/pom.xml
@@ -38,7 +38,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/pom.xml b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/pom.xml
index 3eafbb9e2..693fbcf7a 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/pom.xml
@@ -80,7 +80,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp/pom.xml b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp/pom.xml
index c4e8dc7ab..ffd7a4b0f 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp/pom.xml
@@ -79,7 +79,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/pom.xml b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/pom.xml
index 1198ae733..53297cb13 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/pom.xml
@@ -86,7 +86,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/pom.xml b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/pom.xml
index b7b20d161..9b399fa22 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/pom.xml
@@ -105,7 +105,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml b/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
index 8a4fb02d5..8bafabc38 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
@@ -181,7 +181,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-components/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-components/pom.xml
index 8f83b803e..09f5bb084 100644
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-components/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-components/pom.xml
@@ -77,7 +77,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-model/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-model/pom.xml
index 3c755eac8..6e9cdad17 100644
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-model/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-model/pom.xml
@@ -113,7 +113,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-standalone/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-standalone/pom.xml
index 1b85a1d87..c807b7a46 100644
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-standalone/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-standalone/pom.xml
@@ -32,7 +32,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui/pom.xml
index 44b868ae6..f24bf9109 100644
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui/pom.xml
@@ -67,7 +67,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-vertx/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-vertx/pom.xml
index a66b85ece..835e77fe0 100644
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-vertx/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-vertx/pom.xml
@@ -457,7 +457,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
\ No newline at end of file
diff --git a/deeplearning4j/deeplearning4j-ui-parent/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/pom.xml
index 70c32b984..947a28783 100644
--- a/deeplearning4j/deeplearning4j-ui-parent/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/pom.xml
@@ -49,7 +49,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/deeplearning4j-zoo/pom.xml b/deeplearning4j/deeplearning4j-zoo/pom.xml
index bec71ec04..ea431ebd9 100644
--- a/deeplearning4j/deeplearning4j-zoo/pom.xml
+++ b/deeplearning4j/deeplearning4j-zoo/pom.xml
@@ -85,7 +85,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 
diff --git a/deeplearning4j/dl4j-integration-tests/pom.xml b/deeplearning4j/dl4j-integration-tests/pom.xml
index 43e6bfa60..e8d958cf1 100644
--- a/deeplearning4j/dl4j-integration-tests/pom.xml
+++ b/deeplearning4j/dl4j-integration-tests/pom.xml
@@ -120,7 +120,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
\ No newline at end of file
diff --git a/deeplearning4j/pom.xml b/deeplearning4j/pom.xml
index 17c89f931..e121e305d 100644
--- a/deeplearning4j/pom.xml
+++ b/deeplearning4j/pom.xml
@@ -225,7 +225,7 @@
                             <skip>${skipBackendChoice}</skip>
                             <rules>
                                 <requireActiveProfile>
-                                    <profiles>test-nd4j-native,test-nd4j-cuda-10.2</profiles>
+                                    <profiles>test-nd4j-native,test-nd4j-cuda-11.0</profiles>
                                     <all>false</all>
                                 </requireActiveProfile>
                             </rules>
@@ -500,7 +500,7 @@
         </profile>
         <!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <activation>
                 <activeByDefault>false</activeByDefault>
             </activation>
@@ -513,7 +513,7 @@
                 </dependency>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${nd4j.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/libnd4j/include/array/NDArray.h b/libnd4j/include/array/NDArray.h
index 7b32b7d49..67452dda2 100644
--- a/libnd4j/include/array/NDArray.h
+++ b/libnd4j/include/array/NDArray.h
@@ -628,17 +628,17 @@ namespace sd {
         *  keepDims - if true then put unities in place of reduced dimensions
         */
 
-        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
+        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
 
-        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
+        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
 
-        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
+        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
 
-        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
+        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
 
         /**
         *  method reduces array by excluding its shapes along dimensions present in given dimensions vector
@@ -647,10 +647,10 @@ namespace sd {
         *  keepDims - if true then put unities in place of reduced dimensions
         *  extras - extra parameters
         */
-        void reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(sd::reduce::SameOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(sd::reduce::BoolOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(sd::reduce::LongOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::SameOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::BoolOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::LongOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
 
         /**
         *  return variance of array elements set
diff --git a/libnd4j/include/array/NDArray.hXX b/libnd4j/include/array/NDArray.hXX
index eefe169cf..cfd910343 100644
--- a/libnd4j/include/array/NDArray.hXX
+++ b/libnd4j/include/array/NDArray.hXX
@@ -1353,80 +1353,80 @@ void* NDArray::bufferWithOffset(Nd4jLong offset) {
 
 //////////////////////////////////////////////////////////////////////////
 // eventually method reduces array by excluding its shapes along axes present in dimensions vector
-NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims) const {
 
     std::vector<int> copy(dimensions);
 
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, isR() ? dataType() : Environment::getInstance().defaultFloatDataType(), keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, isR() ? dataType() : Environment::getInstance().defaultFloatDataType(), keepDims, false, getContext()->getWorkspace());
 
     NDArray result(newShape, true, getContext());
 
-    this->reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    this->reduceAlongDimension(op, result, copy, keepDims, false);
 
     return result;
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims) const {
 
     std::vector<int> copy(dimensions);
 
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, keepDims, false, getContext()->getWorkspace());
 
     NDArray result(newShape, true, getContext());
 
-    reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    reduceAlongDimension(op, result, copy, keepDims, false);
 
     return result;
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims) const {
 
     std::vector<int> copy(dimensions);
 
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::BOOL, keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::BOOL, keepDims, false, getContext()->getWorkspace());
 
     NDArray result(newShape, true, getContext());
 
-    reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    reduceAlongDimension(op, result, copy, keepDims, false);
 
     return result;
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims) const {
 
     std::vector<int> copy(dimensions);
 
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::INT64, keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::INT64, keepDims, false, getContext()->getWorkspace());
 
     NDArray result(newShape, true, getContext());
 
-    reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    reduceAlongDimension(op, result, copy, keepDims, false);
 
     return result;
 }
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -4240,7 +4240,7 @@ NDArray NDArray::applyAllReduce3(sd::reduce3::Ops op, const NDArray& other, cons
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension FloatOps: you can't use this method on String array!");
@@ -4250,7 +4250,7 @@ void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, con
     std::vector<int> copy(dimensions);
 
     if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
         if(!shape::shapeEquals(newShape, target.shapeInfo()))
             throw std::runtime_error("NDArray::reduceAlongDimension FloatOps: wrong target shape!");
     }
@@ -4261,8 +4261,18 @@ void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, con
         NativeOpExecutioner::execReduceFloatScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(),nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
     }
     else {
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(shapeInfo(), copy);
-        NativeOpExecutioner::execReduceFloat(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), copy.data(), copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
+
+        if(rankOf() - dimensions.size() != target.rankOf()) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
+        NativeOpExecutioner::execReduceFloat(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
+
     }
     synchronize("NDArray::reduceAlongDimension FloatOps");
 
@@ -4271,7 +4281,7 @@ void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, con
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension SameOps: you can't use this method on String array!");
@@ -4281,7 +4291,7 @@ void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, cons
     std::vector<int> copy(dimensions);
 
     if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
         if(!shape::shapeEquals(newShape, target.shapeInfo()))
             throw std::runtime_error("NDArray::reduceAlongDimension SameOps: wrong target shape!");
     }
@@ -4291,10 +4301,19 @@ void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, cons
     if(rankOf() == copy.size() || copy.empty()) {
         NativeOpExecutioner::execReduceSameScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
     }
-    else { //if (!isEmpty()) {
-        auto pDims = sd::Environment::getInstance().isCPU() ? copy.data() : nullptr;
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(this->shapeInfo(), copy);
-        NativeOpExecutioner::execReduceSame(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+    else {
+
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
+
+        if(rankOf() - dimensions.size() != target.rankOf()) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
+        NativeOpExecutioner::execReduceSame(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
     }
     synchronize("NDArray::reduceAlongDimension SameOps");
 
@@ -4303,7 +4322,7 @@ void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, cons
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension LongOps: you can't use this method on String array!");
@@ -4313,7 +4332,7 @@ void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, cons
     std::vector<int> copy(dimensions);
 
     if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
         if(!shape::shapeEquals(newShape, target.shapeInfo()))
             throw std::runtime_error("NDArray::reduceAlongDimension LongOps: wrong target shape!");
     }
@@ -4324,9 +4343,17 @@ void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, cons
         NativeOpExecutioner::execReduceLongScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
     }
     else {
-        auto pDims = sd::Environment::getInstance().isCPU() ? copy.data() : nullptr;
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(this->shapeInfo(), copy);
-        NativeOpExecutioner::execReduceLong(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
+
+        if(rankOf() - dimensions.size() != target.rankOf()) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
+        NativeOpExecutioner::execReduceLong(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
     }
     synchronize("NDArray::reduceAlongDimension LongOps");
 
@@ -4335,7 +4362,7 @@ void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, cons
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension BoolOps cuda: you can't use this method on String array!");
@@ -4345,7 +4372,7 @@ void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, cons
     std::vector<int> copy(dimensions);
 
     if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
         if(!shape::shapeEquals(newShape, target.shapeInfo()))
             throw std::runtime_error("NDArray::reduceAlongDimension BoolOps cuda: wrong target shape!");
     }
@@ -4356,9 +4383,17 @@ void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, cons
         NativeOpExecutioner::execReduceBoolScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
     }
     else {
-        auto pDims = sd::Environment::getInstance().isCPU() ? copy.data() : nullptr;
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(this->shapeInfo(), copy);
-        NativeOpExecutioner::execReduceBool(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
+
+        if(rankOf() - dimensions.size() != target.rankOf()) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
+        NativeOpExecutioner::execReduceBool(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
     }
     synchronize("NDArray::reduceAlongDimension LongOps");
 
diff --git a/libnd4j/include/helpers/ConstantShapeHelper.h b/libnd4j/include/helpers/ConstantShapeHelper.h
index 25440e05c..65c3bcb99 100644
--- a/libnd4j/include/helpers/ConstantShapeHelper.h
+++ b/libnd4j/include/helpers/ConstantShapeHelper.h
@@ -46,11 +46,13 @@ namespace sd {
         static ConstantShapeHelper & getInstance();
 
 
-      ConstantShapeBuffer& bufferForShapeInfo(sd::DataType dataType, char order, const std::vector<Nd4jLong> &shape);
-      ConstantShapeBuffer& bufferForShapeInfo(const ShapeDescriptor &descriptor);
-      ConstantShapeBuffer& bufferForShapeInfo(const Nd4jLong *shapeInfo);
-      ConstantShapeBuffer& bufferForShapeInfo(sd::DataType dataType, char order, int rank, const Nd4jLong* shape);
-      ConstantShapeBuffer& createShapeInfoWithUnitiesForBroadcast(const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, sd::memory::Workspace* workspace = nullptr, const std::vector<int> &dimensions = {});
+        ConstantShapeBuffer& bufferForShapeInfo(sd::DataType dataType, char order, const std::vector<Nd4jLong> &shape);
+        ConstantShapeBuffer& bufferForShapeInfo(const ShapeDescriptor &descriptor);
+        ConstantShapeBuffer& bufferForShapeInfo(const Nd4jLong *shapeInfo);
+        ConstantShapeBuffer& bufferForShapeInfo(sd::DataType dataType, char order, int rank, const Nd4jLong* shape);
+        ConstantShapeBuffer& createShapeInfoWithUnitiesForBroadcast(const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, sd::memory::Workspace* workspace = nullptr, const std::vector<int> &dimensions = {});
+        ConstantShapeBuffer& createShapeInfoWithNoUnitiesForReduce(const Nd4jLong* maxShapeInfo, const std::vector<int> &dimsWithUnities, sd::memory::Workspace* workspace = nullptr);
+        ConstantShapeBuffer& createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, sd::memory::Workspace* workspace = nullptr);
 
 
         const Nd4jLong* emptyShapeInfo(sd::DataType dataType);
diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h
index 9bf3daede..325fa3505 100644
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
@@ -41,43 +41,43 @@ namespace sd {
     public:
 
         template <typename OpType>
-        static FORCEINLINE void loopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, E* extraParams, int64_t start, int64_t stop);
+        static FORCEINLINE void loopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, E* extraParams);
     };
 
     template <typename X, typename Z>
     class ReductionFloatLoops : public ReductionLoops<X, Z, Z> {
     public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, Z* extraParams);
 
         template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, Z* extraParams);
     };
 
     template <typename X, typename Z>
     class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X, Z, X> {
     public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
 
         template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
     };
 
     template <typename X, typename Z>
     class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X, Z, X> {
     public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
 
         template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
     };
 
     template <typename X>
     class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X, X, X> {
     public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, X* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
 
         template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, X* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
     };
 
 
@@ -122,372 +122,613 @@ namespace sd {
         static void innerloopReduce3All(const X* x, const Nd4jLong* xShapeInfo, const X* y, const Nd4jLong* yShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* xTadShapeInfo, const Nd4jLong* xTadOffsets, const Nd4jLong* yTadShapeInfo, const Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
     };
 
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+static void reduceExec21(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
+
+    auto func = PRAGMA_THREADS_FOR {
+
+        for (auto i0 = start; i0 < stop; ++i0) {
+
+            auto x0 = x + i0 * xStrd0;
+            auto z0 = z + i0 * zStrd0;
+
+            auto s = OpType::startingValue(x0);
+
+            if(xStrd1 == 1)
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    s = OpType::update(s, OpType::op(x0[i1], extraParams), extraParams);
+            else
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    s = OpType::update(s, OpType::op(x0[i1 * xStrd1], extraParams), extraParams);
+
+            *z0 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis1), extraParams);
+        }
+    };
+
+    samediff::Threads::parallel_for(func,  0,xAxis0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+static void reduceExec31(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
+
+    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis1 * xAxis2);
+
+    auto func = PRAGMA_THREADS_FOR {
+
+        for (auto i0 = start; i0 < stop; ++i0) {
+
+            auto x0 = x + i0 * xStrd0;
+            auto z0 = z + i0 * zStrd0;
+
+            auto s = OpType::startingValue(x0);
+
+            if(xStrd1 == 1)
+                for (uint i2 = 0; i2 < xAxis2; ++i2)
+                    for (uint i1 = 0; i1 < xAxis1; ++i1)
+                        s = OpType::update(s, OpType::op(x0[i1 + i2*xStrd2], extraParams), extraParams);
+            else if(xStrd2 == 1)
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2], extraParams), extraParams);
+            else
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2], extraParams), extraParams);
+
+            *z0 = OpType::postProcess(s, tadLen, extraParams);
+        }
+    };
+
+    samediff::Threads::parallel_for(func,  0,xAxis0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec32(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 1);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
+    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 0);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
 
 
+    auto func = PRAGMA_THREADS_FOR_2D {
 
-    /*
-    //////////////////////////////////////////////////////////////////////////////
-    template<typename X, typename Y, typename Z>
-    void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
-                        const Y* y, const Nd4jLong* yShapeInfo,
-                              Z* z, const Nd4jLong* zShapeInfo,
-                              Z* extraParams,
-                              std::function<Z(X,Y,Z*)> op) {
+        for (auto i0 = start_x; i0 < stop_x; ++i0) {
+            for (auto i1 = start_y; i1 < stop_y; ++i1) {
 
-        const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
+                auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
+                auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
 
-        const Nd4jLong* xShape  = shape::shapeOf(xShapeInfo);
-        const Nd4jLong* xStride = shape::stride(xShapeInfo);
-        const Nd4jLong* yStride = shape::stride(yShapeInfo);
-        const Nd4jLong* zStride = shape::stride(zShapeInfo);
+                auto s = OpType::startingValue(x1);
 
-        const Nd4jLong len = shape::length(xShapeInfo);
+                if(xStrd2 == 1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        s = OpType::update(s, OpType::op(x1[i2], extraParams), extraParams);
+                else
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        s = OpType::update(s, OpType::op(x1[i2 * xStrd2], extraParams), extraParams);
 
-        OmpLaunchHelper threadsInfo(len);
+                *z1 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis2), extraParams);
+            }
+        }
+    };
 
-        switch (kindOfLoop) {
+    samediff::Threads::parallel_for(func, 0,xAxis0,1,  0,xAxis1,1);
+}
 
-            case LoopKind::EWS1: {
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec41(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
 
-                    const auto xi = x + threadOffset;
-                    const auto yi = y + threadOffset;
-                              auto zi = z + threadOffset;
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
 
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++)
-                        zi[i] = op(xi[i], yi[i], extraParams);
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
+
+    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
+    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
+
+    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis1 * xAxis2 * xAxis3);
+
+    auto func = PRAGMA_THREADS_FOR {
+
+        for (auto i0 = start; i0 < stop; ++i0) {
+
+            auto x0 = x + i0 * xStrd0;
+            auto z0 = z + i0 * zStrd0;
+
+            auto s = OpType::startingValue(x0);
+
+            if(xStrd1 == 1)
+                for (uint i3 = 0; i3 < xAxis3; ++i3)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i1 = 0; i1 < xAxis1; ++i1)
+                        s = OpType::update(s, OpType::op(x0[i1 + i2*xStrd2 + i3*xStrd3], extraParams), extraParams);
+            else if(xStrd2 == 1)
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i3 = 0; i3 < xAxis3; ++i3)
+                        for (uint i2 = 0; i2 < xAxis2; ++i2)
+                            s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2 + i3*xStrd3], extraParams), extraParams);
+            else if(xStrd3 == 1)
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3], extraParams), extraParams);
+            else
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3*xStrd3], extraParams), extraParams);
+
+            *z0 = OpType::postProcess(s, tadLen, extraParams);
+        }
+    };
+
+    samediff::Threads::parallel_for(func,  0,xAxis0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec42(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 1);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
+    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 0);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
+
+    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
+    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
+
+    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis2 * xAxis3);
+
+    auto func = PRAGMA_THREADS_FOR_2D {
+
+        for (auto i0 = start_x; i0 < stop_x; ++i0) {
+            for (auto i1 = start_y; i1 < stop_y; ++i1) {
+
+                auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
+                auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
+
+                auto s = OpType::startingValue(x1);
+
+                if(xStrd2 == 1)
+                    for (uint i3 = 0; i3 < xAxis3; ++i3)
+                        for (uint i2 = 0; i2 < xAxis2; ++i2)
+                            s = OpType::update(s, OpType::op(x1[i2 + i3*xStrd3], extraParams), extraParams);
+                else if(xStrd3 == 1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3], extraParams), extraParams);
+                else
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3*xStrd3], extraParams), extraParams);
+
+                *z1 = OpType::postProcess(s, tadLen, extraParams);
+            }
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0,xAxis0,1,  0,xAxis1,1);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec43(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 2);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
+    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, 1);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
+    const Nd4jLong zStrd2 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 2 : 0);
+
+    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
+    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
+
+    auto func = PRAGMA_THREADS_FOR_3D {
+
+        for (auto i0 = start_x; i0 < stop_x; ++i0) {
+            for (auto i1 = start_y; i1 < stop_y; ++i1) {
+                for (auto i2 = start_z; i2 < stop_z; ++i2) {
+
+                    auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
+                    auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
+
+                    auto s = OpType::startingValue(x2);
+
+                    if(xStrd3 == 1)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            s = OpType::update(s, OpType::op(x2[i3], extraParams), extraParams);
+                    else
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            s = OpType::update(s, OpType::op(x2[i3*xStrd3], extraParams), extraParams);
+
+                    *z2 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis3), extraParams);
                 }
             }
-                break;
+        }
+    };
 
-            case LoopKind::EWSNONZERO: {
-                const uint xEws = shape::elementWiseStride(xShapeInfo);
-                const uint yEws = shape::elementWiseStride(yShapeInfo);
-                const uint zEws = shape::elementWiseStride(zShapeInfo);
+    samediff::Threads::parallel_for(func,  0,xAxis0,1,  0,xAxis1,1,  0,xAxis2,1);
+}
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-                    const auto xi = x + threadOffset * xEws;
-                    const auto yi = y + threadOffset * yEws;
-                          auto zi = z + threadOffset * zEws;
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec51(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
 
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++)
-                        zi[i*zEws] = op(xi[i*xEws], yi[i*yEws], extraParams);
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
+
+    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
+    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
+
+    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
+    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
+
+    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis1 * xAxis2 * xAxis3 * xAxis4);
+
+    auto func = PRAGMA_THREADS_FOR {
+
+        for (auto i0 = start; i0 < stop; ++i0) {
+
+            auto x0 = x + i0 * xStrd0;
+            auto z0 = z + i0 * zStrd0;
+
+            auto s = OpType::startingValue(x0);
+
+            if(xStrd1 == 1)
+                for (uint i4 = 0; i4 < xAxis4; ++i4)
+                    for (uint i3 = 0; i3 < xAxis3; ++i3)
+                        for (uint i2 = 0; i2 < xAxis2; ++i2)
+                            for (uint i1 = 0; i1 < xAxis1; ++i1)
+                                s = OpType::update(s, OpType::op(x0[i1 + i2*xStrd2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
+            else if(xStrd2 == 1)
+                for (uint i4 = 0; i4 < xAxis4; ++i4)
+                    for (uint i3 = 0; i3 < xAxis3; ++i3)
+                        for (uint i1 = 0; i1 < xAxis1; ++i1)
+                            for (uint i2 = 0; i2 < xAxis2; ++i2)
+                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
+            else if(xStrd3 == 1)
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i4 = 0; i4 < xAxis4; ++i4)
+                            for (uint i3 = 0; i3 < xAxis3; ++i3)
+                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3 + i4*xStrd4], extraParams), extraParams);
+            else if(xStrd4 == 1)
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3*xStrd3 + i4], extraParams), extraParams);
+            else
+                for (uint i1 = 0; i1 < xAxis1; ++i1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
+
+            *z0 = OpType::postProcess(s, tadLen, extraParams);
+        }
+    };
+
+    samediff::Threads::parallel_for(func,  0,xAxis0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec52(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 1);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
+    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 0);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
+
+    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
+    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
+
+    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
+    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
+
+    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis2 * xAxis3 * xAxis4);
+
+    auto func = PRAGMA_THREADS_FOR_2D {
+
+        for (auto i0 = start_x; i0 < stop_x; ++i0) {
+            for (auto i1 = start_y; i1 < stop_y; ++i1) {
+
+                auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
+                auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
+
+                auto s = OpType::startingValue(x1);
+
+                if(xStrd2 == 1)
+                    for (uint i4 = 0; i4 < xAxis4; ++i4)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            for (uint i2 = 0; i2 < xAxis2; ++i2)
+                                s = OpType::update(s, OpType::op(x1[i2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
+                else if(xStrd3 == 1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i4 = 0; i4 < xAxis4; ++i4)
+                            for (uint i3 = 0; i3 < xAxis3; ++i3)
+                                s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3 + i4*xStrd4], extraParams), extraParams);
+                else if(xStrd4 == 1)
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3*xStrd3 + i4], extraParams), extraParams);
+                else
+                    for (uint i2 = 0; i2 < xAxis2; ++i2)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
+
+                *z1 = OpType::postProcess(s, tadLen, extraParams);
+            }
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0,xAxis0,1,  0,xAxis1,1);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec53(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 2);
+
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
+    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, 1);
+
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
+    const Nd4jLong zStrd2 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 2 : 0);
+
+    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
+    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
+
+    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
+    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
+
+    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis3 * xAxis4);
+
+    auto func = PRAGMA_THREADS_FOR_3D {
+
+        for (auto i0 = start_x; i0 < stop_x; ++i0) {
+            for (auto i1 = start_y; i1 < stop_y; ++i1) {
+                for (auto i2 = start_z; i2 < stop_z; ++i2) {
+
+                    auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
+                    auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
+
+                    auto s = OpType::startingValue(x2);
+
+                    if(xStrd3 == 1)
+                        for (uint i4 = 0; i4 < xAxis4; ++i4)
+                            for (uint i3 = 0; i3 < xAxis3; ++i3)
+                                s = OpType::update(s, OpType::op(x2[i3 + i4*xStrd4], extraParams), extraParams);
+                    else if(xStrd4 == 1)
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x2[i3*xStrd3 + i4], extraParams), extraParams);
+                    else
+                        for (uint i3 = 0; i3 < xAxis3; ++i3)
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x2[i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
+
+                    *z2 = OpType::postProcess(s, tadLen, extraParams);
                 }
             }
-                break;
+        }
+    };
 
-            case LoopKind::RANK1: {
-                PRAGMA_OMP_PARALLEL_FOR
-                for (uint i0 = 0; i0 < len; ++i0)
-                    z[i0 * zStride[0]] = op(x[i0 * xStride[0]], y[i0 * yStride[0]], extraParams);
-            }
-                break;
+    samediff::Threads::parallel_for(func,  0,xAxis0,1,  0,xAxis1,1,  0,xAxis2,1);
+}
 
-            case LoopKind::RANK2: {
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                        z[i0 * zStride[0] + i1 * zStride[1]] = op(x[i0 * xStride[0] + i1 * xStride[1]], y[i0 * yStride[0] + i1 * yStride[1]], extraParams);
-            }
-                break;
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceExec54(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
 
-            case LoopKind::RANK3: {
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-                for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
-                            z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]], extraParams);
-            }
-                break;
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[3]);
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[3]);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 3);
 
-            case LoopKind::RANK4: {
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(3)
-                for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
-                            for (uint i3 = 0; i3 < xShape[3]; ++i3)
-                                z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]], extraParams);
-            }
-                break;
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[2]);
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[2]);
+    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 2);
 
-            case LoopKind::RANK5: {
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(4)
-                for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
-                            for (uint i3 = 0; i3 < xShape[3]; ++i3)
-                                for (uint i4 = 0; i4 < xShape[4]; ++i4)
-                                    z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]+i4*zStride[4]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]+i4*xStride[4]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]+i4*yStride[4]], extraParams);
-            }
-                break;
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[2] : dims[1]);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[2] : dims[1]);
+    const Nd4jLong zStrd2 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 2 : 1);
 
-            default: {
-                uint xShapeInfoCast[MAX_RANK];
-                uint yShapeInfoCast[MAX_RANK];
-                uint zShapeInfoCast[MAX_RANK];
+    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[3] : dims[0]);
+    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[3] : dims[0]);
+    const Nd4jLong zStrd3 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 3 : 0);
 
-                bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                bool canCastY = DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
+    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                        z[zOffset] = op(x[xOffset], y[yOffset], extraParams);
+    auto func = PRAGMA_THREADS_FOR_3D {
+
+        for (auto i0 = start_x; i0 < stop_x; ++i0) {
+            for (auto i1 = start_y; i1 < stop_y; ++i1) {
+                for (auto i2 = start_z; i2 < stop_z; ++i2) {
+                    for (auto i3 = 0; i3 < xAxis3; ++i3) {
+
+                        auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3;
+                        auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
+
+                        auto s = OpType::startingValue(x3);
+
+                       if(xStrd4 == 1)
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x3[i4], extraParams), extraParams);
+                        else
+                            for (uint i4 = 0; i4 < xAxis4; ++i4)
+                                s = OpType::update(s, OpType::op(x3[i4*xStrd4], extraParams), extraParams);
+
+                        *z3 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis4), extraParams);
                     }
                 }
             }
         }
+    };
+
+    samediff::Threads::parallel_for(func,  0,xAxis0,1,  0,xAxis1,1,  0,xAxis2,1);
+}
+
+
+////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z, typename E, typename OpType>
+void reduceDefault(sd::memory::Workspace* workspace, const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
+
+    const int zRank = shape::rank(zShapeInfo);
+    const int tadRank = shape::rank(xShapeInfo) - zRank;
+
+    Nd4jLong* outerXTadShapeInfo = sd::ShapeBuilders::createSubArrShapeInfo(xShapeInfo, dims, zRank);
+    Nd4jLong* innerXTadShapeInfo = sd::ShapeBuilders::createSubArrShapeInfo(xShapeInfo, dims+zRank, tadRank);
+
+    const bool sameOffsets1 = shape::haveSameShapeAndStrides(zShapeInfo, outerXTadShapeInfo);
+    const bool sameOffsets2 = shape::haveSameShapeAndStrides(zShapeInfo, innerXTadShapeInfo);
+
+    const Nd4jLong zLen   = shape::length(zShapeInfo);
+    const Nd4jLong tadLen = shape::length(innerXTadShapeInfo);
+
+    Nd4jLong* zOffsets = nullptr;
+    ALLOCATE(zOffsets, workspace, zLen, Nd4jLong);
+    shape::calcOffsets(zShapeInfo, zOffsets);
+
+    Nd4jLong* outerXTadOffsets = zOffsets;
+    if(!sameOffsets1) {
+        ALLOCATE(outerXTadOffsets, workspace, zLen, Nd4jLong);
+        shape::calcOffsets(outerXTadShapeInfo, outerXTadOffsets);
     }
-    */
 
-
-
-    //////////////////////////////////////////////////////////////////////////////
-    template<typename X, typename Z, typename E>
-    template <typename OpType>
-    void sd::ReductionLoops<X, Z, E>::loopReduce(const X* x, const Nd4jLong* xShapeInfo,
-                                                 Z* z, const Nd4jLong* zShapeInfo,
-                                                 const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets,
-                                                 E* extraParams,
-                                                 int64_t start, int64_t stop) {
-
-        const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo);
-
-        const Nd4jLong zLen = shape::length(zShapeInfo);
-        const Nd4jLong tadLen = shape::length(tadShapeInfo);
-
-        const uint tadEws = shape::elementWiseStride(tadShapeInfo);
-        const uint zEws = shape::elementWiseStride(zShapeInfo);
-
-        const Nd4jLong* tadShape = shape::shapeOf(tadShapeInfo);
-        const Nd4jLong* tadStride = shape::stride(tadShapeInfo);
-
-        int numThreads = OmpLaunchHelper::tadThreads(tadLen, zLen);
-
-        switch (kindOfLoop) {
-
-            //*********************************************//
-            // case LoopKind::SMALLARR2DX: {
-            //     shape::printShapeInfoLinear(xShapeInfo);
-            //     shape::printShapeInfoLinear(zShapeInfo);
-            //     const auto xLen = zLen * tadLen;
-            //     for (uint i = 0; i < xLen; ++i) {
-            //         const auto zOffset = shape::subArrayOffset(i, xShapeInfo, zShapeInfo, dimsToExclude, dimsLen);
-            //         const uint tadInd = (i / tadEws) % tadLen;
-            //         auto startVal = tadInd ? z[zOffset] : static_cast<Z>(OpType::startingValue(x));
-            //         z[zOffset] = OpType::update(startVal, OpType::op(x[i], extraParams), extraParams);
-            //         if(tadInd == tadLen - 1)
-            //             z[zOffset] = OpType::postProcess(z[zOffset], tadLen, extraParams);
-            //         printf("%u - %lld\n", i, zOffset);
-            //     }
-            // }
-        case LoopKind::SMALLARR2DX: {
-            const auto uTadLen = static_cast<uint>(tadLen);
-            const auto uZLenMinusOne = static_cast<uint>(zLen - 1);
-            const auto xLen = static_cast<uint>(zLen * uTadLen);
-            const auto sv = static_cast<Z>(OpType::startingValue(x));
-
-            for (uint i = 0; i <= uZLenMinusOne; i++)
-                z[i] = OpType::startingValue(x);
-
-            uint zOffset = 0;
-            for (uint i = 0; i < xLen; ++i) {
-                z[zOffset] = OpType::update(z[zOffset], OpType::op(x[i], extraParams), extraParams);
-                zOffset = zOffset == uZLenMinusOne ? 0 : zOffset + 1;
-            }
-
-            for (uint i = 0; i <= uZLenMinusOne; i++)
-                z[i] = OpType::postProcess(z[i], tadLen, extraParams);
-        }
-        break;
-
-        //*********************************************//
-        case LoopKind::EWS1: {
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong j = 0; j < tadLen; j++)
-                    s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
-
-                z[i] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-         //*********************************************//
-        case LoopKind::EWSNONZERO: {
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong j = 0; j < tadLen; j++)
-                    s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
-
-                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-        //*********************************************//
-        case LoopKind::RANK1: {
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong i0 = 0; i0 < tadLen; ++i0)
-                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
-
-                z[i] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-                            break;
-
-                            //*********************************************//
-        case LoopKind::RANK2: {
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
-                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
-                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
-
-                z[i] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-        //*********************************************//
-        case LoopKind::RANK3: {
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
-                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
-                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
-                            s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
-
-                z[i] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-        //*********************************************//
-        case LoopKind::RANK4: {
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
-                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
-                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
-                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
-                                s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
-
-                z[i] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-        //*********************************************//
-        case LoopKind::RANK5: {
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
-                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
-                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
-                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
-                                for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4)
-                                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
-
-                z[i] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-        //*********************************************//
-        case LoopKind::X_EWSNONZERO: {
-            uint castZShapeInfo[MAX_RANK];
-            const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
-
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong j = 0; j < tadLen; j++)
-                    s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
-
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-        //*********************************************//
-        case LoopKind::Z_EWSNONZERO: {
-            uint castTadShapeInfo[MAX_RANK];
-            const bool canCastTad = sd::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
-
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong j = 0; j < tadLen; j++) {
-                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                    s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
-                }
-
-                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-            };
-        }
-        break;
-
-        //*********************************************//
-        default: {
-            auto innertadOffsets = new Nd4jLong[tadLen];
-            shape::calcOffsets(tadShapeInfo, innertadOffsets);
-
-            uint castZShapeInfo[MAX_RANK];
-            const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
-
-            for (auto i = start; i < stop; i++) {
-                auto tad = x + tadOffsets[i];
-                auto s = OpType::startingValue(tad);
-
-                for (Nd4jLong j = 0; j < tadLen; j++)
-                    s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
-
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
-            };
-
-            delete[] innertadOffsets;
-        }
-        }
+    Nd4jLong* innerXTadOffsets = zOffsets;
+    if(!sameOffsets2) {
+        ALLOCATE(innerXTadOffsets, workspace, tadLen, Nd4jLong);
+        shape::calcOffsets(innerXTadShapeInfo, innerXTadOffsets);
     }
 
+    auto func = PRAGMA_THREADS_FOR{
+
+        for (auto i = start; i < stop; ++i) {
+
+            const auto tad = x + outerXTadOffsets[i];
+            auto s = OpType::startingValue(tad);
+
+            for (Nd4jLong j = 0; j < tadLen; j++)
+                s = OpType::update(s, OpType::op(tad[innerXTadOffsets[j]], extraParams), extraParams);
+
+            z[zOffsets[i]] = OpType::postProcess(s, tadLen, extraParams);
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo));
+
+    RELEASE(outerXTadShapeInfo, workspace);
+    RELEASE(innerXTadShapeInfo, workspace);
+    RELEASE(zOffsets, workspace);
+    if(!sameOffsets1)
+        RELEASE(outerXTadOffsets, workspace);
+    if(!sameOffsets2)
+        RELEASE(innerXTadOffsets, workspace);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+template<typename X, typename Z, typename E>
+template <typename OpType>
+void sd::ReductionLoops<X, Z, E>::loopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, E* extraParams) {
+
+    const int xRank = shape::rank(xShapeInfo);
+    const int zRank = shape::rank(zShapeInfo);
+
+    // shape::printShapeInfoLinear(xShapeInfo);
+    // shape::printShapeInfoLinear(zShapeInfo);
+    // shape::printIntArray(dims, shape::rank(xShapeInfo));
+
+    if(xRank == 2 && zRank == 1)
+        reduceExec21<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 3 && zRank == 1)
+        reduceExec31<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 3 && zRank == 2)
+        reduceExec32<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 4 && zRank == 1)
+        reduceExec41<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 4 && zRank == 2)
+        reduceExec42<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 4 && zRank == 3)
+        reduceExec43<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 5 && zRank == 1)
+        reduceExec51<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 5 && zRank == 2)
+        reduceExec52<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 5 && zRank == 3)
+        reduceExec53<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else if(xRank == 5 && zRank == 4)
+        reduceExec54<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+    else
+        reduceDefault<X,Z,E,OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+}
+
 
 
     //////////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/helpers/ShapeBuilders.h b/libnd4j/include/helpers/ShapeBuilders.h
index e2c29a280..14726d5e6 100644
--- a/libnd4j/include/helpers/ShapeBuilders.h
+++ b/libnd4j/include/helpers/ShapeBuilders.h
@@ -52,11 +52,9 @@ namespace sd {
         static Nd4jLong* copyShapeInfoAndType(const Nd4jLong* inShapeInfo, const Nd4jLong* shapeInfoToGetTypeFrom, const bool copyStrides, memory::Workspace* workspace = nullptr);
 
         /**
-        * allocates memory for new shapeInfo and copy all information from inShapeInfo to new shapeInfo except dimensions in dimsToExclude (unit dimensions) and corresponding strides
-        * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
-        * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
+        * allocates memory for sub-array shapeInfo and copy shape and strides at axes(positions) stored in dims
         */
-        static Nd4jLong* copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace = nullptr);
+        static Nd4jLong* createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, memory::Workspace* workspace = nullptr);
 
         static Nd4jLong* emptyShapeInfo(const sd::DataType dataType, memory::Workspace* workspace = nullptr);
 
diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h
index cb2faa43d..bd30d9225 100644
--- a/libnd4j/include/helpers/ShapeUtils.h
+++ b/libnd4j/include/helpers/ShapeUtils.h
@@ -40,6 +40,12 @@ namespace sd {
         static const Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const NDArray& arr, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
         static const Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const Nd4jLong* shapeInfo, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
 
+
+        // for example
+        // if rank = 3 and dimsToExclude = {0,2} then output = {1,0,2},   if rank = 3 and dimsToExclude = {2} then output = {0,1,2}
+        // if rank = 3 and dimsToExclude = {0} then output = {1,2,0},     if rank = 4 and dimsToExclude = {0,3} then output = {1,2,0,3}
+        static std::vector<int> evalDimsForReduceOp(const int rank, const std::vector<int>& dimsToExclude);
+
         /**
          * evaluate output shape for reduce operation when input shape is empty
          * behavior is analogous to tf
diff --git a/libnd4j/include/helpers/benchmark/ReductionBenchmark.h b/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
index d87c20d3c..c48030542 100644
--- a/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
@@ -94,9 +94,9 @@ namespace sd {
                 auto tadOffsets = Environment::getInstance().isCPU() ? pack.primaryOffsets() : pack.specialOffsets();
 
                 if (_opType == 0)
-                    NativeOpExecutioner::execReduceFloat(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size(), tadOnlyShapeInfo, tadOffsets);
+                    NativeOpExecutioner::execReduceFloat(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size());
                 else
-                    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size(), tadOnlyShapeInfo, tadOffsets);
+                    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size());
             }
 
             manager.synchronize();
diff --git a/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp b/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
index 528527f36..f12616688 100644
--- a/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
@@ -184,6 +184,43 @@ ConstantShapeBuffer& ConstantShapeHelper::createShapeInfoWithUnitiesForBroadcast
 
     return bufferForShapeInfo(descriptor);
 }
-} // namespace sd
+
+
+////////////////////////////////////////////////////////////////////////
+ConstantShapeBuffer& ConstantShapeHelper::createShapeInfoWithNoUnitiesForReduce(const Nd4jLong* inShapeInfo, const std::vector<int> &dimsWithUnities, sd::memory::Workspace* workspace) {
+
+    Nd4jLong* newShapeInfo = nullptr;
+    ALLOCATE(newShapeInfo, workspace, shape::shapeInfoLength(shape::rank(inShapeInfo) - dimsWithUnities.size()), Nd4jLong);
+
+    int temp;
+    if(dimsWithUnities.size() == 1 && shape::isCommonVector(inShapeInfo, temp) && temp == dimsWithUnities[0]) {
+        auto dims = ShapeUtils::evalDimsToExclude(shape::rank(inShapeInfo), {temp});
+        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dims.data(), dims.size(), newShapeInfo);
+    } else {
+        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsWithUnities.data(), dimsWithUnities.size(), newShapeInfo);
+    }
+
+    ShapeDescriptor descriptor(newShapeInfo);
+
+    RELEASE(newShapeInfo, workspace);
+
+    return bufferForShapeInfo(descriptor);
+}
+
+////////////////////////////////////////////////////////////////////////
+ConstantShapeBuffer& ConstantShapeHelper::createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, sd::memory::Workspace* workspace) {
+
+    Nd4jLong* newShapeInfo = ShapeBuilders::createSubArrShapeInfo(inShapeInfo, dims, dimsSize, workspace);
+
+    ShapeDescriptor descriptor(newShapeInfo);
+
+    RELEASE(newShapeInfo, workspace);
+
+    return bufferForShapeInfo(descriptor);
+}
+
+
+
+}
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp
index 437eebe1d..91467758b 100644
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@@ -443,17 +443,17 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB,  NDArray* vC,
             // calculate index of current batch
             Nd4jLong batchInd;
             if(cRank > 2)
-                batchInd = shape::coords2index(cShapeInfo, cCoords.data(), cRank - 2, cBatchDims);
+                batchInd = shape::coords2index(cShapeInfo, cBatchDims, cRank - 2, cCoords.data());
 
             // evaluate A coordinates
             if(aRank > 2)
-                shape::index2coords(batchInd, aShapeInfo, aCoords.data(), aRank - 2, aBatchDims);
+                shape::index2coords(batchInd, aShapeInfo, aBatchDims, aRank - 2, aCoords.data());
             aCoords[aMaxis] = cCoords[cMaxis];
             aCoords[aKaxis] = 0;
 
             // evaluate B coordinates
             if(bRank > 2)
-                shape::index2coords(batchInd, bShapeInfo, bCoords.data(), bRank - 2, bBatchDims);
+                shape::index2coords(batchInd, bShapeInfo, bBatchDims, bRank - 2, bCoords.data());
             bCoords[bKaxis] = 0;
             bCoords[bNaxis] = cCoords[cNaxis];
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
index e122717fc..04e6e4eb5 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
@@ -26,20 +26,19 @@ namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionBoolLoops<X, Z>::innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+    void ReductionBoolLoops<X, Z>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const int* dims, X* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
     }
 
     template<typename X, typename Y>
-    void ReductionBoolLoops<X, Y>::wrapper(const int opNum,
+    void ReductionBoolLoops<X, Y>::wrapper(const int opNum, sd::memory::Workspace* workspace,
                                            const X *x, const Nd4jLong *xShapeInfo,
                                            Y *z, const Nd4jLong *zShapeInfo,
-                                           const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets,
-                                           X *extraParams, int64_t start, int64_t stop) {
+                                           const int *dims, X *extraParams) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_BOOL_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_BOOL_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float.hpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float.hpp
index c7ed544b2..2ab71b34a 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float.hpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float.hpp
@@ -28,20 +28,19 @@ namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(const X * x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+    void ReductionFloatLoops<X, Z>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const int* dims, Z* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
     }
 
     template<typename X, typename Y>
-    void ReductionFloatLoops<X, Y>::wrapper(const int opNum, const X *x, const Nd4jLong *xShapeInfo,
+    void ReductionFloatLoops<X, Y>::wrapper(const int opNum, sd::memory::Workspace* workspace,
+                                            const X *x, const Nd4jLong *xShapeInfo,
                                             Y *z, const Nd4jLong *zShapeInfo,
-                                            const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets,
-                                            Y *extraParams,
-                                            int64_t start, int64_t stop) {
+                                            const int *dims, Y *extraParams) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_FLOAT_OPS);
 #endif
     }
      
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
index be6cb28bd..820091f09 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
@@ -33,18 +33,19 @@ namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionLongLoops<X, Z>::innerloopReduce(const X * x, const Nd4jLong* xShapeInfo, Z *z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+    void ReductionLongLoops<X, Z>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, Z *z, const Nd4jLong* zShapeInfo, const int* dims, X* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
-    void ReductionLongLoops<X, Y>::wrapper(const int opNum, const X *x, const Nd4jLong *xShapeInfo, Y *z,
-                                           const Nd4jLong *zShapeInfo, const Nd4jLong *tadShapeInfo,
-                                           const Nd4jLong *tadOffsets, X *extraParams, int64_t start, int64_t stop) {
+    void ReductionLongLoops<X, Y>::wrapper(const int opNum, sd::memory::Workspace* workspace,
+                                            const X *x, const Nd4jLong *xShapeInfo,
+                                            Y *z, const Nd4jLong *zShapeInfo,
+                                            const int *dims, X *extraParams) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_LONG_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_LONG_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
index 53725de83..2544a3c03 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
@@ -26,23 +26,23 @@ namespace sd {
 
     template<typename X>
     template <typename OpType>
-    void ReductionSameLoops<X>::innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+    void ReductionSameLoops<X>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const int* dims, X* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,X,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
     }
 
     template<typename X>
-    void ReductionSameLoops<X>::wrapper(const int opNum, const X *vx, const Nd4jLong *xShapeInfo, X *vz,
-                                        const Nd4jLong *zShapeInfo, const Nd4jLong *tadShapeInfo,
-                                        const Nd4jLong *tadOffsets,
-                                           X *vextraParams, int64_t start, int64_t stop) {
+    void ReductionSameLoops<X>::wrapper(const int opNum, sd::memory::Workspace* workspace,
+                                        const X *vx, const Nd4jLong *xShapeInfo,
+                                        X *z, const Nd4jLong *zShapeInfo,
+                                        const int *dims, X *vextraParams) {
 #ifndef INLINE_LOOPS
         auto x = reinterpret_cast<X *>(vx);
         auto z = reinterpret_cast<X *>(vz);
         auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_SAME_OPS);
+        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_SAME_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu b/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
index 35ba60ca9..fb093e7b7 100644
--- a/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
+++ b/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
@@ -24,6 +24,7 @@
 #include <helpers/ShapeBuilders.h>
 #include <execution/AffinityManager.h>
 #include <helpers/ConstantHelper.h>
+#include <helpers/ShapeUtils.h>
 #include <array/PrimaryPointerDeallocator.h>
 #include <array/CudaPointerDeallocator.h>
 
@@ -187,4 +188,38 @@ ConstantShapeBuffer&  ConstantShapeHelper::createShapeInfoWithUnitiesForBroadcas
     return bufferForShapeInfo(descriptor);
 }
 
+////////////////////////////////////////////////////////////////////////
+ConstantShapeBuffer& ConstantShapeHelper::createShapeInfoWithNoUnitiesForReduce(const Nd4jLong* inShapeInfo, const std::vector<int> &dimsWithUnities, sd::memory::Workspace* workspace) {
+
+    Nd4jLong* newShapeInfo = nullptr;
+    ALLOCATE(newShapeInfo, workspace, shape::shapeInfoLength(shape::rank(inShapeInfo) - dimsWithUnities.size()), Nd4jLong);
+
+    int temp;
+    if(dimsWithUnities.size() == 1 && shape::isCommonVector(inShapeInfo, temp) && temp == dimsWithUnities[0]) {
+        auto dims = ShapeUtils::evalDimsToExclude(shape::rank(inShapeInfo), {temp});
+        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dims.data(), dims.size(), newShapeInfo);
+    } else {
+        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsWithUnities.data(), dimsWithUnities.size(), newShapeInfo);
+    }
+
+    ShapeDescriptor descriptor(newShapeInfo);
+
+    RELEASE(newShapeInfo, workspace);
+
+    return bufferForShapeInfo(descriptor);
+}
+
+////////////////////////////////////////////////////////////////////////
+ConstantShapeBuffer& ConstantShapeHelper::createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, sd::memory::Workspace* workspace) {
+
+    Nd4jLong* newShapeInfo = ShapeBuilders::createSubArrShapeInfo(inShapeInfo, dims, dimsSize, workspace);
+
+    ShapeDescriptor descriptor(newShapeInfo);
+
+    RELEASE(newShapeInfo, workspace);
+
+    return bufferForShapeInfo(descriptor);
+}
+
+
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cuda_off/MmulHelper.cu b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
index d1122d794..36f48184a 100644
--- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
@@ -571,17 +571,17 @@ static __global__ void batchedCudaGemm(const void* vA, const Nd4jLong* aShapeInf
         // calculate index of current batch
         Nd4jLong batchInd;
         if(cBatchDims != nullptr)
-            batchInd = shape::coords2index(cShapeInfo, cCoords, cRank - 2, cBatchDims);
+            batchInd = shape::coords2index(cShapeInfo, cBatchDims, cRank - 2, cCoords);
 
         // evaluate A coordinates
         if(aBatchDims != nullptr)
-            shape::index2coords(batchInd, aShapeInfo, aCoords, aRank - 2, aBatchDims);
+            shape::index2coords(batchInd, aShapeInfo, aBatchDims, aRank - 2, aCoords);
         aCoords[aMaxis] = cCoords[cMaxis];
         aCoords[aKaxis] = 0;
 
         // evaluate B coordinates
         if(bBatchDims != nullptr)
-            shape::index2coords(batchInd, bShapeInfo, bCoords, bRank - 2, bBatchDims);
+            shape::index2coords(batchInd, bShapeInfo, bBatchDims, bRank - 2, bCoords);
         bCoords[bKaxis] = 0;
         bCoords[bNaxis] = cCoords[cNaxis];
 
diff --git a/libnd4j/include/helpers/impl/ShapeBuilders.cpp b/libnd4j/include/helpers/impl/ShapeBuilders.cpp
index 7c0c7fed6..dbcf6dac0 100644
--- a/libnd4j/include/helpers/impl/ShapeBuilders.cpp
+++ b/libnd4j/include/helpers/impl/ShapeBuilders.cpp
@@ -140,14 +140,26 @@ namespace sd {
     }
 
 ////////////////////////////////////////////////////////////////////////////////
-Nd4jLong* ShapeBuilders::copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace) {
+Nd4jLong* ShapeBuilders::createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, memory::Workspace* workspace) {
 
-    Nd4jLong *outShapeInfo = nullptr;
-    ALLOCATE(outShapeInfo, workspace, shape::shapeInfoLength(inShapeInfo[0] - dimsSize), Nd4jLong);
+    Nd4jLong *subArrShapeInfo = nullptr;
+    ALLOCATE(subArrShapeInfo, workspace, shape::shapeInfoLength(dimsSize), Nd4jLong);
 
-    shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsSize, dimsToExclude, outShapeInfo);
+    subArrShapeInfo[0] = dimsSize;                                 // rank
+    sd::ArrayOptions::copyDataType(subArrShapeInfo, inShapeInfo);  // type
+    subArrShapeInfo[2*dimsSize + 3] = shape::order(inShapeInfo);   // order
 
-    return outShapeInfo;
+    Nd4jLong* shape = shape::shapeOf(subArrShapeInfo);
+    Nd4jLong* strides = shape::stride(subArrShapeInfo);
+
+    for(int i = 0; i < dimsSize; ++i) {
+        shape[i]   = shape::sizeAt(inShapeInfo, dims[i]);
+        strides[i] = shape::strideAt(inShapeInfo, dims[i]);
+    }
+
+    shape::checkStridesEwsAndOrder(subArrShapeInfo);
+
+    return subArrShapeInfo;
 }
 
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp
index 2c189cff1..998df7728 100644
--- a/libnd4j/include/helpers/impl/ShapeUtils.cpp
+++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp
@@ -1062,6 +1062,17 @@ bool ShapeUtils::areShapesEqual(const Nd4jLong* shapeInfo, const std::vector<Nd4
     return true;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+std::vector<int> ShapeUtils::evalDimsForReduceOp(const int rank, const std::vector<int>& dimsToExclude) {
+
+    std::vector<int> output = ShapeUtils::evalDimsToExclude(rank, dimsToExclude);
+
+    for(uint j = 0; j < dimsToExclude.size(); ++j)
+        output.emplace_back(dimsToExclude[j]);
+
+    return output;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /*
 bool ShapeUtils::isSubArrayCase(const NDArray& arr1, const NDArray& arr2, std::vector<int>& sameDims) {
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index 719b086cb..ca6054482 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -901,6 +901,16 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0);
     ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0);
     ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, const int* dims);     // length of dims is equal to rank of shapeInfo
+
+    // all three arrays should have same rank
+    // all three arrays should have same dimensions or some of them are 1 (that is satisfy broadcasting principle), strides may be different
+    // shapeInfo1 - first array should have max length compared to rest of two arrays
+    ND4J_EXPORT _CUDA_HD void getOffsetBroadcast(const Nd4jLong& startInd, const Nd4jLong ind,
+                                                const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2, const Nd4jLong* shapeInfo3,
+                                                const bool sameOffsets12, const bool sameOffsets13,
+                                                int* coords,
+                                                Nd4jLong& offset1, Nd4jLong& offset2, Nd4jLong& offset3);
 
     ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank);
 
@@ -918,11 +928,12 @@ namespace shape {
 
     ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords);
     ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords);
+    // ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, const int* dims, Nd4jLong *coords);
 
     /**
     * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
     */
-    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims);
+    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, int *coords);
 
     /**
     * Convert coordinates to the corresponding linear index (sequence number in other words)
@@ -935,7 +946,7 @@ namespace shape {
     /**
     * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
     */
-    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims);
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int* dims, const int dimsSize, const int *coords);
 
    /**
    * increment n-dimensional array by one iteration by changing coord appropriately
@@ -951,7 +962,7 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo);
     ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned);
 
-    ND4J_EXPORT _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo);
+    ND4J_EXPORT _CUDA_HD void printShapeInfo(const Nd4jLong *shapeInfo);
 
     ND4J_EXPORT _CUDA_HD void printShapeInfoLinear(const Nd4jLong *shapeInfo);
 
@@ -1057,10 +1068,10 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities);
 
     /**
-    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude(points on unity dimensions) = {1,3}, dimsSize = 2
     * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
     */
-    INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo);
+    INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int* dimsToExclude, const int dimsSize, Nd4jLong* outShapeInfo);
 
     /**
     * get stride over contiguous axis (contiguous axis must have stride = 1)
@@ -1847,13 +1858,13 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape,
     return index;
 }
 
-INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, const int *coords) {
 
     Nd4jLong index, shift = 1;;
 
-    index = coords[tadDims[dimsSize - 1]];
-    for(uint i = dimsSize - 1;  i >= 1; --i) {
-        shift *= shapeInfo[tadDims[i]];
+    index = coords[dims[dimsLen - 1]];
+    for(uint i = dimsLen - 1;  i >= 1; --i) {
+        shift *= shapeInfo[dims[i]];
         index += shift * coords[i - 1];
     }
 
@@ -3324,6 +3335,18 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong
     return offset;
 }
 
+//////////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset) {
+
+    Nd4jLong offset = baseOffset;
+
+    for(uint i = 1; i <= shapeInfo[0]; ++i)
+        if(shapeInfo[i] != 1)
+            offset += coords[i - 1] * shapeInfo[shapeInfo[0] + i];
+
+    return offset;
+}
+
 //////////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset) {
 
@@ -3337,17 +3360,78 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coor
 }
 
 //////////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset) {
+INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, const int* dims) {
 
-    Nd4jLong offset = baseOffset;
+    Nd4jLong offset = 0;
 
     for(uint i = 1; i <= shapeInfo[0]; ++i)
         if(shapeInfo[i] != 1)
-            offset += coords[i - 1] * shapeInfo[shapeInfo[0] + i];
+            offset += coords[dims[i - 1]] * shapeInfo[shapeInfo[0] + i];
 
     return offset;
 }
 
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD void getOffsetBroadcast(const Nd4jLong& startInd, const Nd4jLong ind,
+                                            const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2, const Nd4jLong* shapeInfo3,
+                                            const bool sameOffsets12, const bool sameOffsets13,
+                                            int* coords,
+                                            Nd4jLong& offset1, Nd4jLong& offset2, Nd4jLong& offset3) {
+
+    const Nd4jLong* shape1 = shape::shapeOf(shapeInfo1);
+    const Nd4jLong* strides1 = shape::stride(shapeInfo1);
+    const Nd4jLong* shape2 = shape::shapeOf(shapeInfo2);
+    const Nd4jLong* strides2 = shape::stride(shapeInfo2);
+    const Nd4jLong* shape3 = shape::shapeOf(shapeInfo3);
+    const Nd4jLong* strides3 = shape::stride(shapeInfo3);
+
+    if(startInd == ind) {
+
+        if(shape::rank(shapeInfo1) == 0) {
+            offset1 = offset2 = offset3 = 0;
+            return;
+        }
+
+        shape::index2coords(ind, shapeInfo1, coords);
+        offset1 = shape::getOffset(shapeInfo1, coords);
+
+        if(sameOffsets12)
+            offset2 = offset1;
+        else
+            offset2 = shape::getOffset(shapeInfo2, coords);
+
+        if(sameOffsets13)
+            offset3 = offset1;
+        else
+            offset3 = shape::getOffset(shapeInfo3, coords);
+
+        return;
+    }
+
+    int axis = shapeInfo1[0] - 1;
+    while(coords[axis] == shape1[axis] - 1) {
+        if(!sameOffsets12 && shape2[axis] != 1)
+            offset2 -= (shape2[axis] - 1) * strides2[axis];
+        if(!sameOffsets13 && shape3[axis] != 1)
+            offset3 -= (shape3[axis] - 1) * strides3[axis];
+        if(shape1[axis] != 1)
+            offset1 -= (shape1[axis] - 1) * strides1[axis];
+        coords[axis--] = 0;
+    }
+
+    ++coords[axis];
+    offset1 += strides1[axis];
+
+    if(!sameOffsets12 && shape2[axis] != 1)
+        offset2 += strides2[axis];
+    if(!sameOffsets13 && shape3[axis] != 1)
+        offset3 += strides3[axis];
+
+    if(sameOffsets12)
+        offset2 = offset1;
+    if(sameOffsets13)
+        offset3 = offset1;
+}
 
 /**
  * Returns the tensor along dimension
@@ -3443,7 +3527,7 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coo
         printf("\n");
     }
 
-    INLINEDEF _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo) {
+    INLINEDEF _CUDA_HD void printShapeInfo(const Nd4jLong *shapeInfo) {
         int rank = shape::rank(shapeInfo);
         Nd4jLong *shape = shape::shapeOf(shapeInfo);
         printf("Rank %d\n",rank);
@@ -4583,89 +4667,92 @@ INLINEDEF void calcOffsets(const Nd4jLong* shapeInfo, Nd4jLong* offsets, const c
 //////////////////////////////////////////////////////////////////////
 INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong* strides, Nd4jLong* offsets, const char order) {
 
-    // if(false) {                     // tests showed that this code did calculation notably slower even for big N
-    //     Nd4jLong indexes[MAX_RANK];
-    //     PRAGMA_OMP_PARALLEL_FOR_ARGS(private(indexes))
-    //     for (Nd4jLong i = 0; i < N; ++i) {
-    //         shape::index2coords(rank, shape, i, indexes);
-    //         subArrOffsets[i] = 0;
-    //         for (int j = 0; j < rank; ++j)
-    //             if(shape[j] != 1)
-    //                 subArrOffsets[i] += indexes[j] * strides[j];
-    //     }
-    //     return;
-    // }
+    const uint64_t len = shape::prodLong(shape, rank);
 
     // set offset for first sub-array, it is equal to zero always
     offsets[0] = 0;
 
-    Nd4jLong * idx          = new Nd4jLong[rank];
-    Nd4jLong* offsetPerDim = new Nd4jLong[rank];
-    memset(idx, 0, sizeof(Nd4jLong) * rank);
+    uint coords[MAX_RANK];
+    memset(coords, 0, sizeof(uint) * rank);
 
-    PRAGMA_OMP_SIMD
-    for (int k = 0; k < rank; ++k)
-        offsetPerDim[k] = (shape[k] - 1) * strides[k];
-
-    Nd4jLong init = 0, i = 1;
-    // nested loops - calculation of sub-array offsets
     if(order == 'c') {
 
-        Nd4jLong rankMinusOne = rank - 1, j = rankMinusOne;
-
-        while(j >= 0) {
-
-            if(shape[j] == 1) { --j; continue; } // ignore dimensions equal to unity
-
-            if(j == rankMinusOne) {              // last dimension
-                for(int l = 1; l < shape[j]; ++l) {
-                    offsets[i] = offsets[i - 1] + strides[j];
-                    i++;
-                }
-                --j;
-            }
-            else if(idx[j] < shape[j] - 1) {
-                init += strides[j];
-                offsets[i++] = init;
-                ++idx[j];
-                j = rankMinusOne;
-            }
-            else {
-                init -= offsetPerDim[j];
-                idx[j--] = 0;
+        for (uint64_t i = 1; i < len; ++i) {
+            int axis = rank - 1;
+            offsets[i] = 0;
+            while(coords[axis] == shape[axis] - 1) {
+                offsets[i] -= (shape[axis] - 1) * strides[axis];
+                coords[axis--] = 0;
             }
+            ++coords[axis];
+            offsets[i] += offsets[i-1] + strides[axis];
         }
-    }
-    else {
+    } else {
 
-        Nd4jLong j = 0;
-
-        while(j < rank) {
-
-            if(shape[j] == 1) { ++j; continue; } // ignore dimensions equal to unity
-
-            if(j == 0) {              // last dimension
-                for(int l = 1; l < shape[j]; ++l) {
-                    offsets[i] = offsets[i - 1] + strides[j];
-                    i++;
-                }
-                ++j;
-            }
-            else if(idx[j] < shape[j] - 1) {
-                init += strides[j];
-                offsets[i++] = init;
-                ++idx[j];
-                j = 0;
-            }
-            else {
-                init -= offsetPerDim[j];
-                idx[j++] = 0;
+        for (uint64_t i = 1; i < len; ++i) {
+            int axis = 0;
+            offsets[i] = 0;
+            while(coords[axis] == shape[axis] - 1) {
+                offsets[i] -= (shape[axis] - 1) * strides[axis];
+                coords[axis++] = 0;
             }
+            ++coords[axis];
+            offsets[i] += offsets[i-1] + strides[axis];
         }
     }
 
-    delete []idx;
-    delete []offsetPerDim;
+    // Nd4jLong init = 0, i = 1;
+    // // nested loops - calculation of sub-array offsets
+    // if(order == 'c') {
+
+    //     int rankMinusOne = rank - 1, j = rankMinusOne;
+
+    //     while(j >= 0) {
+
+    //         if(shape[j] == 1) { --j; continue; } // ignore dimensions equal to unity
+
+    //         if(j == rankMinusOne) {              // last dimension
+    //             for(uint l = 1; l < shape[j]; ++l)
+    //                 offsets[i++] = offsets[i - 1] + strides[j];
+    //             --j;
+    //         }
+    //         else if(coords[j] < shape[j] - 1) {
+    //             init += strides[j];
+    //             offsets[i++] = init;
+    //             ++coords[j];
+    //             j = rankMinusOne;
+    //         }
+    //         else {
+    //             init -= (shape[j] - 1) * strides[j];
+    //             coords[j--] = 0;
+    //         }
+    //     }
+    // }
+    // else {
+
+    //     int j = 0;
+
+    //     while(j < rank) {
+
+    //         if(shape[j] == 1) { ++j; continue; } // ignore dimensions equal to unity
+
+    //         if(j == 0) {              // last dimension
+    //             for(uint l = 1; l < shape[j]; ++l)
+    //                 offsets[i++] = offsets[i - 1] + strides[j];
+    //             ++j;
+    //         }
+    //         else if(coords[j] < shape[j] - 1) {
+    //             init += strides[j];
+    //             offsets[i++] = init;
+    //             ++coords[j];
+    //             j = 0;
+    //         }
+    //         else {
+    //             init -= (shape[j] - 1) * strides[j];
+    //             coords[j++] = 0;
+    //         }
+    //     }
+    // }
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -4884,13 +4971,14 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, int *coords) {
 
-    for(uint i = dimsSize - 1; i > 0; --i) {
-        coords[tadDims[i]] = index % shapeInfo[1 + tadDims[i]];
-        index /= shapeInfo[1 + tadDims[i]];
+    for(uint i = dimsLen - 1; i > 0; --i) {
+        const auto ind = dims[i];
+        coords[ind] = index % shapeInfo[1 + ind];
+        index /= shapeInfo[1 + ind];
     }
-    coords[tadDims[0]] = index;      // last iteration
+    coords[dims[0]] = index;      // last iteration
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -4921,6 +5009,64 @@ INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLo
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities) {
+
+    const int rank = shape::rank(inShapeInfo);
+    const int numOfNonUnities = shape::numOfNonUnitDims(rank, shape::shapeOf(inShapeInfo));
+
+    if(numOfNonUnities == rank) { // no unities in shape, no copy procedure
+        shapeNoUnities   = const_cast<Nd4jLong*>(inShapeInfo) + 1;
+        stridesNoUnities = const_cast<Nd4jLong*>(inShapeInfo) + 1 + rank;
+        return numOfNonUnities;
+    }
+
+    for(uint j = 0, i = 0; i < rank; ++i) {
+        if(shape::shapeOf(inShapeInfo)[i] != 1) {
+            shapeNoUnities[j]                     = shape::shapeOf(inShapeInfo)[i];
+            shapeNoUnities[numOfNonUnities + j++] = shape::stride(inShapeInfo)[i];
+        }
+    }
+
+    stridesNoUnities = shapeNoUnities + numOfNonUnities;
+
+    return numOfNonUnities;
+}
+
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int* dimsToExclude, const int dimsSize, Nd4jLong* outShapeInfo) {
+
+    outShapeInfo[0] = inShapeInfo[0] - dimsSize;
+
+    for(uint j = 0, k = 0, i = 0; i < inShapeInfo[0]; ++i) {
+        if(j < dimsSize && i == dimsToExclude[j]) {
+            ++j;
+            continue;
+        }
+
+        shape::shapeOf(outShapeInfo)[k]  = shape::shapeOf(inShapeInfo)[i];
+        shape::stride(outShapeInfo)[k++] = shape::stride(inShapeInfo)[i];
+    }
+
+    sd::ArrayOptions::copyDataType(outShapeInfo, inShapeInfo);                      // type
+    *shape::ews(outShapeInfo)             = shape::elementWiseStride(inShapeInfo);  // ews
+    outShapeInfo[2 * outShapeInfo[0] + 3] = shape::order(inShapeInfo);              // order
+}
+
+//////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, int *coords) {
+
+//     if(startIndex == index) {
+//         shape::index2coords(index, shapeInfo, dims, dimsLen, coords);
+//     }
+//     else {
+//         int i = dimsLen - 1;
+//         while(coords[dims[i]] == shape::sizeAt(shapeInfo, dims[i]) - 1)
+//             coords[dims[i--]] = 0;
+//         ++coords[dims[i]];
+//     }
+// }
+
 //////////////////////////////////////////////////////////////////////
 // INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
 
@@ -5111,50 +5257,6 @@ INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLo
 //     }
 // }
 
-//////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities) {
-
-    const int rank = shape::rank(inShapeInfo);
-    const int numOfNonUnities = shape::numOfNonUnitDims(rank, shape::shapeOf(inShapeInfo));
-
-    if(numOfNonUnities == rank) { // no unities in shape, no copy procedure
-        shapeNoUnities   = const_cast<Nd4jLong*>(inShapeInfo) + 1;
-        stridesNoUnities = const_cast<Nd4jLong*>(inShapeInfo) + 1 + rank;
-        return numOfNonUnities;
-    }
-
-    for(uint j = 0, i = 0; i < rank; ++i) {
-        if(shape::shapeOf(inShapeInfo)[i] != 1) {
-            shapeNoUnities[j]                     = shape::shapeOf(inShapeInfo)[i];
-            shapeNoUnities[numOfNonUnities + j++] = shape::stride(inShapeInfo)[i];
-        }
-    }
-
-    stridesNoUnities = shapeNoUnities + numOfNonUnities;
-
-    return numOfNonUnities;
-}
-
-//////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo) {
-
-    outShapeInfo[0] = inShapeInfo[0] - dimsSize;
-
-    for(uint j = 0, k = 0, i = 0; i < inShapeInfo[0]; ++i) {
-        if(j < dimsSize && i == dimsToExclude[j]) {
-            ++j;
-            continue;
-        }
-
-        shape::shapeOf(outShapeInfo)[k]  = shape::shapeOf(inShapeInfo)[i];
-        shape::stride(outShapeInfo)[k++] = shape::stride(inShapeInfo)[i];
-    }
-
-    sd::ArrayOptions::copyDataType(outShapeInfo, inShapeInfo);                      // type
-    *shape::ews(outShapeInfo)             = shape::elementWiseStride(inShapeInfo);  // ews
-    outShapeInfo[2 * outShapeInfo[0] + 3] = shape::order(inShapeInfo);              // order
-}
-
 
 //////////////////////////////////////////////////////////////////////
 // INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
diff --git a/libnd4j/include/legacy/NativeOpExecutioner.h b/libnd4j/include/legacy/NativeOpExecutioner.h
index 84ab886c4..20f0b6532 100644
--- a/libnd4j/include/legacy/NativeOpExecutioner.h
+++ b/libnd4j/include/legacy/NativeOpExecutioner.h
@@ -470,8 +470,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                                 void *extraParams,
                                 void *hZ, const Nd4jLong *hZShapeInfo,
                                 void *dZ, const Nd4jLong *dZShapeInfo,
-                                int *dimension, int dimensionLength,
-                                const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+                                int *dimension, int dimensionLength);
 
     static void execReduceSame(sd::LaunchContext  *lc,
                                int opNum,
@@ -480,8 +479,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                                void *extraParams,
                                void *hZ, const Nd4jLong *hZShapeInfo,
                                void *dZ, const Nd4jLong *dZShapeInfo,
-                               int *dimension, int dimensionLength,
-                               const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+                               int *dimension, int dimensionLength);
 
     static void execReduceBool(sd::LaunchContext  *lc,
                                int opNum,
@@ -490,8 +488,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                                void *extraParams,
                                void *hZ, const Nd4jLong *hZShapeInfo,
                                void *dZ, const Nd4jLong *dZShapeInfo,
-                               int *dimension, int dimensionLength,
-                               const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+                               int *dimension, int dimensionLength);
 
     static void execReduceLong(sd::LaunchContext  *lc,
                                int opNum,
@@ -500,8 +497,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                                void *extraParams,
                                void *hZ, const Nd4jLong *hZShapeInfo,
                                void *dZ, const Nd4jLong *dZShapeInfo,
-                               int *dimension, int dimensionLength,
-                               const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+                               int *dimension, int dimensionLength);
 
     /**
      *
diff --git a/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp b/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
index 6b6c51a13..be8a0fbb3 100644
--- a/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
@@ -585,8 +585,7 @@ void NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
                                           void *extraParams,
                                           void *hZ, const Nd4jLong *hZShapeInfo,
                                           void *dZ, const Nd4jLong *dZShapeInfo,
-                                          int *dimension, int dimensionLength,
-                                          const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+                                          int *dimension, int dimensionLength) {
 
 
 
@@ -597,13 +596,7 @@ void NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
     if (shape::isEmpty(hZShapeInfo))
         return;
 
-    auto func = PRAGMA_THREADS_FOR {
-        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
-    };
-
-    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
-
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
+      BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES, FLOAT_TYPES);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -614,24 +607,16 @@ void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
                                          void *extraParams,
                                          void *hZ, const Nd4jLong *hZShapeInfo,
                                          void *dZ, const Nd4jLong *dZShapeInfo,
-                                         int *dimension, int dimensionLength,
-                                         const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+                                         int *dimension, int dimensionLength) {
 
 
     auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     // nothing to do here if result is empty
     if (shape::isEmpty(hZShapeInfo))
         return;
 
-    auto func = PRAGMA_THREADS_FOR {
-        BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES);
-    };
-
-    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
-
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
+    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -642,8 +627,7 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
                                          void *extraParams,
                                          void *hZ, const Nd4jLong *hZShapeInfo,
                                          void *dZ, const Nd4jLong *dZShapeInfo,
-                                         int *dimension, int dimensionLength,
-                                         const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+                                         int *dimension, int dimensionLength) {
 
 
     auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
@@ -653,13 +637,7 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
     if (shape::isEmpty(hZShapeInfo))
         return;
 
-    auto func = PRAGMA_THREADS_FOR {
-        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, BOOL_TYPES);
-    };
-
-    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
-
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
+      BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES, BOOL_TYPES);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -670,8 +648,7 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
                                          void *extraParams,
                                          void *hZ, const Nd4jLong *hZShapeInfo,
                                          void *dZ, const Nd4jLong *dZShapeInfo,
-                                         int *dimension, int dimensionLength,
-                                         const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+                                         int *dimension, int dimensionLength) {
 
 
     auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
@@ -681,13 +658,7 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
     if (shape::isEmpty(hZShapeInfo))
         return;
 
-    auto func = PRAGMA_THREADS_FOR {
-        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, LONG_TYPES);
-    };
-
-    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
-
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
+    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES, LONG_TYPES);
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/legacy/cpu/NativeOps.cpp b/libnd4j/include/legacy/cpu/NativeOps.cpp
index f9e3f669c..463adc17e 100644
--- a/libnd4j/include/legacy/cpu/NativeOps.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOps.cpp
@@ -447,28 +447,26 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
                                    OpaqueDataBuffer *dbZ, const Nd4jLong *hZShapeInfo, const Nd4jLong *dZShapeInfo,
                                    OpaqueDataBuffer *dbDimension, const Nd4jLong *hDimensionShape, const Nd4jLong *dDimensionShape) {
     try {
+
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         auto dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPackX = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        const auto zLen = shape::length(hZShapeInfo);
 
-        auto hTADShapeInfo = tadPackX.primaryShapeInfo();
-        auto hTADOffsets = tadPackX.primaryOffsets();
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+        const Nd4jLong* zShapeInfoD = dZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
+        NativeOpExecutioner::execReduceFloat(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
 
-        NativeOpExecutioner::execReduceFloat(nullptr, opNum,
-                                             dbX->primary(),
-                                             hXShapeInfo,
-                                             dbX->special(),
-                                             dXShapeInfo,
-                                             extraParams,
-                                             dbZ->primary(),
-                                             hZShapeInfo,
-                                             dbZ->special(),
-                                             dZShapeInfo,
-                                             dimension,
-                                             dimensionLength,
-                                             hTADShapeInfo,
-                                             hTADOffsets);
     } catch (std::exception &e) {
         sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
         sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
@@ -481,30 +479,27 @@ void execReduceBool2(Nd4jPointer *extraPointers,
                                 void *extraParams,
                                 OpaqueDataBuffer *dbZ, const Nd4jLong *hZShapeInfo, const Nd4jLong *dZShapeInfo,
                                 OpaqueDataBuffer *dbDimension, const Nd4jLong *hDimensionShape, const Nd4jLong *dDimensionShape) {
+
     try {
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         auto dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension,
-                                                                                dimensionLength);
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
 
-        auto hTADShapeInfo = tadPack.primaryShapeInfo();
-        auto hTADOffsets = tadPack.primaryOffsets();
+        const auto zLen = shape::length(hZShapeInfo);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+        const Nd4jLong* zShapeInfoD = dZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo)) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
+        NativeOpExecutioner::execReduceBool(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
 
-        NativeOpExecutioner::execReduceBool(nullptr, opNum,
-                                            dbX->primary(),
-                                            hXShapeInfo,
-                                            dbX->special(),
-                                            dXShapeInfo,
-                                            extraParams,
-                                            dbZ->primary(),
-                                            hZShapeInfo,
-                                            dbZ->special(),
-                                            dZShapeInfo,
-                                            dimension,
-                                            dimensionLength,
-                                            hTADShapeInfo,
-                                            hTADOffsets);
     } catch (std::exception &e) {
         sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
         sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
@@ -521,26 +516,22 @@ void execReduceSame2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension,
-                                                                                dimensionLength);
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
 
-        auto hTADShapeInfo = tadPack.primaryShapeInfo();
-        auto hTADOffsets = tadPack.primaryOffsets();
+        const auto zLen = shape::length(hZShapeInfo);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+        const Nd4jLong* zShapeInfoD = dZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
+        NativeOpExecutioner::execReduceSame(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
 
-        NativeOpExecutioner::execReduceSame(nullptr, opNum,
-                                            dbX->primary(),
-                                            hXShapeInfo,
-                                            dbX->special(),
-                                            dXShapeInfo,
-                                            extraParams,
-                                            dbZ->primary(),
-                                            hZShapeInfo,
-                                            dbZ->special(),
-                                            dZShapeInfo,
-                                            dimension,
-                                            dimensionLength,
-                                            hTADShapeInfo,
-                                            hTADOffsets);
     } catch (std::exception &e) {
         sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
         sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
@@ -557,25 +548,22 @@ void execReduceLong2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
 
-        auto hTADShapeInfo = tadPack.primaryShapeInfo();
-        auto hTADOffsets = tadPack.primaryOffsets();
+        const auto zLen = shape::length(hZShapeInfo);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+        const Nd4jLong* zShapeInfoD = dZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
+        NativeOpExecutioner::execReduceLong(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
 
-        NativeOpExecutioner::execReduceLong(nullptr, opNum,
-                                            dbX->primary(),
-                                            hXShapeInfo,
-                                            dbX->special(),
-                                            dXShapeInfo,
-                                            extraParams,
-                                            dbZ->primary(),
-                                            hZShapeInfo,
-                                            dbZ->special(),
-                                            dZShapeInfo,
-                                            dimension,
-                                            dimensionLength,
-                                            hTADShapeInfo,
-                                            hTADOffsets);
     } catch (std::exception &e) {
         sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
         sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
diff --git a/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu b/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
index 14cbf306a..cb3c78238 100644
--- a/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
+++ b/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
@@ -210,7 +210,7 @@ void NativeOpExecutioner::execSummaryStatsScalar(sd::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
     auto reductionPointer = lc->getReductionPointer();
 
-    dim3 launchDims = dim3(256, 256, 32768);
+    dim3 launchDims = dim3(256, CUDA_BLOCK_SIZE, 1024);
 
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
@@ -577,8 +577,7 @@ void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
                             void *extraParams,
                             void *hZ, Nd4jLong const* hZShapeInfo,
                             void *dZ, Nd4jLong const* dZShapeInfo,
-                            int *dimension, int dimensionLength,
-                            Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
+                            int *dimension, int dimensionLength) {
 
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
@@ -588,15 +587,14 @@ void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
 
     auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
     auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
-    auto xRank = shape::rank(hXShapeInfo);
 
     if (zType != xType)
         throw datatype_exception::build("NativeOpExecutioner::execReduceSame requires both X & Z operands to have same type", xType, zType);
 
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 8192);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES);
 
     // TODO: remove after the release
     auto res = cudaStreamSynchronize(*stream);
@@ -612,8 +610,7 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
                             void *extraParams,
                             void *hZ, Nd4jLong const* hZShapeInfo,
                             void *dZ, Nd4jLong const* dZShapeInfo,
-                            int *dimension,int dimensionLength,
-                            Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
+                            int *dimension,int dimensionLength) {
 
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
@@ -627,11 +624,10 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
     if (zType != sd::DataType::INT64)
         throw datatype_exception::build("NativeOpExecutioner::execReduceLong wrong Z data type", sd::DataType::INT64, zType);
 
-    auto xRank = shape::rank(hXShapeInfo);
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);
+    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES, LONG_TYPES);
 
     // TODO: remove after the release
     auto res = cudaStreamSynchronize(*stream);
@@ -648,8 +644,7 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
                             void *extraParams,
                             void *hZ, Nd4jLong const* hZShapeInfo,
                             void *dZ, Nd4jLong const* dZShapeInfo,
-                            int *dimension, int dimensionLength,
-                            Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
+                            int *dimension, int dimensionLength) {
 
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
@@ -663,11 +658,10 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
     if (zType != sd::DataType::BOOL)
         throw std::runtime_error("NativeOpExecutioner::execReduceBool requires Z operand to have BOOL type");
 
-    auto xRank = shape::rank(hXShapeInfo);
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
+    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES, BOOL_TYPES);
 
     // TODO: remove after the release
     auto res = cudaStreamSynchronize(*stream);
@@ -675,6 +669,45 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
         throw cuda_exception::build("execReduceBool failed", res);
 }
 
+////////////////////////////////////////////////////////////////////////
+/**
+ *
+ * @param opNum
+ * @param dX
+ * @param dXShapeInfo
+ * @param extraParams
+ * @param dZ
+ * @param dZShapeInfo
+ */
+void  NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
+                                          int opNum,
+                                          const void *hX, const Nd4jLong *hXShapeInfo,
+                                          const void *dX, const Nd4jLong *dXShapeInfo,
+                                          void *extraParams,
+                                          void *hZ, const Nd4jLong *hZShapeInfo,
+                                          void *dZ, const Nd4jLong *dZShapeInfo,
+                                          int *dimension, int dimensionLength) {
+
+    auto stream = lc->getCudaStream();
+    auto reductionPointer = lc->getReductionPointer();
+
+    if (sd::Environment::getInstance().isDebugAndVerbose())
+        printf("F8 opNum:[%i]\n", opNum);
+
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
+
+    auto numBlocks = shape::length(hZShapeInfo);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+
+    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES, FLOAT_TYPES);
+
+    // TODO: remove after the release
+    auto res = cudaStreamSynchronize(*stream);
+    if (res != 0)
+        throw cuda_exception::build("execReduceFloat failed", res);
+}
+
 ////////////////////////////////////////////////////////////////////////
 /**
  *
@@ -707,7 +740,8 @@ void NativeOpExecutioner::execIndexReduce(sd::LaunchContext  *lc,
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
     auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 	auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+	auto tadLength = shape::length(hXShapeInfo) / numBlocks;
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, tadLength < CUDA_BLOCK_SIZE ? tadLength : CUDA_BLOCK_SIZE, 1024);
 
     if (zType != sd::DataType::INT64 && zType != sd::DataType::INT32)
         throw datatype_exception::build("NativeOpExecutioner::execIndexReduce requires Z operand to have INT32/INT64 type", zType);
@@ -722,46 +756,6 @@ void NativeOpExecutioner::execIndexReduce(sd::LaunchContext  *lc,
         throw cuda_exception::build("execIndexReduce failed", res);
 }
 
-////////////////////////////////////////////////////////////////////////
-/**
- *
- * @param opNum
- * @param dX
- * @param dXShapeInfo
- * @param extraParams
- * @param dZ
- * @param dZShapeInfo
- */
-void  NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
-										int opNum,
-										void const* hX, Nd4jLong const* hXShapeInfo,
-        								void const* dX, Nd4jLong const* dXShapeInfo,
-        								void *extraParams,
-        								void *hZ, Nd4jLong const* hZShapeInfo,
-										void *dZ, Nd4jLong const* dZShapeInfo,
-										int *dimension,int dimensionLength,
-										Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
-
-	auto stream = lc->getCudaStream();
-	auto reductionPointer = lc->getReductionPointer();
-
-	if (sd::Environment::getInstance().isDebugAndVerbose())
-		printf("F8 opNum:[%i]\n", opNum);
-
-	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
-
-    auto xRank = shape::rank(hXShapeInfo);
-    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
-
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
-
-    // TODO: remove after the release
-    auto res = cudaStreamSynchronize(*stream);
-    if (res != 0)
-        throw cuda_exception::build("execReduceFloat failed", res);
-}
 
 
 /**
@@ -790,7 +784,7 @@ void NativeOpExecutioner::execIndexReduceScalar(sd::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 
 	if (sd::Environment::getInstance().isDebugAndVerbose() && launchDims.x == 1)
 		printf("AF1 opNum:[%i]\n", opNum);
@@ -840,7 +834,7 @@ void NativeOpExecutioner::execReduceFloatScalar(sd::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceScalar(launchDims, stream, opNum, dX,dXShapeInfo, hXShapeInfo, extraParams, dZ,dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -870,9 +864,9 @@ void NativeOpExecutioner::execReduceBoolScalar(sd::LaunchContext  *lc,
         throw std::runtime_error("NativeOpExecutioner::execReduceBoolScalar requires Z operand to have BOOL type");
 
     auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, BOOL_TYPES);
 
@@ -901,9 +895,9 @@ void NativeOpExecutioner::execReduceSameScalar(sd::LaunchContext  *lc,
         throw datatype_exception::build("NativeOpExecutioner::execReduceSameScalar requires both X & Z operands to have same type", xType, zType);
 
     auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
 
     BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES);
 
@@ -932,9 +926,9 @@ void NativeOpExecutioner::execReduceLongScalar(sd::LaunchContext  *lc,
         throw datatype_exception::build("NativeOpExecutioner::execReduceLongScalar wrong Z data type", sd::DataType::INT64, zType);
 
     auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, LONG_TYPES);
 
@@ -1128,7 +1122,7 @@ void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
     auto stream = lc->getCudaStream();
     auto reductionPointer = lc->getReductionPointer();
 
-    dim3 launchDims = dim3(256, 256, 32768);
+    dim3 launchDims = dim3(256, CUDA_BLOCK_SIZE, 1024);
 
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
@@ -1158,7 +1152,7 @@ void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
 
-    dim3 launchDims = dim3(256, 256, 32768);
+    dim3 launchDims = dim3(256, CUDA_BLOCK_SIZE, 1024);
 
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
@@ -1194,9 +1188,9 @@ void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
     auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
     auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(shape::length(hXShapeInfo), blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
 
     if (xType != yType)
         throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
@@ -1246,7 +1240,7 @@ void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
 
 
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum,
                                                                     dX, dXShapeInfo,
@@ -1286,9 +1280,9 @@ void NativeOpExecutioner::execReduce3Scalar(sd::LaunchContext  *lc,
     auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
 
     if (xType != yType)
         throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
@@ -1652,7 +1646,7 @@ void NativeOpExecutioner::execReduce3All(sd::LaunchContext  *lc,
     if (sd::Environment::getInstance().isDebugAndVerbose())
         printf("D119 opNum:[%i]\n", opNum);
 
-    dim3 launchDims(shape::length(hZShapeInfo), 256, 32768);
+    dim3 launchDims(shape::length(hZShapeInfo), CUDA_BLOCK_SIZE / 2, 1024);
 
     if (sd::Environment::getInstance().isVerbose() && launchDims.x == 1)
         printf("AD119 opNum:[%i]\n", opNum);
@@ -1706,7 +1700,7 @@ void NativeOpExecutioner::execReduce3TAD(sd::LaunchContext  *lc,
         throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);
 
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, tadShapeInfo, tadOffsets, yTadShapeInfo, yTadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
 
diff --git a/libnd4j/include/legacy/cuda/NativeOps.cu b/libnd4j/include/legacy/cuda/NativeOps.cu
index 1ccc2c7d5..186b3a9cb 100755
--- a/libnd4j/include/legacy/cuda/NativeOps.cu
+++ b/libnd4j/include/legacy/cuda/NativeOps.cu
@@ -454,17 +454,24 @@ void execReduceSame2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
-                                                                                dimension,
-                                                                                shape::length(hDimensionShape));
+        const auto zLen = shape::length(hZShapeInfo);
 
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
         LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
         NativeOpExecutioner::execReduceSame(&lc, opNum,
                 dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                 extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
-                dimension, dimensionLength,
-                tadPack.specialShapeInfo(), tadPack.specialOffsets());
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
+                dims.data(), dims.size());
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
@@ -487,17 +494,25 @@ void execReduceLong2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
-                                                                                dimension,
-                                                                                shape::length(hDimensionShape));
+        const auto zLen = shape::length(hZShapeInfo);
+
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
 
         LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
         NativeOpExecutioner::execReduceLong(&lc, opNum,
                 dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                 extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
-                dimension, dimensionLength,
-                tadPack.specialShapeInfo(), tadPack.specialOffsets());
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
+                dims.data(), dims.size());
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
@@ -562,17 +577,25 @@ void execReduceBool2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
-                                                                                dimension,
-                                                                                shape::length(hDimensionShape));
+        const auto zLen = shape::length(hZShapeInfo);
+
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
 
         LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
         NativeOpExecutioner::execReduceBool(&lc, opNum,
                 dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                 extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
-                dimension, dimensionLength,
-                tadPack.specialShapeInfo(), tadPack.specialOffsets());
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
+                dims.data(), dims.size());
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
@@ -690,17 +713,25 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
-                                                                                dimension,
-                                                                                shape::length(hDimensionShape));
+        const auto zLen = shape::length(hZShapeInfo);
+
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
+
+        const Nd4jLong* zShapeInfoH = hZShapeInfo;
+
+        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
+            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
+            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+        }
+
+        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
 
         LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
         NativeOpExecutioner::execReduceFloat(&lc, opNum,
                 dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                 extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
-                dimension, dimensionLength,
-                tadPack.specialShapeInfo(), tadPack.specialOffsets());
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
+                dims.data(), dims.size());
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index 4c59de0ec..51b76df73 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -784,24 +784,14 @@ static void execDefault(const X *x, const Nd4jLong *xShapeInfo, const Y *y, cons
     const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
     const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
 
-    const int rank = shape::rank(zShapeInfo);   // xRank = yRank = zRank
-
     auto func = PRAGMA_THREADS_FOR{
 
-        int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+        int coords[MAX_RANK];
+        Nd4jLong xOffset, yOffset, zOffset;
 
         for (auto i = start; i < stop; ++i) {
 
-            shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
-
-            for (uint j = 0; j < rank; ++j) {
-                xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
-                yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
-            }
-
-            const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
-            const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
-            const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
+            shape::getOffsetBroadcast(start, i, zShapeInfo, xShapeInfo, yShapeInfo, xzSameOffsets, yzSameOffsets, coords, zOffset, xOffset, yOffset);
 
             z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
         }
diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.hpp b/libnd4j/include/loops/cpu/broadcasting_bool.hpp
index a15935124..22f30e40a 100644
--- a/libnd4j/include/loops/cpu/broadcasting_bool.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.hpp
@@ -665,24 +665,14 @@ static void execDefault(const X *x, const Nd4jLong *xShapeInfo, const X *y, cons
     const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
     const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
 
-    const int rank = shape::rank(zShapeInfo);   // xRank = yRank = zRank
-
     auto func = PRAGMA_THREADS_FOR{
 
-        int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+        int coords[MAX_RANK];
+        Nd4jLong xOffset, yOffset, zOffset;
 
         for (auto i = start; i < stop; ++i) {
 
-            shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
-
-            for (uint j = 0; j < rank; ++j) {
-                xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
-                yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
-            }
-
-            const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
-            const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
-            const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
+            shape::getOffsetBroadcast(start, i, zShapeInfo, xShapeInfo, yShapeInfo, xzSameOffsets, yzSameOffsets, coords, zOffset, xOffset, yOffset);
 
             z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
         }
diff --git a/libnd4j/include/loops/cpu/broadcasting_int.hpp b/libnd4j/include/loops/cpu/broadcasting_int.hpp
index 39b251594..5b95d963f 100644
--- a/libnd4j/include/loops/cpu/broadcasting_int.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.hpp
@@ -651,24 +651,14 @@ static void execDefault(const X *x, const Nd4jLong *xShapeInfo, const X *y, cons
     const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
     const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
 
-    const int rank = shape::rank(zShapeInfo);   // xRank = yRank = zRank
-
     auto func = PRAGMA_THREADS_FOR{
 
-        int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+        int coords[MAX_RANK];
+        Nd4jLong xOffset, yOffset, zOffset;
 
         for (auto i = start; i < stop; ++i) {
 
-            shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
-
-            for (uint j = 0; j < rank; ++j) {
-                xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
-                yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
-            }
-
-            const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
-            const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
-            const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
+            shape::getOffsetBroadcast(start, i, zShapeInfo, xShapeInfo, yShapeInfo, xzSameOffsets, yzSameOffsets, coords, zOffset, xOffset, yOffset);
 
             z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
         }
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
index 94e156705..754002eaa 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@@ -114,71 +114,6 @@ namespace functions {
             DISPATCH_BY_OPNUM_TT(execScalar, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo), REDUCE_BOOL_OPS);
         }
 
-        template <typename X, typename Y>
-        void ReduceBoolFunction<X, Y>::exec(const int opNum,
-                                            const void *x, const Nd4jLong *xShapeInfo,
-                                            void *extraParams,
-                                            void *z, const Nd4jLong *zShapeInfo,
-                                            int *dimension, int dimensionLength,
-                                            const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                            int64_t start, int64_t stop) {
-                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_BOOL_OPS);
-        }
-
-        template <typename X, typename Z>
-        template <typename OpType>
-        void _CUDA_H ReduceBoolFunction<X,Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
-                                                   void *vextraParams,
-                                                   void *vresult, const Nd4jLong *zShapeInfo,
-                                                   int *dimension, int dimensionLength,
-                                                   const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset, int64_t start, int64_t stop) {
-
-                auto x = reinterpret_cast<const X *>(vx);
-                auto z = reinterpret_cast<Z *>(vresult);
-                auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-                auto resultLength = shape::length(zShapeInfo);
-
-                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
-                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
-                        return;
-                    const auto startingVal = OpType::startingValue(x);
-
-                    for (Nd4jLong i = 0; i < resultLength; i++)
-                        z[i] = startingVal;
-                    return;
-                }
-
-                //pre squeezed: this is for keeping the pointer to the original
-                //shape information for tad offset
-                //the squeezed information doesn't render the right strides for
-                //tad offset
-                // || tad.wholeThing
-                if (resultLength == 1 || dimension == nullptr || dimensionLength == shape::rank(xShapeInfo)) {
-                    z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
-                    return;
-                }
-
-                auto tadOnlyShapeInfo = tadShapeInfo;
-                auto tadOffsets = tadOffset;
-
-                if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
-                    if (dimensionLength < 1)
-                        return;
-
-                    auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(xShapeInfo, dimension, dimensionLength);
-                    tadOnlyShapeInfo = tadPack.primaryShapeInfo();
-                    tadOffsets = tadPack.primaryOffsets();
-                }
-
-#ifdef INLINE_LOOPS
-                sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#else
-                sd::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#endif
-            }
-
-
         template <typename X, typename Z>
         template<typename OpType>
         void _CUDA_H ReduceBoolFunction<X,Z>::exec(const void *x, const Nd4jLong *xShapeInfo,
@@ -220,7 +155,51 @@ namespace functions {
                 return OpType::postProcess(intermediate[0], length, extraParams);
             }
 
+////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+template <typename OpType>
+void _CUDA_H ReduceBoolFunction<X,Z>::exec(sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int* dims) {
 
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceBoolFunction, , LIBND4J_TYPES, BOOL_TYPES);
+    const X* x = reinterpret_cast<const X*>(vx);
+          Z* z = reinterpret_cast<Z*>(vz);
+          X* extraParams = reinterpret_cast<X*>(vextraParams);
+
+    const int xRank = shape::rank(xShapeInfo);
+    const int zRank = shape::rank(zShapeInfo);
+
+     if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+
+        const auto startingVal = OpType::startingValue(x);
+        const auto zLen = shape::length(zShapeInfo);
+
+        for (Nd4jLong i = 0; i < zLen; i++)
+            z[i] = startingVal;
+        return;
     }
-}
\ No newline at end of file
+
+    if (shape::length(zShapeInfo) == 1) {
+        z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
+        return;
+    }
+
+
+#ifdef INLINE_LOOPS
+    sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#else
+    sd::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#endif
+
+}
+
+////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+void ReduceBoolFunction<X,Y>::exec(const int opNum, sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims) {
+
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(workspace, vx, xShapeInfo, vextraParams, vz, zShapeInfo, dims), REDUCE_BOOL_OPS);
+}
+
+
+BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceBoolFunction, , LIBND4J_TYPES, BOOL_TYPES);
+}
+}
+
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
index 6be93b1c4..352fa2200 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
@@ -26,11 +26,13 @@
 #include <helpers/OmpLaunchHelper.h>
 #include <helpers/Loops.h>
 #include <helpers/ConstantTadHelper.h>
+#include <helpers/ShapeBuilders.h>
 
 using namespace simdOps;
 
 namespace functions {
-    namespace reduce {
+namespace reduce    {
+
         template <typename X, typename Z>
         template <typename OpType>
         void _CUDA_H ReduceFloatFunction<X,Z>::execScalar(const void *vx, const Nd4jLong *xShapeInfo,
@@ -133,86 +135,6 @@ namespace functions {
             DISPATCH_BY_OPNUM_TT(execScalar, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo), REDUCE_FLOAT_OPS);
         }
 
-        template <typename X, typename Y>
-        void ReduceFloatFunction<X, Y>::exec(const int opNum,
-                                             const void *x, const Nd4jLong *xShapeInfo,
-                                             void *extraParams,
-                                             void *z, const Nd4jLong *zShapeInfo,
-                                             int *dimension, int dimensionLength,
-                                             const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                             int64_t start, int64_t stop) {
-                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
-                                               xShapeInfo,
-                                               extraParams,
-                                               z,
-                                               zShapeInfo,
-                                               dimension,
-                                               dimensionLength,
-                                               tadShapeInfo,
-                                               tadOffset, start, stop),
-                                  REDUCE_FLOAT_OPS);
-        }
-
-        template <typename X, typename Z>
-        template <typename OpType>
-        void _CUDA_H ReduceFloatFunction<X,Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
-                                                    void *vextraParams,
-                                                    void *vresult, const Nd4jLong *zShapeInfo,
-                                                    int *dimension, int dimensionLength,
-                                                    const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                                    int64_t start, int64_t stop) {
-
-                auto x = reinterpret_cast<const X *>(vx);
-                auto z = reinterpret_cast<Z *>(vresult);
-                auto extraParams = reinterpret_cast<Z *>(vextraParams);
-
-                auto resultLength = shape::length(zShapeInfo);
-
-                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
-                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
-                        return;
-                    const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? sd::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
-
-                    for (Nd4jLong i = 0; i < resultLength; i++)
-                        z[i] = startingVal;
-                    return;
-                }
-
-                //pre squeezed: this is for keeping the pointer to the original
-                //shape information for tad offset
-                //the squeezed information doesn't render the right strides for
-                //tad offset
-                // || tad.wholeThing
-                if (resultLength == 1 || dimension == nullptr || dimensionLength == shape::rank(xShapeInfo)) {
-                    z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
-                    return;
-                }
-
-                if (OpType::requiresSpecialAccumulation) {
-                    OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset);
-                    return;
-                }
-
-                auto tadOnlyShapeInfo = tadShapeInfo;
-                auto tadOffsets = tadOffset;
-
-                if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
-                    if (dimensionLength < 0)
-                        return;
-
-                    auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(xShapeInfo, dimension, dimensionLength);
-                    tadOnlyShapeInfo = tadPack.primaryShapeInfo();
-                    tadOffsets = tadPack.primaryOffsets();
-                }
-
-#ifdef INLINE_LOOPS
-                sd::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#else
-                sd::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#endif
-            }
-
-
         template <typename X, typename Z>
         template<typename OpType>
         void _CUDA_H ReduceFloatFunction<X,Z>::exec(const void *x, const Nd4jLong *xShapeInfo,
@@ -255,5 +177,54 @@ namespace functions {
             // return result
             return OpType::postProcess(intermediate[0], length, extraParams);
         }
+
+
+////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+template<typename OpType>
+void _CUDA_H ReduceFloatFunction<X, Z>::exec(sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int* dims) {
+
+    const X* x = reinterpret_cast<const X*>(vx);
+          Z* z = reinterpret_cast<Z*>(vz);
+          Z* extraParams = reinterpret_cast<Z*>(vextraParams);
+
+    const int xRank = shape::rank(xShapeInfo);
+    const int zRank = shape::rank(zShapeInfo);
+
+     if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+
+        const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? sd::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
+        const auto zLen = shape::length(zShapeInfo);
+
+        for (Nd4jLong i = 0; i < zLen; i++)
+            z[i] = startingVal;
+        return;
     }
+
+    if (shape::length(zShapeInfo) == 1) {
+        z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
+        return;
+    }
+
+    if (OpType::requiresSpecialAccumulation) {
+        OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, const_cast<int*>(dims)+zRank, xRank-zRank, nullptr, nullptr);
+        return;
+    }
+
+#ifdef INLINE_LOOPS
+    sd::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#else
+    sd::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#endif
+
+}
+
+////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+void ReduceFloatFunction<X, Y>::exec(const int opNum, sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims) {
+
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(workspace, vx, xShapeInfo, vextraParams, vz, zShapeInfo, dims), REDUCE_FLOAT_OPS);
+}
+
+}
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
index a4fae3228..e908f1fc7 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
@@ -129,76 +129,6 @@ namespace functions {
             DISPATCH_BY_OPNUM_TT(execScalar, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo), REDUCE_LONG_OPS);
         }
 
-        template <typename X, typename Y>
-        void ReduceLongFunction<X, Y>::exec(const int opNum,
-                                            const void *x, const Nd4jLong *xShapeInfo,
-                                            void *extraParams,
-                                            void *z, const Nd4jLong *zShapeInfo,
-                                            int *dimension, int dimensionLength,
-                                            const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                            int64_t start, int64_t stop) {
-                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_LONG_OPS);
-        }
-
-        template <typename X, typename Z>
-        template <typename OpType>
-        void _CUDA_H ReduceLongFunction<X,Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
-                                                   void *vextraParams,
-                                                   void *vresult, const Nd4jLong *zShapeInfo,
-                                                   int *dimension, int dimensionLength,
-                                                   const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                                   int64_t start, int64_t stop) {
-
-                auto x = reinterpret_cast<const X *>(vx);
-                auto z = reinterpret_cast<Z *>(vresult);
-                auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-                auto resultLength = shape::length(zShapeInfo);
-
-                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
-                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
-                        return;
-                    const auto startingVal = OpType::startingValue(x);
-
-                    for (Nd4jLong i = 0; i < resultLength; i++)
-                        z[i] = startingVal;
-                    return;
-                }
-
-                //pre squeezed: this is for keeping the pointer to the original
-                //shape information for tad offset
-                //the squeezed information doesn't render the right strides for
-                //tad offset
-                // || tad.wholeThing
-                if (resultLength == 1 || dimension == nullptr || dimensionLength == shape::rank(xShapeInfo)) {
-                    z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
-                    return;
-                }
-
-                if (OpType::requiresSpecialAccumulation) {
-                    OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset);
-                    return;
-                }
-
-                auto tadOnlyShapeInfo = tadShapeInfo;
-                auto tadOffsets = tadOffset;
-
-                if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
-                    if (dimensionLength < 1)
-                        return;
-
-                    auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(xShapeInfo, dimension, dimensionLength);
-                    tadOnlyShapeInfo = tadPack.primaryShapeInfo();
-                    tadOffsets = tadPack.primaryOffsets();
-                }
-
-#ifdef INLINE_LOOPS
-                sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#else
-                sd::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#endif
-            }
-
 
         template <typename X, typename Z>
         template<typename OpType>
@@ -243,6 +173,56 @@ namespace functions {
         }
 
 
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceLongFunction, , LIBND4J_TYPES, LONG_TYPES);
+////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+template<typename OpType>
+void _CUDA_H ReduceLongFunction<X, Z>::exec(sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int* dims) {
+
+    const X* x = reinterpret_cast<const X*>(vx);
+          Z* z = reinterpret_cast<Z*>(vz);
+          X* extraParams = reinterpret_cast<X*>(vextraParams);
+
+    const int xRank = shape::rank(xShapeInfo);
+    const int zRank = shape::rank(zShapeInfo);
+
+     if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+
+        const auto startingVal = OpType::startingValue(x);
+        const auto zLen = shape::length(zShapeInfo);
+
+        for (Nd4jLong i = 0; i < zLen; i++)
+            z[i] = startingVal;
+        return;
     }
-}
\ No newline at end of file
+
+    if (shape::length(zShapeInfo) == 1) {
+        z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
+        return;
+    }
+
+    if (OpType::requiresSpecialAccumulation) {
+        OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, const_cast<int*>(dims)+zRank, xRank-zRank, nullptr, nullptr);
+            return;
+    }
+
+#ifdef INLINE_LOOPS
+    sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#else
+    sd::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#endif
+
+}
+
+////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+void ReduceLongFunction<X, Y>::exec(const int opNum, sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims) {
+
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(workspace, vx, xShapeInfo, vextraParams, vz, zShapeInfo, dims), REDUCE_LONG_OPS);
+}
+
+
+
+BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceLongFunction, , LIBND4J_TYPES, LONG_TYPES);
+}
+}
+
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
index 10607fb6d..78cce19a0 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
@@ -129,85 +129,6 @@ namespace functions {
             DISPATCH_BY_OPNUM_T(execScalar, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo), REDUCE_SAME_OPS);
         }
 
-        template <typename X>
-        void ReduceSameFunction<X>::exec(const int opNum,
-                                         const void *x, const Nd4jLong *xShapeInfo,
-                                         void *extraParams,
-                                         void *z, const Nd4jLong *zShapeInfo,
-                                         int *dimension, int dimensionLength,
-                                         const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                         int64_t start, int64_t stop) {
-                DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
-                                               xShapeInfo,
-                                               extraParams,
-                                               z,
-                                               zShapeInfo,
-                                               dimension,
-                                               dimensionLength,
-                                               tadShapeInfo,
-                                               tadOffset, start, stop),
-                                  REDUCE_SAME_OPS);
-        }
-
-        template <typename X>
-        template <typename OpType>
-        void _CUDA_H ReduceSameFunction<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
-                                                 void *vextraParams,
-                                                 void *vz, const Nd4jLong *zShapeInfo,
-                                                 int *dimension, int dimensionLength,
-                                                 const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                                 int64_t start, int64_t stop) {
-
-                auto x = reinterpret_cast<const X *>(vx);
-                auto z = reinterpret_cast<X *>(vz);
-                auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-                auto zLength = shape::length(zShapeInfo);
-
-                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
-                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
-                        return;
-                    const auto startingVal = OpType::startingValue(x);
-
-                    for (Nd4jLong i = 0; i < zLength; i++)
-                        z[i] = startingVal;
-                    return;
-                }
-
-                //pre squeezed: this is for keeping the pointer to the original
-                //shape information for tad offset
-                //the squeezed information doesn't render the right strides for
-                //tad offset
-                // || tad.wholeThing
-                if (zLength == 1 || dimension == nullptr || dimensionLength == shape::rank(xShapeInfo)) {
-                    z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
-                    return;
-                }
-
-                if (OpType::requiresSpecialAccumulation) {
-                    OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset);
-                    return;
-                }
-
-                auto tadOnlyShapeInfo = tadShapeInfo;
-                auto tadOffsets = tadOffset;
-
-                if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
-                    if (dimensionLength < 1)
-                        return;
-
-                    auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(xShapeInfo, dimension, dimensionLength);
-                    tadOnlyShapeInfo = tadPack.primaryShapeInfo();
-                    tadOffsets = tadPack.primaryOffsets();
-                }
-
-#ifdef INLINE_LOOPS
-                sd::ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#else
-                sd::ReductionSameLoops<X>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
-#endif
-            }
-
 
         template <typename X>
         template<typename OpType>
@@ -251,7 +172,57 @@ namespace functions {
             return OpType::postProcess(intermediate[0], length, extraParams);
         }
 
+////////////////////////////////////////////////////////////////////////
+template <typename X>
+template <typename OpType>
+void _CUDA_H ReduceSameFunction<X>::exec(sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int* dims) {
 
-        BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ReduceSameFunction, , LIBND4J_TYPES);
+    const X* x = reinterpret_cast<const X*>(vx);
+          X* z = reinterpret_cast<X*>(vz);
+          X* extraParams = reinterpret_cast<X*>(vextraParams);
+
+    const int xRank = shape::rank(xShapeInfo);
+    const int zRank = shape::rank(zShapeInfo);
+
+    if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+
+        const auto startingVal = OpType::startingValue(x);
+        const auto zLen = shape::length(zShapeInfo);
+
+        for (Nd4jLong i = 0; i < zLen; i++)
+            z[i] = startingVal;
+        return;
     }
-}
\ No newline at end of file
+
+    if (shape::length(zShapeInfo) == 1) {
+        z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
+        return;
+    }
+
+    if (OpType::requiresSpecialAccumulation) {
+        OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, const_cast<int*>(dims)+zRank, xRank-zRank, nullptr, nullptr);
+        return;
+    }
+
+#ifdef INLINE_LOOPS
+    sd::ReductionLoops<X,X,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#else
+    sd::ReductionSameLoops<X>::template innerloopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
+#endif
+
+}
+
+////////////////////////////////////////////////////////////////////////
+template <typename X>
+void ReduceSameFunction<X>::exec(const int opNum, sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims) {
+
+    DISPATCH_BY_OPNUM_T(exec, PARAMS(workspace, vx, xShapeInfo, vextraParams, vz, zShapeInfo, dims), REDUCE_SAME_OPS);
+}
+
+
+
+BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ReduceSameFunction, , LIBND4J_TYPES);
+}
+}
+
+
diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp
index 4b5c7833f..80db91782 100644
--- a/libnd4j/include/loops/cuda/broadcasting.chpp
+++ b/libnd4j/include/loops/cuda/broadcasting.chpp
@@ -279,20 +279,15 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int coords[MAX_RANK];
 
     for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
 
-        shape::index2coords(i, zShapeInfo, zCoords);
+        shape::index2coords(i, zShapeInfo, coords);
 
-        for (uint j = 0; j < rank; ++j) {
-            xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
-            yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
-        }
-
-        const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
-        const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
-        const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
+        const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords);
+        const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, coords);
 
         z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
     }
diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu
index bed00a20f..b2d94def8 100644
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@@ -295,20 +295,15 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int coords[MAX_RANK];
 
     for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
 
-        shape::index2coords(i, zShapeInfo, zCoords);
+        shape::index2coords(i, zShapeInfo, coords);
 
-        for (uint j = 0; j < rank; ++j) {
-            xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
-            yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
-        }
-
-        const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
-        const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
-        const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
+        const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords);
+        const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, coords);
 
         z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
     }
diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu
index 37cbf3eba..1d3c0375a 100644
--- a/libnd4j/include/loops/cuda/broadcasting_int.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_int.cu
@@ -275,20 +275,15 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong co
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int coords[MAX_RANK];
 
     for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
 
-        shape::index2coords(i, zShapeInfo, zCoords);
+        shape::index2coords(i, zShapeInfo, coords);
 
-        for (uint j = 0; j < rank; ++j) {
-            xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
-            yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
-        }
-
-        const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
-        const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
-        const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
+        const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords);
+        const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, coords);
 
         z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
     }
diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu
index dbe03a9bf..23f0be7cd 100644
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@@ -120,12 +120,11 @@ namespace functions {
 
         template <typename X, typename Z>
         template <typename OpType>
-        __device__ void IndexReduce<X, Z>::aggregatePartials(IndexValue<X> **sPartialsRef, Nd4jLong tid, Nd4jLong numElements, void *vextraParams) {
+        __device__ void IndexReduce<X, Z>::aggregatePartials(IndexValue<X> *sPartials, Nd4jLong tid, Nd4jLong numElements, void *vextraParams) {
             // start the shared memory loop on the next power of 2 less
             // than the block size.  If block size is not a power of 2,
             // accumulate the intermediate sums in the remainder range.
             auto extraParams = static_cast<X*>(vextraParams);
-            IndexValue<X> *sPartials = *sPartialsRef;
             Nd4jLong floorPow2 = blockDim.x;
 
             if (floorPow2 & (floorPow2 - 1)) {
@@ -191,12 +190,7 @@ namespace functions {
             __shared__ volatile bool resultScalar;
 
             //shared memory space for storing intermediate results
-            __shared__ IndexValue<X>* sPartials;
-            if(threadIdx.x == 0) {
-                extern __shared__ unsigned char shmem[];
-                sPartials = reinterpret_cast<IndexValue<X>*>(shmem);
-            }
-            __syncthreads();
+            __shared__ IndexValue<X> sPartials[CUDA_BLOCK_SIZE];
 
             sPartials[threadIdx.x] = OpType::startingIndexValue(dx);
 
@@ -261,7 +255,7 @@ namespace functions {
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
+                        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -282,7 +276,7 @@ namespace functions {
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
+                        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -313,7 +307,7 @@ namespace functions {
                 sPartials[threadIdx.x] = reduction;
                 __syncthreads();
 
-                aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, (int) n),extraParams);
+                aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, (int) n),extraParams);
                 __syncthreads();
 
                 if (gridDim.x > 1) {
@@ -345,7 +339,7 @@ namespace functions {
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x),extraParams);
+                        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x),extraParams);
 
                         __syncthreads();
                         if (tid == 0) {
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
index de854416d..0c81334a6 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
@@ -33,14 +33,10 @@ using namespace simdOps;
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename OpType>
-__global__ void simpleReduce(const void *x, const Nd4jLong *xShapeInfo,
-                            void *extraParams,
-                            void *z, const Nd4jLong *zShapeInfo,
-                            int *dimension, int dimensionLength,
-                            void *reductionBuffer,
-                            const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets) {
+__global__ void simpleReduce(const void *x, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                            void *extraParams, void *vreductionBuffer, void *z, const Nd4jLong *zShapeInfo) {
 
-    functions::reduce::ReduceBoolFunction<X,Z>::template transformCudaXD<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
+    functions::reduce::ReduceBoolFunction<X,Z>::template transformCudaXD<OpType>(x, outerXTadShapeInfo, innerXTadShapeInfo, vreductionBuffer, extraParams, z, zShapeInfo);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -55,7 +51,6 @@ __global__ void simpleScalar(const void *x, const Nd4jLong *xShapeInfo,
     functions::reduce::ReduceBoolFunction<X, Z>::template execScalarCuda<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, reductionBuffer, tadOnlyShapeInfo);
 }
 
-
 namespace functions {
 namespace reduce    {
 
@@ -95,53 +90,49 @@ __device__ void ReduceBoolFunction<X,Z>::aggregatePartials(void *vsPartials, Nd4
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
 template <typename OpType>
-__device__ void ReduceBoolFunction<X,Z>::transformCudaXD(const void *vx, const Nd4jLong *xShapeInfo,
-                                                        void *vextraParams,
-                                                        void *vz, const Nd4jLong *zShapeInfo,
-                                                        int *dimension,  int dimensionLength,
-                                                        void *vreductionBuffer,
-                                                         const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets) {
+__device__ void ReduceBoolFunction<X,Z>::transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                                                        void *vextraParams, void *vreductionBuffer,
+                                                        void *vz, const Nd4jLong *zShapeInfo) {
 
     auto x = reinterpret_cast<const X*>(vx);
     auto z = reinterpret_cast<Z*>(vz);
     auto extraParams = reinterpret_cast<X*>(vextraParams);
-    auto reductionBuffer = reinterpret_cast<Z*>(vreductionBuffer);
 
     //shared memory space for storing intermediate results
-    __shared__ Z* sPartials;
-    __shared__ int tadLength, numTads;
-    __shared__ bool isPlainOutput;
+    __shared__ Z sPartials[CUDA_BLOCK_SIZE];
+    __shared__ int tadLen, numTads;
+    __shared__ bool sameOffsets;
 
     if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
+        sameOffsets = shape::haveSameShapeAndStrides(zShapeInfo, outerXTadShapeInfo);
 
-        isPlainOutput = shape::order(zShapeInfo) == 'c' && shape::elementWiseStride(zShapeInfo) == 1;
-
-        tadLength = shape::length(tadOnlyShapeInfo); //tadLength(xShapeInfo, dimension, dimensionLength);
-        numTads = shape::length(xShapeInfo) / tadLength;
+        tadLen  = shape::length(innerXTadShapeInfo);
+        numTads = shape::length(outerXTadShapeInfo);
     }
     __syncthreads();
 
+    int coords[MAX_RANK];
+
     for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
 
-        Nd4jLong tadOffsetForBlock = tadOffsets[r];
-        sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
+        shape::index2coords(r, outerXTadShapeInfo, coords);
+        const auto outerOffset = shape::getOffset(outerXTadShapeInfo, coords);
+        const auto zOffset = sameOffsets ? outerOffset : shape::getOffset(zShapeInfo, coords);
 
-        for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
+        const X* xTad = x + outerOffset;
+        sPartials[threadIdx.x] = OpType::startingValue(xTad);
 
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
-        }
+        for (int i = threadIdx.x; i < tadLen; i += blockDim.x)
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(xTad[shape::getIndexOffset(i, innerXTadShapeInfo)], extraParams), extraParams);
         __syncthreads();
 
-        // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        // aggregate. do NOT reduce for elements > tadLen
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLen), extraParams);
 
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[zOffset] = OpType::postProcess(sPartials[threadIdx.x], tadLen, extraParams);
     }
 }
 
@@ -162,13 +153,11 @@ __device__ void ReduceBoolFunction<X,Z>::execScalarCuda(const void *vx, const Nd
     auto tid = blockDim.x * blockIdx.x + threadIdx.x;
 
     //shared memory space for storing intermediate results
-    __shared__ Z* sPartials;
+    __shared__ Z sPartials[CUDA_BLOCK_SIZE];
     __shared__ Nd4jLong xEws;
     __shared__ Nd4jLong len;
 
     if(threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
         xEws = shape::elementWiseStride(xShapeInfo);
         len = shape::length(xShapeInfo);
     }
@@ -237,12 +226,9 @@ __device__ void ReduceBoolFunction<X,Z>::execScalarCuda(const void *vx, const Nd
 template <typename X, typename Z>
 template<typename OpType>
 __host__ void ReduceBoolFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStream_t *stream,
-                                                      const void *x, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo,
-                                                      void *extraParams,
-                                                      void *z, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo,
-                                                      int *dimension, int dimensionLength,
-                                                      void *reductionPointer,
-                                                      const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+                                                       const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                       void *extraParams, void *vreductionBuffer,
+                                                       void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims) {
     if(shape::isEmpty(hXShapeInfo)) {
 
         if(shape::isEmpty(hZShapeInfo))
@@ -257,11 +243,17 @@ __host__ void ReduceBoolFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStrea
         auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
-        functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, zShapeInfo, hZShapeInfo, z, zShapeInfo, hZShapeInfo, ptr, nullptr);
+        functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, dZShapeInfo, hZShapeInfo, z, dZShapeInfo, hZShapeInfo, ptr, nullptr);
         sd::DebugHelper::checkErrorCode(stream, "reduceBoolDim empty(...) failed");
     }
     else {
-        simpleReduce<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets);
+         const int zRank = shape::rank(hZShapeInfo);
+        const int tadRank = shape::rank(hXShapeInfo) - zRank;
+
+        auto outerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims, zRank);
+        auto innerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims+zRank, tadRank);
+
+        simpleReduce<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, reinterpret_cast<Nd4jLong const*>(outerPack.special()), reinterpret_cast<Nd4jLong const*>(innerPack.special()), extraParams, vreductionBuffer, z, dZShapeInfo);
         sd::DebugHelper::checkErrorCode(stream, "reduceBoolDim(...) failed");
     }
 }
@@ -314,16 +306,16 @@ _CUDA_H void ReduceBoolFunction<X,Y>::execReduceScalar(dim3 launchDims, cudaStre
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-_CUDA_H void ReduceBoolFunction<X,Y>::execReduceXD(dim3 launchDims, cudaStream_t *stream,
-                                                   const int opNum,
-                                                   const int rank, const void *x, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo,
-                                                   void *extraParams,
-                                                   void *z, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo,
-                                                   int *dimension, int dimensionLength,
-                                                   void *reductionPointer,
-                                                   const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
-
-    DISPATCH_BY_OPNUM_TT(intermediateXD, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(REDUCE_BOOL_OPS));
+_CUDA_H void ReduceBoolFunction<X,Y>::execReduceXD(dim3 launchDims, cudaStream_t *stream, const int opNum,
+                                                    const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                    void *extraParams, void *vreductionBuffer,
+                                                    void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims) {
+    if(shape::length(hZShapeInfo) == 1)  {
+        ReduceBoolFunction<X,Y>::execReduceScalar(launchDims, stream, opNum, x, dXShapeInfo, hXShapeInfo, extraParams, z, dZShapeInfo, hZShapeInfo, nullptr, 0, vreductionBuffer, nullptr);
+    }
+    else {
+        DISPATCH_BY_OPNUM_TT(intermediateXD, PARAMS(launchDims, stream, x, dXShapeInfo, hXShapeInfo, extraParams, vreductionBuffer, z, dZShapeInfo, hZShapeInfo, dims), OPS_A(REDUCE_BOOL_OPS));
+    }
     DEBUG_KERNEL(stream, opNum);
 }
 
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
index 71f5d03da..d4882d6c0 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
+++ b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
@@ -35,14 +35,10 @@ using namespace simdOps;
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename OpType>
-__global__ void simpleReduce(const void *x, const Nd4jLong *xShapeInfo,
-                            void *extraParams,
-                            void *z, const Nd4jLong *zShapeInfo,
-                            int *dimension, int dimensionLength,
-                            void *reductionBuffer,
-                             const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets) {
+__global__ void simpleReduce(const void *x, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                            void *extraParams, void *vreductionBuffer, void *z, const Nd4jLong *zShapeInfo) {
 
-    functions::reduce::ReduceFloatFunction<X,Z>::template transformCudaXD<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
+    functions::reduce::ReduceFloatFunction<X,Z>::template transformCudaXD<OpType>(x, outerXTadShapeInfo, innerXTadShapeInfo, extraParams, vreductionBuffer, z, zShapeInfo);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -96,51 +92,48 @@ __device__ void ReduceFloatFunction<X,Z>::aggregatePartials(void *vsPartials, Nd
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
 template <typename OpType>
-__device__ void ReduceFloatFunction<X,Z>::transformCudaXD(const void *vx, const Nd4jLong *xShapeInfo,
-                                                        void *vextraParams,
-                                                        void *vz, const Nd4jLong *zShapeInfo,
-                                                        int *dimension,  int dimensionLength,
-                                                        void *vreductionBuffer,
-                                                          const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets) {
+__device__ void ReduceFloatFunction<X,Z>::transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                                                        void *vextraParams, void *vreductionBuffer,
+                                                        void *vz, const Nd4jLong *zShapeInfo) {
 
     auto x = reinterpret_cast<const X*>(vx);
     auto z = reinterpret_cast<Z*>(vz);
     auto extraParams = reinterpret_cast<Z*>(vextraParams);
-    auto reductionBuffer = reinterpret_cast<Z*>(vreductionBuffer);
 
     //shared memory space for storing intermediate results
-    __shared__ Z* sPartials;
-    __shared__ int tadLength, numTads;
-    __shared__ bool isPlainOutput;
+    __shared__ Z sPartials[CUDA_BLOCK_SIZE];
+    __shared__ int tadLen, numTads;
+    __shared__ bool sameOffsets;
 
     if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
+        sameOffsets = shape::haveSameShapeAndStrides(zShapeInfo, outerXTadShapeInfo);
 
-        isPlainOutput = shape::order(zShapeInfo) == 'c' && shape::elementWiseStride(zShapeInfo) == 1;
-
-        tadLength = shape::length(tadOnlyShapeInfo);
-        numTads = shape::length(xShapeInfo) / tadLength;
+        tadLen  = shape::length(innerXTadShapeInfo);
+        numTads = shape::length(outerXTadShapeInfo);
     }
     __syncthreads();
 
+    int coords[MAX_RANK];
+
     for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
 
-        auto tadOffsetForBlock = tadOffsets[r];
-        sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
+        shape::index2coords(r, outerXTadShapeInfo, coords);
+        const auto outerOffset = shape::getOffset(outerXTadShapeInfo, coords);
+        const auto zOffset = sameOffsets ? outerOffset : shape::getOffset(zShapeInfo, coords);
 
-        for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
-        }
+        const X* xTad = x + outerOffset;
+        sPartials[threadIdx.x] = OpType::startingValue(xTad);
+
+        for (int i = threadIdx.x; i < tadLen; i += blockDim.x)
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(xTad[shape::getIndexOffset(i, innerXTadShapeInfo)], extraParams), extraParams);
         __syncthreads();
 
-        // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        // aggregate. do NOT reduce for elements > tadLen
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLen), extraParams);
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[zOffset] = OpType::postProcess(sPartials[threadIdx.x], tadLen, extraParams);
     }
 }
 
@@ -161,13 +154,11 @@ __device__ void ReduceFloatFunction<X,Z>::execScalarCuda(const void *vx, const N
     auto tid = blockDim.x * blockIdx.x + threadIdx.x;
 
     //shared memory space for storing intermediate results
-    __shared__ Z* sPartials;
+    __shared__ Z sPartials[CUDA_BLOCK_SIZE];
     __shared__ Nd4jLong xEws;
     __shared__ Nd4jLong len;
 
     if(threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
         xEws = shape::elementWiseStride(xShapeInfo);
         len = shape::length(xShapeInfo);
     }
@@ -236,12 +227,9 @@ __device__ void ReduceFloatFunction<X,Z>::execScalarCuda(const void *vx, const N
 template <typename X, typename Z>
 template<typename OpType>
 __host__ void ReduceFloatFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStream_t *stream,
-                                                       const void *x, const Nd4jLong *xShape, const Nd4jLong *hXShapeInfo,
-                                                       void *extraParams,
-                                                       void *z, const Nd4jLong *zShape, const Nd4jLong *hZShapeInfo,
-                                                       int *dimension, int dimensionLength,
-                                                       void *reductionPointer,
-                                                       const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+                                                       const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                       void *extraParams, void *vreductionBuffer,
+                                                       void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims) {
 
     if(shape::isEmpty(hXShapeInfo)) {
 
@@ -256,10 +244,17 @@ __host__ void ReduceFloatFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStre
         auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
-        functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, zShape, hZShapeInfo, z, zShape, hZShapeInfo, ptr, nullptr);
+        functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, dZShapeInfo, hZShapeInfo, z, dZShapeInfo, hZShapeInfo, ptr, nullptr);
     }
     else {
-        simpleReduce<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShape, extraParams, z, zShape, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets);
+
+        const int zRank = shape::rank(hZShapeInfo);
+        const int tadRank = shape::rank(hXShapeInfo) - zRank;
+
+        auto outerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims, zRank);
+        auto innerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims+zRank, tadRank);
+
+        simpleReduce<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, reinterpret_cast<Nd4jLong const*>(outerPack.special()), reinterpret_cast<Nd4jLong const*>(innerPack.special()), extraParams, vreductionBuffer, z, dZShapeInfo);
     }
 }
 
@@ -269,7 +264,7 @@ template<typename OpType>
 __host__ void ReduceFloatFunction<X,Z>::intermediateScalar(dim3 launchDims, cudaStream_t *stream,
                                                            const void *x, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo,
                                                            void *extraParams,
-                                                           void *z, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo,
+                                                           void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo,
                                                            int *dimension, int dimensionLength,
                                                            void *reductionBuffer,
                                                            const Nd4jLong *tadOnlyShapeInfo) {
@@ -286,7 +281,7 @@ __host__ void ReduceFloatFunction<X,Z>::intermediateScalar(dim3 launchDims, cuda
             throw sd::cuda_exception::build("ReduceFloatFunction<X,Z>::intermediateScalar: failed to copy resulting scalar", res);
     }
     else {
-        simpleScalar<X, Z, OpType> <<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo);
+        simpleScalar<X, Z, OpType> <<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, dZShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo);
     }
 }
 
@@ -296,27 +291,28 @@ _CUDA_H void ReduceFloatFunction<X,Y>::execReduceScalar(dim3 launchDims, cudaStr
                                                         const int opNum,
                                                         const void *x, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo,
                                                         void *extraParams,
-                                                        void *z, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo,
+                                                        void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo,
                                                         int *dimension, int dimensionLength,
                                                         void *reductionBuffer,
                                                         const Nd4jLong *tadOnlyShapeInfo) {
 
-        DISPATCH_BY_OPNUM_TT(intermediateScalar, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_FLOAT_OPS));
+        DISPATCH_BY_OPNUM_TT(intermediateScalar, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_FLOAT_OPS));
         sd::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-_CUDA_H void ReduceFloatFunction<X,Y>::execReduceXD(dim3 launchDims, cudaStream_t *stream,
-                                                    const int opNum,
-                                                    const int rank, const void *x, const Nd4jLong *xShape, const Nd4jLong *hXShapeInfo,
-                                                    void *extraParams,
-                                                    void *z, const Nd4jLong *zShape, const Nd4jLong *hZShapeInfo,
-                                                    int *dimension, int dimensionLength,
-                                                    void *reductionPointer,
-                                                    const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+_CUDA_H void ReduceFloatFunction<X,Y>::execReduceXD(dim3 launchDims, cudaStream_t *stream, const int opNum,
+                                                    const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                    void *extraParams, void *vreductionBuffer,
+                                                    void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims) {
 
-    DISPATCH_BY_OPNUM_TT(intermediateXD, PARAMS(launchDims, stream, x, xShape, hXShapeInfo, extraParams, z, zShape, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(REDUCE_FLOAT_OPS));
+    if(shape::length(hZShapeInfo) == 1)  {
+        ReduceFloatFunction<X,Y>::execReduceScalar(launchDims, stream, opNum, x, dXShapeInfo, hXShapeInfo, extraParams, z, dZShapeInfo, hZShapeInfo, nullptr, 0, vreductionBuffer, nullptr);
+    }
+    else {
+        DISPATCH_BY_OPNUM_TT(intermediateXD, PARAMS(launchDims, stream, x, dXShapeInfo, hXShapeInfo, extraParams, vreductionBuffer, z, dZShapeInfo, hZShapeInfo, dims), OPS_A(REDUCE_FLOAT_OPS));
+    }
     DEBUG_KERNEL(stream, opNum);
 }
 
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_long.cu b/libnd4j/include/loops/cuda/reduce/reduce_long.cu
index 1beac5330..e80afa6b2 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_long.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_long.cu
@@ -33,14 +33,10 @@ using namespace simdOps;
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename OpType>
-__device__ void reduceSimpleGeneric(const void *x,  const Nd4jLong *xShapeInfo,
-                                    void *extraParams,
-                                    void *z, const Nd4jLong *zShapeInfo,
-                                    int *dimension, int dimensionLength,
-                                    void *reductionBuffer,
-                                    const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets) {
+__global__ void simpleReduce(const void *x, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                            void *extraParams, void *vreductionBuffer, void *z, const Nd4jLong *zShapeInfo) {
 
-    functions::reduce::ReduceLongFunction<X,Z>::template transformCudaXD<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
+    functions::reduce::ReduceLongFunction<X,Z>::template transformCudaXD<OpType>(x, outerXTadShapeInfo, innerXTadShapeInfo, extraParams, vreductionBuffer, z, zShapeInfo);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -55,17 +51,6 @@ __device__ void reduceScalarGeneric(const void *x, const Nd4jLong *xShapeInfo,
     functions::reduce::ReduceLongFunction<X, Z>::template execScalarCuda<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, reductionBuffer, tadOnlyShapeInfo);
 }
 
-////////////////////////////////////////////////////////////////////////
-template <typename X, typename Z, typename OpType>
-__global__ void simpleReduce(const void *x, const Nd4jLong *xShapeInfo,
-                            void *extraParams,
-                            void *z, const Nd4jLong *zShapeInfo,
-                            int *dimension, int dimensionLength,
-                            void *reductionBuffer,
-                             const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets) {
-
-    reduceSimpleGeneric<X, Z, OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
-}
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename OpType>
@@ -118,51 +103,48 @@ __device__ void ReduceLongFunction<X,Z>::aggregatePartials(void *vsPartials, Nd4
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
 template <typename OpType>
-__device__ void ReduceLongFunction<X,Z>::transformCudaXD(const void *vx, const Nd4jLong *xShapeInfo,
-                                                        void *vextraParams,
-                                                        void *vz, const Nd4jLong *zShapeInfo,
-                                                        int *dimension,  int dimensionLength,
-                                                        void *vreductionBuffer,
-                                                         const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets) {
+__device__ void ReduceLongFunction<X,Z>::transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                                                        void *vextraParams, void *vreductionBuffer,
+                                                        void *vz, const Nd4jLong *zShapeInfo) {
 
     auto x = reinterpret_cast<const X*>(vx);
     auto z = reinterpret_cast<Z*>(vz);
     auto extraParams = reinterpret_cast<X*>(vextraParams);
-    auto reductionBuffer = reinterpret_cast<Z*>(vreductionBuffer);
 
     //shared memory space for storing intermediate results
-    __shared__ Z* sPartials;
-    __shared__ int tadLength, numTads;
-    __shared__ bool isPlainOutput;
+    __shared__ Z sPartials[CUDA_BLOCK_SIZE];
+    __shared__ int tadLen, numTads;
+    __shared__ bool sameOffsets;
 
     if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
+        sameOffsets = shape::haveSameShapeAndStrides(zShapeInfo, outerXTadShapeInfo);
 
-        isPlainOutput = shape::order(zShapeInfo) == 'c' && shape::elementWiseStride(zShapeInfo) == 1;
-
-        tadLength = shape::length(tadOnlyShapeInfo);
-        numTads = shape::length(xShapeInfo) / tadLength;
+        tadLen  = shape::length(innerXTadShapeInfo);
+        numTads = shape::length(outerXTadShapeInfo);
     }
     __syncthreads();
 
+    int coords[MAX_RANK];
+
     for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
 
-        Nd4jLong tadOffsetForBlock = tadOffsets[r];
-        sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
+        shape::index2coords(r, outerXTadShapeInfo, coords);
+        const auto outerOffset = shape::getOffset(outerXTadShapeInfo, coords);
+        const auto zOffset = sameOffsets ? outerOffset : shape::getOffset(zShapeInfo, coords);
 
-        for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
-        }
+        const X* xTad = x + outerOffset;
+        sPartials[threadIdx.x] = OpType::startingValue(xTad);
+
+        for (int i = threadIdx.x; i < tadLen; i += blockDim.x)
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(xTad[shape::getIndexOffset(i, innerXTadShapeInfo)], extraParams), extraParams);
         __syncthreads();
 
-        // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        // aggregate. do NOT reduce for elements > tadLen
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLen), extraParams);
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[zOffset] = OpType::postProcess(sPartials[threadIdx.x], tadLen, extraParams);
     }
 }
 
@@ -183,13 +165,11 @@ __device__ void ReduceLongFunction<X,Z>::execScalarCuda(const void *vx, const Nd
     auto tid = blockDim.x * blockIdx.x + threadIdx.x;
 
     //shared memory space for storing intermediate results
-    __shared__ Z* sPartials;
+    __shared__ Z sPartials[CUDA_BLOCK_SIZE];
     __shared__ Nd4jLong xEws;
     __shared__ Nd4jLong len;
 
     if(threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
         xEws = shape::elementWiseStride(xShapeInfo);
         len = shape::length(xShapeInfo);
     }
@@ -257,12 +237,9 @@ __device__ void ReduceLongFunction<X,Z>::execScalarCuda(const void *vx, const Nd
 template <typename X, typename Z>
 template<typename OpType>
 __host__ void ReduceLongFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStream_t *stream,
-                                                      const void *x, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo,
-                                                      void *extraParams,
-                                                      void *z, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo,
-                                                      int *dimension, int dimensionLength,
-                                                      void *reductionPointer,
-                                                      const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+                                                       const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                       void *extraParams, void *vreductionBuffer,
+                                                       void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims) {
 
     if(shape::isEmpty(hXShapeInfo)) {
 
@@ -278,10 +255,16 @@ __host__ void ReduceLongFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStrea
         auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
-        functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, zShapeInfo, hXShapeInfo, z, zShapeInfo, hZShapeInfo, ptr, nullptr);
+        functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, dZShapeInfo, hXShapeInfo, z, dZShapeInfo, hZShapeInfo, ptr, nullptr);
     }
     else {
-        simpleReduce<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets);
+        const int zRank = shape::rank(hZShapeInfo);
+        const int tadRank = shape::rank(hXShapeInfo) - zRank;
+
+        auto outerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims, zRank);
+        auto innerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims+zRank, tadRank);
+
+        simpleReduce<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, reinterpret_cast<Nd4jLong const*>(outerPack.special()), reinterpret_cast<Nd4jLong const*>(innerPack.special()), extraParams, vreductionBuffer, z, dZShapeInfo);
     }
 }
 
@@ -329,16 +312,17 @@ _CUDA_H void ReduceLongFunction<X,Y>::execReduceScalar(dim3 launchDims, cudaStre
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-_CUDA_H void ReduceLongFunction<X,Y>::execReduceXD(dim3 launchDims, cudaStream_t *stream,
-                                                   const int opNum,
-                                                   int rank, const void *x, const Nd4jLong *xShapeInfo, const Nd4jLong* hXShapeInfo,
-                                                   void *extraParams,
-                                                   void *z, const Nd4jLong *zShapeInfo, const Nd4jLong* hZShapeInfo,
-                                                   int *dimension, int dimensionLength,
-                                                   void *reductionPointer,
-                                                   const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
+_CUDA_H void ReduceLongFunction<X,Y>::execReduceXD(dim3 launchDims, cudaStream_t *stream, const int opNum,
+                                                    const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                    void *extraParams, void *vreductionBuffer,
+                                                    void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims) {
 
-    DISPATCH_BY_OPNUM_TT(intermediateXD, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(REDUCE_LONG_OPS));
+    if(shape::length(hZShapeInfo) == 1)  {
+        ReduceLongFunction<X,Y>::execReduceScalar(launchDims, stream, opNum, x, dXShapeInfo, hXShapeInfo, extraParams, z, dZShapeInfo, hZShapeInfo, nullptr, 0, vreductionBuffer, nullptr);
+    }
+    else {
+        DISPATCH_BY_OPNUM_TT(intermediateXD, PARAMS(launchDims, stream, x, dXShapeInfo, hXShapeInfo, extraParams, vreductionBuffer, z, dZShapeInfo, hZShapeInfo, dims), OPS_A(REDUCE_LONG_OPS));
+    }
     DEBUG_KERNEL(stream, opNum);
 }
 
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_same.cu b/libnd4j/include/loops/cuda/reduce/reduce_same.cu
index c1947314e..0ae76eb51 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_same.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_same.cu
@@ -31,17 +31,12 @@
 
 using namespace simdOps;
 
-
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename OpType>
-__global__ void simpleReduce(void const* x, Nd4jLong const* xShapeInfo,
-                            void *extraParams,
-                            void *z, Nd4jLong const* zShapeInfo,
-                            int *dimension, int dimensionLength,
-                            void *reductionBuffer,
-                            Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets) {
+__global__ void simpleReduce(const void *x, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                            void *extraParams, void *vreductionBuffer, void *z, const Nd4jLong *zShapeInfo) {
 
-    functions::reduce::ReduceSameFunction<X>::template transformCudaXD<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
+    functions::reduce::ReduceSameFunction<X>::template transformCudaXD<OpType>(x, outerXTadShapeInfo, innerXTadShapeInfo, extraParams, vreductionBuffer, z, zShapeInfo);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -95,61 +90,54 @@ __device__ void ReduceSameFunction<X>::aggregatePartials(void *vsPartials, Nd4jL
 ////////////////////////////////////////////////////////////////////////
 template <typename X>
 template <typename OpType>
-__device__ void ReduceSameFunction<X>::transformCudaXD( void const* vx, Nd4jLong const* xShapeInfo,
-                                                        void *vextraParams,
-                                                        void *vz, Nd4jLong const* zShapeInfo,
-                                                        int *dimension,  int dimensionLength,
-                                                        void *vreductionBuffer,
-                                                        Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets) {
+__device__ void ReduceSameFunction<X>::transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo,
+                                                        void *vextraParams, void *vreductionBuffer,
+                                                        void *vz, const Nd4jLong *zShapeInfo) {
 
     auto x = reinterpret_cast<X const*>(vx);
     auto z = reinterpret_cast<X*>(vz);
     auto extraParams = reinterpret_cast<X*>(vextraParams);
     auto reductionBuffer = reinterpret_cast<X*>(vreductionBuffer);
 
-    if (OpType::requiresSpecialAccumulation) {
-        OpType::execSpecialCuda(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
-        return;
-    }
+    // if (OpType::requiresSpecialAccumulation) {
+    //     OpType::execSpecialCuda(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
+    //     return;
+    // }
 
     //shared memory space for storing intermediate results
-    __shared__ X* sPartials;
-
-    __shared__ int tadLength, tadRank, numTads;
-    __shared__ Nd4jLong *tadShape, *tadStride;
-    __shared__ bool isPlainOutput;
+    __shared__ X sPartials[CUDA_BLOCK_SIZE];
+    __shared__ int tadLen, numTads;
+    __shared__ bool sameOffsets;
 
     if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<X*>(shmem);
+        sameOffsets = shape::haveSameShapeAndStrides(zShapeInfo, outerXTadShapeInfo);
 
-        isPlainOutput = shape::order(zShapeInfo) == 'c' && shape::elementWiseStride(zShapeInfo) == 1;
-
-        tadLength = shape::length(tadOnlyShapeInfo);
-        tadRank = shape::rank(tadOnlyShapeInfo);
-        numTads = shape::length(xShapeInfo) / tadLength;
-        tadShape = shape::shapeOf(tadOnlyShapeInfo);
-        tadStride = shape::stride(tadOnlyShapeInfo);
+        tadLen  = shape::length(innerXTadShapeInfo);
+        numTads = shape::length(outerXTadShapeInfo);
     }
     __syncthreads();
 
+    int coords[MAX_RANK];
+
     for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
 
-        Nd4jLong tadOffsetForBlock = tadOffsets[r];
-        sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
+        shape::index2coords(r, outerXTadShapeInfo, coords);
+        const auto outerOffset = shape::getOffset(outerXTadShapeInfo, coords);
+        const auto zOffset = sameOffsets ? outerOffset : shape::getOffset(zShapeInfo, coords);
 
-        for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
-        }
+        const X* xTad = x + outerOffset;
+        sPartials[threadIdx.x] = OpType::startingValue(xTad);
+
+        for (int i = threadIdx.x; i < tadLen; i += blockDim.x)
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(xTad[shape::getIndexOffset(i, innerXTadShapeInfo)], extraParams), extraParams);
         __syncthreads();
 
-        // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        // aggregate. do NOT reduce for elements > tadLen
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLen), extraParams);
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[zOffset] = OpType::postProcess(sPartials[threadIdx.x], tadLen, extraParams);
     }
 }
 
@@ -179,13 +167,11 @@ __device__ void ReduceSameFunction<X>::execScalarCuda(void const* vx, Nd4jLong c
     auto tid = blockDim.x * blockIdx.x + threadIdx.x;
 
     //shared memory space for storing intermediate results
-    __shared__ X* sPartials;
+    __shared__ X sPartials[CUDA_BLOCK_SIZE];
     __shared__ Nd4jLong xEws;
     __shared__ Nd4jLong len;
 
     if(threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<X*>(shmem);
         xEws = shape::elementWiseStride(xShapeInfo);
         len = shape::length(xShapeInfo);
     }
@@ -251,7 +237,10 @@ __device__ void ReduceSameFunction<X>::execScalarCuda(void const* vx, Nd4jLong c
 ////////////////////////////////////////////////////////////////////////
 template <typename X>
 template<typename OpType>
-__host__ void ReduceSameFunction<X>::intermediateXD(dim3 launchDims, cudaStream_t *stream, void const* x, Nd4jLong const* xShapeInfo, Nd4jLong const* hXShapeInfo, void *extraParams, void *z, Nd4jLong const* zShapeInfo, Nd4jLong const* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
+__host__ void ReduceSameFunction<X>::intermediateXD(dim3 launchDims, cudaStream_t *stream,
+                                                       const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                       void *extraParams, void *vreductionBuffer,
+                                                       void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims) {
 
     if(shape::isEmpty(hXShapeInfo)) {
 
@@ -267,10 +256,16 @@ __host__ void ReduceSameFunction<X>::intermediateXD(dim3 launchDims, cudaStream_
         auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
-        functions::scalar::ScalarTransform<X, X, X>::executeCudaShaped(launchDims, stream, 14, z, zShapeInfo, hXShapeInfo, z, zShapeInfo, hZShapeInfo, ptr, nullptr);
+        functions::scalar::ScalarTransform<X, X, X>::executeCudaShaped(launchDims, stream, 14, z, dZShapeInfo, hXShapeInfo, z, dZShapeInfo, hZShapeInfo, ptr, nullptr);
     }
     else {
-        simpleReduce<X, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets);
+
+        const int zRank = shape::rank(hZShapeInfo);
+        const int tadRank = shape::rank(hXShapeInfo) - zRank;
+
+        auto outerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims, zRank);
+        auto innerPack = sd::ConstantShapeHelper::getInstance().createSubArrShapeInfo(hXShapeInfo, dims+zRank, tadRank);
+        simpleReduce<X, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, reinterpret_cast<Nd4jLong const*>(outerPack.special()), reinterpret_cast<Nd4jLong const*>(innerPack.special()), extraParams, vreductionBuffer, z, dZShapeInfo);
     }
 }
 
@@ -305,9 +300,17 @@ _CUDA_H void ReduceSameFunction<X>::execReduceScalar(dim3 launchDims, cudaStream
 
 ////////////////////////////////////////////////////////////////////////
 template <typename X>
-_CUDA_H void ReduceSameFunction<X>::execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void const* x, Nd4jLong const* xShapeInfo, Nd4jLong const* hXShapeInfo, void *extraParams, void *z, Nd4jLong const* zShapeInfo, Nd4jLong const* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
+_CUDA_H void ReduceSameFunction<X>::execReduceXD(dim3 launchDims, cudaStream_t *stream, const int opNum,
+                                                    const void *x, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo,
+                                                    void *extraParams, void *vreductionBuffer,
+                                                    void *z, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims) {
 
-    DISPATCH_BY_OPNUM_T(intermediateXD, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), REDUCE_SAME_OPS);
+    if(shape::length(hZShapeInfo) == 1)  {
+        ReduceSameFunction<X>::execReduceScalar(launchDims, stream, opNum, x, dXShapeInfo, hXShapeInfo, extraParams, z, dZShapeInfo, hZShapeInfo, nullptr, 0, vreductionBuffer, nullptr);
+    }
+    else {
+        DISPATCH_BY_OPNUM_T(intermediateXD, PARAMS(launchDims, stream, x, dXShapeInfo, hXShapeInfo, extraParams, vreductionBuffer, z, dZShapeInfo, hZShapeInfo, dims), REDUCE_SAME_OPS);
+    }
     DEBUG_KERNEL(stream, opNum);
 }
 
diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp
index 2a301b817..799ddda6a 100644
--- a/libnd4j/include/loops/cuda/reduce3.chpp
+++ b/libnd4j/include/loops/cuda/reduce3.chpp
@@ -122,12 +122,9 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void const* vx, Nd4jLong const* xS
 	auto z = reinterpret_cast<Z*>(vz);
 
 	__shared__ Z extraZ[3];
-	__shared__ Z* sPartials;
+	__shared__ Z sPartials[CUDA_BLOCK_SIZE];
 
 	if (threadIdx.x == 0) {
-		extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
-
 		extraZ[0] = (Z) 0.0f;
 		extraZ[1] = (Z) 0.0f;
 
@@ -250,16 +247,11 @@ __device__ void Reduce3<X,Z>::transformAll( void const* vx, Nd4jLong const* xSha
 	auto z = reinterpret_cast<Z*>(vz);
 
     // initialize partials first
-    __shared__ Z* sPartials;
-    if(threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
-    }
-    __syncthreads();
+    __shared__ Z sPartials[CUDA_BLOCK_SIZE];
 
     Z startingVal = OpType::startingValue(dx);
 	sPartials[threadIdx.x] = startingVal;
-	X *tempX = reinterpret_cast<X*>(sPartials) + blockDim.x;
+	auto tempX = reinterpret_cast<X*>(sPartials) + blockDim.x;
 
     const int maxBlock = blockDim.x;
 
@@ -365,7 +357,7 @@ __device__ void Reduce3<X,Z>::transform(void const* vx, Nd4jLong const* xShapeIn
 
 	__shared__ Z extraZ[OpType::extraParamsLen > 0 ? OpType::extraParamsLen : 1];
 
-	__shared__ Z* sPartials;
+	__shared__ Z sPartials[CUDA_BLOCK_SIZE];
 	__shared__ int tadLen;
 	__shared__ Nd4jLong zLen;
 	__shared__ Nd4jLong xTadEws;
@@ -375,9 +367,6 @@ __device__ void Reduce3<X,Z>::transform(void const* vx, Nd4jLong const* xShapeIn
 	__shared__ char yTadOrder;
 
 	if(threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sPartials = reinterpret_cast<Z*>(shmem);
-
         tadLen    = shape::length(tadOnlyShapeInfo);
         zLen      = shape::length(zShapeInfo);
         xTadEws   = shape::elementWiseStride(tadOnlyShapeInfo);
diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu
index 521ac5b06..bdab3d743 100644
--- a/libnd4j/include/loops/cuda/summarystatsreduce.cu
+++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu
@@ -52,12 +52,12 @@ void _CUDA_G summaryStatsReduceT(int op, void const* dx, Nd4jLong const* xShapeI
 		 */
         template<typename X, typename Z>
         template<typename OpType>
-        _CUDA_D void SummaryStatsReduce<X,Z>::aggregatePartials(SummaryStatsData<X> **sPartialsRef, Nd4jLong tid, Nd4jLong numElements, void *vextraParams) {
+        _CUDA_D void SummaryStatsReduce<X,Z>::aggregatePartials(SummaryStatsData<X> *sPartials, Nd4jLong tid, Nd4jLong numElements, void *vextraParams) {
             // start the shared memory loop on the next power of 2 less
             // than the block size.  If block size is not a power of 2,
             // accumulate the intermediate sums in the remainder range.
+
             auto extraParams = static_cast<Z*>(vextraParams);
-            SummaryStatsData<X> *sPartials = *sPartialsRef;
             Nd4jLong floorPow2 = blockDim.x;
 
             if (floorPow2 & (floorPow2 - 1)) {
@@ -123,12 +123,7 @@ void _CUDA_G summaryStatsReduceT(int op, void const* dx, Nd4jLong const* xShapeI
 
             int numElements = blockDim.x;
             //shared memory space for storing intermediate results
-            __shared__ SummaryStatsData<X> *sPartials;
-            if(threadIdx.x == 0) {
-                extern __shared__ unsigned char shmem[];
-                sPartials = reinterpret_cast<SummaryStatsData<X>*>(shmem);
-            }
-            __syncthreads();
+            __shared__ SummaryStatsData<X> sPartials[CUDA_BLOCK_SIZE];
 
             Z startingVal = startingValue(dx);
 
@@ -211,7 +206,7 @@ void _CUDA_G summaryStatsReduceT(int op, void const* dx, Nd4jLong const* xShapeI
                             sPartials[threadIdx.x] = update(sPartials[threadIdx.x], OpType::op(indexVal2, extraParams), extraParams);
                         }
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+                        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -237,7 +232,7 @@ void _CUDA_G summaryStatsReduceT(int op, void const* dx, Nd4jLong const* xShapeI
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+                        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -274,7 +269,7 @@ void _CUDA_G summaryStatsReduceT(int op, void const* dx, Nd4jLong const* xShapeI
                 sPartials[threadIdx.x] = reduction;
 
                 __syncthreads();
-                aggregatePartials<OpType>(&sPartials, threadIdx.x, blockDim.x, extraParams);
+                aggregatePartials<OpType>(sPartials, threadIdx.x, blockDim.x, extraParams);
                 __syncthreads();
 
                 if (gridDim.x > 1) {
@@ -311,7 +306,7 @@ void _CUDA_G summaryStatsReduceT(int op, void const* dx, Nd4jLong const* xShapeI
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, gridDim.x, extraParams);
+                        aggregatePartials<OpType>(sPartials, threadIdx.x, gridDim.x, extraParams);
                         __syncthreads();
 
                         if (tid == 0) {
diff --git a/libnd4j/include/loops/indexreduce.h b/libnd4j/include/loops/indexreduce.h
index 2e8bc33d2..173c79b64 100755
--- a/libnd4j/include/loops/indexreduce.h
+++ b/libnd4j/include/loops/indexreduce.h
@@ -22,7 +22,7 @@
 
 #ifndef INDEXREDUCE_H_
 #define INDEXREDUCE_H_
-#include "../helpers/shape.h"
+#include <helpers/shape.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -63,7 +63,7 @@ namespace functions {
                                      const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset);
 
     template<typename OpType>
-	static __device__ void aggregatePartials(IndexValue<X> **sPartialsRef, Nd4jLong tid, Nd4jLong numElements, void *extraParams);
+	static __device__ void aggregatePartials(IndexValue<X> *sPartialsRef, Nd4jLong tid, Nd4jLong numElements, void *extraParams);
 
 
     template<typename OpType>
diff --git a/libnd4j/include/loops/reduce_bool.h b/libnd4j/include/loops/reduce_bool.h
index a74d53033..170b29991 100644
--- a/libnd4j/include/loops/reduce_bool.h
+++ b/libnd4j/include/loops/reduce_bool.h
@@ -29,6 +29,7 @@
 #include <system/pairwise_util.h>
 #include <ops/ops.h>
 #include <system/op_boilerplate.h>
+#include <memory/Workspace.h>
 
 #pragma once
 #ifdef __CUDACC__
@@ -61,17 +62,17 @@ namespace functions {
             static __device__ void execScalarCuda(const void *vx, const Nd4jLong *xShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
             template<typename OpType>
-            static __device__ void transformCudaXD(const void *vx, const Nd4jLong *xShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets);
+            static __device__ void transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo, void *extraParams, void *vreductionBuffer,  void *vz, const Nd4jLong *zShapeInfo);
 
             template<typename OpType>
             static __host__ void intermediateScalar(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
             template<typename OpType>
-            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims);
 
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
-            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims);
 #else
 
             /**
@@ -98,13 +99,11 @@ namespace functions {
                                    void *extraParams,
                                    void *z, const Nd4jLong *zShapeInfo);
 
-            static void exec(int opNum,
-                             const void *x, const Nd4jLong *xShapeInfo,
-                             void *extraParams,
-                             void *result, const Nd4jLong *resultShapeInfoBuffer,
-                             int *dimension, int dimensionLength,
-                             const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                             int64_t start, int64_t stop);
+            static void exec(int opNum, sd::memory::Workspace* workspace,
+                             const void *vx, const Nd4jLong *xShapeInfo,
+                             void *vextraParams,
+                             void *vz, const Nd4jLong *zShapeInfo,
+                             const int *dims);
 
             /**
              * Execute on the cpu
@@ -118,14 +117,12 @@ namespace functions {
              * @param dimensionLength the length of the dimension buffer
              */
 
-
             template<typename OpType>
-            static void _CUDA_H exec(const void *x, const Nd4jLong *xShapeInfo,
-                                     void *extraParams,
-                                     void *result, const Nd4jLong *resultShapeInfoBuffer,
-                                     int *dimension, int dimensionLength,
-                                     const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                     int64_t start, int64_t stop);
+            static void _CUDA_H exec(sd::memory::Workspace* workspace,
+                                    const void *vx, const Nd4jLong *xShapeInfo,
+                                    void *vextraParams,
+                                    void *vz, const Nd4jLong *zShapeInfo,
+                                    const int *dims);
 
             /**
             * CPU implementation
diff --git a/libnd4j/include/loops/reduce_float.h b/libnd4j/include/loops/reduce_float.h
index c78082f8e..bc78df2d9 100644
--- a/libnd4j/include/loops/reduce_float.h
+++ b/libnd4j/include/loops/reduce_float.h
@@ -29,6 +29,7 @@
 #include <system/pairwise_util.h>
 #include <ops/ops.h>
 #include <system/op_boilerplate.h>
+#include <memory/Workspace.h>
 
 #pragma once
 #ifdef __CUDACC__
@@ -52,28 +53,28 @@ namespace functions {
  */
         template<typename X, typename Z>
         class ReduceFloatFunction {
-        
+
         public:
 
-#ifdef __CUDACC__            
+#ifdef __CUDACC__
             template<typename OpType>
             static __device__ void aggregatePartials(void *sPartials, Nd4jLong tid, Nd4jLong numItems, void *extraParams);
 
             template<typename OpType>
 			static __device__ void execScalarCuda(const void *vx, const Nd4jLong *xShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
-            template<typename OpType> 
-            static __device__ void transformCudaXD(const void *vx, const Nd4jLong *xShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets);
+            template<typename OpType>
+            static __device__ void transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *zShapeInfo);
 
             template<typename OpType>
             static __host__ void intermediateScalar(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
             template<typename OpType>
-            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShape, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims);
 
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
-            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShape, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims);
 #else
 
             /**
@@ -102,13 +103,11 @@ namespace functions {
                                    void *extraParams,
                                    void *vz, const Nd4jLong *zShapeInfo);
 
-            static void exec(int opNum,
+            static void exec(int opNum, sd::memory::Workspace* workspace,
                              const void *vx, const Nd4jLong *xShapeInfo,
-                             void *extraParams,
-                             void *vz, const Nd4jLong *resultShapeInfoBuffer,
-                             int *dimension, int dimensionLength,
-                             const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                             int64_t start, int64_t stop);
+                             void *vextraParams,
+                             void *vz, const Nd4jLong *zShapeInfo,
+                             const int *dims);
 
             /**
              * Execute on the cpu
@@ -124,12 +123,11 @@ namespace functions {
 
 
             template<typename OpType>
-            static void _CUDA_H exec(const void *vx, const Nd4jLong *xShapeInfo,
-                                     void *extraParams,
-                                     void *vz, const Nd4jLong *resultShapeInfoBuffer,
-                                     int *dimension, int dimensionLength,
-                                     const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                     int64_t start, int64_t stop);
+            static void _CUDA_H exec(sd::memory::Workspace* workspace,
+                                    const void *vx, const Nd4jLong *xShapeInfo,
+                                    void *extraParams,
+                                    void *vz, const Nd4jLong *zShapeInfo,
+                                    const int* dims);
 
             /**
             * CPU implementation
diff --git a/libnd4j/include/loops/reduce_long.h b/libnd4j/include/loops/reduce_long.h
index 45ede2985..5ee0cce3b 100644
--- a/libnd4j/include/loops/reduce_long.h
+++ b/libnd4j/include/loops/reduce_long.h
@@ -29,6 +29,7 @@
 #include <system/pairwise_util.h>
 #include <ops/ops.h>
 #include <system/op_boilerplate.h>
+#include <memory/Workspace.h>
 
 #pragma once
 #ifdef __CUDACC__
@@ -60,17 +61,17 @@ namespace functions {
             static __device__ void execScalarCuda(const void *vx, const Nd4jLong *xShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
             template<typename OpType>
-            static __device__ void transformCudaXD(const void *vx, const Nd4jLong *xShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets);
+            static __device__ void transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *zShapeInfo);
 
             template<typename OpType>
             static __host__ void intermediateScalar(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
             template<typename OpType>
-            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims);
 
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo);
 
-            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
+            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims);
 
 #else
 
@@ -100,13 +101,11 @@ namespace functions {
                                    void *extraParams,
                                    void *z, const Nd4jLong *zShapeInfo);
 
-            static void exec(int opNum,
-                             const void *x, const Nd4jLong *xShapeInfo,
-                             void *extraParams,
-                             void *result, const Nd4jLong *resultShapeInfoBuffer,
-                             int *dimension, int dimensionLength,
-                             const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                             int64_t start, int64_t stop);
+            static void exec(int opNum, sd::memory::Workspace* workspace,
+                             const void *vx, const Nd4jLong *xShapeInfo,
+                             void *vextraParams,
+                             void *vz, const Nd4jLong *zShapeInfo,
+                             const int *dims);
 
             /**
              * Execute on the cpu
@@ -120,14 +119,12 @@ namespace functions {
              * @param dimensionLength the length of the dimension buffer
              */
 
-
             template<typename OpType>
-            static void _CUDA_H exec(const void *x, const Nd4jLong *xShapeInfo,
-                                     void *extraParams,
-                                     void *result, const Nd4jLong *resultShapeInfoBuffer,
-                                     int *dimension, int dimensionLength,
-                                     const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                     int64_t start, int64_t stop);
+            static void _CUDA_H exec(sd::memory::Workspace* workspace,
+                             const void *vx, const Nd4jLong *xShapeInfo,
+                             void *vextraParams,
+                             void *vz, const Nd4jLong *zShapeInfo,
+                             const int *dims);
 
             /**
             * CPU implementation
diff --git a/libnd4j/include/loops/reduce_same.h b/libnd4j/include/loops/reduce_same.h
index 5f3622f39..f28409bc6 100644
--- a/libnd4j/include/loops/reduce_same.h
+++ b/libnd4j/include/loops/reduce_same.h
@@ -29,6 +29,7 @@
 #include <system/pairwise_util.h>
 #include <ops/ops.h>
 #include <system/op_boilerplate.h>
+#include <memory/Workspace.h>
 
 #pragma once
 #ifdef __CUDACC__
@@ -63,17 +64,17 @@ namespace functions {
             static __device__ void execScalarCudaLegacy(int opNum, void const* vx, Nd4jLong const* xShapeInfo, void *extraParams, void *vz, Nd4jLong const* zShapeInfo, void *reductionBuffer, Nd4jLong const* tadOnlyShapeInfo);
 
             template<typename OpType>
-            static __device__ void transformCudaXD( void const* vx, Nd4jLong const* xShapeInfo, void *extraParams, void *vz, Nd4jLong const* zShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets);
+            static __device__ void transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo, void *extraParams, void* reductionBuffer, void *vz, const Nd4jLong *zShapeInfo);
 
             template<typename OpType>
             static __host__ void intermediateScalar(dim3 launchDims, cudaStream_t *stream, void const* vx, Nd4jLong const* xShapeInfo, Nd4jLong const* hXShapeInfo, void *extraParams, void *vz, Nd4jLong const* zShapeInfo, Nd4jLong const* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong const* tadOnlyShapeInfo);
 
             template<typename OpType>
-            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, void const* vx, Nd4jLong const* xShapeInfo, Nd4jLong const* hXShapeInfo, void *extraParams, void *vz, Nd4jLong const* zShapeInfo, Nd4jLong const* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets);
+            static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void* reductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims);
 
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void const* vx, Nd4jLong const* xShapeInfo, Nd4jLong const*  hXShapeInfo, void *extraParams, void *vz, Nd4jLong const* zShapeInfo, Nd4jLong const* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong const* tadOnlyShapeInfo);
 
-            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void const* vx, Nd4jLong const* xShapeInfo, Nd4jLong const* hXShapeInfo, void *extraParams, void *vz, Nd4jLong const* zShapeInfo, Nd4jLong const* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets);
+            static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void* reductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims);
 #else
 
             /**
@@ -103,13 +104,11 @@ namespace functions {
                                    void *extraParams,
                                    void *z, const Nd4jLong *zShapeInfo);
 
-            static void exec(int opNum,
-                             const void *x, const Nd4jLong *xShapeInfo,
-                             void *extraParams,
-                             void *result, const Nd4jLong *resultShapeInfoBuffer,
-                             int *dimension, int dimensionLength,
-                             const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                             int64_t start, int64_t stop);
+            static void exec(int opNum, sd::memory::Workspace* workspace,
+                             const void *vx, const Nd4jLong *xShapeInfo,
+                             void *vextraParams,
+                             void *vz, const Nd4jLong *zShapeInfo,
+                             const int *dims);
 
             /**
              * Execute on the cpu
@@ -123,14 +122,12 @@ namespace functions {
              * @param dimensionLength the length of the dimension buffer
              */
 
-
             template<typename OpType>
-            static void _CUDA_H exec(const void *x, const Nd4jLong *xShapeInfo,
-                                     void *extraParams,
-                                     void *result, const Nd4jLong *resultShapeInfoBuffer,
-                                     int *dimension, int dimensionLength,
-                                     const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
-                                     int64_t start, int64_t stop);
+            static void _CUDA_H exec(sd::memory::Workspace* workspace,
+                                    const void *vx, const Nd4jLong *xShapeInfo,
+                                    void *vextraParams,
+                                    void *vz, const Nd4jLong *zShapeInfo,
+                                    const int *dims);
 
             /**
             * CPU implementation
diff --git a/libnd4j/include/loops/summarystatsreduce.h b/libnd4j/include/loops/summarystatsreduce.h
index 1ab06a11b..12aea687a 100755
--- a/libnd4j/include/loops/summarystatsreduce.h
+++ b/libnd4j/include/loops/summarystatsreduce.h
@@ -275,7 +275,7 @@ namespace functions {
             }
 
             template<typename OpType>
-            static _CUDA_D void aggregatePartials(SummaryStatsData<X> **sPartialsRef, Nd4jLong tid, Nd4jLong numElements, void *extraParams);
+            static _CUDA_D void aggregatePartials(SummaryStatsData<X> *sPartials, Nd4jLong tid, Nd4jLong numElements, void *extraParams);
 
 
             template<typename OpType>
diff --git a/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp b/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
index 0d5d1d011..a542ae8e4 100644
--- a/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
@@ -184,7 +184,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 			}
 			else
 				dLdw->assign(E);
@@ -210,7 +210,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -238,7 +238,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
diff --git a/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp b/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
index 99cf2e3c1..9317aed32 100644
--- a/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
@@ -216,7 +216,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
 			else {
 				if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 				}
 				else
 					dLdw->assign(E);
@@ -249,7 +249,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
 
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 					}
 					else
 						dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -284,7 +284,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
 
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 						*dLdw /= numOfNonZeroWeights;
 					}
 					else
diff --git a/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp b/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
index 71e7489ea..0766cf600 100644
--- a/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
@@ -193,7 +193,7 @@ namespace sd {
                         dLdw->assign(E.reduceNumber(reduce::Sum));
                     else if(weights != weightsBroad) {
                         std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                     }
                     else
                         dLdw->assign(E);
@@ -222,7 +222,7 @@ namespace sd {
                             *dLdw = 0.;
                         else if(weights != weightsBroad) {
                             std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                         }
                         else
                             dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -251,7 +251,7 @@ namespace sd {
                             dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
                         else if(weights != weightsBroad) {
                             std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                            E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                            E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                             *dLdw /= numOfNonZeroWeightsScalar;
                         }
                         else
diff --git a/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp b/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
index 2d0b44b3c..5f7c94c88 100644
--- a/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
@@ -208,7 +208,7 @@ DECLARE_SHAPE_FN(huber_loss) {
 						dLdw->assign(E.reduceNumber(reduce::Sum));
 					else if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 					}
 					else
 						dLdw->assign(E);
@@ -237,7 +237,7 @@ DECLARE_SHAPE_FN(huber_loss) {
 							*dLdw = 0.;
 						else if(weights != weightsBroad) {
 							std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-							((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+							((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 						}
 						else
 							dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -266,7 +266,7 @@ DECLARE_SHAPE_FN(huber_loss) {
 							dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
 						else if(weights != weightsBroad) {
 							std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-							E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+							E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 							*dLdw /= numOfNonZeroWeightsScalar;
 						}
 						else
diff --git a/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp b/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
index ab0c8923e..3d1332302 100644
--- a/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
@@ -197,7 +197,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 			}
 			else
 				dLdw->assign(E);
@@ -228,7 +228,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -256,7 +256,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / numOfNonZeroWeights);
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
diff --git a/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp b/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
index 5cc6b60ab..f1b7d5f41 100644
--- a/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
@@ -201,7 +201,7 @@ namespace ops {
                     dLdw->assign(E.reduceNumber(reduce::Sum));
                 else if(weights != weightsBroad) {
                     std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                    E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                    E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                 }
                 else
                     dLdw->assign(E);
@@ -230,7 +230,7 @@ namespace ops {
                         *dLdw = 0.;
                     else if(weights != weightsBroad) {
                         std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                        ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                        ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                     }
                     else
                         dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -259,7 +259,7 @@ namespace ops {
                         dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
                     else if(weights != weightsBroad) {
                         std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                         *dLdw /= numOfNonZeroWeightsScalar;
                     }
                     else
diff --git a/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp b/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
index f36fa3c62..e604a3da8 100644
--- a/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
@@ -274,7 +274,7 @@ namespace sd {
                         dLdw->assign(E.reduceNumber(reduce::Sum));
                     else if(weights != weightsBroad) {
                         std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                        E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                     }
                     else
                         dLdw->assign(E);
@@ -301,7 +301,7 @@ namespace sd {
                             *dLdw = 0.;
                         else if(weights != weightsBroad) {
                             std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                            ((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                         }
                         else
                             dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -329,7 +329,7 @@ namespace sd {
                             dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
                         else if(weights != weightsBroad) {
                             std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-                            E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+                            E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
                             *dLdw /= numOfNonZeroWeightsScalar;
                         }
                         else
diff --git a/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp b/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
index 6c54706c4..50fdb46e1 100644
--- a/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
@@ -192,7 +192,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss_grad, 3, 3, false, 0, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 			}
 			else
 				dLdw->assign(E);
@@ -219,7 +219,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss_grad, 3, 3, false, 0, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -247,7 +247,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss_grad, 3, 3, false, 0, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / double(numOfNonZeroWeights));
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
diff --git a/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp b/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
index ddd28d43d..372a93388 100644
--- a/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
@@ -212,7 +212,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				dLdw->assign(E.reduceNumber(reduce::Sum));
 			else if(weights != weightsBroad) {
 				std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+				E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 			}
 			else
 				dLdw->assign(E);
@@ -241,7 +241,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 					*dLdw = 0.;
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 				}
 				else
 					dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum * sum));
@@ -269,7 +269,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 					dLdw->assign(E.reduceNumber(reduce::Sum) / numOfNonZeroWeightsScalar);
 				else if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 					*dLdw /= numOfNonZeroWeightsScalar;
 				}
 				else
diff --git a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
index 79d46e448..c0bb78015 100644
--- a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
@@ -258,7 +258,7 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 
 				if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+					E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 				}
 				else
 					dLdw->assign(E);
@@ -294,7 +294,7 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+						((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum)).reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 					}
 					else
 						dLdw->assign((E * sum - (E * *weightsBroad).reduceNumber(reduce::Sum)) / (sum*sum));
@@ -331,7 +331,7 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->shapeInfo(), weightsBroad->shapeInfo());
-						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true, false, false);
+						E.reduceAlongDimension(reduce::Sum, *dLdw, axesToReduceAlong, true);
 						*dLdw /= numOfNonZeroWeights;
 					}
 					else
diff --git a/libnd4j/include/ops/declarable/generic/nn/recurrent/sru.cpp b/libnd4j/include/ops/declarable/generic/nn/recurrent/sru.cpp
index ba4e3d52f..918b33d4b 100644
--- a/libnd4j/include/ops/declarable/generic/nn/recurrent/sru.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/recurrent/sru.cpp
@@ -278,8 +278,8 @@ CUSTOM_OP_IMPL(sru_bp, 8, 4, true, 0, 0) {
         gradX->applyBroadcast(broadcast::Multiply, {0,1}, *mask, *gradX);  // apply mask
 
     // gradB
-    auto temp3 = gradBias->reduceAlongDimension(reduce::Sum, {0,2}, false, true);    // [1 x 2K]
-    gradB->assign(temp3);
+    auto gradB2 = gradB->reshape(gradB->ordering(), {2*K});
+    gradBias->reduceAlongDimension(reduce::Sum, gradB2, {0,2});    // [1 x 2K]
 
     // gradW [bS x 3K x K]
     x->permutei({0, 2, 1});                                               // [bS x N x K]
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
index e675342d9..205929119 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
@@ -193,11 +193,9 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
 
 	__shared__ Nd4jLong  len;
 	__shared__ int numOfIters;
-	__shared__ T* shmem;
+	__shared__ T shmem[CUDA_BLOCK_SIZE];
 
 	if (threadIdx.x == 0) {
-		extern __shared__ char shared[];
-		shmem = reinterpret_cast<T*>(shared);
 		len = shape::length(xShapeInfo);
 		numOfIters = (len + blockDim.x - 1) / blockDim.x;   // ceil (len / blockDim.x)
 	}
@@ -274,7 +272,7 @@ __global__ void softMaxForVectorCudaGlobal(const void *vx, const Nd4jLong *xShap
 template <typename T>
 linkage void softMaxForVectorCudaLauncher(const cudaStream_t* stream, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo) {
 
-	softMaxForVectorCudaGlobal<T><<<1, MAX_NUM_THREADS / 4 , (MAX_NUM_THREADS / 4) * sizeof(T) + 512, *stream>>>(vx, xShapeInfo, vz, zShapeInfo);
+	softMaxForVectorCudaGlobal<T><<<1, CUDA_BLOCK_SIZE, 1024, *stream>>>(vx, xShapeInfo, vz, zShapeInfo);
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -324,9 +322,9 @@ void softmax(sd::LaunchContext * context, const NDArray& input, NDArray& output,
 		auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(input.shapeInfo(), {dimension});
         auto packZ = sd::ConstantTadHelper::getInstance().tadForDimensions(output.shapeInfo(), {dimension});
 
-        const int threadsPerBlock = MAX_NUM_THREADS / 4;
+        const int threadsPerBlock = CUDA_BLOCK_SIZE;
         const int blocksPerGrid = packZ.numberOfTads();
-        const int sharedMem = input.sizeOfT() * threadsPerBlock + 512;
+        const int sharedMem = 1024;
 
         NDArray::prepareSpecialUse({&output}, {&input});
     	BUILD_SINGLE_SELECTOR(input.dataType(), softMaxCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.specialBuffer(), packX.specialShapeInfo(), packX.specialOffsets(), output.specialBuffer(), packZ.specialShapeInfo(), packZ.specialOffsets()), FLOAT_TYPES);
@@ -356,11 +354,9 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 
 	__shared__ Nd4jLong  len;
 	__shared__ int numOfIters;
-	__shared__ T* shmem;
+	__shared__ T shmem[CUDA_BLOCK_SIZE];
 
 	if (threadIdx.x == 0) {
-		extern __shared__ char shared[];
-		shmem = reinterpret_cast<T*>(shared);
 		len = shape::length(xzShapeInfo);
 		numOfIters = (len + blockDim.x - 1) / blockDim.x;   // ceil (len / blockDim.x)
 	}
@@ -430,7 +426,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 template <typename T>
 linkage void logSoftMaxForVectorCudaLauncher(const cudaStream_t* stream, const void *vx, const Nd4jLong *xzShapeInfo, void *vz) {
 
-	logSoftMaxForVectorCuda<T><<<1, MAX_NUM_THREADS, MAX_NUM_THREADS * sizeof(T) + 512, *stream>>>(vx, xzShapeInfo, vz);
+	logSoftMaxForVectorCuda<T><<<1, CUDA_BLOCK_SIZE, 1024, *stream>>>(vx, xzShapeInfo, vz);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -475,11 +471,9 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 
 	__shared__ Nd4jLong  len;
 	__shared__ int numOfIters;
-	__shared__ T* shmem;
+	__shared__ T shmem[CUDA_BLOCK_SIZE];
 
 	if (threadIdx.x == 0) {
-		extern __shared__ char shared[];
-		shmem = reinterpret_cast<T*>(shared);
 		len = shape::length(xzShapeInfo);
 		numOfIters = (len + blockDim.x - 1) / blockDim.x;   // ceil (len / blockDim.x)
 	}
@@ -550,7 +544,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 template <typename T>
 linkage void softMaxDerivForVectorCudaLauncher(const cudaStream_t* stream, const void *vx, const Nd4jLong *xzShapeInfo, void *vz) {
 
-	softMaxDerivForVectorCuda<T><<<1, MAX_NUM_THREADS, MAX_NUM_THREADS * sizeof(T) + 512, *stream>>>(vx, xzShapeInfo, vz);
+	softMaxDerivForVectorCuda<T><<<1, CUDA_BLOCK_SIZE, 1024, *stream>>>(vx, xzShapeInfo, vz);
 }
 
 ///////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
index e3fdd9411..67847518d 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
@@ -30,13 +30,7 @@ namespace sd {
                 auto y = reinterpret_cast<const X*>(vy);
                 auto z = reinterpret_cast<Z*>(vz);
 
-                __shared__ Nd4jLong *shared;
-
-                if (threadIdx.x == 0) {
-                    extern __shared__ unsigned char shmem[];
-                    shared = reinterpret_cast<Nd4jLong*>(shmem);
-                }
-                __syncthreads();
+                __shared__ Nd4jLong shared[CUDA_BLOCK_SIZE];
 
                 // we want to nullify temporary memory before accumulating intermediate results
                 shared[threadIdx.x] = 0;
@@ -82,7 +76,7 @@ namespace sd {
 
             template <typename X, typename Z>
             static void _hamming(LaunchContext *context, NDArray &x, NDArray &y, NDArray &z) {
-                _hammingKernel<X, Z><<<256, 256, 256 * sizeof(Nd4jLong) + 256, *context->getCudaStream()>>>(x.specialBuffer(), x.specialShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), z.specialBuffer(), nullptr, x.lengthOf());
+                _hammingKernel<X, Z><<<256, CUDA_BLOCK_SIZE, 1024, *context->getCudaStream()>>>(x.specialBuffer(), x.specialShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), z.specialBuffer(), nullptr, x.lengthOf());
             }
 
             void hamming(LaunchContext *context, NDArray &x, NDArray &y, NDArray &output) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
index 842a41ced..361cae22c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
@@ -251,7 +251,12 @@ namespace sd {
                     sd::DebugHelper::checkErrorCode(stream, "helpers::mirrorPadLinearKernel(...) failed");
                 }
                 else {
-                    mirrorPadKernel<F, I><<<256, 256, 8192, *stream>>>(input.specialBuffer(), input.specialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), outLen, paddings.specialBuffer(), paddings.specialShapeInfo(), reflBorder);
+
+                    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+                    const int blocksPerGrid = (outLen + threadsPerBlock - 1) / threadsPerBlock;
+                    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 256;
+
+                    mirrorPadKernel<F, I><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(input.specialBuffer(), input.specialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), outLen, paddings.specialBuffer(), paddings.specialShapeInfo(), reflBorder);
                     sd::DebugHelper::checkErrorCode(stream, "helpers::mirrorPadKernel(...) failed");
                 }
                 NDArray::registerSpecialUse({&output}, {&input, &paddings});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
index 61aefa255..c6aba57bc 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
@@ -38,15 +38,12 @@ __global__ static void inTopKCuda(const void* vx, const Nd4jLong* xShapeInfo,
     const auto y = reinterpret_cast<const Y*>(vy);
           auto z = reinterpret_cast<bool*>(vz);
 
-    __shared__ uint* sharedMem;
+    __shared__ uint sharedMem[CUDA_BLOCK_SIZE];
     __shared__ X elemToCompare;
     __shared__ const X* xTad;
     __shared__ Nd4jLong idx, xTadLen;
 
     if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<uint*>(shmem);
-
         xTadLen = shape::length(xTadShapeInfo);
 
         xTad = reinterpret_cast<const X*>(vx) + xTadOffsets[blockIdx.x];
@@ -93,9 +90,9 @@ int inTopKFunctor(sd::LaunchContext * context, const NDArray* predictions, const
 
     const auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(predictions->shapeInfo(), {1});
 
-    const int threadsPerBlock = MAX_NUM_THREADS;
+    const int threadsPerBlock = CUDA_BLOCK_SIZE;
     const int blocksPerGrid = static_cast<int>(packX.numberOfTads());
-    const int sharedMem = sizeof(uint) * threadsPerBlock + 128;
+    const int sharedMem = 1024;
 
     const auto xType = predictions->dataType();
     const auto yType = targets->dataType();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index 80e0e0858..f1b57f52c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -92,15 +92,11 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
     const auto x = reinterpret_cast<const T*>(vx);
           auto z = reinterpret_cast<T*>(vz);
 
-    __shared__ T* sharedMem;
-    __shared__ int xRank, zRank, *coordsMem;        // xRank = zRank + 2
+    __shared__ T sharedMem[CUDA_BLOCK_SIZE];
+    __shared__ int xRank, zRank;        // xRank = zRank + 2
     __shared__ Nd4jLong xLen, zLen;
 
     if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<T*>(shmem);
-        coordsMem = reinterpret_cast<int*>(shmem + blockDim.x * sizeof(T));
-
         xRank = shape::rank(xShapeInfo);
         zRank = shape::rank(zShapeInfo);
         xLen = shape::length(xShapeInfo);
@@ -109,7 +105,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
     }
     __syncthreads();
 
-    auto coords = coordsMem + threadIdx.x * xRank;
+    Nd4jLong coords[MAX_RANK];
 
     for (uint m = blockIdx.x; m < zLen; m += gridDim.x) {   // one block per each element of z, that is per each matrix
 
@@ -158,9 +154,9 @@ void trace(sd::LaunchContext* context, const NDArray& input, NDArray& output) {
     PointersManager manager(context, "trace");
 
     const uint diagLen = input.sizeAt(-1) < input.sizeAt(-2) ? input.sizeAt(-1) : input.sizeAt(-2);
-    const int threadsPerBlock = MAX_NUM_THREADS / 4;
+    const int threadsPerBlock = CUDA_BLOCK_SIZE;
     const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * (sizeof(int) * input.rankOf() + input.sizeOfT()) + 128;
+    const int sharedMem = 1024;
 
     NDArray::prepareSpecialUse({&output}, {&input});
     BUILD_SINGLE_SELECTOR(input.dataType(), traceCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.specialBuffer(), input.specialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), diagLen), LIBND4J_TYPES);
@@ -177,13 +173,10 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
     const auto x = reinterpret_cast<const T*>(vx);  // gradO
           auto z = reinterpret_cast<T*>(vz);        // gradI
 
-    __shared__ int rank, areSameOffsets, *sharedMem;                // xRank = zRank
+    __shared__ int rank, areSameOffsets;
     __shared__ Nd4jLong len, totalThreads;  // xLen = zLen
 
     if (threadIdx.x == 0) {
-
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<int*>(shmem);
         areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
         rank = shape::rank(xShapeInfo);
         len  = shape::length(zShapeInfo);
@@ -192,7 +185,7 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
 
     __syncthreads();
 
-    auto coords = sharedMem + threadIdx.x * rank;
+    Nd4jLong coords[MAX_RANK];
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -240,14 +233,10 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
     const auto x = reinterpret_cast<const T*>(vx);  // gradO
           auto z = reinterpret_cast<T*>(vz);        // gradI
 
-    __shared__ int xRank, zRank, *sharedMem;                // xRank >= zRank
+    __shared__ int xRank, zRank;                // xRank >= zRank
     __shared__ Nd4jLong numOfXOffsets, zLen, totalThreads;  // xLen >= zLen
 
     if (threadIdx.x == 0) {
-
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<int*>(shmem);
-
         xRank = shape::rank(zShapeInfo);
         zLen  = shape::length(zShapeInfo);
         numOfXOffsets = shape::length(xShapeInfo) / zLen;
@@ -259,7 +248,7 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    auto memBuff  = sharedMem + threadIdx.x * 2 * xRank;
+    int memBuff[MAX_RANK * 2];
     auto xOffsets = globMem + tid * numOfXOffsets;
 
     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
index e16e71619..48fc0d84c 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
@@ -75,15 +75,28 @@ namespace sd {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                    const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    if(x->rankOf() - dims.size() != z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
 
-                    NativeOpExecutioner::execReduceBool(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
-                            extras.argumentsAsT(x->dataType()),
-                            z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
-                            dims.data(), (int) dims.size(), reinterpret_cast<Nd4jLong const*>(pTadShape), reinterpret_cast<Nd4jLong const*>(pTadOffsets));
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
+                    NativeOpExecutioner::execReduceBool(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), nullptr, z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD, dims2.data(), dims2.size());
+
+
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    // NativeOpExecutioner::execReduceBool(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
+                    //         extras.argumentsAsT(x->dataType()),
+                    //         z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
+                    //         dims.data(), (int) dims.size(), reinterpret_cast<Nd4jLong const*>(pTadShape), reinterpret_cast<Nd4jLong const*>(pTadOffsets));
                 }
 
                 STORE_RESULT(*z);
@@ -111,13 +124,25 @@ namespace sd {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                    const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    if(x->rankOf() - dims.size() != z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
 
-                    NativeOpExecutioner::execReduceBool(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()),
-                            z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
+                    NativeOpExecutioner::execReduceBool(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), nullptr, z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD, dims2.data(), dims2.size());
+
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    // NativeOpExecutioner::execReduceBool(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()),
+                    //         z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
index a0ff14858..1fa7ce351 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
@@ -76,14 +76,25 @@ namespace sd {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
+
+                    if(x->rankOf() == z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
+
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
 
                     NativeOpExecutioner::execReduceFloat(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
-                            extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
-                            dims.data(), (int) dims.size(), reinterpret_cast<Nd4jLong const*>(pTadShape), reinterpret_cast<Nd4jLong const*>(pTadOffsets));
+                            extras.argumentsAsT(z->dataType()), z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD,
+                            dims2.data(), (int) dims2.size());
 
                 }
 
@@ -109,16 +120,25 @@ namespace sd {
                     // TAD
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
+
+                    if(x->rankOf() == z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
+
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
 
                     NativeOpExecutioner::execReduceFloat(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
-                            extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
-                            dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
-
-
+                            extras.argumentsAsT(z->dataType()), z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD,
+                            dims2.data(), (int) dims2.size());
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
index f5007ff03..2e6a9f78e 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
@@ -78,15 +78,27 @@ namespace sd {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                    const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    if(x->rankOf() - dims.size() != z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
 
-                    NativeOpExecutioner::execReduceLong(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
-                            extras.argumentsAsT(x->dataType()),
-                            z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
-                            dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
+                    NativeOpExecutioner::execReduceLong(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), nullptr, z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD, dims2.data(), dims2.size());
+
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    // NativeOpExecutioner::execReduceLong(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
+                    //         extras.argumentsAsT(x->dataType()),
+                    //         z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
+                    //         dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
                 }
 
                 STORE_RESULT(*z);
@@ -111,13 +123,26 @@ namespace sd {
                     // TAD
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                    const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    if(x->rankOf() - dims.size() != z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
 
-                    NativeOpExecutioner::execReduceLong(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()),
-                            z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
+                    NativeOpExecutioner::execReduceLong(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), nullptr, z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD, dims2.data(), dims2.size());
+
+
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    // NativeOpExecutioner::execReduceLong(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()),
+                    //         z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
 
                 }
             }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
index 299d19f14..e6dff7a20 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
@@ -73,15 +73,27 @@ namespace sd {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                    const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    if(x->rankOf() - dims.size() != z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
 
-                    NativeOpExecutioner::execReduceSame(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
-                            extras.argumentsAsT(z->dataType()),
-                            z->buffer(), z->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
-                            dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
+                    NativeOpExecutioner::execReduceSame(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), nullptr, z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD, dims2.data(), dims2.size());
+
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    // NativeOpExecutioner::execReduceSame(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
+                    //         extras.argumentsAsT(z->dataType()),
+                    //         z->buffer(), z->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
+                    //         dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
                 }
 
                 STORE_RESULT(*z);
@@ -106,14 +118,26 @@ namespace sd {
                     // TAD
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+                       const Nd4jLong* zShapeInfoH = z->shapeInfo();
+                    const Nd4jLong* zShapeInfoD = z->specialShapeInfo();
 
-                    auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-                    auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+                    if(x->rankOf() - dims.size() != z->rankOf()) {
+                        auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(z->shapeInfo(), dims, z->getContext()->getWorkspace());
+                        zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
+                        zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
+                    }
 
-                    NativeOpExecutioner::execReduceSame(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
-                            extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
-                            dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
+                    std::vector<int> dims2 = ShapeUtils::evalDimsForReduceOp(x->rankOf(), dims);
+                    NativeOpExecutioner::execReduceSame(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), nullptr, z->buffer(), zShapeInfoH, z->specialBuffer(), zShapeInfoD, dims2.data(), dims2.size());
+
+                    // auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(x->shapeInfo(), dims);
+
+                    // auto pTadShape = Environment::getInstance().isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
+                    // auto pTadOffsets = Environment::getInstance().isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
+
+                    // NativeOpExecutioner::execReduceSame(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
+                    //         extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(),
+                    //         dims.data(), (int) dims.size(), pTadShape, pTadOffsets);
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
index a77faf6f7..e19751a0e 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
@@ -81,8 +81,12 @@ static void conv2dCUDNN(const LaunchContext* context,
 
     // algorithm description
     cudnnConvolutionFwdAlgo_t algo;
-    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
-    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    cudnnConvolutionFwdAlgoPerf_t algoPerf;
+    int count = 0;
+    //err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    err = cudnnFindConvolutionForwardAlgorithm(*handle, x, w, conv, z, 1, &count, &algoPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    algo = algoPerf.algo;
 
 
     // allocate auxiliary device memory, abbreviation ws means workspace
@@ -188,13 +192,20 @@ static void conv2dBpCUDNN(const LaunchContext* context,
 
     // gradW algorithm description
     cudnnConvolutionBwdFilterAlgo_t algoGradW;
-    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
-    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    cudnnConvolutionBwdFilterAlgoPerf_t algoGradWPerf;
+    int count = 0;
+    //err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    err = cudnnFindConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, 1, &count, &algoGradWPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    algoGradW = algoGradWPerf.algo;
 
     // gradI algorithm description
     cudnnConvolutionBwdDataAlgo_t algoGradI;
-    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
-    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    cudnnConvolutionBwdDataAlgoPerf_t algoGradIPerf;
+    //err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    err = cudnnFindConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, 1, &count, &algoGradIPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    algoGradI = algoGradIPerf.algo;
 
     // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
     size_t wsGradWSize;
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu b/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
index 693ebeefa..f11a590c2 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
@@ -94,8 +94,13 @@ static void conv3dCUDNN(const LaunchContext* context,
 
     // algorithm description
     cudnnConvolutionFwdAlgo_t algo;
-    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
-    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    cudnnConvolutionFwdAlgoPerf_t algoPerf;
+    int count = 0;
+    //err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    err = cudnnFindConvolutionForwardAlgorithm(*handle, x, w, conv, z, 1, &count, &algoPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    algo = algoPerf.algo;
+
 
     // allocate auxiliary device memory, abbreviation ws means workspace
     size_t wsSize;
@@ -217,13 +222,20 @@ static void conv3dBpCUDNN(const LaunchContext* context,
 
     // gradW algorithm description
     cudnnConvolutionBwdFilterAlgo_t algoGradW;
-    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
-    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    cudnnConvolutionBwdFilterAlgoPerf_t algoGradWPerf;
+    int count = 0;
+    //err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    err = cudnnFindConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, 1, &count, &algoGradWPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    algoGradW = algoGradWPerf.algo;
 
     // gradI algorithm description
     cudnnConvolutionBwdDataAlgo_t algoGradI;
-    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
-    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    cudnnConvolutionBwdDataAlgoPerf_t algoGradIPerf;
+    //err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    err = cudnnFindConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, 1, &count, &algoGradIPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    algoGradI = algoGradIPerf.algo;
 
     // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
     size_t wsGradWSize;
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
index c268961ce..a1f408cc5 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
@@ -89,8 +89,12 @@ static void depthwiseConv2dCUDNN(const LaunchContext* context,
 
     // algorithm description
     cudnnConvolutionFwdAlgo_t algo;
-    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
-    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    cudnnConvolutionFwdAlgoPerf_t algoPerf;
+    int count = 0;
+    //err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    err = cudnnFindConvolutionForwardAlgorithm(*handle, x, w, conv, z, 1, &count, &algoPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    algo = algoPerf.algo;
 
     // allocate auxiliary device memory, abbreviation ws means workspace
     size_t wsSize;
@@ -206,13 +210,20 @@ static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
 
     // gradW algorithm description
     cudnnConvolutionBwdFilterAlgo_t algoGradW;
-    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
-    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    cudnnConvolutionBwdFilterAlgoPerf_t algoGradWPerf;
+    int count = 0;
+    //err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    err = cudnnFindConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, 1, &count, &algoGradWPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    algoGradW = algoGradWPerf.algo;
 
     // gradI algorithm description
     cudnnConvolutionBwdDataAlgo_t algoGradI;
-    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
-    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    cudnnConvolutionBwdDataAlgoPerf_t algoGradIPerf;
+    //err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    err = cudnnFindConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, 1, &count, &algoGradIPerf);
+    if (err != 0 || count == 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    algoGradI = algoGradIPerf.algo;
 
     // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
     size_t wsGradWSize;
diff --git a/libnd4j/include/system/pointercast.h b/libnd4j/include/system/pointercast.h
index 2c64d608e..778667402 100644
--- a/libnd4j/include/system/pointercast.h
+++ b/libnd4j/include/system/pointercast.h
@@ -24,6 +24,8 @@
 #include <system/msvc.h>
 #include <stdint.h>
 
+#define CUDA_BLOCK_SIZE 256
+
 typedef void* Nd4jPointer;
 typedef long long Nd4jLong;
 typedef uint64_t Nd4jULong;
diff --git a/libnd4j/pom.xml b/libnd4j/pom.xml
index d1d9944fa..1819d8112 100644
--- a/libnd4j/pom.xml
+++ b/libnd4j/pom.xml
@@ -55,8 +55,8 @@
 
     <properties>
         <!-- CUDA version is linked with the artifact name so cannot move to parent pom.xml -->
-        <cuda.version>10.2</cuda.version>
-        <cudnn.version>7.6</cudnn.version>
+        <cuda.version>11.0</cuda.version>
+        <cudnn.version>8.0</cudnn.version>
         <libnd4j.build>release</libnd4j.build>
         <libnd4j.chip>cpu</libnd4j.chip>
         <libnd4j.platform>${javacpp.platform}</libnd4j.platform>
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
index b87985458..8eb294e7c 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
@@ -2849,9 +2849,68 @@ TYPED_TEST(TypedConvolutionTests1, deconv2d_tf_test1) {
     ASSERT_EQ(Status::OK(), results.status());
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
-
 }
 
+//////////////////////////////////////////////////////////////////////
+TEST_F(ConvolutionTests1, conv2d_bp_7) {
+  int bS=2, iH=12,iW=12,  iC=3,oC=3,  kH=3,kW=3,  sH=2,sW=2,  pH=0,pW=0,  dH=1,dW=1;
+  int       oH=6,oW=6;
+  int paddingMode = 1;             // 1-SAME, 0-VALID;
+  int dataFormat  = 0;             // 1-NHWC, 0-NCHW
+
+  NDArray input('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+  NDArray weights('c', {kH, kW, iC, oC}, sd::DataType::FLOAT32);
+  NDArray bias('c', {oC}, {1,2,3}, sd::DataType::FLOAT32);
+  NDArray gradO('c', {bS, oC, oH, oW}, sd::DataType::FLOAT32);
+  NDArray gradI('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+  NDArray gradW('c', {kH, kW, iC, oC}, sd::DataType::FLOAT32);
+  NDArray gradB('c', {oC}, sd::DataType::FLOAT32);
+  input = 2.;
+  weights.linspace(0.1, 0.1);
+  gradO.linspace(0.01, 0.01);
+
+  sd::ops::conv2d_bp op;
+  auto status = op.execute({&input, &weights, &bias, &gradO}, {&gradI, &gradW, &gradB}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat}, {});
+  ASSERT_EQ(Status::OK(), status);
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(ConvolutionTests1, conv2d_ff_119_1) {
+  auto i = NDArrayFactory::create<float>('c', {2, 3, 13, 13});
+  auto w = NDArrayFactory::create<float>('c', {3, 3, 3, 3});
+  auto b = NDArrayFactory::create<float>('c', {3});
+  auto o = NDArrayFactory::create<float>('c', {2, 3, 6, 6});
+
+  sd::ops::conv2d op_ff;
+  auto status = op_ff.execute({&i, &w, &b}, {&o}, {3,3,  2,2,  0,0,  1,1,  0,  0,  1});
+  ASSERT_EQ(Status::OK(), status);
+
+  auto gi = i.ulike();
+  auto gw = w.ulike();
+
+  sd::ops::conv2d_bp op_bp;
+  status = op_bp.execute({&i, &w, &b, &o}, {&gi, &gw}, {3,3,  2,2,  0,0,  1,1,  0,  0,  1});
+  ASSERT_EQ(Status::OK(), status);
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(ConvolutionTests1, conv2d_ff_119_2) {
+  auto i = NDArrayFactory::create<float>('c', {2, 3, 17, 17});
+  auto w = NDArrayFactory::create<float>('c', {3, 3, 3, 3});
+  auto b = NDArrayFactory::create<float>('c', {3});
+  auto o = NDArrayFactory::create<float>('c', {2, 3, 8, 8});
+
+  sd::ops::conv2d op_ff;
+  auto status = op_ff.execute({&i, &w, &b}, {&o}, {3,3,  2,2,  0,0,  1,1,  0,  0,  1});
+  ASSERT_EQ(Status::OK(), status);
+
+  auto gi = i.ulike();
+  auto gw = w.ulike();
+
+  sd::ops::conv2d_bp op_bp;
+  status = op_bp.execute({&i, &w, &b, &o}, {&gi, &gw}, {3,3,  2,2,  0,0,  1,1,  0,  0,  1});
+  ASSERT_EQ(Status::OK(), status);
+}
 
 #endif //LIBND4J_CONVOLUTIONTESTS1_H
 
diff --git a/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu b/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu
index 3d6886565..cb7806bd4 100644
--- a/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu
+++ b/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu
@@ -35,6 +35,8 @@
 #include <helpers/ConstantTadHelper.h>
 #include <array/ShapeDescriptor.h>
 #include <array/ConstantDataBuffer.h>
+#include <helpers/ShapeUtils.h>
+#include <exceptions/cuda_exception.h>
 
 using namespace sd;
 using namespace sd::graph;
@@ -46,7 +48,7 @@ public:
 
 
 //////////////////////////////////////////////////////////////////////////
-static cudaError_t allocateDeviceMem(LaunchContext& lc, std::vector<void*>& devicePtrs, const std::vector<std::pair<void*,size_t>>& hostData) { 
+static cudaError_t allocateDeviceMem(LaunchContext& lc, std::vector<void*>& devicePtrs, const std::vector<std::pair<void*,size_t>>& hostData) {
 
 	if(devicePtrs.size() != hostData.size())
 		throw std::invalid_argument("prepareDataForCuda: two input sts::vectors should same sizes !");
@@ -63,9 +65,9 @@ static cudaError_t allocateDeviceMem(LaunchContext& lc, std::vector<void*>& devi
 	cudaStream_t stream = *lc.getCudaStream();
 
 	for(int i = 0; i < devicePtrs.size(); ++i) {
-		
+
 		cudaResult = cudaMalloc(reinterpret_cast<void **>(&devicePtrs[i]), hostData[i].second); if(cudaResult != 0) return cudaResult;
-		cudaMemcpyAsync(devicePtrs[i], hostData[i].first, hostData[i].second, cudaMemcpyHostToDevice, stream);				
+		cudaMemcpyAsync(devicePtrs[i], hostData[i].first, hostData[i].second, cudaMemcpyHostToDevice, stream);
 	}
 	return cudaResult;
 }
@@ -99,7 +101,7 @@ TEST_F(CudaBasicsTests1, TestPairwise_1) {
 	cudaMemcpyAsync(devShapePtrX, x.shapeInfo(), shape::shapeInfoByteLength(x.shapeInfo()), cudaMemcpyHostToDevice, *stream);
     res = cudaStreamSynchronize(*stream);
     ASSERT_EQ(0, res);
-	
+
 	LaunchContext lc(stream, nullptr, nullptr);
 	NativeOpExecutioner::execPairwiseTransform(&lc, pairwise::Add, nullptr, x.shapeInfo(), devBufferPtrX, reinterpret_cast<Nd4jLong*>(devShapePtrX), nullptr, x.shapeInfo(), devBufferPtrX, reinterpret_cast<Nd4jLong*>(devShapePtrX), nullptr, z.shapeInfo(), devBufferPtrZ, reinterpret_cast<Nd4jLong*>(devShapePtrX), nullptr);
 	res = cudaStreamSynchronize(*stream);
@@ -131,21 +133,21 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
     NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT32);
     NDArray x2('c', {2,2}, {0.5, 1.5, -4.5, 3.5}, sd::DataType::BFLOAT16);
     NDArray x3('c', {2,2}, {0, -1, 0, 1}, sd::DataType::BOOL);
-    
+
     NDArray scalar('c', {}, std::vector<double>{0}, sd::DataType::INT64);
 
     NDArray exp1('c', {}, std::vector<double>{3}, sd::DataType::INT64);
     NDArray exp2('c', {}, std::vector<double>{2}, sd::DataType::INT64);
     NDArray exp3('c', {}, std::vector<double>{1}, sd::DataType::INT64);
 
-    void *dX1, *dX2, *dX3, *dZ; 
+    void *dX1, *dX2, *dX3, *dZ;
     Nd4jLong *dX1ShapeInfo, *dX2ShapeInfo, *dX3ShapeInfo, *dZShapeInfo;
 
     cudaError_t cudaResult;
 
     cudaResult = cudaMalloc(reinterpret_cast<void **>(&dX1), x1.lengthOf() * x1.sizeOfT()); 		   		         	 ASSERT_EQ(0, cudaResult);
-    cudaResult = cudaMalloc(reinterpret_cast<void **>(&dX2), x2.lengthOf() * x2.sizeOfT()); 		   		         	 ASSERT_EQ(0, cudaResult);    
-    cudaResult = cudaMalloc(reinterpret_cast<void **>(&dX3), x3.lengthOf() * x3.sizeOfT()); 		   		         	 ASSERT_EQ(0, cudaResult);    
+    cudaResult = cudaMalloc(reinterpret_cast<void **>(&dX2), x2.lengthOf() * x2.sizeOfT()); 		   		         	 ASSERT_EQ(0, cudaResult);
+    cudaResult = cudaMalloc(reinterpret_cast<void **>(&dX3), x3.lengthOf() * x3.sizeOfT()); 		   		         	 ASSERT_EQ(0, cudaResult);
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&dZ), scalar.lengthOf() * scalar.sizeOfT()); 				         ASSERT_EQ(0, cudaResult);
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&dX1ShapeInfo), shape::shapeInfoByteLength(x1.shapeInfo()));    ASSERT_EQ(0, cudaResult);
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&dX2ShapeInfo), shape::shapeInfoByteLength(x2.shapeInfo()));    ASSERT_EQ(0, cudaResult);
@@ -153,14 +155,14 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&dZShapeInfo), shape::shapeInfoByteLength(scalar.shapeInfo())); ASSERT_EQ(0, cudaResult);
 
     cudaStream_t stream;
-	cudaResult = cudaStreamCreate(&stream); 
+	cudaResult = cudaStreamCreate(&stream);
 	ASSERT_EQ(0, cudaResult);
 
 	x1.syncToHost();
 	x2.syncToHost();
 	x3.syncToHost();
 	scalar.syncToHost();
-	
+
 	cudaMemcpyAsync(dX1, x1.buffer(), x1.lengthOf() * x1.sizeOfT(), cudaMemcpyHostToDevice, stream);
 	cudaMemcpyAsync(dX2, x2.buffer(), x2.lengthOf() * x2.sizeOfT(), cudaMemcpyHostToDevice, stream);
 	cudaMemcpyAsync(dX3, x3.buffer(), x3.lengthOf() * x3.sizeOfT(), cudaMemcpyHostToDevice, stream);
@@ -168,7 +170,7 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
 	cudaMemcpyAsync(dX2ShapeInfo, x2.shapeInfo(), shape::shapeInfoByteLength(x2.shapeInfo()), cudaMemcpyHostToDevice, stream);
 	cudaMemcpyAsync(dX3ShapeInfo, x3.shapeInfo(), shape::shapeInfoByteLength(x3.shapeInfo()), cudaMemcpyHostToDevice, stream);
 	cudaMemcpyAsync(dZShapeInfo, scalar.shapeInfo(), shape::shapeInfoByteLength(scalar.shapeInfo()), cudaMemcpyHostToDevice, stream);
-	
+
 	void* reductionPointer = nullptr;
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&reductionPointer), 1024*1024);
 	ASSERT_EQ(0, cudaResult);
@@ -178,21 +180,21 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
 	LaunchContext lc(&stream, LaunchContext::defaultContext()->getReductionPointer(), LaunchContext::defaultContext()->getScalarPointer(), LaunchContext::defaultContext()->getAllocationPointer());
 
 	/***************************************/
-	
+
     NativeOpExecutioner::execIndexReduceScalar(&lc,
     											sd::indexreduce::IndexAbsoluteMax,
     											x1.buffer(), x1.shapeInfo(),
-    	                                       	dX1, dX1ShapeInfo, 
-    	                                       	nullptr, 
+    	                                       	dX1, dX1ShapeInfo,
+    	                                       	nullptr,
     	                                       	scalar.buffer(), scalar.shapeInfo(),
     	                                       	dZ, dZShapeInfo);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     cudaMemcpyAsync(scalar.buffer(), dZ, scalar.lengthOf() * scalar.sizeOfT(), cudaMemcpyDeviceToHost, stream);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     scalar.tickWriteHost();
@@ -200,55 +202,55 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
 	ASSERT_NEAR(exp1.e<float>(0), scalar.e<float>(0), 1e-5);
 
     /***************************************/
-    
+
     NativeOpExecutioner::execIndexReduceScalar(&lc,
     											sd::indexreduce::IndexAbsoluteMax,
     											nullptr, x2.shapeInfo(),
-    	                                       	dX2, dX2ShapeInfo, 
-    	                                       	nullptr, 
+    	                                       	dX2, dX2ShapeInfo,
+    	                                       	nullptr,
     	                                       	nullptr, scalar.shapeInfo(),
     	                                       	dZ, dZShapeInfo);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     cudaMemcpyAsync(scalar.buffer(), dZ, scalar.lengthOf() * scalar.sizeOfT(), cudaMemcpyDeviceToHost, stream);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     ASSERT_NEAR(exp2.e<float>(0), scalar.e<float>(0), 1e-5);
 
     // *************************************
 
-    NativeOpExecutioner::execIndexReduceScalar(&lc, 
+    NativeOpExecutioner::execIndexReduceScalar(&lc,
     											sd::indexreduce::IndexAbsoluteMax,
     											nullptr, x3.shapeInfo(),
-    	                                       	dX3, dX3ShapeInfo, 
-    	                                       	nullptr, 
+    	                                       	dX3, dX3ShapeInfo,
+    	                                       	nullptr,
     	                                       	nullptr, scalar.shapeInfo(),
     	                                       	dZ, dZShapeInfo);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     cudaMemcpyAsync(scalar.buffer(), dZ, scalar.lengthOf() * scalar.sizeOfT(), cudaMemcpyDeviceToHost, stream);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     ASSERT_NEAR(exp3.e<float>(0), scalar.e<float>(0), 1e-5);
-    
+
 	/***************************************/
 
 	cudaFree(dX1); 			cudaFree(dX2); 			cudaFree(dX3); 			cudaFree(dZ);
-	cudaFree(dX1ShapeInfo); cudaFree(dX2ShapeInfo); cudaFree(dX3ShapeInfo); cudaFree(dZShapeInfo); 
+	cudaFree(dX1ShapeInfo); cudaFree(dX2ShapeInfo); cudaFree(dX3ShapeInfo); cudaFree(dZShapeInfo);
 
-	/***************************************/	
+	/***************************************/
 
-	cudaResult = cudaStreamDestroy(stream); 
+	cudaResult = cudaStreamDestroy(stream);
 	ASSERT_EQ(0, cudaResult);
-	
+
 }
 
 ////////////////////////////////////////////////////////////////////////////
@@ -264,11 +266,11 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 
     NDArray exp1('c', {}, std::vector<double>{-30.f}, sd::DataType::FLOAT32);
     NDArray exp2('c', {}, std::vector<double>{15.}, sd::DataType::DOUBLE);
-    
+
 	NDArray scalar1('c', {}, std::vector<double>{100.f}, sd::DataType::FLOAT32);
     NDArray scalar2('c', {}, std::vector<double>{100.}, sd::DataType::DOUBLE);
 
-    void *dX1, *dX2, *dX3, *dX4, *dZ1, *dZ2; 
+    void *dX1, *dX2, *dX3, *dX4, *dZ1, *dZ2;
     Nd4jLong *dX1ShapeInfo, *dX3ShapeInfo, *dZ1ShapeInfo, *dZ2ShapeInfo;
 
     cudaError_t cudaResult;
@@ -285,7 +287,7 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&dZ2ShapeInfo), shape::shapeInfoByteLength(scalar2.shapeInfo())); 	ASSERT_EQ(0, cudaResult);
 
     cudaStream_t stream;
-	cudaResult = cudaStreamCreate(&stream); 
+	cudaResult = cudaStreamCreate(&stream);
 	ASSERT_EQ(0, cudaResult);
 
 	x1.syncToHost();
@@ -294,7 +296,7 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 	x4.syncToHost();
 	scalar1.syncToHost();
 	scalar2.syncToHost();
-	
+
 	cudaMemcpyAsync(dX1, x1.buffer(), x1.lengthOf() * x1.sizeOfT(), cudaMemcpyHostToDevice, stream);
 	cudaMemcpyAsync(dX2, x2.buffer(), x2.lengthOf() * x2.sizeOfT(), cudaMemcpyHostToDevice, stream);
 	cudaMemcpyAsync(dX3, x3.buffer(), x3.lengthOf() * x3.sizeOfT(), cudaMemcpyHostToDevice, stream);
@@ -307,7 +309,7 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 	/***************************************/
 
 	void* reductionPointer  = nullptr;
-	int*  allocationPointer = nullptr;	
+	int*  allocationPointer = nullptr;
 
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&reductionPointer),  1024*1024);		ASSERT_EQ(0, cudaResult);
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024);		ASSERT_EQ(0, cudaResult);
@@ -315,10 +317,10 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 	LaunchContext lc(&stream, reductionPointer, nullptr, allocationPointer);
 
 	/***************************************/
-	
+
     NativeOpExecutioner::execReduce3Scalar(&lc, sd::reduce3::Dot,nullptr, x1.shapeInfo(),dX1, dX1ShapeInfo, nullptr, nullptr, x2.shapeInfo(),dX2, dX1ShapeInfo,nullptr, scalar1.shapeInfo(),dZ1, dZ1ShapeInfo);
 
-    cudaResult = cudaStreamSynchronize(stream);     
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     scalar1.tickWriteHost();
@@ -326,36 +328,36 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 
     cudaMemcpyAsync(scalar1.buffer(), dZ1, scalar1.lengthOf() * scalar1.sizeOfT(), cudaMemcpyDeviceToHost, stream);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
 	ASSERT_NEAR(exp1.e<float>(0), scalar1.e<float>(0), 1e-5);
 
     /***************************************/
-    
+
     NativeOpExecutioner::execReduce3Scalar(&lc, sd::reduce3::Dot,nullptr, x3.shapeInfo(),dX3, dX3ShapeInfo, nullptr, nullptr, x4.shapeInfo(),dX4, dX3ShapeInfo,nullptr, scalar2.shapeInfo(),dZ2, dZ2ShapeInfo);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
     cudaMemcpyAsync(scalar2.buffer(), dZ2, scalar2.lengthOf() * scalar2.sizeOfT(), cudaMemcpyDeviceToHost, stream);
 
-    cudaResult = cudaStreamSynchronize(stream); 
+    cudaResult = cudaStreamSynchronize(stream);
     ASSERT_EQ(0, cudaResult);
 
 	ASSERT_NEAR(exp2.e<float>(0), scalar2.e<float>(0), 1e-5);
-    
+
 	/***************************************/
 
 	cudaFree(dX1); 			cudaFree(dX2); cudaFree(dX3); 		   cudaFree(dX4); 	cudaFree(dZ1); 				cudaFree(dZ2);
 	cudaFree(dX1ShapeInfo); 			   cudaFree(dX3ShapeInfo); 					cudaFree(dZ1ShapeInfo);		cudaFree(dZ2ShapeInfo);
 
-	/***************************************/	
+	/***************************************/
 
-	cudaResult = cudaStreamDestroy(stream); 
+	cudaResult = cudaStreamDestroy(stream);
 	ASSERT_EQ(0, cudaResult);
 }
- 
+
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_1) {
@@ -372,9 +374,9 @@ TEST_F(CudaBasicsTests1, execReduce3_1) {
     y.syncToHost();
     z.syncToHost();
 
-    
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions	
+
+    std::vector<std::pair<void*,size_t>> hostData;
+	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
     cudaError_t cudaResult;
@@ -382,27 +384,27 @@ TEST_F(CudaBasicsTests1, execReduce3_1) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
+								(int*)devicePtrs[0], dimensions.size(),
 								nullptr, nullptr, nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -411,18 +413,18 @@ TEST_F(CudaBasicsTests1, execReduce3_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_2) {
-    
+
 	NDArray x('c', {2,2}, {1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
     NDArray y('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
 
     NDArray exp('c', {}, std::vector<double>{15.}, sd::DataType::DOUBLE);
     NDArray z('c', {}, std::vector<double>{100.},  sd::DataType::DOUBLE);
-   
-    std::vector<int> dimensions = {0, 1};   
 
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions	
+    std::vector<int> dimensions = {0, 1};
+
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
+	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -431,28 +433,28 @@ TEST_F(CudaBasicsTests1, execReduce3_2) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
-	// call cuda kernel which calculates result	
+
+	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
+								(int*)devicePtrs[0], dimensions.size(),
 								nullptr, nullptr, nullptr, nullptr);
 
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -460,16 +462,16 @@ TEST_F(CudaBasicsTests1, execReduce3_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_3) {
-    
+
 	NDArray x('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT32);
     NDArray y('c', {2,3}, {-6,-5,-4,-3,-2,-1}, sd::DataType::INT32);
 
     NDArray exp('c', {3}, {-18,-20,-18}, sd::DataType::FLOAT32);
     NDArray z('c', {3}, {100,100,100}, sd::DataType::FLOAT32);
-   
+
     std::vector<int> dimensions = {0};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
@@ -481,13 +483,13 @@ TEST_F(CudaBasicsTests1, execReduce3_3) {
     yTad.createTadOnlyShapeInfo();
     yTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function    
-    std::vector<std::pair<void*,size_t>> hostData;    
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	hostData.emplace_back(yTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(yTad.tadOnlyShapeInfo));// 3 -- yTadShapeInfo
-	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets	
+	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -496,29 +498,29 @@ TEST_F(CudaBasicsTests1, execReduce3_3) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
-	// call cuda kernel which calculates result	
+
+	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
-								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+								(int*)devicePtrs[0], dimensions.size(),
+								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 								(Nd4jLong*)devicePtrs[3], (Nd4jLong*)devicePtrs[4]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
 	z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -526,16 +528,16 @@ TEST_F(CudaBasicsTests1, execReduce3_3) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_4) {
-    	
+
     NDArray x('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::DOUBLE);
     NDArray y('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
 
     NDArray exp('c', {2}, {9,22.5}, sd::DataType::DOUBLE);
     NDArray z('c', {2}, {100,100}, sd::DataType::DOUBLE);
-   
+
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
@@ -548,12 +550,12 @@ TEST_F(CudaBasicsTests1, execReduce3_4) {
     yTad.createOffsets();
 
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	hostData.emplace_back(yTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(yTad.tadOnlyShapeInfo));// 3 -- yTadShapeInfo
-	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets	
+	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -562,29 +564,29 @@ TEST_F(CudaBasicsTests1, execReduce3_4) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
-	// call cuda kernel which calculates result	
+
+	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
-								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+								(int*)devicePtrs[0], dimensions.size(),
+								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 								(Nd4jLong*)devicePtrs[3], (Nd4jLong*)devicePtrs[4]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -592,16 +594,16 @@ TEST_F(CudaBasicsTests1, execReduce3_4) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_5) {
-    	
+
     NDArray x('c', {2,2,3}, {1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::FLOAT32);
     NDArray y('c', {2,2,3}, {1,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::FLOAT32);
 
     NDArray exp('c', {2,3}, {7.5, 10.5, 13.5, 25.5, 28.5, 31.5}, sd::DataType::FLOAT32);
     NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
-   
+
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
@@ -614,12 +616,12 @@ TEST_F(CudaBasicsTests1, execReduce3_5) {
     yTad.createOffsets();
 
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	hostData.emplace_back(yTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(yTad.tadOnlyShapeInfo));// 3 -- yTadShapeInfo
-	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets	
+	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -628,29 +630,29 @@ TEST_F(CudaBasicsTests1, execReduce3_5) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
-								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+								(int*)devicePtrs[0], dimensions.size(),
+								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 								(Nd4jLong*)devicePtrs[3], (Nd4jLong*)devicePtrs[4]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -658,16 +660,16 @@ TEST_F(CudaBasicsTests1, execReduce3_5) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3All_1) {
-    	
+
     NDArray x('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
     NDArray y('c', {2,3}, {-1,1,-1,1,-1,1}, sd::DataType::INT32);
 
     NDArray exp('c', {2,3}, {2,-2,2,2,-2,2}, sd::DataType::FLOAT32);
     NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
-   
+
     std::vector<int> dimensions = {0};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
@@ -679,13 +681,13 @@ TEST_F(CudaBasicsTests1, execReduce3All_1) {
     yTad.createTadOnlyShapeInfo();
     yTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function    
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	hostData.emplace_back(yTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(yTad.tadOnlyShapeInfo));// 3 -- yTadShapeInfo
-	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4 -- yTadOffsets	
+	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4 -- yTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -694,29 +696,29 @@ TEST_F(CudaBasicsTests1, execReduce3All_1) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3All(&lc, sd::reduce3::Dot,
 										nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-										nullptr, 
+										nullptr,
 										nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-										(int*)devicePtrs[0], dimensions.size(), 
-										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+										(int*)devicePtrs[0], dimensions.size(),
+										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 										(Nd4jLong*)devicePtrs[3], (Nd4jLong*)devicePtrs[4]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
 	z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -724,16 +726,16 @@ TEST_F(CudaBasicsTests1, execReduce3All_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3All_2) {
-    	
+
     NDArray x('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
     NDArray y('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
 
     NDArray exp('c', {2,3}, {6,6,6,9,9,9}, sd::DataType::DOUBLE);
     NDArray z('c', {2,3}, {100,100,100,100,100,100,},sd::DataType::DOUBLE);
-   
+
     std::vector<int> dimensions = {0};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
@@ -745,13 +747,13 @@ TEST_F(CudaBasicsTests1, execReduce3All_2) {
     yTad.createTadOnlyShapeInfo();
     yTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function    
-    std::vector<std::pair<void*,size_t>> hostData;    
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	hostData.emplace_back(yTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(yTad.tadOnlyShapeInfo));// 3 -- yTadShapeInfo
-	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets	
+	hostData.emplace_back(yTad.tadOffsets, yTad.numTads * sizeof(Nd4jLong));						// 4-- yTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -760,29 +762,29 @@ TEST_F(CudaBasicsTests1, execReduce3All_2) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3All(&lc, sd::reduce3::Dot,
 										nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-										nullptr, 
+										nullptr,
 										nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-										(int*)devicePtrs[0], dimensions.size(), 
-										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+										(int*)devicePtrs[0], dimensions.size(),
+										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 										(Nd4jLong*)devicePtrs[3], (Nd4jLong*)devicePtrs[4]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -790,22 +792,22 @@ TEST_F(CudaBasicsTests1, execReduce3All_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execIndexReduce_1) {
-    	
+
     NDArray x('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::DOUBLE);
     x.linspace(-2.); x.syncToDevice();
     NDArray exp('c', {2}, {2, 2}, sd::DataType::INT64);
     NDArray z('c', {2}, {100,100}, sd::DataType::INT64);
-    
-    std::vector<int> dimensions = {1};          
 
-    // evaluate xTad data 
+    std::vector<int> dimensions = {1};
+
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function        
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
@@ -817,28 +819,31 @@ TEST_F(CudaBasicsTests1, execIndexReduce_1) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execIndexReduce(&lc, sd::indexreduce::IndexMax,
 										nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-										nullptr, 
+										nullptr,
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-										(int*)devicePtrs[0], dimensions.size(), 
+										(int*)devicePtrs[0], dimensions.size(),
 										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
 
-	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamSynchronize(stream);
+	if (cudaResult != 0)
+	  throw sd::cuda_exception::build("execIndexReduce failed", cudaResult);
+
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -846,7 +851,7 @@ TEST_F(CudaBasicsTests1, execIndexReduce_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execIndexReduce_2) {
-    	
+
     NDArray x('c', {2,3,4,5}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     						  	100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
@@ -856,18 +861,18 @@ TEST_F(CudaBasicsTests1, execIndexReduce_2) {
     x.linspace(-2.f); x.syncToDevice();
     NDArray exp('c', {2,5}, {11,11,11,11,11,11,11,11,11,11}, sd::DataType::INT64);
     NDArray z('c', {2,5}, {100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT64);
-    
-    std::vector<int> dimensions = {1,2};     
 
-    // evaluate xTad data 
+    std::vector<int> dimensions = {1,2};
+
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function    
-    
-    std::vector<std::pair<void*,size_t>> hostData;    
+    // prepare input arrays for prepareDataForCuda function
+
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
@@ -879,28 +884,28 @@ TEST_F(CudaBasicsTests1, execIndexReduce_2) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execIndexReduce(&lc, sd::indexreduce::IndexMax,
 										nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-										nullptr, 
+										nullptr,
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-										(int*)devicePtrs[0], dimensions.size(), 
+										(int*)devicePtrs[0], dimensions.size(),
 										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -908,7 +913,7 @@ TEST_F(CudaBasicsTests1, execIndexReduce_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execIndexReduce_3) {
-    	
+
     NDArray x('c', {2,3,4,5}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     						  	100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
@@ -918,50 +923,50 @@ TEST_F(CudaBasicsTests1, execIndexReduce_3) {
     x.linspace(-2.); x.syncToDevice();
     NDArray exp('c', {3}, {39, 39, 39}, sd::DataType::INT64);
     NDArray z('c', {3}, {100,100,100}, sd::DataType::INT64);
-    
+
     std::vector<int> dimensions = {0,2,3};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function        
-    std::vector<std::pair<void*,size_t>> hostData;   
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execIndexReduce(&lc, sd::indexreduce::IndexMax,
 										nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-										nullptr, 
+										nullptr,
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-										(int*)devicePtrs[0], dimensions.size(), 
+										(int*)devicePtrs[0], dimensions.size(),
 										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -972,18 +977,18 @@ TEST_F(CudaBasicsTests1, execScalar_1) {
 
 	if (!Environment::getInstance().isExperimentalBuild())
         return;
-    	
+
     NDArray x('c', {2,3},  {0,1,2,3,4,5}, sd::DataType::INT64);
     NDArray exp('c',{2,3}, {0,0,1,1,2,2}, sd::DataType::INT64);
     NDArray scalar('c',{}, std::vector<double>{2.f}, sd::DataType::FLOAT32);
     NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::INT64);
-    
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
-	
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execScalar(&lc, sd::scalar::Divide,
 									nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
@@ -993,9 +998,9 @@ TEST_F(CudaBasicsTests1, execScalar_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1007,18 +1012,18 @@ TEST_F(CudaBasicsTests1, execScalar_2) {
 
 	if (!Environment::getInstance().isExperimentalBuild())
         return;
-    	
+
     NDArray x('c', {2,3},  {-1,-2,-3,-4,-5,-6}, sd::DataType::INT64);
     NDArray exp('c',{2,3}, {10,10,10,10,10,10}, sd::DataType::FLOAT32);
     NDArray scalar('c',{}, std::vector<double>{10.f}, sd::DataType::FLOAT32);
     NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
-    
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execScalar(&lc, sd::scalar::CopyPws,
 									nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
@@ -1028,9 +1033,9 @@ TEST_F(CudaBasicsTests1, execScalar_2) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 
@@ -1043,7 +1048,7 @@ TEST_F(CudaBasicsTests1, execScalar_3) {
 
 	if (!Environment::getInstance().isExperimentalBuild())
         return;
-    	
+
     NDArray x('c', {2,3,2},  {0,1,2,3,4,5,6,7,8,9,10,11}, sd::DataType::INT64);
     NDArray scalars('c',{2,2}, {1,2,3,4}, sd::DataType::FLOAT32);
     NDArray exp('c', {2,3,2},  {0,0,2,1,4,2, 2,1,2,2,3,2}, sd::DataType::INT64);
@@ -1051,49 +1056,49 @@ TEST_F(CudaBasicsTests1, execScalar_3) {
 
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;   
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));							// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));	// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execScalar(&lc, sd::scalar::Divide,
 									nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 									nullptr,
 									nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 									nullptr, scalars.shapeInfo(), scalars.specialBuffer(), scalars.specialShapeInfo(),
-									(int*)devicePtrs[0], dimensions.size(), 
-									(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+									(int*)devicePtrs[0], dimensions.size(),
+									(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 									nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5); 		
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1101,18 +1106,18 @@ TEST_F(CudaBasicsTests1, execScalar_3) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execScalarBool_1) {
-    	
+
     NDArray x('c', {2,3},  {-1,-2,0,1,2,3}, sd::DataType::BFLOAT16);
     NDArray scalar('c',{}, std::vector<double>{0}, sd::DataType::BFLOAT16);
     NDArray exp('c',{2,3}, {0,0,0,1,1,1}, sd::DataType::BOOL);
     NDArray z('c', {2,3}, {100,100,100,100,100,100,}, sd::DataType::BOOL);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
-		
+
 	// call cuda kernel which calculates result
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execScalarBool(&lc, sd::scalar::GreaterThan,
@@ -1123,9 +1128,9 @@ TEST_F(CudaBasicsTests1, execScalarBool_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1134,7 +1139,7 @@ TEST_F(CudaBasicsTests1, execScalarBool_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execScalarBool_2) {
-    	
+
     NDArray x('c', {2,3},  {0,1,2,3,4,5}, sd::DataType::FLOAT32);
     NDArray scalars('c',{2}, {-1,4}, sd::DataType::FLOAT32);
     NDArray exp('c', {2,3},  {1,1,1,0,0,1}, sd::DataType::BOOL);
@@ -1142,19 +1147,19 @@ TEST_F(CudaBasicsTests1, execScalarBool_2) {
 
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;   
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));							// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));	// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1163,27 +1168,27 @@ TEST_F(CudaBasicsTests1, execScalarBool_2) {
 
 	// allocate required amount of global device memory and copy host data to it
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-			
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execScalarBool(&lc, sd::scalar::GreaterThan,
 									nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 									nullptr,
 									nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 									nullptr, scalars.shapeInfo(), scalars.specialBuffer(), scalars.specialShapeInfo(),
-									(int*)devicePtrs[0], dimensions.size(), 
-									(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+									(int*)devicePtrs[0], dimensions.size(),
+									(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 									nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5); 		
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1194,7 +1199,7 @@ TEST_F(CudaBasicsTests1, execBroadcast_1) {
 
 	if (!Environment::getInstance().isExperimentalBuild())
         return;
-    	
+
 	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
     NDArray y('c', {3},   {10, 20, 30}, sd::DataType::INT64);
     NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
@@ -1203,19 +1208,19 @@ TEST_F(CudaBasicsTests1, execBroadcast_1) {
 
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;   
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));							// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));	// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1230,20 +1235,20 @@ TEST_F(CudaBasicsTests1, execBroadcast_1) {
 										nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 										nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-										(int*)devicePtrs[0], dimensions.size(), 
-										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+										(int*)devicePtrs[0], dimensions.size(),
+										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 										nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5); 		
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1254,7 +1259,7 @@ TEST_F(CudaBasicsTests1, execBroadcast_2) {
 
 	if (!Environment::getInstance().isExperimentalBuild())
         return;
-    	
+
 	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
     NDArray y('c', {2,4},   {10,20,30,40,50,60,70,80}, sd::DataType::FLOAT32);
     NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::FLOAT32);
@@ -1263,19 +1268,19 @@ TEST_F(CudaBasicsTests1, execBroadcast_2) {
 
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;   
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));							// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));	// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1290,20 +1295,20 @@ TEST_F(CudaBasicsTests1, execBroadcast_2) {
 										nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 										nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-										(int*)devicePtrs[0], dimensions.size(), 
-										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
+										(int*)devicePtrs[0], dimensions.size(),
+										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 										nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5); 		
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1311,7 +1316,7 @@ TEST_F(CudaBasicsTests1, execBroadcast_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execBroadcastBool_1) {
-    	
+
 	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
     NDArray y('c', {3},   {2, 12, 22}, sd::DataType::INT32);
     NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,}, sd::DataType::BOOL);
@@ -1320,19 +1325,19 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_1) {
 
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;   
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));							// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));	// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1348,20 +1353,20 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_1) {
 										nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
                                         nullptr,
-										(int*)devicePtrs[0], dimensions.size(), 
-										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
-										nullptr, nullptr);	
+										(int*)devicePtrs[0], dimensions.size(),
+										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
+										nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5); 		
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1369,7 +1374,7 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execBroadcastBool_2) {
-    	
+
 	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100},sd::DataType::FLOAT32);
     NDArray y('c', {2,4},   {1,10,10,15,20,20,20,24}, sd::DataType::FLOAT32);
     NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::BOOL);
@@ -1378,20 +1383,20 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_2) {
 
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;   
+    // prepare input arrays for prepareDataForCuda function
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));							// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));	// 1 -- xTadShapeInfo
 	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));							// 2 -- xTadOffsets
-	
+
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1407,20 +1412,20 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_2) {
 										nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 										nullptr,
-										(int*)devicePtrs[0], dimensions.size(), 
-										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], 
-										nullptr, nullptr);	
+										(int*)devicePtrs[0], dimensions.size(),
+										(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
+										nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
- 	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5); 		
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i)
+		cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1431,7 +1436,7 @@ TEST_F(CudaBasicsTests1, execPairwiseTransform_1) {
 
 	if (!Environment::getInstance().isExperimentalBuild())
         return;
-    	
+
 	NDArray x('c', {2,2,2}, {1,5,3,7,2,6,4,8}, sd::DataType::INT32);
     NDArray y('c', {4,2}, {0.1,0.2,0.3,0.4,1.5,0.6,0.7,1.8}, sd::DataType::DOUBLE);
     NDArray z('c', {8}, {100,100,100,100,100,100,100,100}, sd::DataType::INT32);
@@ -1444,7 +1449,7 @@ TEST_F(CudaBasicsTests1, execPairwiseTransform_1) {
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
-	
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execPairwiseTransform(&lc, sd::pairwise::Subtract,
 												nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
@@ -1454,30 +1459,30 @@ TEST_F(CudaBasicsTests1, execPairwiseTransform_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
-	
+
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execPairwiseBoolTransform_1) {
-    	
+
 	NDArray x('c', {2,2,2}, {1,5,3,7,2,6,4,8}, sd::DataType::INT64);
     NDArray y('c', {4,2}, {0,2,0,4,0,6,0,8}, sd::DataType::INT64);
     NDArray z('c', {8}, {100,100,100,100,100,100,100,100}, sd::DataType::BOOL);
 	NDArray exp('c', {8}, {0,1,0,1,0,1,0,1}, sd::DataType::BOOL);
 	x.permutei({2,1,0});	// -> {1,2,3,4,5,6,7,8}
 	x.syncShape();
-        
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
-	LaunchContext lc(&stream);	
+	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execPairwiseBoolTransform(&lc, sd::pairwise::EqualTo,
@@ -1488,11 +1493,11 @@ TEST_F(CudaBasicsTests1, execPairwiseBoolTransform_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
-	
+
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
 }
@@ -1500,13 +1505,13 @@ TEST_F(CudaBasicsTests1, execPairwiseBoolTransform_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformFloat_1) {
-    	
+
 	NDArray x('c', {2,2}, {0, 6.25, 2.25, 12.25}, sd::DataType::DOUBLE);
     NDArray z('c', {4}, {100,100,100,100}, sd::DataType::FLOAT32);
 	NDArray exp('c', {4}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
 	x.permutei({1,0});
 	x.syncShape();
-        
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1521,9 +1526,9 @@ TEST_F(CudaBasicsTests1, execTransformFloat_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1532,17 +1537,17 @@ TEST_F(CudaBasicsTests1, execTransformFloat_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformFloat_2) {
-    	
+
 	NDArray x('c', {1,4}, {0, 4, 9, 16}, sd::DataType::INT64);
     NDArray z('c', {2,2}, {100,100,100,100}, sd::DataType::DOUBLE);
 	NDArray exp('c', {2,2}, {0, 2, 3, 4}, sd::DataType::DOUBLE);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
-	
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execTransformFloat(&lc, sd::transform::Sqrt,
 		nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
@@ -1551,9 +1556,9 @@ TEST_F(CudaBasicsTests1, execTransformFloat_2) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1562,12 +1567,12 @@ TEST_F(CudaBasicsTests1, execTransformFloat_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformAny_1) {
-    	
+
 	NDArray x('c', {2,2}, {0, 6.25, 2.25, 12.25}, sd::DataType::DOUBLE);
     NDArray z('c', {4,1}, {100,100,100,100}, sd::DataType::INT32);
 	NDArray exp('c', {4,1}, {0, 2, 6, 12}, sd::DataType::INT32);
 	x.permutei({1,0});
-        
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1582,10 +1587,10 @@ TEST_F(CudaBasicsTests1, execTransformAny_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);	
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1593,7 +1598,7 @@ TEST_F(CudaBasicsTests1, execTransformAny_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformAny_2) {
-    	
+
 	NDArray x('c', {1,4}, {0, 6.25, 2.25, 12.25}, sd::DataType::BFLOAT16);
     NDArray z('c', {2,2}, {100,100,100,100}, sd::DataType::FLOAT32);
 	NDArray exp('c', {2,2}, {0, 6.25, 2.25, 12.25}, sd::DataType::FLOAT32);
@@ -1612,9 +1617,9 @@ TEST_F(CudaBasicsTests1, execTransformAny_2) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1623,12 +1628,12 @@ TEST_F(CudaBasicsTests1, execTransformAny_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformStrict_1) {
-    	
+
 	NDArray x('c', {2,3}, {0,2,4,1,3,5}, sd::DataType::DOUBLE);
     NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::DOUBLE);
 	NDArray exp('c', {3,2}, {0, 3, 12, 27, 48, 75}, sd::DataType::DOUBLE);
 	x.permutei({1,0});
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1643,9 +1648,9 @@ TEST_F(CudaBasicsTests1, execTransformStrict_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1654,11 +1659,11 @@ TEST_F(CudaBasicsTests1, execTransformStrict_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformStrict_2) {
-    	
+
 	NDArray x('c', {6}, {0,1,2,3,4,5}, sd::DataType::FLOAT32);
     NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
 	NDArray exp('c', {3,2}, {0, 3, 12, 27, 48, 75}, sd::DataType::FLOAT32);
-    	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1673,9 +1678,9 @@ TEST_F(CudaBasicsTests1, execTransformStrict_2) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
 	z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1684,12 +1689,12 @@ TEST_F(CudaBasicsTests1, execTransformStrict_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformSame_1) {
-    
+
 	NDArray x('c', {2,3}, {0,2.5,4.5,1.5,3.5,5.5}, sd::DataType::DOUBLE);
     NDArray z('c', {1,6}, {100,100,100,100,100,100}, sd::DataType::DOUBLE);
 	NDArray exp('c', {1,6}, {0,2.25,6.25,12.25,20.25,30.25}, sd::DataType::DOUBLE);
 	x.permutei({1,0});
-    	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1704,9 +1709,9 @@ TEST_F(CudaBasicsTests1, execTransformSame_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-        
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1715,17 +1720,17 @@ TEST_F(CudaBasicsTests1, execTransformSame_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformSame_2) {
-    	
+
 	NDArray x('c', {6}, {0,1,2,3,4,5}, sd::DataType::INT32);
     NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::INT32);
 	NDArray exp('c', {3,2}, {0,1,4,9,16,25}, sd::DataType::INT32);
-    	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
-	
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execTransformSame(&lc, sd::transform::Square,
 		nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
@@ -1734,9 +1739,9 @@ TEST_F(CudaBasicsTests1, execTransformSame_2) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1745,12 +1750,12 @@ TEST_F(CudaBasicsTests1, execTransformSame_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformBool_1) {
-    
+
 	NDArray x('c', {2,3}, {0,2,4,-1,-3,-5}, sd::DataType::DOUBLE);
     NDArray z('c', {1,6}, {100,100,100,100,100,100}, sd::DataType::BOOL);
 	NDArray exp('c', {1,6}, {0,0,1,0,1,0}, sd::DataType::BOOL);
 	x.permutei({1,0});
-    
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1765,9 +1770,9 @@ TEST_F(CudaBasicsTests1, execTransformBool_1) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-         	
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -1776,11 +1781,11 @@ TEST_F(CudaBasicsTests1, execTransformBool_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformBool_2) {
-    	
+
 	NDArray x('c', {6}, {0,-1,2,-3,4,-5}, sd::DataType::INT32);
     NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::BOOL);
 	NDArray exp('c', {3,2}, {0,0,1,0,1,0}, sd::DataType::BOOL);
-    	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -1795,10 +1800,10 @@ TEST_F(CudaBasicsTests1, execTransformBool_2) {
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    
+
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
- 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);	
+ 	for (int e = 0; e < z.lengthOf(); e++)
+ 		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -1806,449 +1811,264 @@ TEST_F(CudaBasicsTests1, execTransformBool_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloat_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
     NDArray z('c', {3}, {100,100,100}, sd::DataType::FLOAT32);
     NDArray exp('c', {3}, {2.5, 6.5, 10.5}, sd::DataType::FLOAT32);
     x.permutei({2,1,0});
-    
+
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceFloat(&lc, sd::reduce::Mean,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceFloat(&lc, sd::reduce::Mean, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloat_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
     NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
     NDArray exp('c', {2,4}, {-1., 0., 1., 2.,11., 12., 13., 14.}, sd::DataType::DOUBLE);
-    
+
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceFloat(&lc, sd::reduce::Mean,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceFloat(&lc, sd::reduce::Mean, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSame_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
     NDArray z('c', {3}, {100,100,100}, sd::DataType::INT32);
     NDArray exp('c', {3}, {20, 52, 84}, sd::DataType::INT32);
     x.permutei({2,1,0});
-    
+
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceSame(&lc, sd::reduce::Sum,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceSame(&lc, sd::reduce::Sum, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSame_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::FLOAT32);
     NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::FLOAT32);
     NDArray exp('c', {2,4}, {-3., 0., 3., 6.,33., 36., 39., 42.}, sd::DataType::FLOAT32);
-    
+
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceSame(&lc, sd::reduce::Sum,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceSame(&lc, sd::reduce::Sum, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBool_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::INT32);
     NDArray z('c', {3}, {100,100,100}, sd::DataType::BOOL);
     NDArray exp('c', {3}, {0, 1, 1}, sd::DataType::BOOL);
     x.permutei({2,1,0});
 
-    
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceBool(&lc, sd::reduce::IsPositive,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceBool(&lc, sd::reduce::IsPositive, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBool_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::FLOAT32);
     NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::BOOL);
     NDArray exp('c', {2,4}, {1, 1, 1, 1, 0, 0, 0, 0}, sd::DataType::BOOL);
-    
+
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceBool(&lc, sd::reduce::IsPositive,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceBool(&lc, sd::reduce::IsPositive, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLong_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::INT32);
     NDArray z('c', {3}, {100,100,100}, sd::DataType::INT64);
     NDArray exp('c', {3}, {5,6,6}, sd::DataType::INT64);
     x.permutei({2,1,0});
-    
+
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceLong(&lc, sd::reduce::CountNonZero,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceLong(&lc, sd::reduce::CountNonZero, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLong_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::FLOAT32);
     NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::INT64);
     NDArray exp('c', {2,4}, {3, 1, 3, 2, 2, 1, 2, 3}, sd::DataType::INT64);
 
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
-    shape::TAD xTad;
-    xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
-    xTad.createTadOnlyShapeInfo();
-    xTad.createOffsets();
-
-    // prepare input arrays for prepareDataForCuda function       
-    std::vector<std::pair<void*,size_t>> hostData;    	
-	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
-	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
-	std::vector<void*> devicePtrs(hostData.size(), nullptr);
-	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
-	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceLong(&lc, sd::reduce::CountNonZero,
-					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
-					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-					(int*)devicePtrs[0], dimensions.size(), 
-					(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
+	std::vector<int> dims = sd::ShapeUtils::evalDimsForReduceOp(x.rankOf(), dimensions);
+    NativeOpExecutioner::execReduceLong(&lc, sd::reduce::CountNonZero, nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), dims.data(), dims.size());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
-	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) 
-		cudaFree(devicePtrs[i]);	
-
 	// delete cuda stream
-	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
+	cudaResult = cudaStreamDestroy(stream);
+	ASSERT_EQ(0, cudaResult);
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloatScalar_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::FLOAT32);
     NDArray exp('c', {}, std::vector<double>{6.5}, sd::DataType::FLOAT32);
     x.permutei({2,1,0});
-       
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2260,18 +2080,18 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_1) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceFloatScalar(&lc, sd::reduce::Mean,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
-	
+
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2280,11 +2100,11 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloatScalar_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::DOUBLE);
     NDArray exp('c', {}, std::vector<double>{6.5}, sd::DataType::DOUBLE);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2296,18 +2116,18 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_2) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceFloatScalar(&lc, sd::reduce::Mean,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2316,12 +2136,12 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSameScalar_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::INT32);
     NDArray exp('c', {}, std::vector<double>{156}, sd::DataType::INT32);
     x.permutei({2,1,0});
-       
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2333,18 +2153,18 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_1) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceSameScalar(&lc, sd::reduce::Sum,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
-	
+
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2353,11 +2173,11 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSameScalar_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::DOUBLE);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::DOUBLE);
     NDArray exp('c', {}, std::vector<double>{156}, sd::DataType::DOUBLE);
-	
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2369,18 +2189,18 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_2) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceSameScalar(&lc, sd::reduce::Sum,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2389,13 +2209,13 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBoolScalar_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::INT32);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::BOOL);
     NDArray exp('c', {}, std::vector<double>{1}, sd::DataType::BOOL);
     x.permutei({2,1,0});
-    x.syncShape();    
-       
+    x.syncShape();
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2407,18 +2227,18 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_1) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceBoolScalar(&lc, sd::reduce::IsPositive,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
-	
+
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2427,11 +2247,11 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBoolScalar_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::DOUBLE);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::BOOL);
     NDArray exp('c', {}, std::vector<double>{1}, sd::DataType::BOOL);
-    
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2443,18 +2263,18 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_2) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceBoolScalar(&lc, sd::reduce::IsPositive,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2463,13 +2283,13 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLongScalar_1) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::INT32);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::INT64);
     NDArray exp('c', {}, std::vector<double>{17}, sd::DataType::INT64);
     x.permutei({2,1,0});
-    x.syncShape();    
-       
+    x.syncShape();
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2481,18 +2301,18 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_1) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceLongScalar(&lc, sd::reduce::CountNonZero,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
-	
+
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2501,11 +2321,11 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLongScalar_2) {
-    	   	
+
     NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::DOUBLE);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::INT64);
     NDArray exp('c', {}, std::vector<double>{17}, sd::DataType::INT64);
-    
+
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
 	cudaStream_t stream;
@@ -2517,18 +2337,18 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_2) {
 	cudaResult = cudaMalloc(reinterpret_cast<void **>(&allocationPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
 	lc.setAllocationPointer(allocationPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduceLongScalar(&lc, sd::reduce::CountNonZero,
 					nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-					nullptr, 
+					nullptr,
 					nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++)  		
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2537,12 +2357,12 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_1) {
-    	
+
     NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::FLOAT32);
     NDArray y('c', {2,2}, {1,2,3,4}, sd::DataType::FLOAT32);
     NDArray exp('c', {3}, {10,20,30}, sd::DataType::DOUBLE);
     NDArray z('c', {3}, {100,100,100}, sd::DataType::DOUBLE);
-   
+
     std::vector<int> dimensions = {0,1};
     auto packX = ConstantTadHelper::getInstance().tadForDimensions(x.shapeInfo(), dimensions);
     LaunchContext* context = x.getContext();
@@ -2553,7 +2373,7 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_1) {
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3TAD(context, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 								nullptr, dimensions.size(),
@@ -2563,32 +2383,32 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_1) {
     z.tickWriteDevice();
 //    z.printIndexedBuffer("OutputReduce3TAD");
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_2) {
-    	
+
     NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::INT64);
     NDArray y('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT64);
     NDArray exp('c', {2}, {10,73}, sd::DataType::FLOAT32);
     NDArray z('c', {2}, {100,100}, sd::DataType::FLOAT32);
-   
+
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -2597,28 +2417,28 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_2) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3TAD(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
+								(int*)devicePtrs[0], dimensions.size(),
 								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], nullptr, nullptr);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -2626,25 +2446,25 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_2) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_3) {
-    	
+
     NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::INT64);
     NDArray y('c', {3}, {1,2,3}, sd::DataType::INT64);
     NDArray exp('c', {2,2}, {-22,-4,14,32}, sd::DataType::FLOAT32);
     NDArray z('c', {2,2}, {100,100,100,100}, sd::DataType::FLOAT32);
-   
+
     std::vector<int> dimensions = {2};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -2653,28 +2473,28 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_3) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3TAD(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
+								(int*)devicePtrs[0], dimensions.size(),
 								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], (Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -2682,7 +2502,7 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_3) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_4) {
-    	
+
     NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::DOUBLE);
     NDArray y('c', {2,2,3}, {10,20,30,40,50,60,70,80,90,100,110,120}, sd::DataType::DOUBLE);
     NDArray exp('c', {}, std::vector<double>{1820}, sd::DataType::FLOAT32);
@@ -2690,17 +2510,17 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_4) {
 
     std::vector<int> dimensions = {0,1,2};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -2709,27 +2529,27 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_4) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execReduce3TAD(&lc, sd::reduce3::Dot,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 
+								nullptr,
 								nullptr, y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
+								(int*)devicePtrs[0], dimensions.size(),
 								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2], (Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2]);
 
 	cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -2751,13 +2571,13 @@ TEST_F(CudaBasicsTests1, execSummaryStats_1) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 	void* reductionPointer;
-    cudaResult = cudaMalloc(reinterpret_cast<void **>(&reductionPointer), 1024*1024); ASSERT_EQ(0, cudaResult);    	
+    cudaResult = cudaMalloc(reinterpret_cast<void **>(&reductionPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execSummaryStats(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 								
+								nullptr,
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 								true);
 
@@ -2765,7 +2585,7 @@ TEST_F(CudaBasicsTests1, execSummaryStats_1) {
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2774,24 +2594,24 @@ TEST_F(CudaBasicsTests1, execSummaryStats_1) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execSummaryStats_2) {
-    	
+
     NDArray x('c', {2,2,3}, {-5,-4,-3,-20,-1,0,1,2,3,4,5,6}, sd::DataType::DOUBLE);
     NDArray exp('c', {2}, {3.405877, 9.715966}, sd::DataType::FLOAT32);
     NDArray z('c', {2}, {100,100}, sd::DataType::FLOAT32);
 
     std::vector<int> dimensions = {0,2};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -2800,15 +2620,15 @@ TEST_F(CudaBasicsTests1, execSummaryStats_2) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execSummaryStats(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 								
+								nullptr,
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-								(int*)devicePtrs[0], dimensions.size(), 
+								(int*)devicePtrs[0], dimensions.size(),
 								(Nd4jLong*)devicePtrs[1], (Nd4jLong*)devicePtrs[2],
 								true);
 
@@ -2816,11 +2636,11 @@ TEST_F(CudaBasicsTests1, execSummaryStats_2) {
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -2828,24 +2648,24 @@ TEST_F(CudaBasicsTests1, execSummaryStats_2) {
 /*
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execSummaryStats_3) {
-    	
+
     NDArray x('c', {2,2,3}, {-5,-4,-3,-20,-1,0,1,2,3,4,5,6}, sd::DataType::DOUBLE);
     NDArray exp('c', {2}, {10.606602, 2.121320}, sd::DataType::FLOAT32);
     NDArray z('c', {2}, {100,100}, sd::DataType::FLOAT32);
 
     std::vector<int> dimensions = {1};
 
-    // evaluate xTad data 
+    // evaluate xTad data
     shape::TAD xTad;
     xTad.init(x.shapeInfo(), dimensions.data(), dimensions.size());
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
 
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    	
+    std::vector<std::pair<void*,size_t>> hostData;
 	hostData.emplace_back(dimensions.data(), dimensions.size() * sizeof(int));						// 0 -- dimensions
 	hostData.emplace_back(xTad.tadOnlyShapeInfo, shape::shapeInfoByteLength(xTad.tadOnlyShapeInfo));// 1 -- xTadShapeInfo
-	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets	
+	hostData.emplace_back(xTad.tadOffsets, xTad.numTads * sizeof(Nd4jLong));						// 2 -- xTadOffsets
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -2854,9 +2674,9 @@ TEST_F(CudaBasicsTests1, execSummaryStats_3) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execSummaryStats(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
@@ -2870,11 +2690,11 @@ TEST_F(CudaBasicsTests1, execSummaryStats_3) {
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -2883,7 +2703,7 @@ TEST_F(CudaBasicsTests1, execSummaryStats_3) {
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
-    	
+
     NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::INT64);
     NDArray exp('c', {}, std::vector<double>{3.605551}, sd::DataType::FLOAT32);
     NDArray z('c', {}, std::vector<double>{100}, sd::DataType::FLOAT32);
@@ -2894,13 +2714,13 @@ TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 	void* reductionPointer;
-    cudaResult = cudaMalloc(reinterpret_cast<void **>(&reductionPointer), 1024*1024); ASSERT_EQ(0, cudaResult);    	
+    cudaResult = cudaMalloc(reinterpret_cast<void **>(&reductionPointer), 1024*1024); ASSERT_EQ(0, cudaResult);
 	lc.setReductionPointer(reductionPointer);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execSummaryStatsScalar(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
-								nullptr, 								
+								nullptr,
 								nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 								true);
 
@@ -2908,7 +2728,7 @@ TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// delete cuda stream
@@ -2917,7 +2737,7 @@ TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
 
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_1) {
-    	   
+
 //    NDArray z('c', {10}, {100,0,0,0,0,0,0,0,0,0}, sd::DataType::DOUBLE);
     NDArray z('c', {10}, {100,0,0,0,0,0,0,0,0,100}, sd::DataType::FLOAT32);
     NDArray exp('c', {10}, {0.050942, -0.183229, -0.093921, 0.075469, 0.257166, -0.254838, 0.342227, -0.682188, -0.004345, 0.464633}, sd::DataType::FLOAT32);
@@ -2971,14 +2791,14 @@ TEST_F(CudaBasicsTests1, execRandom_1) {
 
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_2) {
-    	   
+
     NDArray x('c', {10}, {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1}, sd::DataType::DOUBLE);
     NDArray z('c', {2,5}, {100,100,100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
     NDArray exp('c', {10}, {0., 0., 0.3, 0., 0.5, 0., 0.7, 0., 0., 1.}, sd::DataType::DOUBLE);
-    
+
     ExtraArguments extraArguments({0.7});
     sd::graph::RandomGenerator gen(119,5);
-    
+
 //    // prepare input arrays for prepareDataForCuda function
 //    std::vector<std::pair<void*,size_t>> hostData;
 //	hostData.emplace_back(extraArguments.data(), extraArguments.size() * sizeof(double));		// 0 -- dimensions
@@ -2990,9 +2810,9 @@ TEST_F(CudaBasicsTests1, execRandom_2) {
 //	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext* lc = x.getContext(); //(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 //	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execRandom(lc, sd::random::DropOut,
 								&gen,
@@ -3004,7 +2824,7 @@ TEST_F(CudaBasicsTests1, execRandom_2) {
     z.tickWriteDevice();
     z.syncToHost();
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
@@ -3016,16 +2836,16 @@ TEST_F(CudaBasicsTests1, execRandom_2) {
 
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_3) {
-    	       
+
     NDArray z('c', {10}, {100,100,100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
     NDArray exp('c', {10}, {2.373649, 2.239791, 1.887353, 2.488636, 2.068904, 2.281399, 1.828228, 2.228222, 2.490847, 1.669537}, sd::DataType::DOUBLE);
-    
+
     std::vector<double> extraArguments = {1.5, 2.5};
     sd::graph::RandomGenerator gen(119,5);
-    
+
     // prepare input arrays for prepareDataForCuda function
-    std::vector<std::pair<void*,size_t>> hostData;    		
-	hostData.emplace_back(extraArguments.data(), extraArguments.size() * sizeof(double));		// 0 -- dimensions		
+    std::vector<std::pair<void*,size_t>> hostData;
+	hostData.emplace_back(extraArguments.data(), extraArguments.size() * sizeof(double));		// 0 -- dimensions
 	std::vector<void*> devicePtrs(hostData.size(), nullptr);
 
 	// create cuda stream and LaunchContext
@@ -3034,9 +2854,9 @@ TEST_F(CudaBasicsTests1, execRandom_3) {
 	cudaResult = cudaStreamCreate(&stream);	ASSERT_EQ(0, cudaResult);
 	LaunchContext lc(&stream);
 
-	// allocate required amount of global device memory and copy host data to it 		
+	// allocate required amount of global device memory and copy host data to it
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
-		
+
 	// call cuda kernel which calculates result
 	NativeOpExecutioner::execRandom(&lc, sd::random::UniformDistribution,
 								&gen,
@@ -3047,11 +2867,11 @@ TEST_F(CudaBasicsTests1, execRandom_3) {
     z.tickWriteDevice();
 
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
-	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);	
+	for(int i = 0; i < devicePtrs.size(); ++i) cudaFree(devicePtrs[i]);
 
 	// delete cuda stream
 	cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
@@ -3059,14 +2879,14 @@ TEST_F(CudaBasicsTests1, execRandom_3) {
 
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_4) {
-    	       
+
     NDArray z('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, sd::DataType::FLOAT32);
     NDArray exp('c', {10}, {2.373649, 2.281399, 2.239791, 1.828228, 1.887353, 2.228222, 2.488636, 2.490847, 2.068904, 1.669537}, sd::DataType::FLOAT32);
-    z.permutei({1,0});        
-        
+    z.permutei({1,0});
+
     ExtraArguments extraArguments({1.5, 2.5});
     sd::graph::RandomGenerator gen(119,5);
-    
+
 //    // prepare input arrays for prepareDataForCuda function
 //    std::vector<std::pair<void*,size_t>> hostData;
 //	hostData.emplace_back(extraArguments.data(), extraArguments.size() * sizeof(double));		// 0 -- dimensions
@@ -3092,7 +2912,7 @@ TEST_F(CudaBasicsTests1, execRandom_4) {
     z.tickWriteDevice();
 //    z.printIndexedBuffer("Output Uniform4");
  	// verify results
- 	for (int e = 0; e < z.lengthOf(); e++) 
+ 	for (int e = 0; e < z.lengthOf(); e++)
  		ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
 
 	// free allocated global device memory
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
index 2a099230e..81b869457 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
@@ -186,7 +186,7 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
     auto result0 = op.evaluate({&x}, {0.}, {});
 
     auto z0 = result0.at(0);
-    auto exp0 = x.reduceAlongDimension(reduce::NormFrobenius, empty, false, false);
+    auto exp0 = x.reduceAlongDimension(reduce::NormFrobenius, empty);
     ASSERT_TRUE(exp0.isSameShape(z0));
     ASSERT_TRUE(exp0.equalsTo(z0));
 
@@ -194,7 +194,7 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
     ASSERT_EQ(result1.status(), ND4J_STATUS_OK);
     auto z1 = result1.at(0);
     // z1->printIndexedBuffer("Z1");
-    auto exp1 = x.reduceAlongDimension(reduce::Norm2, dims, false, false);
+    auto exp1 = x.reduceAlongDimension(reduce::Norm2, dims);
     // exp1.printIndexedBuffer("EXP1");
     // z1->printShapeInfo("Z1 shape");
     // exp1.printShapeInfo("EXP1 shape");
@@ -204,7 +204,7 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
     auto result4 = op.evaluate({&x}, {4.}, {1});
 
     auto z4 = result4.at(0);
-    auto exp4= x.reduceAlongDimension(reduce::NormMax, dims, false, false);
+    auto exp4= x.reduceAlongDimension(reduce::NormMax, dims);
     ASSERT_TRUE(exp4.isSameShape(z4));
     ASSERT_TRUE(exp4.equalsTo(z4));
 }
@@ -222,7 +222,7 @@ TEST_F(DeclarableOpsTests3, Test_Norm_2) {
     auto result0 = op.evaluate({&x}, {0}, {});
 
     auto z0 = result0.at(0);
-    auto exp0 = x.reduceAlongDimension(reduce::NormFrobenius, empty, false, false);
+    auto exp0 = x.reduceAlongDimension(reduce::NormFrobenius, empty);
     ASSERT_TRUE(exp0.isSameShape(z0));
     ASSERT_TRUE(exp0.equalsTo(z0));
 
@@ -231,14 +231,14 @@ TEST_F(DeclarableOpsTests3, Test_Norm_2) {
     auto result1 = op.evaluate({&x, &axis}, {1}, {});
 
     auto z1 = result1.at(0);
-    auto exp1 = x.reduceAlongDimension(reduce::Norm2, dims, false, false);
+    auto exp1 = x.reduceAlongDimension(reduce::Norm2, dims);
     ASSERT_TRUE(exp1.isSameShape(z1));
     ASSERT_TRUE(exp1.equalsTo(z1));
 
     auto result4 = op.evaluate({&x, &axis}, {4}, {});
 
     auto z4 = result4.at(0);
-    auto exp4= x.reduceAlongDimension(reduce::NormMax, dims, false, false);
+    auto exp4= x.reduceAlongDimension(reduce::NormMax, dims);
     ASSERT_TRUE(exp4.isSameShape(z4));
     ASSERT_TRUE(exp4.equalsTo(z4));
 
diff --git a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
index fe9c5a7a0..385e042f8 100644
--- a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
@@ -70,7 +70,7 @@ TEST_F(LegacyOpsTests, TransformTests_2) {
 
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 TEST_F(LegacyOpsTests,  Reciprocal_1) {
@@ -126,7 +126,7 @@ TEST_F(LegacyOpsTests,  PWT_Tests_2) {
     //z->printBuffer("Z");
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 TEST_F(LegacyOpsTests, Scalar_Test_1) {
@@ -157,7 +157,7 @@ TEST_F(LegacyOpsTests, Scalar_Test_2) {
     auto z = result.at(0);
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 
@@ -176,7 +176,7 @@ TEST_F(LegacyOpsTests, ReduceTests_1) {
     ASSERT_TRUE(z->isScalar());
     ASSERT_NEAR(x.sumNumber().e<float>(0), z->e<float>(0), 1e-5f);
 
-    
+
 }
 
 
@@ -197,7 +197,7 @@ TEST_F(LegacyOpsTests, ReduceTests_2) {
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 
@@ -217,7 +217,7 @@ TEST_F(LegacyOpsTests, ReduceTests_3) {
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 
@@ -238,7 +238,7 @@ TEST_F(LegacyOpsTests, ReduceTests_4) {
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 TEST_F(LegacyOpsTests, ReduceTests_5) {
@@ -256,7 +256,7 @@ TEST_F(LegacyOpsTests, ReduceTests_5) {
     ASSERT_TRUE(z->isScalar());
     ASSERT_NEAR(x.meanNumber().e<float>(0), z->e<float>(0), 1e-5f);
 
-    
+
 }
 
 
@@ -277,7 +277,7 @@ TEST_F(LegacyOpsTests, ReduceTests_6) {
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 
@@ -297,7 +297,7 @@ TEST_F(LegacyOpsTests, ReduceTests_7) {
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 
@@ -319,7 +319,7 @@ TEST_F(LegacyOpsTests, ReduceTests_8) {
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
-    
+
 }
 
 
@@ -338,7 +338,7 @@ TEST_F(LegacyOpsTests, IndexReduceTests_1) {
     ASSERT_TRUE(z->isScalar());
     ASSERT_EQ(24, z->e<int>(0));
 
-    
+
 }
 
 
@@ -362,7 +362,7 @@ TEST_F(LegacyOpsTests, IndexReduceTests_2) {
     //ASSERT_EQ(4, z->e<int>(3));
     //ASSERT_EQ(4, z->e<int>(4));
 
-    
+
 }
 
 TEST_F(LegacyOpsTests, BroadcastingTests_1) {
@@ -707,7 +707,7 @@ TEST_F(LegacyOpsTests, test_legacy_reduce_empty_1) {
                                         x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
                                         nullptr,
                                         z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
-                                        &dim, 1, x.platformShapeInfo(), nullptr);
+                                        &dim, 1);
 
     ASSERT_EQ(e, z);
 }
@@ -720,7 +720,7 @@ TEST_F(LegacyOpsTests, test_legacy_reduce_empty_2) {
 
     int dim = 1;
 
-    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), reduce::SameOps::Min, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), &dim, 1, x.platformShapeInfo(), nullptr);
+    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), reduce::SameOps::Min, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), &dim, 1);
 
     ASSERT_EQ(e, z);
 }
@@ -733,7 +733,7 @@ TEST_F(LegacyOpsTests, test_legacy_reduce_empty_3) {
 
     int dim = 1;
 
-    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), reduce::SameOps::Max, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), &dim, 1, x.platformShapeInfo(), nullptr);
+    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), reduce::SameOps::Max, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), &dim, 1);
 
     ASSERT_EQ(e, z);
 }
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
index 8150976e1..807de9fed 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
@@ -462,7 +462,7 @@ TEST_F(NDArrayTest, TestTranspose2) {
 }
 
 //////////////////////////////////////////////////////////////////////
-TEST_F(NDArrayTest, TestSumAlongDimension1) {
+TEST_F(NDArrayTest, TestReduceAlongDimension1) {
 
     NDArray array('c', {2,2}, {1,2,3,4}, sd::DataType::FLOAT32);
 
@@ -475,23 +475,7 @@ TEST_F(NDArrayTest, TestSumAlongDimension1) {
 }
 
 //////////////////////////////////////////////////////////////////////
-TEST_F(NDArrayTest, TestSumAlongDimension2) {
-    float *c = new float[4] {1, 2, 3, 4};
-    auto array = new NDArray(c, cShape);
-
-    auto res = array->reduceAlongDimension(reduce::Sum, {1});
-
-    ASSERT_EQ(2, res.lengthOf());
-
-    ASSERT_EQ(3.0f, res.e<float>(0));
-    ASSERT_EQ(7.0f, res.e<float>(1));
-
-    delete[] c;
-    delete array;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(NDArrayTest, TestReduceAlongDimension1) {
+TEST_F(NDArrayTest, TestReduceAlongDimension2) {
     float *c = new float[4] {1, 2, 3, 4};
     auto array = new NDArray(c, cShape);
 
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index e07a0496d..b55f971d4 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -1132,6 +1132,26 @@ TEST_F(PlaygroundTests, lstmLayerCellBp_1) {
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {true, true, true, true, true, true, true});
 }
 
+TEST_F(PlaygroundTests, my) {
+
+    const int N = 40;
+
+    NDArray x('c', {256,256,128,128}, sd::DataType::FLOAT32);
+    NDArray z1('c', {256,2,128}, sd::DataType::DOUBLE);
+    NDArray z = z1({0,0,0,1,0,0});
+    z.printShapeInfo();
+
+    auto timeStart = std::chrono::system_clock::now();
+    for (int i = 0; i < N; ++i) {
+        // x.reduceAlongDimension(sd::reduce::Mean, z, {1,3});
+        x.applyBroadcast(sd::broadcast::Ops::Add, {1,3}, z, x);
+    }
+    auto timeEnd = std::chrono::system_clock::now();
+    auto time = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
+
+    printf("old %ld\n", time);
+}
+
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, lstmLayer_bp_1) {
@@ -1657,52 +1677,4 @@ TEST_F(DeclarableOpsTests15, gru_bp_1) {
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 }
 
-#include<ops/declarable/helpers/transforms.h>
-//////////////////////////////////////////////////////////////////////
-TEST_F(PlaygroundTests, my) {
-
-    const int N = 10;
-
-    NDArray input('c', {8000000}, sd::DataType::INT32);
-    input.linspace(1);
-    NDArray output = input.dup();
-
-
-    sd::graph::RandomGenerator rng;
-
-    sd::ops::helpers::randomShuffle(input.getContext(), input, output, rng, true);
-
-    // auto timeStart = std::chrono::system_clock::now();
-    // for (int i = 0; i < N; ++i)
-    //     sd::ops::helpers::randomShuffle(input.getContext(), input, output, rng, true);
-    // auto timeEnd = std::chrono::system_clock::now();
-    // auto time = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
-    // printf("time: %i \n", time);
-
-    // bool hasDublicates = false;
-    // for(int i = 0; i < output.lengthOf() - 1; ++i)
-    //     for(int j = i+1; j < output.lengthOf(); ++j)
-    //         if(output.t<int>(i) == output.t<int>(j)) {
-    //             hasDublicates = true;
-    //             i = output.lengthOf();
-    //             break;
-    //         }
-
-    ASSERT_TRUE(!input.equalsTo(output));
-
-    bool hasDublicates = false;
-    for(int i = 0; i < input.lengthOf() - 1; ++i)
-        for(int j = i+1; j < input.lengthOf(); ++j)
-            if(input.t<int>(i) == input.t<int>(j)) {
-                hasDublicates = true;
-                i = input.lengthOf();
-                break;
-            }
-    ASSERT_TRUE(!hasDublicates);
-}
-
-
-}
-
 */
-
diff --git a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
index a2c33374a..dfc23f559 100644
--- a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
@@ -30,13 +30,11 @@ using namespace sd;
 
 class RNGTests : public testing::Test {
 private:
-    //Nd4jLong *_bufferA;
-    //Nd4jLong *_bufferB;
+
 
 public:
     long _seed = 119L;
-    //sd::random::RandomBuffer *_rngA;
-    //sd::random::RandomBuffer *_rngB;
+
     sd::graph::RandomGenerator _rngA;
     sd::graph::RandomGenerator _rngB;
 
@@ -45,10 +43,7 @@ public:
     NDArray* nexp2 = NDArrayFactory::create_<float>('c', {10, 10});
 
     RNGTests() {
-        //_bufferA = new Nd4jLong[100000];
-        //_bufferB = new Nd4jLong[100000];
-        //_rngA = (sd::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferA);
-        //_rngB = (sd::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferB);
+
         _rngA.setStates(_seed * 0xDEADBEEF * 13, _seed * 0xDEADBEEF * 7);
         _rngB.setStates(_seed * 0xDEADBEEF * 13, _seed * 0xDEADBEEF * 7);
         nexp0->assign(-1.0f);
@@ -57,10 +52,6 @@ public:
     }
 
     ~RNGTests() {
-        //destroyRandom(_rngA);
-        //destroyRandom(_rngB);
-        //delete[] _bufferA;
-        //delete[] _bufferB;
 
         delete nexp0;
         delete nexp1;
@@ -103,7 +94,6 @@ TEST_F(RNGTests, TestGenerator_SGA_1) {
         array.r<float>(idx) = x;
     }
     auto minimum = array.reduceNumber(reduce::AMin);
-    minimum.printBuffer("Randomly float min on 1M array");
     ASSERT_EQ(123, generator.rootState());
     ASSERT_EQ(456, generator.nodeState());
 }
@@ -118,13 +108,11 @@ TEST_F(RNGTests, Test_Dropout_1) {
 
     float prob[] = {0.5f};
 
-    //x0.applyRandom(random::DropOut, _rngA, nullptr, &x0, prob);
-    //x1.applyRandom(random::DropOut, _rngB, nullptr, &x1, prob);
+
     RandomLauncher::applyDropOut(LaunchContext::defaultContext(), _rngA, &x0, 0.5);
     RandomLauncher::applyDropOut(LaunchContext::defaultContext(), _rngB, &x1, 0.5);
     ASSERT_TRUE(x0.equalsTo(&x1));
-    //x0.printIndexedBuffer("Dropout");
-    // this check is required to ensure we're calling wrong signature
+
     ASSERT_FALSE(x0.equalsTo(nexp0));
     ASSERT_FALSE(x0.equalsTo(nexp1));
     ASSERT_FALSE(x0.equalsTo(nexp2));
@@ -139,13 +127,11 @@ TEST_F(RNGTests, Test_DropoutInverted_1) {
 
     float prob[] = {0.5f};
 
-    //x0.template applyRandom<randomOps::DropOutInverted<float>>(_rngA, nullptr, &x0, prob);
-    //x1.template applyRandom<randomOps::DropOutInverted<float>>(_rngB, nullptr, &x1, prob);
+
     RandomLauncher::applyInvertedDropOut(LaunchContext::defaultContext(), _rngA, &x0, 0.5);
     RandomLauncher::applyInvertedDropOut(LaunchContext::defaultContext(), _rngB, &x1, 0.5);
     ASSERT_TRUE(x0.equalsTo(&x1));
-    //x0.printIndexedBuffer("DropoutInverted");
-    // this check is required to ensure we're calling wrong signature
+
     ASSERT_FALSE(x0.equalsTo(nexp0));
     ASSERT_FALSE(x0.equalsTo(nexp1));
     ASSERT_FALSE(x0.equalsTo(nexp2));
@@ -189,7 +175,6 @@ TEST_F(RNGTests, Test_Launcher_3) {
     RandomLauncher::applyAlphaDropOut(LaunchContext::defaultContext(), _rngA, &x0, 0.5f, 0.2f, 0.1f, 0.3f);
     RandomLauncher::applyAlphaDropOut(LaunchContext::defaultContext(), _rngB, &x1, 0.5f, 0.2f, 0.1f, 0.3f);
 
-    //x1.printIndexedBuffer("x1");
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     ASSERT_FALSE(x0.equalsTo(nexp0));
@@ -204,9 +189,6 @@ TEST_F(RNGTests, Test_Uniform_1) {
     RandomLauncher::fillUniform(LaunchContext::defaultContext(), _rngA, &x0, 1.0f, 2.0f);
     RandomLauncher::fillUniform(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
-    x0.printLinearBuffer();
-    x1.printLinearBuffer();
-
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     ASSERT_FALSE(x0.equalsTo(nexp0));
@@ -215,7 +197,6 @@ TEST_F(RNGTests, Test_Uniform_1) {
 
     for (int e = 0; e < x0.lengthOf(); e++) {
         float v = x0.e<float>(e);
-        nd4j_printf("%f\n", v);
         ASSERT_TRUE(v >= 1.0f && v <= 2.0f);
     }
 }
@@ -253,8 +234,6 @@ TEST_F(RNGTests, Test_Uniform_11) {
     if (v > max)
       max = v;
   }
-
-  nd4j_printf("Max value: %i\n", (int) max);
 }
 
 TEST_F(RNGTests, Test_Uniform_12) {
@@ -269,7 +248,6 @@ TEST_F(RNGTests, Test_Uniform_12) {
       min = v;
   }
 
-  nd4j_printf("Max value: %.8f; Min value: %.8f\n", (float) max, (float) min);
   ASSERT_LT(max, 1.0f);
   ASSERT_GE(min, 0.0);
 }
@@ -286,7 +264,6 @@ TEST_F(RNGTests, Test_Uniform_13) {
       min = v;
   }
 
-  nd4j_printf("Max value: %.8f; Min value: %.8f\n", (float) max, (float) min);
   ASSERT_LT(max, 1.0);
   ASSERT_GE(min, 0.0);
 }
@@ -323,8 +300,6 @@ TEST_F(RNGTests, Test_Gaussian_1) {
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngA, &x0, 1.0f, 2.0f);
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
-    //x0.printIndexedBuffer("x0");
-    //x1.printIndexedBuffer("x1");
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     ASSERT_FALSE(x0.equalsTo(nexp0));
@@ -339,8 +314,6 @@ TEST_F(RNGTests, Test_Gaussian_21) {
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngA, &x0, 0.0f, 1.0f);
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngB, &x1, 0.0f, 1.0f);
 
-//    x0.printIndexedBuffer("x0");
-//    x1.printIndexedBuffer("x1");
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     ASSERT_FALSE(x0.equalsTo(nexp0));
@@ -348,15 +321,10 @@ TEST_F(RNGTests, Test_Gaussian_21) {
     ASSERT_FALSE(x0.equalsTo(nexp2));
     sd::ops::moments op;
     auto result = op.evaluate({&x0}, {}, {});
-    //x0.printIndexedBuffer("X0 Normal");
-    //x1.printIndexedBuffer("X1 Normal");
     ASSERT_TRUE(result.status() == Status::OK());
     auto mean = result.at(0);
     auto variance = result.at(1);
 
-    // mean->printIndexedBuffer("Mean");
-    // variance->printIndexedBuffer("Variance");
-
     ASSERT_NEAR(sd::math::nd4j_abs(mean->e<float>(0)), 0.f, 0.2f);
     ASSERT_NEAR(variance->e<float>(0), 1.0f, 0.2f);
 
@@ -371,8 +339,6 @@ TEST_F(RNGTests, Test_Gaussian_22) {
     RandomLauncher::fillGaussian(sd::LaunchContext::defaultContext(), _rngA, &x0, 0.0f, 1.0f);
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngB, &x1, 0.0f, 1.0f);
 
-    //x0.printIndexedBuffer("x0");
-    //x1.printIndexedBuffer("x1");
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     ASSERT_FALSE(x0.equalsTo(nexp0));
@@ -380,14 +346,11 @@ TEST_F(RNGTests, Test_Gaussian_22) {
     ASSERT_FALSE(x0.equalsTo(nexp2));
     sd::ops::moments op;
     auto result = op.evaluate({&x0}, {}, {});
-    //x0.printIndexedBuffer("X0 Normal");
-    //x1.printIndexedBuffer("X1 Normal");
+
     ASSERT_TRUE(result.status() == Status::OK());
     auto mean0 = result.at(0);
     auto variance0 = result.at(1);
 
-    //mean0->printIndexedBuffer("Mean");
-    //variance0->printIndexedBuffer("Variance");
     ASSERT_NEAR(sd::math::nd4j_abs(mean0->e<float>(0)), 0.f, 1.0e-3f);
     ASSERT_NEAR(variance0->e<float>(0), 1.0f, 1.e-3f);
 
@@ -398,8 +361,8 @@ TEST_F(RNGTests, Test_Gaussian_3) {
 
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngA, &x0, 0.0, 1.0);
 
-    auto mean = x0.meanNumber(); //.e<double>(0);
-    auto stdev = x0.varianceNumber(sd::variance::SummaryStatsStandardDeviation, false);//.e<double>(0);
+    auto mean = x0.meanNumber();
+    auto stdev = x0.varianceNumber(sd::variance::SummaryStatsStandardDeviation, false);
     auto meanExp = NDArrayFactory::create<double>(0.);
     auto devExp = NDArrayFactory::create<double>(1.);
     ASSERT_TRUE(meanExp.equalsTo(mean, 1.e-3));
@@ -435,15 +398,11 @@ TEST_F(RNGTests, Test_Truncated_1) {
 
     /* Check up distribution */
     auto mean = x1.reduceNumber(reduce::Mean);
-    // mean.printIndexedBuffer("Mean 1.0");
     auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation = x1.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    //deviation /= (double)x1.lengthOf();
-    // deviation.printIndexedBuffer("Deviation should be 2.0");
-    // x1.printIndexedBuffer("Distribution TN");
-
 }
+
 TEST_F(RNGTests, Test_Truncated_2) {
     auto x0 = NDArrayFactory::create<float>('c', {1000, 1000});
     auto x1 = NDArrayFactory::create<float>('c', {1000, 1000});
@@ -452,20 +411,10 @@ TEST_F(RNGTests, Test_Truncated_2) {
     RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
-
-    //ASSERT_FALSE(x0.equalsTo(nexp0));
-    //ASSERT_FALSE(x0.equalsTo(nexp1));
-    //ASSERT_FALSE(x0.equalsTo(nexp2));
-
     /* Check up distribution */
     auto mean = x1.reduceNumber(reduce::Mean);
-    // mean.printIndexedBuffer("Mean 1.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation = x1.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    //deviation /= (double)x1.lengthOf();
-    // deviation.printIndexedBuffer("Deviation should be 2.0");
-    //x1.printIndexedBuffer("Distribution TN");
     ASSERT_NEAR(mean.e<float>(0), 1.f, 0.5);
     ASSERT_NEAR(deviation.e<float>(0), 2.f, 0.5);
 }
@@ -480,40 +429,24 @@ TEST_F(RNGTests, Test_Truncated_21) {
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     auto mean0 = x0.reduceNumber(reduce::Mean);
-    // mean0.printIndexedBuffer("0Mean 1.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation0 = x0.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    // deviation0.printIndexedBuffer("0Deviation should be 2.0");
-
-    //ASSERT_FALSE(x0.equalsTo(nexp0));
-    //ASSERT_FALSE(x0.equalsTo(nexp1));
-    //ASSERT_FALSE(x0.equalsTo(nexp2));
 
     /* Check up distribution */
     auto mean = x1.reduceNumber(reduce::Mean);
-    // mean.printIndexedBuffer("Mean 1.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation = x1.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    //deviation /= (double)x1.lengthOf();
-    // deviation.printIndexedBuffer("Deviation should be 2.0");
-    //x1.printIndexedBuffer("Distribution TN");
+
     ASSERT_NEAR(mean.e<float>(0), 1.f, 0.002);
     ASSERT_NEAR(deviation.e<float>(0), 2.f, 0.5);
     sd::ops::moments op;
     auto result = op.evaluate({&x0}, {}, {}, {}, {}, false);
 
-    // result.at(0)->printBuffer("MEAN");
-    // result.at(1)->printBuffer("VARIANCE");
-
     sd::ops::reduce_min minOp;
     sd::ops::reduce_max maxOp;
 
     auto minRes = minOp.evaluate({&x1}, {}, {}, {});
     auto maxRes = maxOp.evaluate({&x0}, {}, {}, {});
-    // minRes->at(0)->printBuffer("MIN for Truncated");
-    // maxRes->at(0)->printBuffer("MAX for Truncated");
 }
 
 TEST_F(RNGTests, Test_Truncated_22) {
@@ -526,25 +459,13 @@ TEST_F(RNGTests, Test_Truncated_22) {
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     auto mean0 = x0.reduceNumber(reduce::Mean);
-    // mean0.printIndexedBuffer("0Mean 2.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation0 = x0.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    // deviation0.printIndexedBuffer("0Deviation should be 4.0");
-
-    //ASSERT_FALSE(x0.equalsTo(nexp0));
-    //ASSERT_FALSE(x0.equalsTo(nexp1));
-    //ASSERT_FALSE(x0.equalsTo(nexp2));
 
     /* Check up distribution */
     auto mean = x1.reduceNumber(reduce::Mean);
-    // mean.printIndexedBuffer("Mean 2.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation = x1.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    //deviation /= (double)x1.lengthOf();
-    // deviation.printIndexedBuffer("Deviation should be 4.0");
-    //x1.printIndexedBuffer("Distribution TN");
     ASSERT_NEAR(mean.e<float>(0), 2.f, 0.01);
     ASSERT_NEAR(deviation.e<float>(0), 4.f, 0.52);
     sd::ops::moments op;
@@ -555,9 +476,6 @@ TEST_F(RNGTests, Test_Truncated_22) {
 
     auto minRes = minOp.evaluate({&x1}, {}, {}, {});
     auto maxRes = maxOp.evaluate({&x0}, {}, {}, {});
-    // minRes->at(0)->printBuffer("MIN for Truncated2");
-    // maxRes->at(0)->printBuffer("MAX for Truncated2");
-
 }
 
 TEST_F(RNGTests, Test_Truncated_23) {
@@ -570,38 +488,22 @@ TEST_F(RNGTests, Test_Truncated_23) {
     ASSERT_TRUE(x0.equalsTo(&x1));
 
     auto mean0 = x0.reduceNumber(reduce::Mean);
-    // mean0.printIndexedBuffer("0Mean 2.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation0 = x0.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    // deviation0.printIndexedBuffer("0Deviation should be 4.0");
-
-    //ASSERT_FALSE(x0.equalsTo(nexp0));
-    //ASSERT_FALSE(x0.equalsTo(nexp1));
-    //ASSERT_FALSE(x0.equalsTo(nexp2));
 
     /* Check up distribution */
     auto mean = x1.reduceNumber(reduce::Mean);
-    // mean.printIndexedBuffer("Mean 2.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation = x1.varianceNumber(variance::SummaryStatsStandardDeviation, false);
-    //deviation /= (double)x1.lengthOf();
-    // deviation.printIndexedBuffer("Deviation should be 4.0");
-    //x1.printIndexedBuffer("Distribution TN");
     ASSERT_NEAR(mean.e<float>(0), 0.f, 0.01);
     ASSERT_NEAR(deviation.e<float>(0), 1.f, 0.5);
     sd::ops::moments op;
     auto result = op.evaluate({&x0});
-    // result->at(0)->printBuffer("MEAN");
-    // result->at(1)->printBuffer("VARIANCE");
     sd::ops::reduce_min minOp;
     sd::ops::reduce_max maxOp;
 
     auto minRes = minOp.evaluate({&x1}, {}, {}, {});
     auto maxRes = maxOp.evaluate({&x0}, {}, {}, {});
-    // minRes->at(0)->printBuffer("MIN for Truncated3");
-    // maxRes->at(0)->printBuffer("MAX for Truncated3");
 
 }
 
@@ -616,8 +518,6 @@ TEST_F(RNGTests, Test_Truncated_3) {
 
     // Check up distribution
     auto mean = x1.reduceNumber(reduce::Mean);
-    // mean.printIndexedBuffer("Mean 1.0");
-    //auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation = x1.varianceNumber(variance::SummaryStatsStandardDeviation, false);
     ASSERT_NEAR(mean.e<float>(0), 1.f, 0.001);
@@ -634,9 +534,6 @@ TEST_F(RNGTests, Test_Binomial_1) {
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
-    //nexp2->printIndexedBuffer("nexp2");
-    //x0.printIndexedBuffer("x0");
-
     ASSERT_FALSE(x0.equalsTo(nexp0));
     ASSERT_FALSE(x0.equalsTo(nexp1));
     ASSERT_FALSE(x0.equalsTo(nexp2));
@@ -669,7 +566,6 @@ TEST_F(RNGTests, Test_Uniform_SGA_3) {
 
     RandomLauncher::fillUniform(LaunchContext::defaultContext(), _rngB, &x1, -sd::DataTypeUtils::template max<float>(), sd::DataTypeUtils::template max<float>());
     auto minimumU = x1.reduceNumber(reduce::AMin);
-    minimumU.printBuffer("\nMinimum");
 }
 
 TEST_F(RNGTests, Test_Gaussian_2) {
@@ -827,17 +723,14 @@ TEST_F(RNGTests, Test_ExponentialDistribution_1) {
     auto z = result.at(0);
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
-    //
-    z->printBuffer("\nExponential1");
+
     auto mean = z->reduceNumber(reduce::Mean);
     auto variance = z->varianceNumber(variance::SummaryStatsVariance, false);
-    mean.printBuffer("Mean for exponential with param 0.25 (4 exp) is");
-    variance.printBuffer("Variance for exponential with param 0.25 (16 exp) is");
+
     ASSERT_FALSE(nexp0->equalsTo(z));
     ASSERT_FALSE(nexp1->equalsTo(z));
     ASSERT_FALSE(nexp2->equalsTo(z));
 
-//    delete result;
 }
 
 TEST_F(RNGTests, Test_ExponentialDistribution_1_SGA) {
@@ -852,12 +745,10 @@ TEST_F(RNGTests, Test_ExponentialDistribution_1_SGA) {
     auto z = result.at(0);
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
-    //
-    z->printBuffer("\nExponential2");
+
     auto mean = z->reduceNumber(reduce::Mean);
     auto variance = z->varianceNumber(variance::SummaryStatsVariance, false);
-    mean.printBuffer("Mean for exponential with param 1.0 (1 exp) is");
-    variance.printBuffer("Variance for exponential with param 1. (1 exp) is");
+
     ASSERT_FALSE(nexp0->equalsTo(z));
     ASSERT_FALSE(nexp1->equalsTo(z));
     ASSERT_FALSE(nexp2->equalsTo(z));
@@ -878,60 +769,16 @@ TEST_F(RNGTests, Test_ExponentialDistribution_2_SGA) {
     auto z = result.at(0);
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
-    //
-//    z->printBuffer("\nExponential2+");
+
     auto mean = z->reduceNumber(reduce::Mean);
     auto variance = z->varianceNumber(variance::SummaryStatsVariance, false);
-    mean.printBuffer("Mean for exponential with param 1.0 (1 exp) is");
-    variance.printBuffer("Variance for exponential with param 1. (1 exp) is");
+
     ASSERT_FALSE(nexp0->equalsTo(z));
     ASSERT_FALSE(nexp1->equalsTo(z));
     ASSERT_FALSE(nexp2->equalsTo(z));
     mean = exp0.reduceNumber(reduce::Mean);
     variance = exp0.varianceNumber(variance::SummaryStatsVariance, false);
-    mean.printBuffer("Mean for exponential with param 2.0 (1/2 exp) is");
-    variance.printBuffer("Variance for exponential with param 2. (1/2 exp) is");
-}
 
-TEST_F(RNGTests, Test_ExponentialDistribution_3_SGA) {
-    auto x = NDArrayFactory::create<Nd4jLong>('c', {2}, {1000, 1000});
-    auto exp0 = NDArrayFactory::create<double>('c', {1000, 1000});
-    RandomGenerator oc(2716049175077475646L, -6182841917129177862L);
-    auto expMean = NDArrayFactory::create<double>(0.5f);
-    auto expVar = NDArrayFactory::create<double>(0.25f);
-    sd::ops::random_exponential op;
-    RandomLauncher::fillExponential(exp0.getContext(), oc, &exp0, 2.f);
-
-    auto result = op.evaluate({&x}, {1.});
-    ASSERT_EQ(Status::OK(), result.status());
-
-    auto z = result.at(0);
-    //ASSERT_TRUE(exp0.isSameShape(z));
-    //ASSERT_FALSE(exp0.equalsTo(z));
-    //
-//    z->printBuffer("\nExponential2+");
-   auto mean = z->reduceNumber(reduce::Mean);
-   auto variance = z->varianceNumber(variance::SummaryStatsVariance, false);
-   mean.printBuffer("Mean");
-   variance.printBuffer("Variance");
-    ASSERT_NEAR(mean.e<double>(0), 1.f, 1.e-2f);
-    ASSERT_NEAR(variance.e<double>(0), 1.f, 1.e-2f);
-//    mean.printBuffer("Mean for exponential with param 1.0 (1 exp) is");
-//    variance.printBuffer("Variance for exponential with param 1. (1 exp) is");
-//    ASSERT_FALSE(nexp0->equalsTo(z));
-//    ASSERT_FALSE(nexp1->equalsTo(z));
-//    ASSERT_FALSE(nexp2->equalsTo(z));
-    mean = exp0.reduceNumber(reduce::Mean);
-    variance = exp0.varianceNumber(variance::SummaryStatsVariance, false);
-    mean.printBuffer("Mean for exponential with param 2.0 (1/2 exp) is");
-    variance.printBuffer("Variance for exponential with param 2. (1/4 exp) is");
-    ASSERT_TRUE(mean.equalsTo(expMean, 1.e-3));
-    ASSERT_TRUE(variance.equalsTo(expVar, 1.e-3));
-    RandomLauncher::fillExponential(exp0.getContext(), oc, &exp0, 1.f);
-    mean = exp0.reduceNumber(reduce::Mean);
-    variance = exp0.varianceNumber(variance::SummaryStatsVariance, false);
-    mean.printBuffer("Mean for exponential with param 1.0 (1 exp) is");
-    variance.printBuffer("Variance for exponential with param 1.0 (1 exp) is");
 }
 
 TEST_F(RNGTests, Test_ExponentialDistribution_2) {
@@ -971,11 +818,8 @@ TEST_F(RNGTests, Test_PoissonDistribution_1) {
     ASSERT_EQ(Status::OK(), result.status());
 
     auto z = result.at(0);
-//    z->printIndexedBuffer("Poisson distribution");
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
-
-
 }
 
 TEST_F(RNGTests, Test_GammaDistribution_1) {
@@ -991,11 +835,9 @@ TEST_F(RNGTests, Test_GammaDistribution_1) {
     ASSERT_EQ(Status::OK(), result.status());
 
     auto z = result.at(0);
-//    z->printIndexedBuffer("Gamma distribution");
+
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
-
-
 }
 
 TEST_F(RNGTests, Test_GammaDistribution_2) {
@@ -1012,7 +854,7 @@ TEST_F(RNGTests, Test_GammaDistribution_2) {
     ASSERT_EQ(Status::OK(), result.status());
 
     auto z = result.at(0);
-//    z->printIndexedBuffer("Gamma distribution");
+
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
 }
@@ -1031,7 +873,6 @@ TEST_F(RNGTests, Test_GammaDistribution_3) {
     ASSERT_EQ(Status::OK(), result.status());
 
     auto z = result.at(0);
-//    z->printIndexedBuffer("Gamma distribution");
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
 
@@ -1127,7 +968,6 @@ TEST_F(RNGTests, Test_UniformDistribution_05) {
 
     sd::ops::reduce_max checkOp;
     auto checkResult = checkOp.evaluate({z});
-    checkResult[0]->printIndexedBuffer("Max on uniform with 0 to 1 on 100M cases is");
 }
 
 namespace sd {
@@ -1168,7 +1008,6 @@ TEST_F(RNGTests, Test_Reproducibility_1) {
 
             bool t = arrayE->equalsTo(arrayT);
             if (!t) {
-                // nd4j_printf("Failed at iteration [%i] for array [%i]\n", e, a);
                 ASSERT_TRUE(false);
             }
 
@@ -1200,19 +1039,15 @@ TEST_F(RNGTests, Test_Reproducibility_2) {
 
             bool t = arrayE->equalsTo(arrayT);
             if (!t) {
-                // nd4j_printf("Failed at iteration [%i] for array [%i]\n", e, a);
 
                 for (Nd4jLong f = 0; f < arrayE->lengthOf(); f++) {
                     double x = arrayE->e<double>(f);
                     double y = arrayT->e<double>(f);
 
                     if (sd::math::nd4j_re(x, y) > 0.1) {
-                        // nd4j_printf("E[%lld] %f != T[%lld] %f\n", (long long) f, (float) x, (long long) f, (float) y);
                         throw std::runtime_error("boom");
                     }
                 }
-
-                // just breaker, since test failed
                 ASSERT_TRUE(false);
             }
 
@@ -1235,8 +1070,6 @@ TEST_F(RNGTests, Test_Uniform_4) {
     auto sumA = x1 - mean; //.reduceNumber(reduce::Sum);
 
     auto deviation = x1.varianceNumber(variance::SummaryStatsVariance, false);
-    //deviation /= (double)x1.lengthOf();
-    // deviation.printIndexedBuffer("Deviation should be 1/12 (0.083333)");
 
     ASSERT_NEAR(mean.e<double>(0), 1.5, 1e-3);
     ASSERT_NEAR(1/12., deviation.e<double>(0), 1e-3);
@@ -1251,8 +1084,6 @@ TEST_F(RNGTests, test_choice_1) {
     RandomGenerator rng(119, 256);
     NativeOpExecutioner::execRandom(sd::LaunchContext ::defaultContext(), random::Choice, &rng, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), prob->buffer(), prob->shapeInfo(), prob->specialBuffer(), prob->specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr);
 
-    // z.printIndexedBuffer("z");
-
     delete x;
     delete prob;
 }
@@ -1367,7 +1198,7 @@ TEST_F(RNGTests, test_multinomial_5) {
     ASSERT_EQ(Status::OK(), op.execute(rng, { &probs, &samples }, { &output }, {}, { 1 }, {}, {}, false));
     auto deviation = output.varianceNumber(variance::SummaryStatsStandardDeviation, false);
     auto mean = output.meanNumber();
-    // printf("Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
+
     // theoretical values for binomial
     ASSERT_NEAR(0.5, deviation.e<double>(0), 4e-3); // 1000000 3e-3);
     ASSERT_NEAR(0.5, mean.e<double>(0), 4e-3); // 1000000 3e-3);
@@ -1383,7 +1214,7 @@ TEST_F(RNGTests, test_multinomial_5) {
 
     deviation = outputR->varianceNumber(variance::SummaryStatsStandardDeviation, false);
     mean = outputR->meanNumber();
-    // printf("Random seed - Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
+
     ASSERT_NEAR(0.5, deviation.e<double>(0), 45e-3); // 1000000 35e-3);
     ASSERT_NEAR(0.5, mean.e<double>(0), 45e-3); // 1000000 35e-3);
 
@@ -1425,13 +1256,13 @@ TEST_F(RNGTests, test_multinomial_6) {
     for (int i = 0; i < countsR.lengthOf(); i++) {
         auto c = countsR.e<double>(i);
         auto p = probExpect.e<double>(i);
-        // printf("Get freq : %f  Expect freq: %f \n", c / Samples, p);
+
         ASSERT_NEAR((c / Samples), p, 45e-3); // 1000000 35e-3);
     }
 
     auto deviation = outputR->varianceNumber(variance::SummaryStatsStandardDeviation, false);
     auto mean = outputR->meanNumber();
-    // printf("Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
+
     ASSERT_NEAR(1.2175, deviation.e<double>(0), 45e-3); // 1000000 35e-3);
     ASSERT_NEAR(2.906, mean.e<double>(0), 45e-3); // 1000000 35e-3);
 
@@ -1454,13 +1285,12 @@ TEST_F(RNGTests, test_multinomial_6) {
     for (int i = 0; i < counts.lengthOf(); i++) {
         auto c = counts.e<double>(i);
         auto p = probExpect.e<double>(i);
-        // printf("Get freq : %f  Expect freq: %f \n", c / Samples, p);
+
         ASSERT_NEAR((c / Samples), p, 4e-3); // 1000000 3e-3);
     }
 
     deviation = output.varianceNumber(variance::SummaryStatsStandardDeviation, false);
     mean = output.meanNumber();
-    // printf("Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
     ASSERT_NEAR(1.2175, deviation.e<double>(0), 5e-3); // 1000000 3e-3);
     ASSERT_NEAR(2.906, mean.e<double>(0), 5e-3); // 1000000 3e-3);
 }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/abstracts/Nd4jWorkspace.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/abstracts/Nd4jWorkspace.java
index 45029fb1e..e133e171b 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/abstracts/Nd4jWorkspace.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/abstracts/Nd4jWorkspace.java
@@ -124,6 +124,9 @@ public abstract class Nd4jWorkspace implements MemoryWorkspace {
 
     protected AtomicLong generationId = new AtomicLong(0);
 
+    // this field is used as alignment base for all allocations within this workspace
+    public final static int alignmentBase = 16;
+
     // this memory manager implementation will be used to allocate real memory for this workspace
 
     public Nd4jWorkspace(@NonNull WorkspaceConfiguration configuration) {
@@ -340,9 +343,9 @@ public abstract class Nd4jWorkspace implements MemoryWorkspace {
         long numElements = requiredMemory / Nd4j.sizeOfDataType(type);
 
         // we enforce 8 byte alignment to ensure CUDA doesn't blame us
-        long div = requiredMemory % 8;
+        long div = requiredMemory % alignmentBase;
         if (div != 0)
-            requiredMemory += (8 - div);
+            requiredMemory += (alignmentBase - div);
 
         // shortcut made to skip workspace
         if (!isUsed.get()) {
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda-platform/pom.xml b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda-platform/pom.xml
index 344e77861..d86e0d353 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda-platform/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda-platform/pom.xml
@@ -22,14 +22,14 @@
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>nd4j-cuda-10.2-platform</artifactId>
+    <artifactId>nd4j-cuda-11.0-platform</artifactId>
     <name>nd4j-cuda-platform</name>
 
     <properties>
         <!-- CUDA version is linked with the artifact name so cannot move to parent pom.xml -->
-        <cuda.version>10.2</cuda.version>
-        <cudnn.version>7.6</cudnn.version>
-        <javacpp-presets.cuda.version>1.5.3</javacpp-presets.cuda.version>
+        <cuda.version>11.0</cuda.version>
+        <cudnn.version>8.0</cudnn.version>
+        <javacpp-presets.cuda.version>1.5.4-SNAPSHOT</javacpp-presets.cuda.version>
         <nd4j.backend>nd4j-cuda-${cuda.version}</nd4j.backend>
     </properties>
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml
index c181d4328..ca03fbcae 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml
@@ -22,14 +22,14 @@
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>nd4j-cuda-10.2</artifactId>
+    <artifactId>nd4j-cuda-11.0</artifactId>
     <name>nd4j-cuda</name>
 
     <properties>
         <!-- CUDA version is linked with the artifact name so cannot move to parent pom.xml -->
-        <cuda.version>10.2</cuda.version>
-        <cudnn.version>7.6</cudnn.version>
-        <javacpp-presets.cuda.version>1.5.3</javacpp-presets.cuda.version>
+        <cuda.version>11.0</cuda.version>
+        <cudnn.version>8.0</cudnn.version>
+        <javacpp-presets.cuda.version>1.5.4-SNAPSHOT</javacpp-presets.cuda.version>
     </properties>
 
     <build>
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/workspace/CudaWorkspace.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/workspace/CudaWorkspace.java
index 0c4cf9ffa..a04dae5d5 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/workspace/CudaWorkspace.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/workspace/CudaWorkspace.java
@@ -92,9 +92,15 @@ public class CudaWorkspace extends Nd4jWorkspace {
                 AllocationsTracker.getInstance().markAllocated(AllocationKind.GENERAL, Nd4j.getAffinityManager().getDeviceForCurrentThread(), bytes + SAFETY_OFFSET);
 
                 MemoryTracker.getInstance().incrementWorkspaceAllocatedAmount(Nd4j.getAffinityManager().getDeviceForCurrentThread(), bytes + SAFETY_OFFSET);
-            }
 
-            //log.info("Workspace [{}] initialized successfully", id);
+                // if base pointer isn't aligned to 16 bytes (128 bits) - adjust the offfset then
+                val addr = workspace.getDevicePointer().address();
+                val div = addr % alignmentBase;
+                if (div != 0) {
+                    deviceOffset.set(alignmentBase - div);
+                    hostOffset.set(alignmentBase - div);
+                }
+            }
         }
     }
 
@@ -134,8 +140,9 @@ public class CudaWorkspace extends Nd4jWorkspace {
     public PagedPointer alloc(long requiredMemory, MemoryKind kind, DataType type, boolean initialize) {
         long numElements = requiredMemory / Nd4j.sizeOfDataType(type);
 
-        if (requiredMemory % 8 != 0)
-            requiredMemory += 8 - (requiredMemory % 8);
+        // alignment
+        if (requiredMemory % alignmentBase != 0)
+            requiredMemory += alignmentBase - (requiredMemory % alignmentBase);
 
         if (!isUsed.get()) {
             if (disabledCounter.incrementAndGet() % 10 == 0)
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index 38c7188f1..6c4d0921e 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -1,4 +1,4 @@
-// Targeted by JavaCPP version 1.5.3: DO NOT EDIT THIS FILE
+// Targeted by JavaCPP version 1.5.4-SNAPSHOT: DO NOT EDIT THIS FILE
 
 package org.nd4j.nativeblas;
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
index 6ac8e133a..db09d9625 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
@@ -135,20 +135,16 @@ public class Nd4jCudaPresets implements LoadEnabled, InfoMapper {
             return;
         }
         int i = 0;
-        String[] libs = {"cudart", "cublasLt", "cublas", "cusolver", "cusparse", "cudnn"};
+        String[] libs = {"cudart", "cublasLt", "cublas", "curand", "cusolver", "cusparse", "cudnn",
+                         "cudnn_ops_infer", "cudnn_ops_train", "cudnn_adv_infer",
+                         "cudnn_adv_train", "cudnn_cnn_infer", "cudnn_cnn_train"};
         for (String lib : libs) {
-            switch (platform) {
-                case "linux-arm64":
-                case "linux-ppc64le":
-                case "linux-x86_64":
-                case "macosx-x86_64":
-                    lib += lib.equals("cudnn") ? "@.7" : lib.equals("cudart") ? "@.10.2" : "@.10";
-                    break;
-                case "windows-x86_64":
-                    lib += lib.equals("cudnn") ? "64_7" : lib.equals("cudart") ? "64_102" : "64_10";
-                    break;
-                default:
-                    continue; // no CUDA
+            if (platform.startsWith("linux")) {
+                lib += lib.startsWith("cudnn") ? "@.8" : lib.equals("curand") || lib.equals("cusolver") ? "@.10" : lib.equals("cudart") ? "@.11.0" : "@.11";
+            } else if (platform.startsWith("windows")) {
+                lib += lib.startsWith("cudnn") ? "64_8" : lib.equals("curand") || lib.equals("cusolver") ? "64_10" : lib.equals("cudart") ? "64_110" : "64_11";
+            } else {
+                continue; // no CUDA
             }
             if (!preloads.contains(lib)) {
                 preloads.add(i++, lib);
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index 2926c06b9..5387cf43b 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -1,4 +1,4 @@
-// Targeted by JavaCPP version 1.5.3: DO NOT EDIT THIS FILE
+// Targeted by JavaCPP version 1.5.4-SNAPSHOT: DO NOT EDIT THIS FILE
 
 package org.nd4j.nativeblas;
 
diff --git a/nd4j/nd4j-backends/nd4j-tests-tensorflow/pom.xml b/nd4j/nd4j-backends/nd4j-tests-tensorflow/pom.xml
index 3e4367992..66efac0fe 100644
--- a/nd4j/nd4j-backends/nd4j-tests-tensorflow/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-tests-tensorflow/pom.xml
@@ -127,12 +127,6 @@
                     <version>${tensorflow.javacpp.version}</version>
                     <classifier>windows-x86_64-gpu</classifier>
                 </dependency>
-                <dependency>
-                    <groupId>org.bytedeco</groupId>
-                    <artifactId>tensorflow</artifactId>
-                    <version>${tensorflow.javacpp.version}</version>
-                    <classifier>macosx-x86_64-gpu</classifier>
-                </dependency>
             </dependencies>
         </profile>
         <profile>
@@ -233,7 +227,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
                 <dependency>
@@ -248,12 +242,6 @@
                     <version>${tensorflow.javacpp.version}</version>
                     <classifier>windows-x86_64-gpu</classifier>
                 </dependency>
-                <dependency>
-                    <groupId>org.bytedeco</groupId>
-                    <artifactId>tensorflow</artifactId>
-                    <version>${tensorflow.javacpp.version}</version>
-                    <classifier>macosx-x86_64-gpu</classifier>
-                </dependency>
             </dependencies>
         </profile>
         <profile>
diff --git a/nd4j/nd4j-backends/nd4j-tests/pom.xml b/nd4j/nd4j-backends/nd4j-tests/pom.xml
index c621330d4..823b38590 100644
--- a/nd4j/nd4j-backends/nd4j-tests/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-tests/pom.xml
@@ -212,7 +212,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/BasicWorkspaceTests.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/BasicWorkspaceTests.java
index 290b69723..4c3679df6 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/BasicWorkspaceTests.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/BasicWorkspaceTests.java
@@ -221,11 +221,11 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
                 INDArray array2 = Nd4j.create(new double[] {1f, 2f, 3f, 4f, 5f});
 
                 long reqMemory = 5 * Nd4j.sizeOfDataType(DOUBLE);
-                assertEquals(reqMemory + reqMemory % 8, wsOne.getPrimaryOffset());
+                assertEquals(reqMemory + reqMemory % 16, wsOne.getPrimaryOffset());
 
                 array2.leverageTo("EXT");
 
-                assertEquals((reqMemory + reqMemory % 8) * 2, wsOne.getPrimaryOffset());
+                assertEquals((reqMemory + reqMemory % 16) * 2, wsOne.getPrimaryOffset());
             }
         }
     }
@@ -237,7 +237,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
             INDArray array1 = Nd4j.create(new double[] {1f, 2f, 3f, 4f, 5f});
 
             long reqMemory = 5 * Nd4j.sizeOfDataType(array1.dataType());
-            assertEquals(reqMemory + reqMemory % 8, wsOne.getPrimaryOffset());
+            assertEquals(reqMemory + reqMemory % 16, wsOne.getPrimaryOffset());
 
             INDArray array2;
 
@@ -252,7 +252,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
             INDArray array3 = Nd4j.create(new double[] {1f, 2f, 3f, 4f, 5f});
 
             reqMemory = 5 * Nd4j.sizeOfDataType(array3.dataType());
-            assertEquals((reqMemory + reqMemory % 8) * 2, wsOne.getPrimaryOffset());
+            assertEquals((reqMemory + reqMemory % 16) * 2, wsOne.getPrimaryOffset());
 
             array1.addi(array2);
 
@@ -275,12 +275,12 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
                 assertEquals(0, wsOne.getPrimaryOffset());
 
                 long reqMemory = 5 * Nd4j.sizeOfDataType(array.dataType());
-                assertEquals(reqMemory + reqMemory % 8, wsTwo.getPrimaryOffset());
+                assertEquals(reqMemory + reqMemory % 16, wsTwo.getPrimaryOffset());
 
                 INDArray copy = array.leverage();
 
-                assertEquals(reqMemory + reqMemory % 8, wsTwo.getPrimaryOffset());
-                assertEquals(reqMemory + reqMemory % 8, wsOne.getPrimaryOffset());
+                assertEquals(reqMemory + reqMemory % 16, wsTwo.getPrimaryOffset());
+                assertEquals(reqMemory + reqMemory % 16, wsOne.getPrimaryOffset());
 
                 assertNotEquals(null, copy);
 
@@ -324,7 +324,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
             array2.assign(array1);
 
             long reqMemory = 5 * Nd4j.sizeOfDataType(array1.dataType());
-            assertEquals(reqMemory + reqMemory % 8, wsI.getPrimaryOffset());
+            assertEquals(reqMemory + reqMemory % 16, wsI.getPrimaryOffset());
             assertEquals(array1, array2);
 
             INDArray array3 = Nd4j.createUninitializedDetached(DataType.FLOAT, new long[0]);
@@ -348,17 +348,17 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
             assertTrue(array.isAttached());
 
             long reqMemory = 5 * Nd4j.sizeOfDataType(array.dataType());
-            assertEquals(reqMemory + reqMemory % 8, wsI.getPrimaryOffset());
+            assertEquals(reqMemory + reqMemory % 16, wsI.getPrimaryOffset());
 
             copy = array.detach();
 
             assertTrue(array.isInScope());
             assertTrue(array.isAttached());
-            assertEquals(reqMemory + reqMemory % 8, wsI.getPrimaryOffset());
+            assertEquals(reqMemory + reqMemory % 16, wsI.getPrimaryOffset());
 
             assertFalse(copy.isAttached());
             assertTrue(copy.isInScope());
-            assertEquals(reqMemory + reqMemory % 8, wsI.getPrimaryOffset());
+            assertEquals(reqMemory + reqMemory % 16, wsI.getPrimaryOffset());
         }
 
         assertEquals(15.0f, copy.sumNumber().floatValue(), 0.01f);
@@ -685,7 +685,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
 
         workspace.initializeWorkspace();
         long reqMemory = 11 * Nd4j.sizeOfDataType(arrayCold.dataType());
-        assertEquals(reqMemory + reqMemory % 8, workspace.getCurrentSize());
+        assertEquals(reqMemory + reqMemory % 16, workspace.getCurrentSize());
 
 
         log.info("-----------------------");
@@ -700,11 +700,11 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
 
             long reqMem = 10 * Nd4j.sizeOfDataType(array.dataType());
 
-            assertEquals(reqMem + reqMem % 8, workspace.getPrimaryOffset());
+            assertEquals(reqMem + reqMem % 16, workspace.getPrimaryOffset());
 
             array.addi(1.0);
 
-            assertEquals(reqMem + reqMem % 8, workspace.getPrimaryOffset());
+            assertEquals(reqMem + reqMem % 16, workspace.getPrimaryOffset());
 
             assertEquals("Failed on iteration " + x, 10, array.sumNumber().doubleValue(), 0.01);
 
@@ -752,13 +752,13 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
 
         // checking if allocation actually happened
         long reqMemory = 5 * Nd4j.sizeOfDataType(DOUBLE);
-        assertEquals(reqMemory + reqMemory % 8, workspace.getPrimaryOffset());
+        assertEquals(reqMemory + reqMemory % 16, workspace.getPrimaryOffset());
 
         array.assign(1.0f);
 
         INDArray dup = array.dup();
 
-        assertEquals((reqMemory + reqMemory % 8) * 2, workspace.getPrimaryOffset());
+        assertEquals((reqMemory + reqMemory % 16) * 2, workspace.getPrimaryOffset());
 
         assertEquals(5, dup.sumNumber().doubleValue(), 0.01);
 
@@ -786,7 +786,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
 
         // checking if allocation actually happened
         long reqMem = 5 * Nd4j.sizeOfDataType(array.dataType());
-        assertEquals(reqMem + reqMem % 8, workspace.getPrimaryOffset());
+        assertEquals(reqMem + reqMem % 16, workspace.getPrimaryOffset());
 
         try {
             INDArray array2 = Nd4j.create(DOUBLE, 10000000);
@@ -795,11 +795,11 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
             assertTrue(true);
         }
 
-        assertEquals(reqMem + reqMem % 8, workspace.getPrimaryOffset());
+        assertEquals(reqMem + reqMem % 16, workspace.getPrimaryOffset());
 
         INDArray array2 = Nd4j.create(DOUBLE, new long[] {1, 5}, 'c');
 
-        assertEquals((reqMem + reqMem % 8) * 2, workspace.getPrimaryOffset());
+        assertEquals((reqMem + reqMem % 16) * 2, workspace.getPrimaryOffset());
     }
 
     @Test
@@ -817,7 +817,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
 
         // checking if allocation actually happened
         long reqMem = 5 * Nd4j.sizeOfDataType(array.dataType());
-        assertEquals(reqMem + reqMem % 8, workspace.getPrimaryOffset());
+        assertEquals(reqMem + reqMem % 16, workspace.getPrimaryOffset());
 
         array.assign(1.0f);
 
@@ -841,7 +841,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
 
         // checking if allocation actually happened
         long reqMem = 5 * Nd4j.sizeOfDataType(array.dataType());
-        assertEquals(reqMem + reqMem % 8, workspace.getPrimaryOffset());
+        assertEquals(reqMem + reqMem % 16, workspace.getPrimaryOffset());
 
         array.assign(1.0f);
 
@@ -870,7 +870,7 @@ public class BasicWorkspaceTests extends BaseNd4jTest {
 
         // checking if allocation actually happened
         long reqMem = 5 * Nd4j.sizeOfDataType(array.dataType());
-        assertEquals(reqMem + reqMem % 8, workspace.getPrimaryOffset());
+        assertEquals(reqMem + reqMem % 16, workspace.getPrimaryOffset());
 
 
         assertEquals(exp, array);
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/WorkspaceProviderTests.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/WorkspaceProviderTests.java
index 20e7f367f..fb6975c7f 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/WorkspaceProviderTests.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/workspace/WorkspaceProviderTests.java
@@ -239,15 +239,15 @@ public class WorkspaceProviderTests extends BaseNd4jTest {
             INDArray array3 = null;
 
             long reqMem = 5 * Nd4j.sizeOfDataType(DataType.DOUBLE);
-            assertEquals(reqMem + reqMem % 8, ws1.getPrimaryOffset());
+            assertEquals(reqMem + reqMem % 16, ws1.getPrimaryOffset());
             try (Nd4jWorkspace ws2 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS2")
                             .notifyScopeEntered()) {
 
                 INDArray array2 = Nd4j.create(new double[] {1f, 2f, 3f, 4f, 5f});
 
                 reqMem = 5 * Nd4j.sizeOfDataType(DataType.DOUBLE);
-                assertEquals(reqMem + reqMem % 8, ws1.getPrimaryOffset());
-                assertEquals(reqMem + reqMem % 8, ws2.getPrimaryOffset());
+                assertEquals(reqMem + reqMem % 16, ws1.getPrimaryOffset());
+                assertEquals(reqMem + reqMem % 16, ws2.getPrimaryOffset());
 
                 try (Nd4jWorkspace ws3 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS1")
                                 .notifyScopeBorrowed()) {
@@ -256,15 +256,15 @@ public class WorkspaceProviderTests extends BaseNd4jTest {
 
                     array3 = array2.unsafeDuplication();
                     assertTrue(ws1 == array3.data().getParentWorkspace());
-                    assertEquals(reqMem + reqMem % 8, ws2.getPrimaryOffset());
-                    assertEquals((reqMem + reqMem % 8) * 2, ws1.getPrimaryOffset());
+                    assertEquals(reqMem + reqMem % 16, ws2.getPrimaryOffset());
+                    assertEquals((reqMem + reqMem % 16) * 2, ws1.getPrimaryOffset());
                 }
 
                 log.info("Current workspace: {}", Nd4j.getMemoryManager().getCurrentWorkspace());
                 assertTrue(ws2 == Nd4j.getMemoryManager().getCurrentWorkspace());
 
-                assertEquals(reqMem + reqMem % 8, ws2.getPrimaryOffset());
-                assertEquals((reqMem + reqMem % 8) * 2, ws1.getPrimaryOffset());
+                assertEquals(reqMem + reqMem % 16, ws2.getPrimaryOffset());
+                assertEquals((reqMem + reqMem % 16) * 2, ws1.getPrimaryOffset());
 
                 assertEquals(15f, array3.sumNumber().floatValue(), 0.01f);
             }
@@ -279,20 +279,18 @@ public class WorkspaceProviderTests extends BaseNd4jTest {
     public void testNestedWorkspacesOverlap1() {
         Nd4j.setDefaultDataTypes(DataType.FLOAT, DataType.FLOAT);
         Nd4j.getWorkspaceManager().setDefaultWorkspaceConfiguration(basicConfiguration);
-        try (Nd4jWorkspace ws1 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS1")
-                        .notifyScopeEntered()) {
+        try (Nd4jWorkspace ws1 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS1").notifyScopeEntered()) {
             INDArray array = Nd4j.create(new float[] {1f, 2f, 3f, 4f, 5f});
 
             long reqMem = 5 * Nd4j.sizeOfDataType();
-            assertEquals(reqMem + reqMem % 8, ws1.getPrimaryOffset());
-            try (Nd4jWorkspace ws2 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS2")
-                            .notifyScopeEntered()) {
+            assertEquals(reqMem + (Nd4jWorkspace.alignmentBase - reqMem % Nd4jWorkspace.alignmentBase), ws1.getPrimaryOffset());
+            try (Nd4jWorkspace ws2 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS2").notifyScopeEntered()) {
 
                 INDArray array2 = Nd4j.create(new float[] {1f, 2f, 3f, 4f, 5f});
 
                 reqMem = 5 * Nd4j.sizeOfDataType();
-                assertEquals(reqMem + reqMem % 8, ws1.getPrimaryOffset());
-                assertEquals(reqMem + reqMem % 8, ws2.getPrimaryOffset());
+                assertEquals(reqMem + (Nd4jWorkspace.alignmentBase - reqMem % Nd4jWorkspace.alignmentBase), ws1.getPrimaryOffset());
+                assertEquals(reqMem + (Nd4jWorkspace.alignmentBase - reqMem % Nd4jWorkspace.alignmentBase), ws2.getPrimaryOffset());
 
                 try (Nd4jWorkspace ws3 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS1")
                                 .notifyScopeBorrowed()) {
@@ -300,8 +298,8 @@ public class WorkspaceProviderTests extends BaseNd4jTest {
 
                     INDArray array3 = Nd4j.create(new float[] {1f, 2f, 3f, 4f, 5f});
 
-                    assertEquals(reqMem + reqMem % 8, ws2.getPrimaryOffset());
-                    assertEquals((reqMem + reqMem % 8) * 2, ws1.getPrimaryOffset());
+                    assertEquals(reqMem + (Nd4jWorkspace.alignmentBase - reqMem % Nd4jWorkspace.alignmentBase), ws2.getPrimaryOffset());
+                    assertEquals((reqMem + (Nd4jWorkspace.alignmentBase - reqMem % Nd4jWorkspace.alignmentBase)) * 2, ws1.getPrimaryOffset());
                 }
             }
         }
@@ -901,21 +899,18 @@ public class WorkspaceProviderTests extends BaseNd4jTest {
             assertEquals(100 * Nd4j.sizeOfDataType(), ws1.getPrimaryOffset());
 
             for (int x = 1; x <= 100; x++) {
-                try (Nd4jWorkspace ws2 = (Nd4jWorkspace) Nd4j.getWorkspaceManager()
-                                .getWorkspaceForCurrentThread(loopConfiguration, "WS2").notifyScopeEntered()) {
+                try (Nd4jWorkspace ws2 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread(loopConfiguration, "WS2").notifyScopeEntered()) {
                     INDArray array2 = Nd4j.create(x);
                 }
 
                 Nd4jWorkspace ws2 = (Nd4jWorkspace) Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS2");
                 long reqMemory = x * Nd4j.sizeOfDataType();
-                assertEquals(reqMemory + reqMemory % 8, ws2.getLastCycleAllocations());
+                assertEquals(reqMemory + reqMemory % 16, ws2.getLastCycleAllocations());
             }
 
             Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS2").initializeWorkspace();
 
-            assertEquals(100 * Nd4j.sizeOfDataType(),
-                            Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS2")
-                                            .getCurrentSize());
+            assertEquals(100 * Nd4j.sizeOfDataType(), Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread("WS2").getCurrentSize());
         }
 
         assertNull(Nd4j.getMemoryManager().getCurrentWorkspace());
diff --git a/nd4j/nd4j-jdbc/nd4j-jdbc-hsql/pom.xml b/nd4j/nd4j-jdbc/nd4j-jdbc-hsql/pom.xml
index b0941300a..818fc69aa 100644
--- a/nd4j/nd4j-jdbc/nd4j-jdbc-hsql/pom.xml
+++ b/nd4j/nd4j-jdbc/nd4j-jdbc-hsql/pom.xml
@@ -94,7 +94,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml
index 75024af0f..8938e6a4d 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml
@@ -120,7 +120,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-node/pom.xml b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-node/pom.xml
index 07a04f80d..33ada7aa7 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-node/pom.xml
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-node/pom.xml
@@ -119,7 +119,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml
index 0325f2d52..ec93d0be7 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml
@@ -113,7 +113,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml b/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml
index 88af3c166..6a437341b 100644
--- a/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml
+++ b/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml
@@ -114,7 +114,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/nd4j/nd4j-remote/nd4j-json-client/pom.xml b/nd4j/nd4j-remote/nd4j-json-client/pom.xml
index e1a661c8b..9e8b42dab 100644
--- a/nd4j/nd4j-remote/nd4j-json-client/pom.xml
+++ b/nd4j/nd4j-remote/nd4j-json-client/pom.xml
@@ -95,7 +95,7 @@
       <dependencies>
         <dependency>
           <groupId>org.nd4j</groupId>
-          <artifactId>nd4j-cuda-10.2</artifactId>
+          <artifactId>nd4j-cuda-11.0</artifactId>
           <version>${project.version}</version>
         </dependency>
       </dependencies>
diff --git a/nd4j/nd4j-remote/nd4j-json-server/pom.xml b/nd4j/nd4j-remote/nd4j-json-server/pom.xml
index 004145101..ded578721 100644
--- a/nd4j/nd4j-remote/nd4j-json-server/pom.xml
+++ b/nd4j/nd4j-remote/nd4j-json-server/pom.xml
@@ -171,7 +171,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/nd4j/nd4j-serde/nd4j-aeron/pom.xml b/nd4j/nd4j-serde/nd4j-aeron/pom.xml
index c94bf86af..9245a3998 100644
--- a/nd4j/nd4j-serde/nd4j-aeron/pom.xml
+++ b/nd4j/nd4j-serde/nd4j-aeron/pom.xml
@@ -109,7 +109,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-serde/nd4j-arrow/pom.xml b/nd4j/nd4j-serde/nd4j-arrow/pom.xml
index 69879e965..26e59c510 100644
--- a/nd4j/nd4j-serde/nd4j-arrow/pom.xml
+++ b/nd4j/nd4j-serde/nd4j-arrow/pom.xml
@@ -127,7 +127,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-serde/nd4j-kryo/pom.xml b/nd4j/nd4j-serde/nd4j-kryo/pom.xml
index 87ddca9f0..92a1c0315 100644
--- a/nd4j/nd4j-serde/nd4j-kryo/pom.xml
+++ b/nd4j/nd4j-serde/nd4j-kryo/pom.xml
@@ -187,7 +187,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                 </dependency>
             </dependencies>
diff --git a/nd4j/nd4j-uberjar/pom.xml b/nd4j/nd4j-uberjar/pom.xml
index 388e48fc5..fa6bd84d0 100644
--- a/nd4j/nd4j-uberjar/pom.xml
+++ b/nd4j/nd4j-uberjar/pom.xml
@@ -259,7 +259,7 @@
     <dependencies>
       <dependency>
         <groupId>org.nd4j</groupId>
-        <artifactId>nd4j-cuda-10.2</artifactId>
+        <artifactId>nd4j-cuda-11.0</artifactId>
         <version>${project.version}</version>
       </dependency>
     </dependencies>
@@ -291,12 +291,12 @@
     <dependencies>
       <dependency>
         <groupId>org.nd4j</groupId>
-        <artifactId>nd4j-cuda-10.2</artifactId>
+        <artifactId>nd4j-cuda-11.0</artifactId>
         <version>${project.version}</version>
       </dependency>
       <dependency>
         <groupId>org.nd4j</groupId>
-        <artifactId>nd4j-cuda-10.2-platform</artifactId>
+        <artifactId>nd4j-cuda-11.0-platform</artifactId>
         <version>${project.version}</version>
       </dependency>
     </dependencies>
diff --git a/nd4s/pom.xml b/nd4s/pom.xml
index f10f8cf41..878cdd02a 100644
--- a/nd4s/pom.xml
+++ b/nd4s/pom.xml
@@ -312,11 +312,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/pom.xml b/pom.xml
index 184eeb11f..915409e35 100644
--- a/pom.xml
+++ b/pom.xml
@@ -289,24 +289,24 @@
         <javacpp.platform.extension/> <!-- -Djavacpp.platform.extension=-avx512 -->
         <javacpp.platform.properties>${javacpp.platform}</javacpp.platform.properties>
 
-        <javacpp.version>1.5.3</javacpp.version>
-        <javacpp-presets.version>1.5.3</javacpp-presets.version>
-        <javacv.version>1.5.3</javacv.version>
+        <javacpp.version>1.5.4-SNAPSHOT</javacpp.version>
+        <javacpp-presets.version>1.5.4-SNAPSHOT</javacpp-presets.version>
+        <javacv.version>1.5.4-SNAPSHOT</javacv.version>
 
-        <python.version>3.7.7</python.version>
+        <python.version>3.7.8</python.version>
         <cpython-platform.version>${python.version}-${javacpp-presets.version}</cpython-platform.version>
-        <numpy.version>1.18.2</numpy.version>
+        <numpy.version>1.19.0</numpy.version>
         <numpy.javacpp.version>${numpy.version}-${javacpp-presets.version}</numpy.javacpp.version>
 
-        <openblas.version>0.3.9-1</openblas.version>
+        <openblas.version>0.3.10</openblas.version>
         <mkl.version>2020.1</mkl.version>
         <opencv.version>4.3.0</opencv.version>
-        <ffmpeg.version>4.2.2</ffmpeg.version>
+        <ffmpeg.version>4.3</ffmpeg.version>
         <leptonica.version>1.79.0</leptonica.version>
         <hdf5.version>1.12.0</hdf5.version>
         <ale.version>0.6.1</ale.version>
-        <gym.version>0.17.1</gym.version>
-        <tensorflow.version>1.15.2</tensorflow.version>
+        <gym.version>0.17.2</gym.version>
+        <tensorflow.version>1.15.3</tensorflow.version>
         <tensorflow.javacpp.version>${tensorflow.version}-${javacpp-presets.version}</tensorflow.javacpp.version>
 
         <commons-compress.version>1.18</commons-compress.version>
diff --git a/python4j/pom.xml b/python4j/pom.xml
index 3f1d026a5..4f672b999 100644
--- a/python4j/pom.xml
+++ b/python4j/pom.xml
@@ -44,7 +44,8 @@
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
             <version>1.6.6</version>
-        </dependency>        <dependency>
+        </dependency>
+        <dependency>
             <groupId>ch.qos.logback</groupId>
             <artifactId>logback-classic</artifactId>
             <version>${logback.version}</version>
@@ -66,10 +67,5 @@
             <artifactId>jsr305</artifactId>
             <version>3.0.2</version>
         </dependency>
-        <dependency>
-            <groupId>org.slf4j</groupId>
-            <artifactId>slf4j-api</artifactId>
-            <version>1.6.6</version>
-        </dependency>
     </dependencies>
 </project>
\ No newline at end of file
diff --git a/python4j/python4j-numpy/pom.xml b/python4j/python4j-numpy/pom.xml
index c631f67e3..7d2ebe7de 100644
--- a/python4j/python4j-numpy/pom.xml
+++ b/python4j/python4j-numpy/pom.xml
@@ -55,11 +55,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.1</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${nd4j.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/rl4j/pom.xml b/rl4j/pom.xml
index c91dd7aa2..e3cdb5ca1 100644
--- a/rl4j/pom.xml
+++ b/rl4j/pom.xml
@@ -112,7 +112,7 @@
                             <skip>${skipBackendChoice}</skip>
                             <rules>
                                 <requireActiveProfile>
-                                    <profiles>test-nd4j-native,test-nd4j-cuda-10.2</profiles>
+                                    <profiles>test-nd4j-native,test-nd4j-cuda-11.0</profiles>
                                     <all>false</all>
                                 </requireActiveProfile>
                             </rules>
@@ -303,11 +303,11 @@
         </profile>
 
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${nd4j.version}</version>
                     <scope>test</scope>
                 </dependency>
diff --git a/rl4j/rl4j-ale/pom.xml b/rl4j/rl4j-ale/pom.xml
index 360c3db1d..5e97eceb0 100644
--- a/rl4j/rl4j-ale/pom.xml
+++ b/rl4j/rl4j-ale/pom.xml
@@ -50,7 +50,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/rl4j/rl4j-api/pom.xml b/rl4j/rl4j-api/pom.xml
index 629783e15..43bf18cd8 100644
--- a/rl4j/rl4j-api/pom.xml
+++ b/rl4j/rl4j-api/pom.xml
@@ -45,7 +45,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/rl4j/rl4j-core/pom.xml b/rl4j/rl4j-core/pom.xml
index bbb66a9e9..1cac4490d 100644
--- a/rl4j/rl4j-core/pom.xml
+++ b/rl4j/rl4j-core/pom.xml
@@ -135,7 +135,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/Agent.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/Agent.java
index 198c2a1ca..b94ed8d61 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/Agent.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/Agent.java
@@ -16,8 +16,10 @@
 package org.deeplearning4j.rl4j.agent;
 
 import lombok.AccessLevel;
+import lombok.Data;
 import lombok.Getter;
 import lombok.NonNull;
+import lombok.experimental.SuperBuilder;
 import org.deeplearning4j.rl4j.agent.listener.AgentListener;
 import org.deeplearning4j.rl4j.agent.listener.AgentListenerList;
 import org.deeplearning4j.rl4j.environment.Environment;
@@ -69,16 +71,20 @@ public class Agent<ACTION> implements IAgent<ACTION> {
      * @param environment The {@link Environment} to be used
      * @param transformProcess The {@link TransformProcess} to be used to transform the raw observations into usable ones.
      * @param policy The {@link IPolicy} to be used
-     * @param maxEpisodeSteps The maximum number of steps an episode can have before being interrupted. Use null to have no max.
+     * @param configuration The configuration for the agent
      * @param id A user-supplied id to identify the instance.
      */
-    public Agent(@NonNull Environment<ACTION> environment, @NonNull TransformProcess transformProcess, @NonNull IPolicy<ACTION> policy, Integer maxEpisodeSteps, String id) {
-        Preconditions.checkArgument(maxEpisodeSteps == null || maxEpisodeSteps > 0, "maxEpisodeSteps must be null (no maximum) or greater than 0, got", maxEpisodeSteps);
+    public Agent(@NonNull Environment<ACTION> environment,
+                 @NonNull TransformProcess transformProcess,
+                 @NonNull IPolicy<ACTION> policy,
+                 @NonNull Configuration configuration,
+                 String id) {
+        Preconditions.checkArgument(configuration.getMaxEpisodeSteps() == null || configuration.getMaxEpisodeSteps() > 0, "Configuration: maxEpisodeSteps must be null (no maximum) or greater than 0, got", configuration.getMaxEpisodeSteps());
 
         this.environment = environment;
         this.transformProcess = transformProcess;
         this.policy = policy;
-        this.maxEpisodeSteps = maxEpisodeSteps;
+        this.maxEpisodeSteps = configuration.getMaxEpisodeSteps();
         this.id = id;
 
         listeners = buildListenerList();
@@ -126,6 +132,7 @@ public class Agent<ACTION> implements IAgent<ACTION> {
         }
 
         onAfterEpisode();
+        listeners.notifyAfterEpisode(this);
     }
 
     protected void reset() {
@@ -217,45 +224,13 @@ public class Agent<ACTION> implements IAgent<ACTION> {
         // Do Nothing
     }
 
-    /**
-     *
-     * @param environment
-     * @param transformProcess
-     * @param policy
-     * @param <ACTION>
-     * @return
-     */
-    public static <ACTION> Builder<ACTION, Agent> builder(@NonNull Environment<ACTION> environment, @NonNull TransformProcess transformProcess, @NonNull IPolicy<ACTION> policy) {
-        return new Builder<>(environment, transformProcess, policy);
-    }
-
-    public static class Builder<ACTION, AGENT_TYPE extends Agent> {
-        protected final Environment<ACTION> environment;
-        protected final TransformProcess transformProcess;
-        protected final IPolicy<ACTION> policy;
-        protected Integer maxEpisodeSteps = null; // Default, no max
-        protected String id;
-
-        public Builder(@NonNull Environment<ACTION> environment, @NonNull TransformProcess transformProcess, @NonNull IPolicy<ACTION> policy) {
-            this.environment = environment;
-            this.transformProcess = transformProcess;
-            this.policy = policy;
-        }
-
-        public Builder<ACTION, AGENT_TYPE> maxEpisodeSteps(int maxEpisodeSteps) {
-            Preconditions.checkArgument(maxEpisodeSteps > 0, "maxEpisodeSteps must be greater than 0, got", maxEpisodeSteps);
-            this.maxEpisodeSteps = maxEpisodeSteps;
-
-            return this;
-        }
-
-        public Builder<ACTION, AGENT_TYPE> id(String id) {
-            this.id = id;
-            return this;
-        }
-
-        public AGENT_TYPE build() {
-            return (AGENT_TYPE)new Agent<ACTION>(environment, transformProcess, policy, maxEpisodeSteps, id);
-        }
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        /**
+         * The maximum number of steps an episode can have before being interrupted. Use null to have no max.
+         */
+        @lombok.Builder.Default
+        Integer maxEpisodeSteps = null; // Default, no max
     }
 }
\ No newline at end of file
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/AgentLearner.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/AgentLearner.java
index 8fd963cda..80da7ff05 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/AgentLearner.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/AgentLearner.java
@@ -15,9 +15,11 @@
  ******************************************************************************/
 package org.deeplearning4j.rl4j.agent;
 
+import lombok.Data;
 import lombok.Getter;
 import lombok.NonNull;
-import org.deeplearning4j.rl4j.agent.learning.ILearningBehavior;
+import lombok.experimental.SuperBuilder;
+import org.deeplearning4j.rl4j.agent.learning.behavior.ILearningBehavior;
 import org.deeplearning4j.rl4j.environment.Environment;
 import org.deeplearning4j.rl4j.environment.StepResult;
 import org.deeplearning4j.rl4j.observation.Observation;
@@ -41,12 +43,17 @@ public class AgentLearner<ACTION> extends Agent<ACTION> implements IAgentLearner
      * @param environment The {@link Environment} to be used
      * @param transformProcess The {@link TransformProcess} to be used to transform the raw observations into usable ones.
      * @param policy The {@link IPolicy} to be used
-     * @param maxEpisodeSteps The maximum number of steps an episode can have before being interrupted. Use null to have no max.
+     * @param configuration The configuration for the AgentLearner
      * @param id A user-supplied id to identify the instance.
      * @param learningBehavior The {@link ILearningBehavior} that will be used to supervise the learning.
      */
-    public AgentLearner(Environment<ACTION> environment, TransformProcess transformProcess, IPolicy<ACTION> policy, Integer maxEpisodeSteps, String id, @NonNull ILearningBehavior<ACTION> learningBehavior) {
-        super(environment, transformProcess, policy, maxEpisodeSteps, id);
+    public AgentLearner(Environment<ACTION> environment,
+                        TransformProcess transformProcess,
+                        IPolicy<ACTION> policy,
+                        Configuration configuration,
+                        String id,
+                        @NonNull ILearningBehavior<ACTION> learningBehavior) {
+        super(environment, transformProcess, policy, configuration, id);
 
         this.learningBehavior = learningBehavior;
     }
@@ -86,30 +93,8 @@ public class AgentLearner<ACTION> extends Agent<ACTION> implements IAgentLearner
         ++totalStepCount;
     }
 
-    // FIXME: parent is still visible
-    public static <ACTION> AgentLearner.Builder<ACTION, AgentLearner<ACTION>> builder(Environment<ACTION> environment,
-                                                   TransformProcess transformProcess,
-                                                   IPolicy<ACTION> policy,
-                                                   ILearningBehavior<ACTION> learningBehavior) {
-        return new AgentLearner.Builder<ACTION, AgentLearner<ACTION>>(environment, transformProcess, policy, learningBehavior);
-    }
-
-    public static class Builder<ACTION, AGENT_TYPE extends AgentLearner<ACTION>> extends Agent.Builder<ACTION, AGENT_TYPE> {
-
-        private final ILearningBehavior<ACTION> learningBehavior;
-
-        public Builder(@NonNull Environment<ACTION> environment,
-                       @NonNull TransformProcess transformProcess,
-                       @NonNull IPolicy<ACTION> policy,
-                       @NonNull ILearningBehavior<ACTION> learningBehavior) {
-            super(environment, transformProcess, policy);
-
-            this.learningBehavior = learningBehavior;
-        }
-
-        @Override
-        public AGENT_TYPE build() {
-            return (AGENT_TYPE)new AgentLearner<ACTION>(environment, transformProcess, policy, maxEpisodeSteps, id, learningBehavior);
-        }
+    @SuperBuilder
+    @Data
+    public static class Configuration extends Agent.Configuration {
     }
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/ITDTargetAlgorithm.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/IUpdateAlgorithm.java
similarity index 56%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/ITDTargetAlgorithm.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/IUpdateAlgorithm.java
index cd7588bfe..5f07369f9 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/ITDTargetAlgorithm.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/IUpdateAlgorithm.java
@@ -14,25 +14,22 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm;
-
-import org.deeplearning4j.rl4j.learning.sync.Transition;
-import org.nd4j.linalg.dataset.api.DataSet;
+package org.deeplearning4j.rl4j.agent.learning.algorithm;
 
 import java.util.List;
 
 /**
- * The interface of all TD target calculation algorithms.
+ * The interface of all <i>features-labels</i> based update algorithms.
  *
- * @param <A> The type of actions
- *
- * @author Alexandre Boulanger
+ * @param <EXPERIENCE_TYPE> The type of experiences
+ * @param <RESULT_TYPE> The type of the result. See {@link org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels FeaturesLabels}
+ *                     and {@link org.deeplearning4j.rl4j.agent.learning.update.Gradients Gradients}
  */
-public interface ITDTargetAlgorithm<A> {
+public interface IUpdateAlgorithm<RESULT_TYPE, EXPERIENCE_TYPE> {
     /**
-     * Compute the updated estimated Q-Values for every transition
-     * @param transitions The transitions from the experience replay
+     * Compute the labels required to update the network from the training batch
+     * @param trainingBatch The transitions from the experience replay
      * @return A DataSet where every element is the observation and the estimated Q-Values for all actions
      */
-    DataSet compute(List<Transition<A>> transitions);
+    RESULT_TYPE compute(List<EXPERIENCE_TYPE> trainingBatch);
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/NStepQLearning.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/NStepQLearning.java
new file mode 100644
index 000000000..b199ecde0
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/NStepQLearning.java
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2019 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.agent.learning.algorithm;
+
+import lombok.Builder;
+import lombok.Data;
+import lombok.NonNull;
+import lombok.experimental.SuperBuilder;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
+import org.deeplearning4j.rl4j.experience.StateActionPair;
+import org.deeplearning4j.rl4j.helper.INDArrayHelper;
+import org.deeplearning4j.rl4j.network.CommonLabelNames;
+import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+
+import java.util.List;
+
+/**
+ * This the "Algorithm S2 Asynchronous n-step Q-learning" of <i>Asynchronous Methods for Deep Reinforcement Learning</i>
+ * @see <a href="https://arxiv.org/pdf/1602.01783.pdf">https://arxiv.org/pdf/1602.01783.pdf</a>, page 13
+ */
+public class NStepQLearning implements IUpdateAlgorithm<Gradients, StateActionPair<Integer>> {
+
+    private final ITrainableNeuralNet current;
+    private final IOutputNeuralNet target;
+    private final int actionSpaceSize;
+    private final double gamma;
+
+    /**
+     * @param current The &theta;' parameters (the thread-specific network)
+     * @param target The &theta;<sup>&ndash;</sup> parameters (the global target network)
+     * @param actionSpaceSize The numbers of possible actions that can be taken on the environment
+     */
+    public NStepQLearning(@NonNull ITrainableNeuralNet current,
+                          @NonNull IOutputNeuralNet target,
+                          int actionSpaceSize,
+                          @NonNull Configuration configuration) {
+        this.current = current;
+        this.target = target;
+        this.actionSpaceSize = actionSpaceSize;
+        this.gamma = configuration.getGamma();
+    }
+
+    @Override
+    public Gradients compute(List<StateActionPair<Integer>> trainingBatch) {
+        int size = trainingBatch.size();
+
+        StateActionPair<Integer> stateActionPair = trainingBatch.get(size - 1);
+
+        INDArray data = stateActionPair.getObservation().getData();
+        INDArray features = INDArrayHelper.createBatchForShape(size, data.shape());
+        INDArray labels = Nd4j.create(size, actionSpaceSize);
+
+        double r;
+        if (stateActionPair.isTerminal()) {
+            r = 0;
+        } else {
+            INDArray output = target.output(data);
+            r = Nd4j.max(output).getDouble(0);
+        }
+
+        for (int i = size - 1; i >= 0; --i) {
+            stateActionPair = trainingBatch.get(i);
+            data = stateActionPair.getObservation().getData();
+
+            features.putRow(i, data);
+
+            r = stateActionPair.getReward() + gamma * r;
+            INDArray row = current.output(data);
+            row = row.putScalar(stateActionPair.getAction(), r);
+            labels.putRow(i, row);
+        }
+
+        FeaturesLabels featuresLabels = new FeaturesLabels(features);
+        featuresLabels.putLabels(CommonLabelNames.QValues, labels);
+        return current.computeGradients(featuresLabels);
+    }
+
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        /**
+         * The discount factor (default is 0.99)
+         */
+        @Builder.Default
+        double gamma = 0.99;
+    }
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/BaseDQNAlgorithm.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/BaseDQNAlgorithm.java
similarity index 70%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/BaseDQNAlgorithm.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/BaseDQNAlgorithm.java
index 6cae384d5..a56880239 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/BaseDQNAlgorithm.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/BaseDQNAlgorithm.java
@@ -14,8 +14,9 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm;
+package org.deeplearning4j.rl4j.agent.learning.algorithm.dqn;
 
+import lombok.NonNull;
 import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.nd4j.linalg.api.ndarray.INDArray;
 
@@ -25,27 +26,24 @@ import org.nd4j.linalg.api.ndarray.INDArray;
  * @author Alexandre Boulanger
  *
  */
-public abstract class BaseDQNAlgorithm extends BaseTDTargetAlgorithm {
+public abstract class BaseDQNAlgorithm extends BaseTransitionTDAlgorithm {
 
     private final IOutputNeuralNet targetQNetwork;
 
     /**
-     * In litterature, this corresponds to Q{net}(s(t+1), a)
+     * In literature, this corresponds to Q<sub>net</sub>(s(t+1), a)
      */
     protected INDArray qNetworkNextObservation;
 
     /**
-     * In litterature, this corresponds to Q{tnet}(s(t+1), a)
+     * In literature, this corresponds to Q<sub>tnet</sub>(s(t+1), a)
      */
     protected INDArray targetQNetworkNextObservation;
 
-    protected BaseDQNAlgorithm(IOutputNeuralNet qNetwork, IOutputNeuralNet targetQNetwork, double gamma) {
-        super(qNetwork, gamma);
-        this.targetQNetwork = targetQNetwork;
-    }
-
-    protected BaseDQNAlgorithm(IOutputNeuralNet qNetwork, IOutputNeuralNet targetQNetwork, double gamma, double errorClamp) {
-        super(qNetwork, gamma, errorClamp);
+    protected BaseDQNAlgorithm(IOutputNeuralNet qNetwork,
+                               @NonNull IOutputNeuralNet targetQNetwork,
+                               BaseTransitionTDAlgorithm.Configuration configuration) {
+        super(qNetwork, configuration);
         this.targetQNetwork = targetQNetwork;
     }
 
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/BaseTDTargetAlgorithm.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/BaseTransitionTDAlgorithm.java
similarity index 66%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/BaseTDTargetAlgorithm.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/BaseTransitionTDAlgorithm.java
index 460567a86..7dc3c9475 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/BaseTDTargetAlgorithm.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/BaseTransitionTDAlgorithm.java
@@ -14,21 +14,25 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm;
+package org.deeplearning4j.rl4j.agent.learning.algorithm.dqn;
 
+import lombok.Builder;
+import lombok.Data;
+import lombok.NonNull;
+import lombok.experimental.SuperBuilder;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
 import org.deeplearning4j.rl4j.learning.sync.Transition;
+import org.deeplearning4j.rl4j.network.CommonLabelNames;
 import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 
 import java.util.List;
 
 /**
- * The base of all TD target calculation algorithms that use deep learning.
- *
- * @author Alexandre Boulanger
+ * The base of all {@link Transition Transition-based} TD algorithms.
  */
-public abstract class BaseTDTargetAlgorithm implements ITDTargetAlgorithm<Integer> {
+public abstract class BaseTransitionTDAlgorithm implements IUpdateAlgorithm<FeaturesLabels, Transition<Integer>> {
 
     protected final IOutputNeuralNet qNetwork;
     protected final double gamma;
@@ -39,27 +43,16 @@ public abstract class BaseTDTargetAlgorithm implements ITDTargetAlgorithm<Intege
     /**
      *
      * @param qNetwork The Q-Network
-     * @param gamma The discount factor
-     * @param errorClamp Will prevent the new Q-Value from being farther than <i>errorClamp</i> away from the previous value. Double.NaN will disable the clamping.
+     * @param configuration The {@link Configuration} to use
      */
-    protected BaseTDTargetAlgorithm(IOutputNeuralNet qNetwork, double gamma, double errorClamp) {
+    protected BaseTransitionTDAlgorithm(@NonNull IOutputNeuralNet qNetwork, @NonNull Configuration configuration) {
         this.qNetwork = qNetwork;
-        this.gamma = gamma;
+        this.gamma = configuration.getGamma();
 
-        this.errorClamp = errorClamp;
+        this.errorClamp = configuration.getErrorClamp();
         isClamped = !Double.isNaN(errorClamp);
     }
 
-    /**
-     *
-     * @param qNetwork The Q-Network
-     * @param gamma The discount factor
-     * Note: Error clamping is disabled with this ctor
-     */
-    protected BaseTDTargetAlgorithm(IOutputNeuralNet qNetwork, double gamma) {
-        this(qNetwork, gamma, Double.NaN);
-    }
-
     /**
      * Called just before the calculation starts
      * @param observations A INDArray of all observations stacked on dimension 0
@@ -80,7 +73,7 @@ public abstract class BaseTDTargetAlgorithm implements ITDTargetAlgorithm<Intege
     protected abstract double computeTarget(int batchIdx, double reward, boolean isTerminal);
 
     @Override
-    public DataSet compute(List<Transition<Integer>> transitions) {
+    public FeaturesLabels compute(List<Transition<Integer>> transitions) {
 
         int size = transitions.size();
 
@@ -103,6 +96,25 @@ public abstract class BaseTDTargetAlgorithm implements ITDTargetAlgorithm<Intege
             updatedQValues.putScalar(i, transition.getAction(), yTarget);
         }
 
-        return new org.nd4j.linalg.dataset.DataSet(observations, updatedQValues);
+        FeaturesLabels featuresLabels = new FeaturesLabels(observations);
+        featuresLabels.putLabels(CommonLabelNames.QValues, updatedQValues);
+
+        return featuresLabels;
+    }
+
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        /**
+         * The discount factor (default is 0.99)
+         */
+        @Builder.Default
+        double gamma = 0.99;
+
+        /**
+         * Will prevent the new Q-Value from being farther than <i>errorClamp</i> away from the previous value. Double.NaN will disable the clamping (default).
+         */
+        @Builder.Default
+        double errorClamp = Double.NaN;
     }
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/DoubleDQN.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/DoubleDQN.java
similarity index 77%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/DoubleDQN.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/DoubleDQN.java
index caeb85fb6..f7d99276b 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/DoubleDQN.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/DoubleDQN.java
@@ -14,7 +14,7 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm;
+package org.deeplearning4j.rl4j.agent.learning.algorithm.dqn;
 
 import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.nd4j.linalg.api.ndarray.INDArray;
@@ -29,15 +29,13 @@ public class DoubleDQN extends BaseDQNAlgorithm {
 
     private static final int ACTION_DIMENSION_IDX = 1;
 
-    // In litterature, this corresponds to: max_{a}Q(s_{t+1}, a)
+    // In literature, this corresponds to: max<sub>a</sub> Q(s<sub>t+1</sub>, a)
     private INDArray maxActionsFromQNetworkNextObservation;
 
-    public DoubleDQN(IOutputNeuralNet qNetwork, IOutputNeuralNet targetQNetwork, double gamma) {
-        super(qNetwork, targetQNetwork, gamma);
-    }
-
-    public DoubleDQN(IOutputNeuralNet qNetwork, IOutputNeuralNet targetQNetwork, double gamma, double errorClamp) {
-        super(qNetwork, targetQNetwork, gamma, errorClamp);
+    public DoubleDQN(IOutputNeuralNet qNetwork,
+                     IOutputNeuralNet targetQNetwork,
+                     BaseTransitionTDAlgorithm.Configuration configuration) {
+        super(qNetwork, targetQNetwork, configuration);
     }
 
     @Override
@@ -48,8 +46,8 @@ public class DoubleDQN extends BaseDQNAlgorithm {
     }
 
     /**
-     * In litterature, this corresponds to:<br />
-     *      Q(s_t, a_t) = R_{t+1} + \gamma * Q_{tar}(s_{t+1}, max_{a}Q(s_{t+1}, a))
+     * In literature, this corresponds to:<br />
+     *      Q(s<sub>t</sub>, a<sub>t</sub>) = R<sub>t+1</sub> + &gamma; * Q<sub>tar</sub>(s<sub>t+1</sub>, max<sub>a</sub> Q(s<sub>t+1</sub>, a))
      * @param batchIdx The index in the batch of the current transition
      * @param reward The reward of the current transition
      * @param isTerminal True if it's the last transition of the "game"
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/StandardDQN.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/StandardDQN.java
similarity index 79%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/StandardDQN.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/StandardDQN.java
index 6cd047c74..c3a4c3be2 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/StandardDQN.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/StandardDQN.java
@@ -14,7 +14,7 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm;
+package org.deeplearning4j.rl4j.agent.learning.algorithm.dqn;
 
 import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.nd4j.linalg.api.ndarray.INDArray;
@@ -29,16 +29,15 @@ public class StandardDQN extends BaseDQNAlgorithm {
 
     private static final int ACTION_DIMENSION_IDX = 1;
 
-    // In litterature, this corresponds to: max_{a}Q_{tar}(s_{t+1}, a)
+    /**
+     * In literature, this corresponds to: max<sub>a</sub> Q<sub>tar</sub>(s<sub>t+1</sub>, a)
+     */
     private INDArray maxActionsFromQTargetNextObservation;
 
-    public StandardDQN(IOutputNeuralNet qNetwork, IOutputNeuralNet targetQNetwork, double gamma) {
-        super(qNetwork, targetQNetwork, gamma);
+    public StandardDQN(IOutputNeuralNet qNetwork, IOutputNeuralNet targetQNetwork, Configuration configuration) {
+        super(qNetwork, targetQNetwork, configuration);
     }
 
-    public StandardDQN(IOutputNeuralNet qNetwork, IOutputNeuralNet targetQNetwork, double gamma, double errorClamp) {
-        super(qNetwork, targetQNetwork, gamma, errorClamp);
-    }
 
     @Override
     protected void initComputation(INDArray observations, INDArray nextObservations) {
@@ -48,8 +47,8 @@ public class StandardDQN extends BaseDQNAlgorithm {
     }
 
     /**
-     * In litterature, this corresponds to:<br />
-     *      Q(s_t, a_t) = R_{t+1} + \gamma * max_{a}Q_{tar}(s_{t+1}, a)
+     * In literature, this corresponds to:<br />
+     *      Q(s<sub>t</sub>, a<sub>t</sub>) = R<sub>t+1</sub> + &gamma; * max<sub>a</sub> Q<sub>tar</sub>(s<sub>t+1</sub>, a)
      * @param batchIdx The index in the batch of the current transition
      * @param reward The reward of the current transition
      * @param isTerminal True if it's the last transition of the "game"
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/ILearningBehavior.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/behavior/ILearningBehavior.java
similarity index 94%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/ILearningBehavior.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/behavior/ILearningBehavior.java
index 0187d8c3a..286655378 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/ILearningBehavior.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/behavior/ILearningBehavior.java
@@ -13,7 +13,7 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
-package org.deeplearning4j.rl4j.agent.learning;
+package org.deeplearning4j.rl4j.agent.learning.behavior;
 
 import org.deeplearning4j.rl4j.observation.Observation;
 
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/LearningBehavior.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/behavior/LearningBehavior.java
similarity index 90%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/LearningBehavior.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/behavior/LearningBehavior.java
index 85c7ec4ce..66709482f 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/LearningBehavior.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/behavior/LearningBehavior.java
@@ -13,10 +13,11 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
-package org.deeplearning4j.rl4j.agent.learning;
+package org.deeplearning4j.rl4j.agent.learning.behavior;
 
 import lombok.Builder;
-import org.deeplearning4j.rl4j.agent.update.IUpdateRule;
+import lombok.NonNull;
+import org.deeplearning4j.rl4j.agent.learning.update.IUpdateRule;
 import org.deeplearning4j.rl4j.experience.ExperienceHandler;
 import org.deeplearning4j.rl4j.observation.Observation;
 
@@ -30,10 +31,10 @@ import org.deeplearning4j.rl4j.observation.Observation;
 @Builder
 public class LearningBehavior<ACTION, EXPERIENCE_TYPE> implements ILearningBehavior<ACTION> {
 
-    @Builder.Default
-    private int experienceUpdateSize = 64;
-
+    @NonNull
     private final ExperienceHandler<ACTION, EXPERIENCE_TYPE> experienceHandler;
+
+    @NonNull
     private final IUpdateRule<EXPERIENCE_TYPE> updateRule;
 
     @Override
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/FeaturesLabels.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/FeaturesLabels.java
new file mode 100644
index 000000000..1d6e1249b
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/FeaturesLabels.java
@@ -0,0 +1,64 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.agent.learning.update;
+
+import lombok.Getter;
+import org.nd4j.linalg.api.ndarray.INDArray;
+
+import java.util.HashMap;
+
+/**
+ * A container that holds the features and the associated labels.
+ */
+public class FeaturesLabels {
+
+    @Getter
+    private final INDArray features;
+
+    private final HashMap<String, INDArray> labels = new HashMap<String, INDArray>();
+
+    /**
+     * @param features
+     */
+    public FeaturesLabels(INDArray features) {
+        this.features = features;
+    }
+
+    /**
+     * @return The number of examples in features and each labels.
+     */
+    public long getBatchSize() {
+        return features.shape()[0];
+    }
+
+    /**
+     * Add labels by name
+     * @param name
+     * @param labels
+     */
+    public void putLabels(String name, INDArray labels) {
+        this.labels.put(name, labels);
+    }
+
+    /**
+     * Get the labels associated to the name.
+     * @param name
+     * @return
+     */
+    public INDArray getLabels(String name) {
+        return this.labels.get(name);
+    }
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/Gradients.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/Gradients.java
new file mode 100644
index 000000000..e97de2042
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/Gradients.java
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.agent.learning.update;
+
+import lombok.Getter;
+import org.deeplearning4j.nn.gradient.Gradient;
+
+import java.util.HashMap;
+
+/**
+ * A {@link Gradient} container used to update neural networks.
+ */
+public class Gradients {
+
+    @Getter
+    private final long batchSize;
+
+    private final HashMap<String, Gradient> gradients = new HashMap<String, Gradient>();
+
+    /**
+     * @param batchSize The size of the training batch used to create this instance
+     */
+    public Gradients(long batchSize) {
+        this.batchSize = batchSize;
+    }
+
+    /**
+     * Add a {@link Gradient} by name.
+     * @param name
+     * @param gradient
+     */
+    public void putGradient(String name, Gradient gradient) {
+        gradients.put(name, gradient);
+    }
+
+    /**
+     * Get a {@link Gradient} by name
+     * @param name
+     * @return
+     */
+    public Gradient getGradient(String name) {
+        return gradients.get(name);
+    }
+
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/IUpdateRule.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/IUpdateRule.java
similarity index 93%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/IUpdateRule.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/IUpdateRule.java
index d679cba24..99ae979b2 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/IUpdateRule.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/IUpdateRule.java
@@ -13,7 +13,7 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
-package org.deeplearning4j.rl4j.agent.update;
+package org.deeplearning4j.rl4j.agent.learning.update;
 
 import java.util.List;
 
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/UpdateRule.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/UpdateRule.java
new file mode 100644
index 000000000..5d909cfbf
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/UpdateRule.java
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.agent.learning.update;
+
+import lombok.Getter;
+import lombok.NonNull;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.INeuralNetUpdater;
+
+import java.util.List;
+
+/**
+ * This implementation of {@link IUpdateRule} delegates the <i>features-labels</i> or <i>gradients</i> computations to
+ * a {@link IUpdateAlgorithm}, and the networks update to a {@link INeuralNetUpdater}.
+ *
+ * @param <RESULT_TYPE> The type of result returned by the IUpdateAlgorithm
+ * @param <EXPERIENCE_TYPE> The type of experience
+ */
+public class UpdateRule<RESULT_TYPE, EXPERIENCE_TYPE> implements IUpdateRule<EXPERIENCE_TYPE> {
+
+    private final INeuralNetUpdater updater;
+
+    private final IUpdateAlgorithm<RESULT_TYPE, EXPERIENCE_TYPE> updateAlgorithm;
+
+    @Getter
+    private int updateCount = 0;
+
+    public UpdateRule(@NonNull IUpdateAlgorithm<RESULT_TYPE, EXPERIENCE_TYPE> updateAlgorithm,
+                      @NonNull INeuralNetUpdater<RESULT_TYPE> updater) {
+        this.updateAlgorithm = updateAlgorithm;
+        this.updater = updater;
+    }
+
+    @Override
+    public void update(List<EXPERIENCE_TYPE> trainingBatch) {
+        RESULT_TYPE featuresLabels = updateAlgorithm.compute(trainingBatch);
+        updater.update(featuresLabels);
+        ++updateCount;
+    }
+
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/NeuralNetUpdater.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/GradientsNeuralNetUpdater.java
similarity index 55%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/NeuralNetUpdater.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/GradientsNeuralNetUpdater.java
index 3dc778251..c657e3fa2 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/NeuralNetUpdater.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/GradientsNeuralNetUpdater.java
@@ -13,15 +13,19 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
-package org.deeplearning4j.rl4j.agent.update.neuralnetupdater;
+package org.deeplearning4j.rl4j.agent.learning.update.updater;
 
+import lombok.Builder;
+import lombok.Data;
+import lombok.NonNull;
+import lombok.experimental.SuperBuilder;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
 import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
-import org.nd4j.linalg.dataset.api.DataSet;
 
 /**
  * A {@link INeuralNetUpdater} that updates a neural network and sync a target network at defined intervals
  */
-public class NeuralNetUpdater implements INeuralNetUpdater {
+public class GradientsNeuralNetUpdater implements INeuralNetUpdater<Gradients> {
 
     private final ITrainableNeuralNet current;
     private final ITrainableNeuralNet target;
@@ -29,27 +33,29 @@ public class NeuralNetUpdater implements INeuralNetUpdater {
     private int updateCount = 0;
     private final int targetUpdateFrequency;
 
+    // TODO: Add async support
     /**
      * @param current The current {@link ITrainableNeuralNet network}
      * @param target The target {@link ITrainableNeuralNet network}
-     * @param targetUpdateFrequency Will synchronize the target network at every <i>targetUpdateFrequency</i> updates
+     *
+     * Note: Presently async is not supported
      */
-    public NeuralNetUpdater(ITrainableNeuralNet current,
-                            ITrainableNeuralNet target,
-                            int targetUpdateFrequency) {
+    public GradientsNeuralNetUpdater(@NonNull ITrainableNeuralNet current,
+                                     @NonNull ITrainableNeuralNet target,
+                                     @NonNull Configuration configuration) {
         this.current = current;
         this.target = target;
 
-        this.targetUpdateFrequency = targetUpdateFrequency;
+        this.targetUpdateFrequency = configuration.getTargetUpdateFrequency();
     }
 
     /**
      * Update the current network
-     * @param featuresLabels A Dataset that will be used to update the network.
+     * @param gradients A {@link Gradients} that will be used to update the network.
      */
     @Override
-    public void update(DataSet featuresLabels) {
-        current.fit(featuresLabels);
+    public void update(Gradients gradients) {
+        current.applyGradients(gradients);
         syncTargetNetwork();
     }
 
@@ -59,4 +65,13 @@ public class NeuralNetUpdater implements INeuralNetUpdater {
         }
     }
 
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        /**
+         * Will synchronize the target network at every <i>targetUpdateFrequency</i> updates (default: no update)
+         */
+        @Builder.Default
+        int targetUpdateFrequency = Integer.MAX_VALUE;
+    }
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/INeuralNetUpdater.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/INeuralNetUpdater.java
similarity index 63%
rename from rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/INeuralNetUpdater.java
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/INeuralNetUpdater.java
index f17c4f11e..6d9fae1f8 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/INeuralNetUpdater.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/INeuralNetUpdater.java
@@ -13,18 +13,17 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
-package org.deeplearning4j.rl4j.agent.update.neuralnetupdater;
-
-import org.nd4j.linalg.dataset.api.DataSet;
+package org.deeplearning4j.rl4j.agent.learning.update.updater;
 
 /**
- * The role of INeuralNetUpdater implementations is to update a {@link org.deeplearning4j.rl4j.network.NeuralNet NeuralNet}
- * from a {@link DataSet}.<p />
+ * The role of INeuralNetUpdater implementations is to update a {@link org.deeplearning4j.rl4j.network.NeuralNet NeuralNet}.<p />
+ * @param <DATA_TYPE> The type of the data needed to to update the netwok. See {@link org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels FeaturesLabels}
+ *                   and {@link org.deeplearning4j.rl4j.agent.learning.update.Gradients Gradients}.
  */
-public interface INeuralNetUpdater {
+public interface INeuralNetUpdater<DATA_TYPE> {
     /**
      * Update a {@link org.deeplearning4j.rl4j.network.NeuralNet NeuralNet}.
-     * @param featuresLabels A Dataset that will be used to update the network.
+     * @param dataType
      */
-    void update(DataSet featuresLabels);
+    void update(DATA_TYPE dataType);
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/LabelsNeuralNetUpdater.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/LabelsNeuralNetUpdater.java
new file mode 100644
index 000000000..33d30f652
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/learning/update/updater/LabelsNeuralNetUpdater.java
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.agent.learning.update.updater;
+
+import lombok.Builder;
+import lombok.Data;
+import lombok.NonNull;
+import lombok.experimental.SuperBuilder;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.nd4j.common.base.Preconditions;
+
+/**
+ * A {@link INeuralNetUpdater} that updates a neural network and sync a target network at defined intervals
+ */
+public class LabelsNeuralNetUpdater implements INeuralNetUpdater<FeaturesLabels> {
+
+    private final ITrainableNeuralNet current;
+    private final ITrainableNeuralNet target;
+
+    private int updateCount = 0;
+    private final int targetUpdateFrequency;
+
+    // TODO: Add async support
+    /**
+     * @param current The current {@link ITrainableNeuralNet network}
+     * @param target The target {@link ITrainableNeuralNet network}
+     * @param configuration The {@link Configuration} to use
+     *
+     * Note: Presently async is not supported
+     */
+    public LabelsNeuralNetUpdater(@NonNull ITrainableNeuralNet current,
+                                  @NonNull ITrainableNeuralNet target,
+                                  @NonNull Configuration configuration) {
+        Preconditions.checkArgument(configuration.getTargetUpdateFrequency() > 0, "Configuration: targetUpdateFrequency must be greater than 0, got: ", configuration.getTargetUpdateFrequency());
+        this.current = current;
+        this.target = target;
+
+        this.targetUpdateFrequency = configuration.getTargetUpdateFrequency();
+    }
+
+    /**
+     * Update the current network
+     * @param featuresLabels A {@link FeaturesLabels} that will be used to update the network.
+     */
+    @Override
+    public void update(FeaturesLabels featuresLabels) {
+        current.fit(featuresLabels);
+        syncTargetNetwork();
+    }
+
+    private void syncTargetNetwork() {
+        if(++updateCount % targetUpdateFrequency == 0) {
+            target.copy(current);
+        }
+    }
+
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        /**
+         * Will synchronize the target network at every <i>targetUpdateFrequency</i> updates (default: no update)
+         */
+        @Builder.Default
+        int targetUpdateFrequency = Integer.MAX_VALUE;
+    }
+
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListener.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListener.java
index f176da144..91b83c59f 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListener.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListener.java
@@ -63,4 +63,12 @@ public interface AgentListener<ACTION> {
      * @return A {@link ListenerResponse}.
      */
     AgentListener.ListenerResponse onAfterStep(Agent agent, StepResult stepResult);
+
+    /**
+     * Called after the episode has ended.
+     *
+     * @param agent The agent that generated the event
+     *
+     */
+    void onAfterEpisode(Agent agent);
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListenerList.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListenerList.java
index 48538aeaf..e697c4c53 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListenerList.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/listener/AgentListenerList.java
@@ -86,4 +86,16 @@ public class AgentListenerList<ACTION> {
 
         return true;
     }
+
+    /**
+     * This method will notify all listeners that an episode has finished.
+     *
+     * @param agent The agent that generated the event.
+     */
+    public void notifyAfterEpisode(Agent<ACTION> agent) {
+        for (AgentListener<ACTION> listener : listeners) {
+            listener.onAfterEpisode(agent);
+        }
+    }
+
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/DQNNeuralNetUpdateRule.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/DQNNeuralNetUpdateRule.java
deleted file mode 100644
index c359e02ce..000000000
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/DQNNeuralNetUpdateRule.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Konduit K.K.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-package org.deeplearning4j.rl4j.agent.update;
-
-import lombok.Getter;
-import org.deeplearning4j.rl4j.agent.update.neuralnetupdater.INeuralNetUpdater;
-import org.deeplearning4j.rl4j.agent.update.neuralnetupdater.NeuralNetUpdater;
-import org.deeplearning4j.rl4j.learning.sync.Transition;
-import org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm.DoubleDQN;
-import org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm.ITDTargetAlgorithm;
-import org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm.StandardDQN;
-import org.deeplearning4j.rl4j.network.dqn.IDQN;
-import org.nd4j.linalg.dataset.api.DataSet;
-
-import java.util.List;
-
-// Temporary class that will be replaced with a more generic class that delegates gradient computation
-// and network update to sub components.
-public class DQNNeuralNetUpdateRule implements IUpdateRule<Transition<Integer>> {
-
-    private final IDQN targetQNetwork;
-    private final INeuralNetUpdater updater;
-
-    private final ITDTargetAlgorithm<Integer> tdTargetAlgorithm;
-
-    @Getter
-    private int updateCount = 0;
-
-    public DQNNeuralNetUpdateRule(IDQN<IDQN> qNetwork, int targetUpdateFrequency, boolean isDoubleDQN, double gamma, double errorClamp) {
-        this.targetQNetwork = qNetwork.clone();
-        tdTargetAlgorithm = isDoubleDQN
-                ? new DoubleDQN(qNetwork, targetQNetwork, gamma, errorClamp)
-                : new StandardDQN(qNetwork, targetQNetwork, gamma, errorClamp);
-        updater = new NeuralNetUpdater(qNetwork, targetQNetwork, targetUpdateFrequency);
-
-    }
-
-    @Override
-    public void update(List<Transition<Integer>> trainingBatch) {
-        DataSet targets = tdTargetAlgorithm.compute(trainingBatch);
-        updater.update(targets);
-    }
-}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/Gradients.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/Gradients.java
deleted file mode 100644
index 4307efe1e..000000000
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/agent/update/Gradients.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Konduit K.K.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-package org.deeplearning4j.rl4j.agent.update;
-
-import lombok.Value;
-import org.deeplearning4j.nn.gradient.Gradient;
-
-// Work in progress
-@Value
-public class Gradients {
-    private Gradient[] gradients; // Temporary: we'll need something better than a Gradient[]
-    private int batchSize;
-}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/BaseAgentLearnerBuilder.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/BaseAgentLearnerBuilder.java
new file mode 100644
index 000000000..f41f9361c
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/BaseAgentLearnerBuilder.java
@@ -0,0 +1,168 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.builder;
+
+import lombok.AccessLevel;
+import lombok.Data;
+import lombok.Getter;
+import lombok.NonNull;
+import lombok.experimental.SuperBuilder;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.AgentLearner;
+import org.deeplearning4j.rl4j.agent.IAgentLearner;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.behavior.ILearningBehavior;
+import org.deeplearning4j.rl4j.agent.learning.behavior.LearningBehavior;
+import org.deeplearning4j.rl4j.agent.learning.update.IUpdateRule;
+import org.deeplearning4j.rl4j.agent.learning.update.UpdateRule;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.INeuralNetUpdater;
+import org.deeplearning4j.rl4j.agent.listener.AgentListener;
+import org.deeplearning4j.rl4j.environment.Environment;
+import org.deeplearning4j.rl4j.experience.ExperienceHandler;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.deeplearning4j.rl4j.observation.transform.TransformProcess;
+import org.deeplearning4j.rl4j.policy.IPolicy;
+
+import java.util.List;
+
+/**
+ * A base {@link IAgentLearner} builder that should be helpful in several common scenarios. <p/>
+ * <b>Note</b>: Classes implementing BaseAgentLearnerBuilder should be careful not to re-use a stateful and/or non thread-safe dependency
+ * through several calls to build(). In doubt, use a new instance.
+ * @param <ACTION> The type of action
+ * @param <EXPERIENCE_TYPE> The type of experiences
+ * @param <ALGORITHM_RESULT_TYPE> The response type of {@link org.deeplearning4j.rl4j.network.IOutputNeuralNet IOutputNeuralNet}.output()
+ */
+public abstract class BaseAgentLearnerBuilder<ACTION, EXPERIENCE_TYPE, ALGORITHM_RESULT_TYPE> implements Builder<IAgentLearner<ACTION>> {
+
+    private final Configuration<ACTION> configuration;
+    private final Builder<Environment<ACTION>> environmentBuilder;
+    private final Builder<TransformProcess> transformProcessBuilder;
+    protected final INetworksHandler networks;
+
+    protected int createdAgentLearnerCount;
+
+    public BaseAgentLearnerBuilder(@NonNull Configuration configuration,
+                                   @NonNull ITrainableNeuralNet neuralNet,
+                                   @NonNull Builder<Environment<ACTION>> environmentBuilder,
+                                   @NonNull Builder<TransformProcess> transformProcessBuilder) {
+        this.configuration = configuration;
+        this.environmentBuilder = environmentBuilder;
+        this.transformProcessBuilder = transformProcessBuilder;
+
+        // TODO: Support async setups
+        if(configuration.isAsynchronous()) {
+            throw new NotImplementedException("Asynchronous BaseAgentLearnerBuilder is not yet implemented");
+        }
+        this.networks = new SyncNetworkHandler(neuralNet);
+    }
+
+    @Getter(AccessLevel.PROTECTED)
+    private Environment<ACTION> environment;
+
+    @Getter(AccessLevel.PROTECTED)
+    private TransformProcess transformProcess;
+
+    @Getter(AccessLevel.PROTECTED)
+    private IPolicy<ACTION> policy;
+
+    @Getter(AccessLevel.PROTECTED)
+    private ExperienceHandler<ACTION, EXPERIENCE_TYPE> experienceHandler;
+
+    @Getter(AccessLevel.PROTECTED)
+    private IUpdateAlgorithm<ALGORITHM_RESULT_TYPE, EXPERIENCE_TYPE> updateAlgorithm;
+
+    @Getter(AccessLevel.PROTECTED)
+    private INeuralNetUpdater<ALGORITHM_RESULT_TYPE> neuralNetUpdater;
+
+    @Getter(AccessLevel.PROTECTED)
+    private IUpdateRule<EXPERIENCE_TYPE> updateRule;
+
+    @Getter(AccessLevel.PROTECTED)
+    private ILearningBehavior<ACTION> learningBehavior;
+
+    protected abstract IPolicy<ACTION> buildPolicy();
+    protected abstract ExperienceHandler<ACTION, EXPERIENCE_TYPE> buildExperienceHandler();
+    protected abstract IUpdateAlgorithm<ALGORITHM_RESULT_TYPE, EXPERIENCE_TYPE> buildUpdateAlgorithm();
+    protected abstract INeuralNetUpdater<ALGORITHM_RESULT_TYPE> buildNeuralNetUpdater();
+    protected IUpdateRule<EXPERIENCE_TYPE> buildUpdateRule() {
+        return new UpdateRule<ALGORITHM_RESULT_TYPE, EXPERIENCE_TYPE>(getUpdateAlgorithm(), getNeuralNetUpdater());
+    }
+    protected ILearningBehavior<ACTION> buildLearningBehavior() {
+        return LearningBehavior.<ACTION, EXPERIENCE_TYPE>builder()
+                .experienceHandler(getExperienceHandler())
+                .updateRule(getUpdateRule())
+                .build();
+    }
+
+    protected void resetForNewBuild() {
+        environment = environmentBuilder.build();
+        transformProcess = transformProcessBuilder.build();
+        policy = buildPolicy();
+        experienceHandler = buildExperienceHandler();
+        updateAlgorithm = buildUpdateAlgorithm();
+        neuralNetUpdater = buildNeuralNetUpdater();
+        updateRule = buildUpdateRule();
+        learningBehavior = buildLearningBehavior();
+
+        ++createdAgentLearnerCount;
+    }
+
+    protected String getThreadId() {
+        return "AgentLearner-" + createdAgentLearnerCount;
+    }
+
+    protected IAgentLearner<ACTION> buildAgentLearner() {
+        AgentLearner<ACTION> result = new AgentLearner(getEnvironment(), getTransformProcess(), getPolicy(), configuration.getAgentLearnerConfiguration(), getThreadId(), getLearningBehavior());
+        if(configuration.getAgentLearnerListeners() != null) {
+            for (AgentListener<ACTION> listener : configuration.getAgentLearnerListeners()) {
+                result.addListener(listener);
+            }
+        }
+
+        return result;
+    }
+
+    /**
+     * Build a properly assembled / configured IAgentLearner.
+     * @return a {@link IAgentLearner}
+     */
+    @Override
+    public IAgentLearner<ACTION> build() {
+        resetForNewBuild();
+        return buildAgentLearner();
+    }
+
+    @SuperBuilder
+    @Data
+    public static class Configuration<ACTION> {
+        /**
+         * The configuration that will be used to build the {@link AgentLearner}
+         */
+        AgentLearner.Configuration agentLearnerConfiguration;
+
+        /**
+         * A list of {@link AgentListener AgentListeners} that will be added to the AgentLearner. (default = null; no listeners)
+         */
+        List<AgentListener<ACTION>> agentLearnerListeners;
+
+        /**
+         * Tell the builder that the AgentLearners will be used in an asynchronous setup
+         */
+        boolean asynchronous;
+    }
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/BaseDQNAgentLearnerBuilder.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/BaseDQNAgentLearnerBuilder.java
new file mode 100644
index 000000000..5306a5319
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/BaseDQNAgentLearnerBuilder.java
@@ -0,0 +1,93 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.builder;
+
+import lombok.AccessLevel;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.experimental.SuperBuilder;
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.IAgentLearner;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.BaseTransitionTDAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.INeuralNetUpdater;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.LabelsNeuralNetUpdater;
+import org.deeplearning4j.rl4j.environment.Environment;
+import org.deeplearning4j.rl4j.environment.IActionSchema;
+import org.deeplearning4j.rl4j.experience.ExperienceHandler;
+import org.deeplearning4j.rl4j.experience.ReplayMemoryExperienceHandler;
+import org.deeplearning4j.rl4j.learning.sync.Transition;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.deeplearning4j.rl4j.observation.transform.TransformProcess;
+import org.deeplearning4j.rl4j.policy.DQNPolicy;
+import org.deeplearning4j.rl4j.policy.EpsGreedy;
+import org.deeplearning4j.rl4j.policy.INeuralNetPolicy;
+import org.deeplearning4j.rl4j.policy.IPolicy;
+import org.nd4j.linalg.api.rng.Random;
+
+/**
+ * A base {@link IAgentLearner} builder that will setup these:
+ * <li>a epsilon-greedy policy</li>
+ * <li>a replay-memory experience handler</li>
+ * <li>a neural net updater that expects feature-labels update data</li>
+ *
+ * Used as the base of DQN builders.
+ */
+public abstract class BaseDQNAgentLearnerBuilder extends BaseAgentLearnerBuilder<Integer, Transition<Integer>, FeaturesLabels> {
+
+    @Getter(AccessLevel.PROTECTED)
+    private final Configuration configuration;
+
+    private final Random rnd;
+
+    public BaseDQNAgentLearnerBuilder(Configuration configuration,
+                                      ITrainableNeuralNet neuralNet,
+                                      Builder<Environment<Integer>> environmentBuilder,
+                                      Builder<TransformProcess> transformProcessBuilder,
+                                      Random rnd) {
+        super(configuration, neuralNet, environmentBuilder, transformProcessBuilder);
+        this.configuration = configuration;
+        this.rnd = rnd;
+    }
+
+    @Override
+    protected IPolicy<Integer> buildPolicy() {
+        INeuralNetPolicy<Integer> greedyPolicy = new DQNPolicy<Integer>(networks.getThreadCurrentNetwork());
+        IActionSchema<Integer> actionSchema = getEnvironment().getSchema().getActionSchema();
+        return new EpsGreedy(greedyPolicy, actionSchema, configuration.getPolicyConfiguration(), rnd);
+    }
+
+    @Override
+    protected ExperienceHandler<Integer, Transition<Integer>> buildExperienceHandler() {
+        return new ReplayMemoryExperienceHandler(configuration.getExperienceHandlerConfiguration(), rnd);
+    }
+
+    @Override
+    protected INeuralNetUpdater<FeaturesLabels> buildNeuralNetUpdater() {
+        return new LabelsNeuralNetUpdater(networks.getThreadCurrentNetwork(), networks.getTargetNetwork(), configuration.getNeuralNetUpdaterConfiguration());
+    }
+
+    @EqualsAndHashCode(callSuper = true)
+    @SuperBuilder
+    @Data
+    public static class Configuration extends BaseAgentLearnerBuilder.Configuration<Integer> {
+        EpsGreedy.Configuration policyConfiguration;
+        ReplayMemoryExperienceHandler.Configuration experienceHandlerConfiguration;
+        LabelsNeuralNetUpdater.Configuration neuralNetUpdaterConfiguration;
+        BaseTransitionTDAlgorithm.Configuration updateAlgorithmConfiguration;
+    }
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/DoubleDQNBuilder.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/DoubleDQNBuilder.java
new file mode 100644
index 000000000..752c53ca7
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/DoubleDQNBuilder.java
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.builder;
+
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.experimental.SuperBuilder;
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.IAgentLearner;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.environment.Environment;
+import org.deeplearning4j.rl4j.learning.sync.Transition;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.deeplearning4j.rl4j.observation.transform.TransformProcess;
+import org.nd4j.linalg.api.rng.Random;
+
+/**
+ * A {@link IAgentLearner} builder that will setup a {@link DoubleDQN double-DQN} algorithm in addition to the setup done by {@link BaseDQNAgentLearnerBuilder}.
+ */
+public class DoubleDQNBuilder extends BaseDQNAgentLearnerBuilder {
+
+
+    public DoubleDQNBuilder(Configuration configuration,
+                            ITrainableNeuralNet neuralNet,
+                            Builder<Environment<Integer>> environmentBuilder,
+                            Builder<TransformProcess> transformProcessBuilder,
+                            Random rnd) {
+        super(configuration, neuralNet, environmentBuilder, transformProcessBuilder, rnd);
+    }
+
+    @Override
+    protected IUpdateAlgorithm<FeaturesLabels, Transition<Integer>> buildUpdateAlgorithm() {
+        return new DoubleDQN(networks.getThreadCurrentNetwork(), networks.getTargetNetwork(), getConfiguration().getUpdateAlgorithmConfiguration());
+    }
+
+    @EqualsAndHashCode(callSuper = true)
+    @SuperBuilder
+    @Data
+    public static class Configuration extends BaseDQNAgentLearnerBuilder.Configuration {
+    }
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/INetworksHandler.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/INetworksHandler.java
new file mode 100644
index 000000000..d4c308bdb
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/INetworksHandler.java
@@ -0,0 +1,43 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.builder;
+
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+
+/**
+ * An interface that abstract what the different networks are depending on the setup (sync vs async)
+ */
+public interface INetworksHandler {
+    /**
+     * @return The global shared target parameters &theta;<sup>&ndash;</sup>
+     */
+    ITrainableNeuralNet getTargetNetwork();
+
+    /**
+     * @return The thread-specific parameters &theta;'
+     */
+    ITrainableNeuralNet getThreadCurrentNetwork();
+
+    /**
+     * @return The global shared parameters &theta;
+     */
+    ITrainableNeuralNet getGlobalCurrentNetwork();
+
+    /**
+     * Perform the required changes before a new IAgentLearner is built
+     */
+    void resetForNewBuild();
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/NStepQLearningBuilder.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/NStepQLearningBuilder.java
new file mode 100644
index 000000000..e6e7a7d11
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/NStepQLearningBuilder.java
@@ -0,0 +1,96 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.builder;
+
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.experimental.SuperBuilder;
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.IAgentLearner;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.NStepQLearning;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.GradientsNeuralNetUpdater;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.INeuralNetUpdater;
+import org.deeplearning4j.rl4j.environment.Environment;
+import org.deeplearning4j.rl4j.environment.IActionSchema;
+import org.deeplearning4j.rl4j.experience.ExperienceHandler;
+import org.deeplearning4j.rl4j.experience.StateActionExperienceHandler;
+import org.deeplearning4j.rl4j.experience.StateActionPair;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.deeplearning4j.rl4j.observation.transform.TransformProcess;
+import org.deeplearning4j.rl4j.policy.DQNPolicy;
+import org.deeplearning4j.rl4j.policy.EpsGreedy;
+import org.deeplearning4j.rl4j.policy.INeuralNetPolicy;
+import org.deeplearning4j.rl4j.policy.IPolicy;
+import org.nd4j.linalg.api.rng.Random;
+
+/**
+ * A {@link IAgentLearner} builder that will setup a {@link NStepQLearning n-step Q-Learning} algorithm with these:
+ * <li>a epsilon-greedy policy</li>
+ * <li>a n-step state-action-reward experience handler</li>
+ * <li>a neural net updater that expects gradient update data</li>
+ * <li>a n-step Q-Learning gradient conputation algorithm</li>
+ */
+public class NStepQLearningBuilder extends BaseAgentLearnerBuilder<Integer, StateActionPair<Integer>, Gradients>{
+
+
+    private final Configuration configuration;
+    private final Random rnd;
+
+    public NStepQLearningBuilder(Configuration configuration,
+                                 ITrainableNeuralNet neuralNet,
+                                 Builder<Environment<Integer>> environmentBuilder,
+                                 Builder<TransformProcess> transformProcessBuilder,
+                                 Random rnd) {
+        super(configuration, neuralNet, environmentBuilder, transformProcessBuilder);
+        this.configuration = configuration;
+        this.rnd = rnd;
+    }
+
+    @Override
+    protected IPolicy<Integer> buildPolicy() {
+        INeuralNetPolicy<Integer> greedyPolicy = new DQNPolicy<Integer>(networks.getThreadCurrentNetwork());
+        IActionSchema<Integer> actionSchema = getEnvironment().getSchema().getActionSchema();
+        return new EpsGreedy(greedyPolicy, actionSchema, configuration.getPolicyConfiguration(), rnd);
+    }
+
+    @Override
+    protected ExperienceHandler<Integer, StateActionPair<Integer>> buildExperienceHandler() {
+        return new StateActionExperienceHandler<Integer>(configuration.getExperienceHandlerConfiguration());
+    }
+
+    @Override
+    protected IUpdateAlgorithm<Gradients, StateActionPair<Integer>> buildUpdateAlgorithm() {
+        IActionSchema<Integer> actionSchema = getEnvironment().getSchema().getActionSchema();
+        return new NStepQLearning(networks.getThreadCurrentNetwork(), networks.getTargetNetwork(), actionSchema.getActionSpaceSize(), configuration.getNstepQLearningConfiguration());
+    }
+
+    @Override
+    protected INeuralNetUpdater<Gradients> buildNeuralNetUpdater() {
+        return new GradientsNeuralNetUpdater(networks.getThreadCurrentNetwork(), networks.getTargetNetwork(), configuration.getNeuralNetUpdaterConfiguration());
+    }
+
+    @EqualsAndHashCode(callSuper = true)
+    @SuperBuilder
+    @Data
+    public static class Configuration extends BaseAgentLearnerBuilder.Configuration<Integer> {
+        EpsGreedy.Configuration policyConfiguration;
+        GradientsNeuralNetUpdater.Configuration neuralNetUpdaterConfiguration;
+        NStepQLearning.Configuration nstepQLearningConfiguration;
+        StateActionExperienceHandler.Configuration experienceHandlerConfiguration;
+    }
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/StandardDQNBuilder.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/StandardDQNBuilder.java
new file mode 100644
index 000000000..462dfacbb
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/StandardDQNBuilder.java
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.builder;
+
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.experimental.SuperBuilder;
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.IAgentLearner;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.environment.Environment;
+import org.deeplearning4j.rl4j.learning.sync.Transition;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.deeplearning4j.rl4j.observation.transform.TransformProcess;
+import org.nd4j.linalg.api.rng.Random;
+
+/**
+ * A {@link IAgentLearner} builder that will setup a {@link StandardDQN standard DQN} algorithm in addition to the setup done by {@link BaseDQNAgentLearnerBuilder}.
+ */
+public class StandardDQNBuilder extends BaseDQNAgentLearnerBuilder {
+
+
+    public StandardDQNBuilder(Configuration configuration,
+                              ITrainableNeuralNet neuralNet,
+                              Builder<Environment<Integer>> environmentBuilder,
+                              Builder<TransformProcess> transformProcessBuilder,
+                              Random rnd) {
+        super(configuration, neuralNet, environmentBuilder, transformProcessBuilder, rnd);
+    }
+
+    @Override
+    protected IUpdateAlgorithm<FeaturesLabels, Transition<Integer>> buildUpdateAlgorithm() {
+        return new StandardDQN(networks.getThreadCurrentNetwork(), networks.getTargetNetwork(), getConfiguration().getUpdateAlgorithmConfiguration());
+    }
+
+    @EqualsAndHashCode(callSuper = true)
+    @SuperBuilder
+    @Data
+    public static class Configuration extends BaseDQNAgentLearnerBuilder.Configuration {
+    }
+}
+
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/SyncNetworkHandler.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/SyncNetworkHandler.java
new file mode 100644
index 000000000..109392d23
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/builder/SyncNetworkHandler.java
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.builder;
+
+import lombok.Getter;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+
+/**
+ * A {@link INetworksHandler} implementation for synchronous setups.<p/>
+ * The target network is cloned from the input network
+ * The thread-current and the global-current uses the input network directly.
+ * Note that there is no difference between the thread-current and the global-current in a sync setup.
+ */
+public class SyncNetworkHandler implements INetworksHandler {
+
+    @Getter
+    final ITrainableNeuralNet targetNetwork;
+
+    @Getter
+    ITrainableNeuralNet threadCurrentNetwork;
+
+    @Getter
+    final ITrainableNeuralNet globalCurrentNetwork;
+
+    public SyncNetworkHandler(ITrainableNeuralNet network) {
+        globalCurrentNetwork = network;
+        targetNetwork = network.clone();
+
+        // In sync setup, the thread current and the global current is the same network
+        threadCurrentNetwork = network;
+    }
+
+    @Override
+    public void resetForNewBuild() {
+        // Do Nothing
+    }
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IActionSchema.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IActionSchema.java
index 9e6e81a7b..ce46fcee0 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IActionSchema.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IActionSchema.java
@@ -19,6 +19,8 @@ import lombok.Value;
 
 // Work in progress
 public interface IActionSchema<ACTION> {
+    int getActionSpaceSize();
+
     ACTION getNoOp();
 
     // Review: A schema should be data-only and not have behavior
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IntegerActionSchema.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IntegerActionSchema.java
index cdf172da6..7a51b1f39 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IntegerActionSchema.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/environment/IntegerActionSchema.java
@@ -15,13 +15,16 @@
  ******************************************************************************/
 package org.deeplearning4j.rl4j.environment;
 
+import lombok.Getter;
 import org.nd4j.linalg.api.rng.Random;
 import org.nd4j.linalg.factory.Nd4j;
 
 // Work in progress
 public class IntegerActionSchema implements IActionSchema<Integer> {
 
-    private final int numActions;
+    @Getter
+    private final int actionSpaceSize;
+
     private final int noOpAction;
     private final Random rnd;
 
@@ -30,7 +33,7 @@ public class IntegerActionSchema implements IActionSchema<Integer> {
     }
 
     public IntegerActionSchema(int numActions, int noOpAction, Random rnd) {
-        this.numActions = numActions;
+        this.actionSpaceSize = numActions;
         this.noOpAction = noOpAction;
         this.rnd = rnd;
     }
@@ -42,6 +45,6 @@ public class IntegerActionSchema implements IActionSchema<Integer> {
 
     @Override
     public Integer getRandomAction() {
-        return rnd.nextInt(numActions);
+        return rnd.nextInt(actionSpaceSize);
     }
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandler.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandler.java
index c7f7d51ae..27a47dcaf 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandler.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandler.java
@@ -15,7 +15,10 @@
  ******************************************************************************/
 package org.deeplearning4j.rl4j.experience;
 
+import lombok.Builder;
+import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.experimental.SuperBuilder;
 import org.deeplearning4j.rl4j.learning.sync.ExpReplay;
 import org.deeplearning4j.rl4j.learning.sync.IExpReplay;
 import org.deeplearning4j.rl4j.learning.sync.Transition;
@@ -47,8 +50,8 @@ public class ReplayMemoryExperienceHandler<A> implements ExperienceHandler<A, Tr
         this.batchSize = expReplay.getDesignatedBatchSize();
     }
 
-    public ReplayMemoryExperienceHandler(int maxReplayMemorySize, int batchSize, Random random) {
-        this(new ExpReplay<A>(maxReplayMemorySize, batchSize, random));
+    public ReplayMemoryExperienceHandler(Configuration configuration, Random random) {
+        this(new ExpReplay<A>(configuration.maxReplayMemorySize, configuration.batchSize, random));
     }
 
     public void addExperience(Observation observation, A action, double reward, boolean isTerminal) {
@@ -91,28 +94,19 @@ public class ReplayMemoryExperienceHandler<A> implements ExperienceHandler<A, Tr
         }
     }
 
-    public class Builder {
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        /**
+         * The maximum replay memory size. Default is 150000
+         */
+        @Builder.Default
         private int maxReplayMemorySize = DEFAULT_MAX_REPLAY_MEMORY_SIZE;
+
+        /**
+         * The size of training batches. Default is 32.
+         */
+        @Builder.Default
         private int batchSize = DEFAULT_BATCH_SIZE;
-        private Random random = Nd4j.getRandom();
-
-        public Builder maxReplayMemorySize(int value) {
-            maxReplayMemorySize = value;
-            return this;
-        }
-
-        public Builder batchSize(int value) {
-            batchSize = value;
-            return this;
-        }
-
-        public Builder random(Random value) {
-            random = value;
-            return this;
-        }
-
-        public ReplayMemoryExperienceHandler<A> build() {
-            return new ReplayMemoryExperienceHandler<A>(maxReplayMemorySize, batchSize, random);
-        }
     }
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandler.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandler.java
index a8fae47bc..b81a5fcc0 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandler.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandler.java
@@ -15,6 +15,9 @@
  ******************************************************************************/
 package org.deeplearning4j.rl4j.experience;
 
+import lombok.Builder;
+import lombok.Data;
+import lombok.experimental.SuperBuilder;
 import org.deeplearning4j.rl4j.observation.Observation;
 
 import java.util.ArrayList;
@@ -29,13 +32,14 @@ import java.util.List;
  * @author Alexandre Boulanger
  */
 public class StateActionExperienceHandler<A> implements ExperienceHandler<A, StateActionPair<A>> {
+    private static final int DEFAULT_BATCH_SIZE = 8;
 
     private final int batchSize;
 
     private boolean isFinalObservationSet;
 
-    public StateActionExperienceHandler(int batchSize) {
-        this.batchSize = batchSize;
+    public StateActionExperienceHandler(Configuration configuration) {
+        this.batchSize = configuration.getBatchSize();
     }
 
     private List<StateActionPair<A>> stateActionPairs = new ArrayList<>();
@@ -79,4 +83,13 @@ public class StateActionExperienceHandler<A> implements ExperienceHandler<A, Sta
         isFinalObservationSet = false;
     }
 
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        /**
+         * The default training batch size. Default is 8.
+         */
+        @Builder.Default
+        private int batchSize = DEFAULT_BATCH_SIZE;
+    }
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/async/AsyncThreadDiscrete.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/async/AsyncThreadDiscrete.java
index bf8838424..1b71960c8 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/async/AsyncThreadDiscrete.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/async/AsyncThreadDiscrete.java
@@ -62,7 +62,10 @@ public abstract class AsyncThreadDiscrete<OBSERVATION extends Encodable, NN exte
             current = (NN) asyncGlobal.getTarget().clone();
         }
 
-        experienceHandler = new StateActionExperienceHandler(getNStep());
+        StateActionExperienceHandler.Configuration experienceHandlerConfiguration = StateActionExperienceHandler.Configuration.builder()
+            .batchSize(getNStep())
+            .build();
+        experienceHandler = new StateActionExperienceHandler(experienceHandlerConfiguration);
     }
 
     private int getNStep() {
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java
index 4e357584d..166e396ec 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java
@@ -17,14 +17,19 @@
 
 package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete;
 
-import lombok.AccessLevel;
 import lombok.Getter;
-import lombok.Setter;
 import org.deeplearning4j.gym.StepReply;
-import org.deeplearning4j.rl4j.agent.learning.ILearningBehavior;
-import org.deeplearning4j.rl4j.agent.learning.LearningBehavior;
-import org.deeplearning4j.rl4j.agent.update.DQNNeuralNetUpdateRule;
-import org.deeplearning4j.rl4j.agent.update.IUpdateRule;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.BaseTransitionTDAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN;
+import org.deeplearning4j.rl4j.agent.learning.behavior.ILearningBehavior;
+import org.deeplearning4j.rl4j.agent.learning.behavior.LearningBehavior;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.IUpdateRule;
+import org.deeplearning4j.rl4j.agent.learning.update.UpdateRule;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.INeuralNetUpdater;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.LabelsNeuralNetUpdater;
 import org.deeplearning4j.rl4j.experience.ExperienceHandler;
 import org.deeplearning4j.rl4j.experience.ReplayMemoryExperienceHandler;
 import org.deeplearning4j.rl4j.learning.IHistoryProcessor;
@@ -33,6 +38,7 @@ import org.deeplearning4j.rl4j.learning.configuration.QLearningConfiguration;
 import org.deeplearning4j.rl4j.learning.sync.Transition;
 import org.deeplearning4j.rl4j.learning.sync.qlearning.QLearning;
 import org.deeplearning4j.rl4j.mdp.MDP;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
 import org.deeplearning4j.rl4j.network.dqn.IDQN;
 import org.deeplearning4j.rl4j.space.Encodable;
 import org.deeplearning4j.rl4j.observation.Observation;
@@ -95,14 +101,30 @@ public abstract class QLearningDiscrete<O extends Encodable> extends QLearning<O
     }
 
     private static ILearningBehavior<Integer> buildLearningBehavior(IDQN qNetwork, QLearningConfiguration conf, Random random) {
-        IUpdateRule<Transition<Integer>> updateRule = new DQNNeuralNetUpdateRule(qNetwork, conf.getTargetDqnUpdateFreq(), conf.isDoubleDQN(), conf.getGamma(), conf.getErrorClamp());
-        ExperienceHandler<Integer, Transition<Integer>> experienceHandler = new ReplayMemoryExperienceHandler(conf.getExpRepMaxSize(), conf.getBatchSize(), random);
+        ITrainableNeuralNet target = qNetwork.clone();
+        BaseTransitionTDAlgorithm.Configuration aglorithmConfiguration = BaseTransitionTDAlgorithm.Configuration.builder()
+                .gamma(conf.getGamma())
+                .errorClamp(conf.getErrorClamp())
+                .build();
+        IUpdateAlgorithm<FeaturesLabels, Transition<Integer>> updateAlgorithm = conf.isDoubleDQN()
+            ? new DoubleDQN(qNetwork, target, aglorithmConfiguration)
+            : new StandardDQN(qNetwork, target, aglorithmConfiguration);
+
+        LabelsNeuralNetUpdater.Configuration neuralNetUpdaterConfiguration = LabelsNeuralNetUpdater.Configuration.builder()
+            .targetUpdateFrequency(conf.getTargetDqnUpdateFreq())
+                .build();
+        INeuralNetUpdater<FeaturesLabels> updater = new LabelsNeuralNetUpdater(qNetwork, target, neuralNetUpdaterConfiguration);
+        IUpdateRule<Transition<Integer>> updateRule = new UpdateRule<FeaturesLabels, Transition<Integer>>(updateAlgorithm, updater);
+
+        ReplayMemoryExperienceHandler.Configuration experienceHandlerConfiguration = ReplayMemoryExperienceHandler.Configuration.builder()
+            .maxReplayMemorySize(conf.getExpRepMaxSize())
+            .batchSize(conf.getBatchSize())
+            .build();
+        ExperienceHandler<Integer, Transition<Integer>> experienceHandler = new ReplayMemoryExperienceHandler(experienceHandlerConfiguration, random);
         return LearningBehavior.<Integer, Transition<Integer>>builder()
                 .experienceHandler(experienceHandler)
                 .updateRule(updateRule)
-                .experienceUpdateSize(conf.getBatchSize())
                 .build();
-
     }
 
     public MDP<O, Integer, DiscreteSpace> getMdp() {
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/CommonGradientNames.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/CommonGradientNames.java
new file mode 100644
index 000000000..c3becf9a3
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/CommonGradientNames.java
@@ -0,0 +1,5 @@
+package org.deeplearning4j.rl4j.network;
+
+public abstract class CommonGradientNames {
+    public static final String QValues = "Q";
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/CommonLabelNames.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/CommonLabelNames.java
new file mode 100644
index 000000000..75d691238
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/CommonLabelNames.java
@@ -0,0 +1,7 @@
+package org.deeplearning4j.rl4j.network;
+
+public abstract class CommonLabelNames {
+
+    public static final String QValues = "Q";
+
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/IOutputNeuralNet.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/IOutputNeuralNet.java
index 58e219ea0..404502d2e 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/IOutputNeuralNet.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/IOutputNeuralNet.java
@@ -35,4 +35,9 @@ public interface IOutputNeuralNet {
      * @return The ouptut of the network
      */
     INDArray output(INDArray batch);
+
+    /**
+     * Clear the neural net of any previous state
+     */
+    void reset();
 }
\ No newline at end of file
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ITrainableNeuralNet.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ITrainableNeuralNet.java
index da91d7e6d..320db019b 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ITrainableNeuralNet.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ITrainableNeuralNet.java
@@ -15,17 +15,31 @@
  ******************************************************************************/
 package org.deeplearning4j.rl4j.network;
 
-import org.nd4j.linalg.dataset.api.DataSet;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
 
 /**
  * An interface defining the <i>trainable</i> aspect of a {@link NeuralNet}.
  */
-public interface ITrainableNeuralNet<NET_TYPE extends ITrainableNeuralNet> {
+public interface ITrainableNeuralNet<NET_TYPE extends ITrainableNeuralNet> extends IOutputNeuralNet {
     /**
      * Train the neural net using the supplied <i>feature-labels</i>
      * @param featuresLabels The feature-labels
      */
-    void fit(DataSet featuresLabels);
+    void fit(FeaturesLabels featuresLabels);
+
+    /**
+     * Use the supplied <i>feature-labels</i> to compute the {@link Gradients} on the neural network.
+     * @param updateLabels The feature-labels
+     * @return The computed {@link Gradients}
+     */
+    Gradients computeGradients(FeaturesLabels updateLabels);
+
+    /**
+     * Applies a {@link Gradients} to the network
+     * @param gradients
+     */
+    void applyGradients(Gradients gradients);
 
     /**
      * Changes this instance to be a copy of the <i>from</i> network.
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/NeuralNet.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/NeuralNet.java
index 7823ea906..8e7bbc166 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/NeuralNet.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/NeuralNet.java
@@ -29,7 +29,7 @@ import java.io.OutputStream;
  * Factorisation between ActorCritic and DQN neural net.
  * Useful for AsyncLearning and Thread code.
  */
-public interface NeuralNet<NN extends NeuralNet> extends IOutputNeuralNet, ITrainableNeuralNet<NN> {
+public interface NeuralNet<NN extends NeuralNet> extends ITrainableNeuralNet<NN> {
 
     /**
      * Returns the underlying MultiLayerNetwork or ComputationGraph objects.
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticCompGraph.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticCompGraph.java
index 6e37fb0f3..9b50eeef1 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticCompGraph.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticCompGraph.java
@@ -26,10 +26,11 @@ import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.layers.recurrent.RnnOutputLayer;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.deeplearning4j.optimize.api.TrainingListener;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.deeplearning4j.util.ModelSerializer;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 
 import java.io.IOException;
 import java.io.OutputStream;
@@ -84,8 +85,21 @@ public class ActorCriticCompGraph implements IActorCritic<ActorCriticCompGraph>
     }
 
     @Override
-    public void fit(DataSet featuresLabels) {
-        fit(featuresLabels.getFeatures(), new INDArray[] { featuresLabels.getLabels() });
+    public void fit(FeaturesLabels featuresLabels) {
+        // TODO
+        throw new NotImplementedException("Not implemented: will be done with AgentLearner async support");
+    }
+
+    @Override
+    public Gradients computeGradients(FeaturesLabels updateLabels) {
+        // TODO
+        throw new NotImplementedException("Not implemented: will be done with AgentLearner async support");
+    }
+
+    @Override
+    public void applyGradients(Gradients gradients) {
+        // TODO
+        throw new NotImplementedException("Not implemented: will be done with AgentLearner async support");
     }
 
     public void copy(ActorCriticCompGraph from) {
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticSeparate.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticSeparate.java
index a086b6065..3e31f0a0b 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticSeparate.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/ac/ActorCriticSeparate.java
@@ -25,10 +25,13 @@ import org.deeplearning4j.nn.layers.recurrent.RnnOutputLayer;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.deeplearning4j.optimize.api.TrainingListener;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
+import org.deeplearning4j.rl4j.network.CommonGradientNames;
+import org.deeplearning4j.rl4j.network.CommonLabelNames;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.deeplearning4j.util.ModelSerializer;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 
 import java.io.IOException;
 import java.io.OutputStream;
@@ -73,7 +76,6 @@ public class ActorCriticSeparate<NN extends ActorCriticSeparate> implements IAct
 
     }
 
-
     public INDArray[] outputAll(INDArray batch) {
         if (recurrent) {
             return new INDArray[] {valueNet.rnnTimeStep(batch), policyNet.rnnTimeStep(batch)};
@@ -90,12 +92,24 @@ public class ActorCriticSeparate<NN extends ActorCriticSeparate> implements IAct
     }
 
     @Override
-    public void fit(DataSet featuresLabels) {
+    public void fit(FeaturesLabels featuresLabels) {
         // TODO: signature of fit() will change from DataSet to a class that has named labels to support network like
         // this one (labels for the value-network and another labels for the policy-network
         throw new NotImplementedException("Not implemented: will be done with AgentLearner async support");
     }
 
+    @Override
+    public Gradients computeGradients(FeaturesLabels updateLabels) {
+        // TODO
+        throw new NotImplementedException("Not implemented: will be done with AgentLearner async support");
+    }
+
+    @Override
+    public void applyGradients(Gradients gradients) {
+        // TODO
+        throw new NotImplementedException("Not implemented: will be done with AgentLearner async support");
+    }
+
     public void copy(NN from) {
         valueNet.setParams(from.valueNet.params());
         policyNet.setParams(from.policyNet.params());
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/dqn/DQN.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/dqn/DQN.java
index 260ea6aa2..27b5bcdf6 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/dqn/DQN.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/network/dqn/DQN.java
@@ -22,10 +22,13 @@ import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.deeplearning4j.optimize.api.TrainingListener;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
+import org.deeplearning4j.rl4j.network.CommonGradientNames;
+import org.deeplearning4j.rl4j.network.CommonLabelNames;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.deeplearning4j.util.ModelSerializer;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 
 import java.io.IOException;
 import java.io.OutputStream;
@@ -81,8 +84,8 @@ public class DQN implements IDQN<DQN> {
     }
 
     @Override
-    public void fit(DataSet featuresLabels) {
-        fit(featuresLabels.getFeatures(), featuresLabels.getLabels());
+    public void fit(FeaturesLabels featuresLabels) {
+        fit(featuresLabels.getFeatures(), featuresLabels.getLabels(CommonLabelNames.QValues));
     }
 
     @Override
@@ -115,6 +118,41 @@ public class DQN implements IDQN<DQN> {
         return gradient(input, labels[0]);
     }
 
+
+    @Override
+    public Gradients computeGradients(FeaturesLabels updateLabels) {
+        mln.setInput(updateLabels.getFeatures());
+        mln.setLabels(updateLabels.getLabels(CommonLabelNames.QValues));
+        mln.computeGradientAndScore();
+        Collection<TrainingListener> iterationListeners = mln.getListeners();
+        if (iterationListeners != null && iterationListeners.size() > 0) {
+            for (TrainingListener l : iterationListeners) {
+                l.onGradientCalculation(mln);
+            }
+        }
+        Gradients result = new Gradients(updateLabels.getBatchSize());
+        result.putGradient(CommonGradientNames.QValues, mln.gradient());
+        return result;
+    }
+
+    @Override
+    public void applyGradients(Gradients gradients) {
+        Gradient qValues = gradients.getGradient(CommonGradientNames.QValues);
+
+        MultiLayerConfiguration mlnConf = mln.getLayerWiseConfigurations();
+        int iterationCount = mlnConf.getIterationCount();
+        int epochCount = mlnConf.getEpochCount();
+        mln.getUpdater().update(mln, qValues, iterationCount, epochCount, (int)gradients.getBatchSize(), LayerWorkspaceMgr.noWorkspaces());
+        mln.params().subi(qValues.gradient());
+        Collection<TrainingListener> iterationListeners = mln.getListeners();
+        if (iterationListeners != null && iterationListeners.size() > 0) {
+            for (TrainingListener listener : iterationListeners) {
+                listener.iterationDone(mln, iterationCount, epochCount);
+            }
+        }
+        mlnConf.setIterationCount(iterationCount + 1);
+    }
+
     public void applyGradient(Gradient[] gradient, int batchSize) {
         MultiLayerConfiguration mlnConf = mln.getLayerWiseConfigurations();
         int iterationCount = mlnConf.getIterationCount();
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/DQNPolicy.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/DQNPolicy.java
index ed591a1ff..3b27cc778 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/DQNPolicy.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/DQNPolicy.java
@@ -18,6 +18,7 @@ package org.deeplearning4j.rl4j.policy;
 
 import lombok.AllArgsConstructor;
 import org.deeplearning4j.rl4j.learning.Learning;
+import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.deeplearning4j.rl4j.network.dqn.DQN;
 import org.deeplearning4j.rl4j.network.dqn.IDQN;
 import org.deeplearning4j.rl4j.space.Encodable;
@@ -37,14 +38,14 @@ import java.io.IOException;
 @AllArgsConstructor
 public class DQNPolicy<OBSERVATION> extends Policy<Integer> {
 
-    final private IDQN dqn;
+    final private IOutputNeuralNet neuralNet;
 
     public static <OBSERVATION extends Encodable> DQNPolicy<OBSERVATION> load(String path) throws IOException {
         return new DQNPolicy<>(DQN.load(path));
     }
 
-    public IDQN getNeuralNet() {
-        return dqn;
+    public IOutputNeuralNet getNeuralNet() {
+        return neuralNet;
     }
 
     @Override
@@ -53,12 +54,13 @@ public class DQNPolicy<OBSERVATION> extends Policy<Integer> {
     }
 
     public Integer nextAction(INDArray input) {
-        INDArray output = dqn.output(input);
+        INDArray output = neuralNet.output(input);
         return Learning.getMaxAction(output);
     }
 
     public void save(String filename) throws IOException {
-        dqn.save(filename);
+        // TODO: refac load & save. Code below should continue to work in the meantime because it's only called by the lecacy code and it's only using a DQN network with DQNPolicy
+        ((IDQN)neuralNet).save(filename);
     }
 
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/EpsGreedy.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/EpsGreedy.java
index f7422be92..4e66765a1 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/EpsGreedy.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/EpsGreedy.java
@@ -18,12 +18,14 @@
 package org.deeplearning4j.rl4j.policy;
 
 import lombok.Builder;
+import lombok.Data;
 import lombok.NonNull;
+import lombok.experimental.SuperBuilder;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.rl4j.environment.IActionSchema;
 import org.deeplearning4j.rl4j.learning.IEpochTrainer;
 import org.deeplearning4j.rl4j.mdp.MDP;
-import org.deeplearning4j.rl4j.network.NeuralNet;
+import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.deeplearning4j.rl4j.space.ActionSpace;
 import org.deeplearning4j.rl4j.space.Encodable;
@@ -45,7 +47,7 @@ import org.nd4j.linalg.factory.Nd4j;
 public class EpsGreedy<A> extends Policy<A> {
 
     final private INeuralNetPolicy<A> policy;
-    final private int updateStart;
+    final private int annealingStart;
     final private int epsilonNbStep;
     final private Random rnd;
     final private double minEpsilon;
@@ -61,14 +63,14 @@ public class EpsGreedy<A> extends Policy<A> {
     @Deprecated
     public <OBSERVATION extends Encodable, AS extends ActionSpace<A>> EpsGreedy(Policy<A> policy,
                                                                                 MDP<Encodable, A, ActionSpace<A>> mdp,
-                                                                                int updateStart,
+                                                                                int annealingStart,
                                                                                 int epsilonNbStep,
                                                                                 Random rnd,
                                                                                 double minEpsilon,
                                                                                 IEpochTrainer learning) {
         this.policy = policy;
         this.mdp = mdp;
-        this.updateStart = updateStart;
+        this.annealingStart = annealingStart;
         this.epsilonNbStep = epsilonNbStep;
         this.rnd = rnd;
         this.minEpsilon = minEpsilon;
@@ -77,17 +79,17 @@ public class EpsGreedy<A> extends Policy<A> {
         this.actionSchema = null;
     }
 
-    public EpsGreedy(@NonNull Policy<A> policy, @NonNull IActionSchema<A> actionSchema, double minEpsilon, int updateStart, int epsilonNbStep) {
-        this(policy, actionSchema, minEpsilon, updateStart, epsilonNbStep, null);
+    public EpsGreedy(@NonNull Policy<A> policy, @NonNull IActionSchema<A> actionSchema, double minEpsilon, int annealingStart, int epsilonNbStep) {
+        this(policy, actionSchema, minEpsilon, annealingStart, epsilonNbStep, null);
     }
 
     @Builder
-    public EpsGreedy(@NonNull INeuralNetPolicy<A> policy, @NonNull IActionSchema<A> actionSchema, double minEpsilon, int updateStart, int epsilonNbStep, Random rnd) {
+    public EpsGreedy(@NonNull INeuralNetPolicy<A> policy, @NonNull IActionSchema<A> actionSchema, double minEpsilon, int annealingStart, int epsilonNbStep, Random rnd) {
         this.policy = policy;
 
         this.rnd = rnd == null ? Nd4j.getRandom() : rnd;
         this.minEpsilon = minEpsilon;
-        this.updateStart = updateStart;
+        this.annealingStart = annealingStart;
         this.epsilonNbStep = epsilonNbStep;
         this.actionSchema = actionSchema;
 
@@ -95,7 +97,11 @@ public class EpsGreedy<A> extends Policy<A> {
         this.learning = null;
     }
 
-    public NeuralNet getNeuralNet() {
+    public EpsGreedy(INeuralNetPolicy<A> policy, IActionSchema<A> actionSchema, @NonNull Configuration configuration, Random rnd) {
+        this(policy, actionSchema, configuration.getMinEpsilon(), configuration.getAnnealingStart(), configuration.getEpsilonNbStep(), rnd);
+    }
+
+    public IOutputNeuralNet getNeuralNet() {
         return policy.getNeuralNet();
     }
 
@@ -141,6 +147,16 @@ public class EpsGreedy<A> extends Policy<A> {
 
     public double getEpsilon() {
         int step = actionSchema != null ? annealingStep : learning.getStepCount();
-        return Math.min(1.0, Math.max(minEpsilon, 1.0 - (step - updateStart) * 1.0 / epsilonNbStep));
+        return Math.min(1.0, Math.max(minEpsilon, 1.0 - (step - annealingStart) * 1.0 / epsilonNbStep));
+    }
+
+    @SuperBuilder
+    @Data
+    public static class Configuration {
+        @Builder.Default
+        final int annealingStart = 0;
+
+        final int epsilonNbStep;
+        final double minEpsilon;
     }
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/INeuralNetPolicy.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/INeuralNetPolicy.java
index c213396c6..b3967e54f 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/INeuralNetPolicy.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/INeuralNetPolicy.java
@@ -1,7 +1,7 @@
 package org.deeplearning4j.rl4j.policy;
 
-import org.deeplearning4j.rl4j.network.NeuralNet;
+import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 
 public interface INeuralNetPolicy<ACTION> extends IPolicy<ACTION> {
-    NeuralNet getNeuralNet();
+    IOutputNeuralNet getNeuralNet();
 }
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/Policy.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/Policy.java
index cf369e359..827162fa1 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/Policy.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/policy/Policy.java
@@ -16,15 +16,16 @@
 
 package org.deeplearning4j.rl4j.policy;
 
+import lombok.experimental.SuperBuilder;
 import org.deeplearning4j.gym.StepReply;
 import org.deeplearning4j.rl4j.learning.HistoryProcessor;
 import org.deeplearning4j.rl4j.learning.IHistoryProcessor;
 import org.deeplearning4j.rl4j.learning.Learning;
 import org.deeplearning4j.rl4j.mdp.MDP;
-import org.deeplearning4j.rl4j.network.NeuralNet;
-import org.deeplearning4j.rl4j.space.Encodable;
+import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.deeplearning4j.rl4j.space.ActionSpace;
+import org.deeplearning4j.rl4j.space.Encodable;
 import org.deeplearning4j.rl4j.util.LegacyMDPWrapper;
 
 /**
@@ -36,7 +37,7 @@ import org.deeplearning4j.rl4j.util.LegacyMDPWrapper;
  */
 public abstract class Policy<A> implements INeuralNetPolicy<A> {
 
-    public abstract NeuralNet getNeuralNet();
+    public abstract IOutputNeuralNet getNeuralNet();
 
     public abstract A nextAction(Observation obs);
 
@@ -106,5 +107,4 @@ public abstract class Policy<A> implements INeuralNetPolicy<A> {
 
         return new Learning.InitMdp(0, observation, reward);
     }
-
 }
diff --git a/libnd4j/include/loops/cuda/reduce3.cu b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/trainer/ITrainer.java
similarity index 71%
rename from libnd4j/include/loops/cuda/reduce3.cu
rename to rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/trainer/ITrainer.java
index c1d63e8dd..7641d3a35 100644
--- a/libnd4j/include/loops/cuda/reduce3.cu
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/trainer/ITrainer.java
@@ -1,33 +1,23 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-//
-
-
-#include <system/op_boilerplate.h>
-#include <loops/reduce3.h>
-#include <loops/legacy_ops.h>
-#include <types/types.h>
-#include <ops/specials_cuda.h>
-
-namespace functions {
-    namespace reduce3 {
-
-
-    }
-}
\ No newline at end of file
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.trainer;
+
+/**
+ * An interface describing the behavior of all trainers
+ */
+public interface ITrainer {
+    void train();
+}
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/trainer/SyncTrainer.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/trainer/SyncTrainer.java
new file mode 100644
index 000000000..d21e30e58
--- /dev/null
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/trainer/SyncTrainer.java
@@ -0,0 +1,64 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+package org.deeplearning4j.rl4j.trainer;
+
+import lombok.Getter;
+import lombok.NonNull;
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.IAgentLearner;
+
+import java.util.function.Predicate;
+
+// TODO: Add listeners & events once AsyncTrainer is implemented
+
+/**
+ * A {@link ITrainer} implementation that will create a single {@link IAgentLearner} and perform the training in a
+ * synchronized setup, until a stopping condition is met.
+ *
+ * @param <ACTION> The type of the actions expected by the environment
+ */
+public class SyncTrainer<ACTION> implements ITrainer {
+
+    private final Predicate<SyncTrainer<ACTION>> stoppingCondition;
+
+    @Getter
+    private int episodeCount;
+
+    @Getter
+    final IAgentLearner<ACTION> agentLearner;
+
+    /**
+     * Build a SyncTrainer that will train until a stopping condition is met.
+     * @param agentLearnerBuilder the builder that will be used to create the agent-learner instance
+     * @param stoppingCondition the training will stop when this condition evaluates to true
+     */
+    @lombok.Builder
+    public SyncTrainer(@NonNull Builder<IAgentLearner<ACTION>> agentLearnerBuilder,
+                       @NonNull Predicate<SyncTrainer<ACTION>> stoppingCondition) {
+        this.stoppingCondition = stoppingCondition;
+        agentLearner = agentLearnerBuilder.build();
+    }
+
+    /**
+     * Perform the training
+     */
+    public void train() {
+        while (!stoppingCondition.test(this)) {
+            agentLearner.run();
+            ++episodeCount;
+        }
+    }
+}
\ No newline at end of file
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentLearnerTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentLearnerTest.java
index e0c0685bf..04f11604b 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentLearnerTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentLearnerTest.java
@@ -1,6 +1,6 @@
 package org.deeplearning4j.rl4j.agent;
 
-import org.deeplearning4j.rl4j.agent.learning.LearningBehavior;
+import org.deeplearning4j.rl4j.agent.learning.behavior.LearningBehavior;
 import org.deeplearning4j.rl4j.environment.Environment;
 import org.deeplearning4j.rl4j.environment.IntegerActionSchema;
 import org.deeplearning4j.rl4j.environment.Schema;
@@ -43,9 +43,10 @@ public class AgentLearnerTest {
     @Test
     public void when_episodeIsStarted_expect_learningBehaviorHandleEpisodeStartCalled() {
         // Arrange
-        AgentLearner<Integer> sut = AgentLearner.builder(environmentMock, transformProcessMock, policyMock, learningBehaviorMock)
+        AgentLearner.Configuration configuration = AgentLearner.Configuration.builder()
                 .maxEpisodeSteps(3)
                 .build();
+        AgentLearner<Integer> sut = new AgentLearner(environmentMock, transformProcessMock, policyMock, configuration, null, learningBehaviorMock);
 
         Schema schema = new Schema(new IntegerActionSchema(0, -1));
         when(environmentMock.reset()).thenReturn(new HashMap<>());
@@ -67,9 +68,10 @@ public class AgentLearnerTest {
     @Test
     public void when_runIsCalled_expect_experienceHandledWithLearningBehavior() {
         // Arrange
-        AgentLearner<Integer> sut = AgentLearner.builder(environmentMock, transformProcessMock, policyMock, learningBehaviorMock)
+        AgentLearner.Configuration configuration = AgentLearner.Configuration.builder()
                 .maxEpisodeSteps(4)
                 .build();
+        AgentLearner<Integer> sut = new AgentLearner(environmentMock, transformProcessMock, policyMock, configuration, null, learningBehaviorMock);
 
         Schema schema = new Schema(new IntegerActionSchema(0, -1));
         when(environmentMock.getSchema()).thenReturn(schema);
@@ -127,9 +129,10 @@ public class AgentLearnerTest {
     @Test
     public void when_runIsCalledMultipleTimes_expect_totalStepCountCorrect() {
         // Arrange
-        AgentLearner<Integer> sut = AgentLearner.builder(environmentMock, transformProcessMock, policyMock, learningBehaviorMock)
+        AgentLearner.Configuration configuration = AgentLearner.Configuration.builder()
                 .maxEpisodeSteps(4)
                 .build();
+        AgentLearner<Integer> sut = new AgentLearner(environmentMock, transformProcessMock, policyMock, configuration, null, learningBehaviorMock);
 
         Schema schema = new Schema(new IntegerActionSchema(0, -1));
         when(environmentMock.getSchema()).thenReturn(schema);
@@ -166,9 +169,10 @@ public class AgentLearnerTest {
     @Test
     public void when_runIsCalledMultipleTimes_expect_rewardSentToLearningBehaviorToBeCorrect() {
         // Arrange
-        AgentLearner<Integer> sut = AgentLearner.builder(environmentMock, transformProcessMock, policyMock, learningBehaviorMock)
+        AgentLearner.Configuration configuration = AgentLearner.Configuration.builder()
                 .maxEpisodeSteps(4)
                 .build();
+        AgentLearner<Integer> sut = new AgentLearner(environmentMock, transformProcessMock, policyMock, configuration, null, learningBehaviorMock);
 
         Schema schema = new Schema(new IntegerActionSchema(0, -1));
         when(environmentMock.getSchema()).thenReturn(schema);
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentTest.java
index 0022e61f0..92ea9f4d7 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/AgentTest.java
@@ -18,7 +18,6 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import static org.mockito.ArgumentMatchers.*;
 import static org.mockito.Mockito.*;
 
 @RunWith(MockitoJUnitRunner.class)
@@ -34,7 +33,7 @@ public class AgentTest {
     @Test
     public void when_buildingWithNullEnvironment_expect_exception() {
         try {
-            Agent.builder(null, null, null).build();
+            new Agent(null, null, null, null, null);
             fail("NullPointerException should have been thrown");
         } catch (NullPointerException exception) {
             String expectedMessage = "environment is marked non-null but is null";
@@ -47,7 +46,7 @@ public class AgentTest {
     @Test
     public void when_buildingWithNullTransformProcess_expect_exception() {
         try {
-            Agent.builder(environmentMock, null, null).build();
+            new Agent(environmentMock, null, null, null, null);
             fail("NullPointerException should have been thrown");
         } catch (NullPointerException exception) {
             String expectedMessage = "transformProcess is marked non-null but is null";
@@ -60,7 +59,7 @@ public class AgentTest {
     @Test
     public void when_buildingWithNullPolicy_expect_exception() {
         try {
-            Agent.builder(environmentMock, transformProcessMock, null).build();
+            new Agent(environmentMock, transformProcessMock, null, null, null);
             fail("NullPointerException should have been thrown");
         } catch (NullPointerException exception) {
             String expectedMessage = "policy is marked non-null but is null";
@@ -70,15 +69,29 @@ public class AgentTest {
         }
     }
 
+    @Test
+    public void when_buildingWithNullConfiguration_expect_exception() {
+        try {
+            new Agent(environmentMock, transformProcessMock, policyMock, null, null);
+            fail("NullPointerException should have been thrown");
+        } catch (NullPointerException exception) {
+            String expectedMessage = "configuration is marked non-null but is null";
+            String actualMessage = exception.getMessage();
+
+            assertTrue(actualMessage.contains(expectedMessage));
+        }
+    }
+
     @Test
     public void when_buildingWithInvalidMaxSteps_expect_exception() {
         try {
-            Agent.builder(environmentMock, transformProcessMock, policyMock)
-                    .maxEpisodeSteps(0)
-                    .build();
+            Agent.Configuration configuration = Agent.Configuration.builder()
+                .maxEpisodeSteps(0)
+                .build();
+            new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
             fail("IllegalArgumentException should have been thrown");
         } catch (IllegalArgumentException exception) {
-            String expectedMessage = "maxEpisodeSteps must be greater than 0, got [0]";
+            String expectedMessage = "Configuration: maxEpisodeSteps must be null (no maximum) or greater than 0, got [0]";
             String actualMessage = exception.getMessage();
 
             assertTrue(actualMessage.contains(expectedMessage));
@@ -88,9 +101,8 @@ public class AgentTest {
     @Test
     public void when_buildingWithId_expect_idSetInAgent() {
         // Arrange
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
-                .id("TestAgent")
-                .build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, "TestAgent");
 
         // Assert
         assertEquals("TestAgent", sut.getId());
@@ -107,8 +119,8 @@ public class AgentTest {
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(new Observation(Nd4j.create(new double[] { 123.0 })));
         when(policyMock.nextAction(any(Observation.class))).thenReturn(1);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
-                .build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         when(listenerMock.onBeforeStep(any(Agent.class), any(Observation.class), anyInt())).thenReturn(AgentListener.ListenerResponse.STOP);
         sut.addListener(listenerMock);
@@ -135,7 +147,8 @@ public class AgentTest {
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(new Observation(Nd4j.create(new double[] { 123.0 })));
         when(environmentMock.isEpisodeFinished()).thenReturn(true);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock).build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
         Agent spy = Mockito.spy(sut);
 
         // Act
@@ -156,7 +169,8 @@ public class AgentTest {
 
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(new Observation(Nd4j.create(new double[] { 123.0 })));
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock).build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         when(listenerMock.onBeforeEpisode(any(Agent.class))).thenReturn(AgentListener.ListenerResponse.STOP);
         sut.addListener(listenerMock);
@@ -183,8 +197,8 @@ public class AgentTest {
 
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(new Observation(Nd4j.create(new double[] { 123.0 })));
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
-                .build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         final Agent spy = Mockito.spy(sut);
 
@@ -213,9 +227,10 @@ public class AgentTest {
 
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(new Observation(Nd4j.create(new double[] { 123.0 })));
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
+        Agent.Configuration configuration = Agent.Configuration.builder()
                 .maxEpisodeSteps(3)
                 .build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         final Agent spy = Mockito.spy(sut);
 
@@ -243,8 +258,8 @@ public class AgentTest {
 
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(Observation.SkippedObservation);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
-                .build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         when(listenerMock.onBeforeStep(any(Agent.class), any(Observation.class), any())).thenReturn(AgentListener.ListenerResponse.STOP);
         sut.addListener(listenerMock);
@@ -268,8 +283,8 @@ public class AgentTest {
 
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(Observation.SkippedObservation);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
-                .build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         when(listenerMock.onBeforeStep(any(Agent.class), any(Observation.class), any())).thenReturn(AgentListener.ListenerResponse.STOP);
         sut.addListener(listenerMock);
@@ -295,9 +310,10 @@ public class AgentTest {
         when(policyMock.nextAction(any(Observation.class)))
                 .thenAnswer(invocation -> (int)((Observation)invocation.getArgument(0)).getData().getDouble(0));
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
+        Agent.Configuration configuration = Agent.Configuration.builder()
                 .maxEpisodeSteps(3)
                 .build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         Agent spy = Mockito.spy(sut);
 
@@ -335,7 +351,8 @@ public class AgentTest {
 
         when(transformProcessMock.transform(any(Map.class), anyInt(), anyBoolean())).thenReturn(new Observation(Nd4j.create(new double[] { 123.0 })));
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock).build();
+        Agent.Configuration configuration = Agent.Configuration.builder().build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         when(listenerMock.onBeforeStep(any(Agent.class), any(Observation.class), any())).thenReturn(AgentListener.ListenerResponse.STOP);
         sut.addListener(listenerMock);
@@ -365,9 +382,10 @@ public class AgentTest {
 
         when(policyMock.nextAction(any(Observation.class))).thenReturn(123);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
+        Agent.Configuration configuration = Agent.Configuration.builder()
                 .maxEpisodeSteps(1)
                 .build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         // Act
         sut.run();
@@ -388,9 +406,10 @@ public class AgentTest {
 
         when(policyMock.nextAction(any(Observation.class))).thenReturn(123);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
+        Agent.Configuration configuration = Agent.Configuration.builder()
                 .maxEpisodeSteps(1)
                 .build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
 
         // Act
         sut.run();
@@ -413,9 +432,10 @@ public class AgentTest {
 
         when(policyMock.nextAction(any(Observation.class))).thenReturn(123);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
+        Agent.Configuration configuration = Agent.Configuration.builder()
                 .maxEpisodeSteps(1)
                 .build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
         Agent spy = Mockito.spy(sut);
 
         // Act
@@ -438,9 +458,10 @@ public class AgentTest {
 
         when(policyMock.nextAction(any(Observation.class))).thenReturn(123);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
+        Agent.Configuration configuration = Agent.Configuration.builder()
                 .maxEpisodeSteps(1)
                 .build();
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
         when(listenerMock.onAfterStep(any(Agent.class), any(StepResult.class))).thenReturn(AgentListener.ListenerResponse.STOP);
         sut.addListener(listenerMock);
 
@@ -466,10 +487,11 @@ public class AgentTest {
 
         when(policyMock.nextAction(any(Observation.class))).thenReturn(123);
 
-        Agent sut = Agent.builder(environmentMock, transformProcessMock, policyMock)
+        Agent.Configuration configuration = Agent.Configuration.builder()
                 .maxEpisodeSteps(1)
                 .build();
-
+        Agent sut = new Agent(environmentMock, transformProcessMock, policyMock, configuration, null);
+        sut.addListener(listenerMock);
         Agent spy = Mockito.spy(sut);
 
         // Act
@@ -477,5 +499,6 @@ public class AgentTest {
 
         // Assert
         verify(spy, times(1)).onAfterEpisode();
+        verify(listenerMock, times(1)).onAfterEpisode(any());
     }
 }
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/NStepQLearningTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/NStepQLearningTest.java
new file mode 100644
index 000000000..ba81082f9
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/NStepQLearningTest.java
@@ -0,0 +1,129 @@
+package org.deeplearning4j.rl4j.agent.learning.algorithm;
+
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
+import org.deeplearning4j.rl4j.experience.StateActionPair;
+import org.deeplearning4j.rl4j.network.CommonLabelNames;
+import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.deeplearning4j.rl4j.observation.Observation;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.ArgumentCaptor;
+import org.mockito.Mock;
+import org.mockito.junit.MockitoJUnitRunner;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.*;
+
+@RunWith(MockitoJUnitRunner.class)
+public class NStepQLearningTest {
+
+    private static final int ACTION_SPACE_SIZE = 2;
+
+    @Mock
+    ITrainableNeuralNet currentMock;
+
+    @Mock
+    IOutputNeuralNet targetMock;
+
+    NStepQLearning sut;
+
+    private void setup(double gamma) {
+        when(currentMock.output(any(INDArray.class))).thenAnswer(invocation -> invocation.getArgument(0, INDArray.class).mul(-1.0));
+        when(targetMock.output(any(INDArray.class))).thenAnswer(invocation -> invocation.getArgument(0, INDArray.class).mul(-2.0));
+
+        NStepQLearning.Configuration configuration = NStepQLearning.Configuration.builder()
+            .gamma(gamma)
+            .build();
+        sut = new NStepQLearning(currentMock, targetMock, ACTION_SPACE_SIZE, configuration);
+    }
+
+    @Test
+    public void when_isTerminal_expect_initRewardIs0() {
+        // Arrange
+        int action = 0;
+        setup(1.0);
+
+        final Observation observation = new Observation(Nd4j.zeros(1, 2));
+        List<StateActionPair<Integer>> experience = new ArrayList<StateActionPair<Integer>>() {
+            {
+                add(new StateActionPair<Integer>(observation, action, 0.0, true));
+            }
+        };
+
+        // Act
+        Gradients result = sut.compute(experience);
+
+        // Assert
+        ArgumentCaptor<FeaturesLabels> argument = ArgumentCaptor.forClass(FeaturesLabels.class);
+        verify(currentMock, times(1)).computeGradients(argument.capture());
+
+        FeaturesLabels featuresLabels = argument.getValue();
+        assertEquals(0.0, featuresLabels.getLabels(CommonLabelNames.QValues).getDouble(0), 0.000001);
+    }
+
+    @Test
+    public void when_notTerminal_expect_initRewardWithMaxQFromTarget() {
+        // Arrange
+        int action = 0;
+        setup(1.0);
+
+        final Observation observation = new Observation(Nd4j.create(new double[] { -123.0, -234.0 }).reshape(1, 2));
+        List<StateActionPair<Integer>> experience = new ArrayList<StateActionPair<Integer>>() {
+            {
+                add(new StateActionPair<Integer>(observation, action, 0.0, false));
+            }
+        };
+
+        // Act
+        Gradients result = sut.compute(experience);
+
+        // Assert
+        ArgumentCaptor<FeaturesLabels> argument = ArgumentCaptor.forClass(FeaturesLabels.class);
+        verify(currentMock, times(1)).computeGradients(argument.capture());
+
+        FeaturesLabels featuresLabels = argument.getValue();
+        assertEquals(-2.0 * observation.getData().getDouble(0, 1), featuresLabels.getLabels(CommonLabelNames.QValues).getDouble(0), 0.000001);
+    }
+
+    @Test
+    public void when_callingWithMultipleExperiences_expect_gradientsAreValid() {
+        // Arrange
+        double gamma = 0.9;
+        setup(gamma);
+
+        List<StateActionPair<Integer>> experience = new ArrayList<StateActionPair<Integer>>() {
+            {
+                add(new StateActionPair<Integer>(new Observation(Nd4j.create(new double[] { -1.1, -1.2 }).reshape(1, 2)), 0, 1.0, false));
+                add(new StateActionPair<Integer>(new Observation(Nd4j.create(new double[] { -2.1, -2.2 }).reshape(1, 2)), 1, 2.0, true));
+            }
+        };
+
+        // Act
+        sut.compute(experience);
+
+        // Assert
+        ArgumentCaptor<FeaturesLabels> argument = ArgumentCaptor.forClass(FeaturesLabels.class);
+        verify(currentMock, times(1)).computeGradients(argument.capture());
+
+        // input side -- should be a stack of observations
+        INDArray featuresValues = argument.getValue().getFeatures();
+        assertEquals(-1.1, featuresValues.getDouble(0, 0), 0.00001);
+        assertEquals(-1.2, featuresValues.getDouble(0, 1), 0.00001);
+        assertEquals(-2.1, featuresValues.getDouble(1, 0), 0.00001);
+        assertEquals(-2.2, featuresValues.getDouble(1, 1), 0.00001);
+
+        // target side
+        INDArray labels = argument.getValue().getLabels(CommonLabelNames.QValues);
+        assertEquals(1.0 + gamma * 2.0, labels.getDouble(0, 0), 0.00001);
+        assertEquals(1.2, labels.getDouble(0, 1), 0.00001);
+        assertEquals(2.1, labels.getDouble(1, 0), 0.00001);
+        assertEquals(2.0, labels.getDouble(1, 1), 0.00001);
+    }
+}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/DoubleDQNTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/DoubleDQNTest.java
similarity index 74%
rename from rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/DoubleDQNTest.java
rename to rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/DoubleDQNTest.java
index d5a946a65..b4f0b3140 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/DoubleDQNTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/DoubleDQNTest.java
@@ -1,6 +1,9 @@
-package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm;
+package org.deeplearning4j.rl4j.agent.learning.algorithm.dqn;
 
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
 import org.deeplearning4j.rl4j.learning.sync.Transition;
+import org.deeplearning4j.rl4j.network.CommonLabelNames;
 import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.junit.Before;
@@ -9,7 +12,6 @@ import org.junit.runner.RunWith;
 import org.mockito.Mock;
 import org.mockito.junit.MockitoJUnitRunner;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 import org.nd4j.linalg.factory.Nd4j;
 
 import java.util.ArrayList;
@@ -28,6 +30,9 @@ public class DoubleDQNTest {
     @Mock
     IOutputNeuralNet targetQNetworkMock;
 
+    private final BaseTransitionTDAlgorithm.Configuration configuration = BaseTransitionTDAlgorithm.Configuration.builder()
+            .gamma(0.5)
+            .build();
 
     @Before
     public void setup() {
@@ -47,13 +52,13 @@ public class DoubleDQNTest {
             }
         };
 
-        DoubleDQN sut = new DoubleDQN(qNetworkMock, targetQNetworkMock, 0.5);
+        org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN sut = new org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN(qNetworkMock, targetQNetworkMock, configuration);
 
         // Act
-        DataSet result = sut.compute(transitions);
+        FeaturesLabels result = sut.compute(transitions);
 
         // Assert
-        INDArray evaluatedQValues = result.getLabels();
+        INDArray evaluatedQValues = result.getLabels(CommonLabelNames.QValues);
         assertEquals(1.0, evaluatedQValues.getDouble(0, 0), 0.0001);
         assertEquals(2.2, evaluatedQValues.getDouble(0, 1), 0.0001);
     }
@@ -71,13 +76,13 @@ public class DoubleDQNTest {
             }
         };
 
-        DoubleDQN sut = new DoubleDQN(qNetworkMock, targetQNetworkMock, 0.5);
+        org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN sut = new org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN(qNetworkMock, targetQNetworkMock, configuration);
 
         // Act
-        DataSet result = sut.compute(transitions);
+        FeaturesLabels result = sut.compute(transitions);
 
         // Assert
-        INDArray evaluatedQValues = result.getLabels();
+        INDArray evaluatedQValues = result.getLabels(CommonLabelNames.QValues);
         assertEquals(1.0 + 0.5 * -22.0, evaluatedQValues.getDouble(0, 0), 0.0001);
         assertEquals(2.2, evaluatedQValues.getDouble(0, 1), 0.0001);
     }
@@ -99,13 +104,13 @@ public class DoubleDQNTest {
             }
         };
 
-        DoubleDQN sut = new DoubleDQN(qNetworkMock, targetQNetworkMock, 0.5);
+        org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.DoubleDQN sut = new DoubleDQN(qNetworkMock, targetQNetworkMock, configuration);
 
         // Act
-        DataSet result = sut.compute(transitions);
+        FeaturesLabels result = sut.compute(transitions);
 
         // Assert
-        INDArray evaluatedQValues = result.getLabels();
+        INDArray evaluatedQValues = result.getLabels(CommonLabelNames.QValues);
         assertEquals(1.0 + 0.5 * -22.0, evaluatedQValues.getDouble(0, 0), 0.0001);
         assertEquals(2.2, evaluatedQValues.getDouble(0, 1), 0.0001);
 
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/StandardDQNTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/StandardDQNTest.java
similarity index 73%
rename from rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/StandardDQNTest.java
rename to rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/StandardDQNTest.java
index bc7812b36..588168d45 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/TDTargetAlgorithm/StandardDQNTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/algorithm/dqn/StandardDQNTest.java
@@ -1,6 +1,9 @@
-package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.TDTargetAlgorithm;
+package org.deeplearning4j.rl4j.agent.learning.algorithm.dqn;
 
+import org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
 import org.deeplearning4j.rl4j.learning.sync.Transition;
+import org.deeplearning4j.rl4j.network.CommonLabelNames;
 import org.deeplearning4j.rl4j.network.IOutputNeuralNet;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.junit.Before;
@@ -9,7 +12,6 @@ import org.junit.runner.RunWith;
 import org.mockito.Mock;
 import org.mockito.junit.MockitoJUnitRunner;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 import org.nd4j.linalg.factory.Nd4j;
 
 import java.util.ArrayList;
@@ -28,6 +30,9 @@ public class StandardDQNTest {
     @Mock
     IOutputNeuralNet targetQNetworkMock;
 
+    private final BaseTransitionTDAlgorithm.Configuration configuration = BaseTransitionTDAlgorithm.Configuration.builder()
+            .gamma(0.5)
+            .build();
 
     @Before
     public void setup() {
@@ -47,13 +52,13 @@ public class StandardDQNTest {
             }
         };
 
-        StandardDQN sut = new StandardDQN(qNetworkMock, targetQNetworkMock, 0.5);
+        org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN sut = new org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN(qNetworkMock, targetQNetworkMock, configuration);
 
         // Act
-        DataSet result = sut.compute(transitions);
+        FeaturesLabels result = sut.compute(transitions);
 
         // Assert
-        INDArray evaluatedQValues = result.getLabels();
+        INDArray evaluatedQValues = result.getLabels(CommonLabelNames.QValues);
         assertEquals(1.0, evaluatedQValues.getDouble(0, 0), 0.0001);
         assertEquals(2.2, evaluatedQValues.getDouble(0, 1), 0.0001);
     }
@@ -69,13 +74,13 @@ public class StandardDQNTest {
             }
         };
 
-        StandardDQN sut = new StandardDQN(qNetworkMock, targetQNetworkMock, 0.5);
+        org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN sut = new org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN(qNetworkMock, targetQNetworkMock, configuration);
 
         // Act
-        DataSet result = sut.compute(transitions);
+        FeaturesLabels result = sut.compute(transitions);
 
         // Assert
-        INDArray evaluatedQValues = result.getLabels();
+        INDArray evaluatedQValues = result.getLabels(CommonLabelNames.QValues);
         assertEquals(1.0 + 0.5 * 22.0, evaluatedQValues.getDouble(0, 0), 0.0001);
         assertEquals(2.2, evaluatedQValues.getDouble(0, 1), 0.0001);
     }
@@ -95,13 +100,13 @@ public class StandardDQNTest {
             }
         };
 
-        StandardDQN sut = new StandardDQN(qNetworkMock, targetQNetworkMock, 0.5);
+        org.deeplearning4j.rl4j.agent.learning.algorithm.dqn.StandardDQN sut = new StandardDQN(qNetworkMock, targetQNetworkMock, configuration);
 
         // Act
-        DataSet result = sut.compute(transitions);
+        FeaturesLabels result = sut.compute(transitions);
 
         // Assert
-        INDArray evaluatedQValues = result.getLabels();
+        INDArray evaluatedQValues = result.getLabels(CommonLabelNames.QValues);
         assertEquals((1.0 + 0.5 * 22.0), evaluatedQValues.getDouble(0, 0), 0.0001);
         assertEquals(2.2, evaluatedQValues.getDouble(0, 1), 0.0001);
 
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/LearningBehaviorTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/behavior/LearningBehaviorTest.java
similarity index 93%
rename from rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/LearningBehaviorTest.java
rename to rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/behavior/LearningBehaviorTest.java
index 1e39c63d5..07e34bfd2 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/LearningBehaviorTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/behavior/LearningBehaviorTest.java
@@ -1,6 +1,7 @@
-package org.deeplearning4j.rl4j.agent.learning;
+package org.deeplearning4j.rl4j.agent.learning.behavior;
 
-import org.deeplearning4j.rl4j.agent.update.IUpdateRule;
+import org.deeplearning4j.rl4j.agent.learning.behavior.LearningBehavior;
+import org.deeplearning4j.rl4j.agent.learning.update.IUpdateRule;
 import org.deeplearning4j.rl4j.experience.ExperienceHandler;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.junit.Before;
@@ -17,7 +18,6 @@ import java.util.List;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
-import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.Mockito.*;
 
 @RunWith(MockitoJUnitRunner.class)
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/FeaturesLabelsTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/FeaturesLabelsTest.java
new file mode 100644
index 000000000..4f9df3595
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/FeaturesLabelsTest.java
@@ -0,0 +1,38 @@
+package org.deeplearning4j.rl4j.agent.learning.update;
+
+import org.junit.Test;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+
+import static org.junit.Assert.assertEquals;
+
+public class FeaturesLabelsTest {
+
+    @Test
+    public void when_getBatchSizeIsCalled_expect_batchSizeIsReturned() {
+        // Arrange
+        INDArray features = Nd4j.create(5, 10);
+        FeaturesLabels sut = new FeaturesLabels(features);
+
+        // Act
+        long batchSize = sut.getBatchSize();
+
+        // Assert
+        assertEquals(5, batchSize);
+    }
+
+    @Test
+    public void when_puttingLabels_expect_getLabelReturnsLabels() {
+        // Arrange
+        INDArray features = Nd4j.create(5, 10);
+        INDArray labels = Nd4j.rand(2, 3);
+        FeaturesLabels sut = new FeaturesLabels(features);
+        sut.putLabels("test", labels);
+
+        // Act
+        INDArray result = sut.getLabels("test");
+
+        // Assert
+        assertEquals(result, labels);
+    }
+}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/GradientsTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/GradientsTest.java
new file mode 100644
index 000000000..68eb86c0b
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/GradientsTest.java
@@ -0,0 +1,40 @@
+package org.deeplearning4j.rl4j.agent.learning.update;
+
+import org.deeplearning4j.nn.gradient.Gradient;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.junit.MockitoJUnitRunner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+import static org.mockito.Mockito.mock;
+
+@RunWith(MockitoJUnitRunner.class)
+public class GradientsTest {
+
+    @Test
+    public void when_getBatchSizeIsCalled_expect_batchSizeIsReturned() {
+        // Arrange
+        Gradients sut = new Gradients(5);
+
+        // Act
+        long batchSize = sut.getBatchSize();
+
+        // Assert
+        assertEquals(5, batchSize);
+    }
+
+    @Test
+    public void when_puttingLabels_expect_getLabelReturnsLabels() {
+        // Arrange
+        Gradient gradient = mock(Gradient.class);
+        Gradients sut = new Gradients(5);
+        sut.putGradient("test", gradient);
+
+        // Act
+        Gradient result = sut.getGradient("test");
+
+        // Assert
+        assertSame(gradient, result);
+    }
+}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/UpdateRuleTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/UpdateRuleTest.java
new file mode 100644
index 000000000..af22911a0
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/UpdateRuleTest.java
@@ -0,0 +1,67 @@
+package org.deeplearning4j.rl4j.agent.learning.update;
+
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.INeuralNetUpdater;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.junit.MockitoJUnitRunner;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.*;
+
+@RunWith(MockitoJUnitRunner.class)
+public class UpdateRuleTest {
+
+    @Mock
+    private IUpdateAlgorithm<FeaturesLabels, Integer> updateAlgorithm;
+
+    @Mock
+    private INeuralNetUpdater<FeaturesLabels> updater;
+
+    private UpdateRule<FeaturesLabels, Integer> sut;
+
+    @Before
+    public void init() {
+        sut = new UpdateRule<FeaturesLabels, Integer>(updateAlgorithm, updater);
+    }
+
+    @Test
+    public void when_callingUpdate_expect_computeAndUpdateNetwork() {
+        // Arrange
+        List<Integer> trainingBatch = new ArrayList<Integer>() {
+            {
+                Integer.valueOf(1);
+                Integer.valueOf(2);
+            }
+        };
+        final FeaturesLabels computeResult = new FeaturesLabels(null);
+        when(updateAlgorithm.compute(any())).thenReturn(computeResult);
+
+        // Act
+        sut.update(trainingBatch);
+
+        // Assert
+        verify(updateAlgorithm, times(1)).compute(trainingBatch);
+        verify(updater, times(1)).update(computeResult);
+    }
+
+    @Test
+    public void when_callingUpdate_expect_updateCountIncremented() {
+        // Arrange
+
+        // Act
+        sut.update(null);
+        int updateCountBefore = sut.getUpdateCount();
+        sut.update(null);
+        int updateCountAfter = sut.getUpdateCount();
+
+        // Assert
+        assertEquals(updateCountBefore + 1, updateCountAfter);
+    }
+
+}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/updater/GradientsNeuralNetUpdaterTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/updater/GradientsNeuralNetUpdaterTest.java
new file mode 100644
index 000000000..6df65782b
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/updater/GradientsNeuralNetUpdaterTest.java
@@ -0,0 +1,56 @@
+package org.deeplearning4j.rl4j.agent.learning.update.updater;
+
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.junit.MockitoJUnitRunner;
+
+import static org.mockito.Mockito.*;
+
+@RunWith(MockitoJUnitRunner.class)
+public class GradientsNeuralNetUpdaterTest {
+
+    @Mock
+    ITrainableNeuralNet currentMock;
+
+    @Mock
+    ITrainableNeuralNet targetMock;
+
+    @Test
+    public void when_callingUpdate_expect_currentUpdatedAndtargetNotChanged() {
+        // Arrange
+        GradientsNeuralNetUpdater.Configuration configuration = GradientsNeuralNetUpdater.Configuration.builder()
+                .build();
+        GradientsNeuralNetUpdater sut = new GradientsNeuralNetUpdater(currentMock, targetMock, configuration);
+        Gradients gradients = new Gradients(10);
+
+        // Act
+        sut.update(gradients);
+
+        // Assert
+        verify(currentMock, times(1)).applyGradients(gradients);
+        verify(targetMock, never()).applyGradients(any());
+    }
+
+    @Test
+    public void when_callingUpdate_expect_targetUpdatedFromCurrentAtFrequency() {
+        // Arrange
+        GradientsNeuralNetUpdater.Configuration configuration = GradientsNeuralNetUpdater.Configuration.builder()
+                .targetUpdateFrequency(3)
+                .build();
+        GradientsNeuralNetUpdater sut = new GradientsNeuralNetUpdater(currentMock, targetMock, configuration);
+        Gradients gradients = new Gradients(10);
+
+        // Act
+        sut.update(gradients);
+        sut.update(gradients);
+        sut.update(gradients);
+
+        // Assert
+        verify(currentMock, never()).copy(any());
+        verify(targetMock, times(1)).copy(currentMock);
+    }
+
+}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/updater/LabelsNeuralNetUpdaterTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/updater/LabelsNeuralNetUpdaterTest.java
new file mode 100644
index 000000000..0dcfa3f4f
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/learning/update/updater/LabelsNeuralNetUpdaterTest.java
@@ -0,0 +1,77 @@
+package org.deeplearning4j.rl4j.agent.learning.update.updater;
+
+import org.deeplearning4j.rl4j.agent.Agent;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.junit.MockitoJUnitRunner;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.mockito.Mockito.*;
+
+@RunWith(MockitoJUnitRunner.class)
+public class LabelsNeuralNetUpdaterTest {
+
+    @Mock
+    ITrainableNeuralNet currentMock;
+
+    @Mock
+    ITrainableNeuralNet targetMock;
+
+    @Test
+    public void when_callingUpdateWithTargetUpdateFrequencyAt0_expect_Exception() {
+        // Arrange
+        LabelsNeuralNetUpdater.Configuration configuration = LabelsNeuralNetUpdater.Configuration.builder()
+                .targetUpdateFrequency(0)
+                .build();
+        try {
+            LabelsNeuralNetUpdater sut = new LabelsNeuralNetUpdater(currentMock, targetMock, configuration);
+            fail("IllegalArgumentException should have been thrown");
+        } catch (IllegalArgumentException exception) {
+            String expectedMessage = "Configuration: targetUpdateFrequency must be greater than 0, got:  [0]";
+            String actualMessage = exception.getMessage();
+
+            assertTrue(actualMessage.contains(expectedMessage));
+        }
+
+    }
+
+    @Test
+    public void when_callingUpdate_expect_currentUpdatedAndTargetNotChanged() {
+        // Arrange
+        LabelsNeuralNetUpdater.Configuration configuration = LabelsNeuralNetUpdater.Configuration.builder()
+                .build();
+        LabelsNeuralNetUpdater sut = new LabelsNeuralNetUpdater(currentMock, targetMock, configuration);
+        FeaturesLabels featureLabels = new FeaturesLabels(null);
+
+        // Act
+        sut.update(featureLabels);
+
+        // Assert
+        verify(currentMock, times(1)).fit(featureLabels);
+        verify(targetMock, never()).fit(any());
+    }
+
+    @Test
+    public void when_callingUpdate_expect_targetUpdatedFromCurrentAtFrequency() {
+        // Arrange
+        LabelsNeuralNetUpdater.Configuration configuration = LabelsNeuralNetUpdater.Configuration.builder()
+                .targetUpdateFrequency(3)
+                .build();
+        LabelsNeuralNetUpdater sut = new LabelsNeuralNetUpdater(currentMock, targetMock, configuration);
+        FeaturesLabels featureLabels = new FeaturesLabels(null);
+
+        // Act
+        sut.update(featureLabels);
+        sut.update(featureLabels);
+        sut.update(featureLabels);
+
+        // Assert
+        verify(currentMock, never()).copy(any());
+        verify(targetMock, times(1)).copy(currentMock);
+    }
+
+}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/NeuralNetUpdaterTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/NeuralNetUpdaterTest.java
deleted file mode 100644
index 578da2d50..000000000
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/agent/update/neuralnetupdater/NeuralNetUpdaterTest.java
+++ /dev/null
@@ -1,51 +0,0 @@
-package org.deeplearning4j.rl4j.agent.update.neuralnetupdater;
-
-import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.mockito.Mock;
-import org.mockito.junit.MockitoJUnitRunner;
-import org.nd4j.linalg.dataset.api.DataSet;
-
-import static org.mockito.Mockito.*;
-
-@RunWith(MockitoJUnitRunner.class)
-public class NeuralNetUpdaterTest {
-
-    @Mock
-    ITrainableNeuralNet currentMock;
-
-    @Mock
-    ITrainableNeuralNet targetMock;
-
-    @Test
-    public void when_callingUpdate_expect_currentUpdatedAndtargetNotChanged() {
-        // Arrange
-        NeuralNetUpdater sut = new NeuralNetUpdater(currentMock, targetMock, Integer.MAX_VALUE);
-        DataSet featureLabels = new org.nd4j.linalg.dataset.DataSet();
-
-        // Act
-        sut.update(featureLabels);
-
-        // Assert
-        verify(currentMock, times(1)).fit(featureLabels);
-        verify(targetMock, never()).fit(any());
-    }
-
-    @Test
-    public void when_callingUpdate_expect_targetUpdatedFromCurrentAtFrequency() {
-        // Arrange
-        NeuralNetUpdater sut = new NeuralNetUpdater(currentMock, targetMock, 3);
-        DataSet featureLabels = new org.nd4j.linalg.dataset.DataSet();
-
-        // Act
-        sut.update(featureLabels);
-        sut.update(featureLabels);
-        sut.update(featureLabels);
-
-        // Assert
-        verify(currentMock, never()).copy(any());
-        verify(targetMock, times(1)).copy(currentMock);
-    }
-
-}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/builder/BaseAgentLearnerBuilderTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/builder/BaseAgentLearnerBuilderTest.java
new file mode 100644
index 000000000..01a8908e3
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/builder/BaseAgentLearnerBuilderTest.java
@@ -0,0 +1,92 @@
+package org.deeplearning4j.rl4j.builder;
+
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.AgentLearner;
+import org.deeplearning4j.rl4j.agent.learning.algorithm.IUpdateAlgorithm;
+import org.deeplearning4j.rl4j.agent.learning.update.updater.INeuralNetUpdater;
+import org.deeplearning4j.rl4j.environment.Environment;
+import org.deeplearning4j.rl4j.experience.ExperienceHandler;
+import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
+import org.deeplearning4j.rl4j.observation.transform.TransformProcess;
+import org.deeplearning4j.rl4j.policy.IPolicy;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.MockitoJUnitRunner;
+
+import static org.mockito.Mockito.*;
+
+@RunWith(MockitoJUnitRunner.class)
+public class BaseAgentLearnerBuilderTest {
+    @Mock
+    BaseAgentLearnerBuilder.Configuration configuration;
+
+    @Mock
+    ITrainableNeuralNet neuralNet;
+
+    @Mock
+    Builder<Environment<Integer>> environmentBuilder;
+
+    @Mock
+    Builder<TransformProcess> transformProcessBuilder;
+
+    @Mock
+    IUpdateAlgorithm updateAlgorithmMock;
+
+    @Mock
+    INeuralNetUpdater neuralNetUpdaterMock;
+
+    @Mock
+    ExperienceHandler experienceHandlerMock;
+
+    @Mock
+    Environment environmentMock;
+
+    @Mock
+    IPolicy policyMock;
+
+    @Mock
+    TransformProcess transformProcessMock;
+
+    BaseAgentLearnerBuilder sut;
+
+    @Before
+    public void setup() {
+        sut = mock(
+                BaseAgentLearnerBuilder.class,
+                Mockito.withSettings()
+                        .useConstructor(configuration, neuralNet, environmentBuilder, transformProcessBuilder)
+                        .defaultAnswer(Mockito.CALLS_REAL_METHODS)
+        );
+
+        AgentLearner.Configuration agentLearnerConfiguration = AgentLearner.Configuration.builder().maxEpisodeSteps(200).build();
+
+        when(sut.buildUpdateAlgorithm()).thenReturn(updateAlgorithmMock);
+        when(sut.buildNeuralNetUpdater()).thenReturn(neuralNetUpdaterMock);
+        when(sut.buildExperienceHandler()).thenReturn(experienceHandlerMock);
+        when(environmentBuilder.build()).thenReturn(environmentMock);
+        when(transformProcessBuilder.build()).thenReturn(transformProcessMock);
+        when(sut.buildPolicy()).thenReturn(policyMock);
+        when(configuration.getAgentLearnerConfiguration()).thenReturn(agentLearnerConfiguration);
+    }
+
+    @Test
+    public void when_buildingAgentLearner_expect_dependenciesAndAgentLearnerIsBuilt() {
+        // Arrange
+
+        // Act
+        sut.build();
+
+        // Assert
+        verify(environmentBuilder, times(1)).build();
+        verify(transformProcessBuilder, times(1)).build();
+        verify(sut, times(1)).buildPolicy();
+        verify(sut, times(1)).buildExperienceHandler();
+        verify(sut, times(1)).buildUpdateAlgorithm();
+        verify(sut, times(1)).buildNeuralNetUpdater();
+        verify(sut, times(1)).buildAgentLearner();
+    }
+
+}
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandlerTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandlerTest.java
index 0d90e812d..44c2960d2 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandlerTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/ReplayMemoryExperienceHandlerTest.java
@@ -21,6 +21,13 @@ public class ReplayMemoryExperienceHandlerTest {
     @Mock
     IExpReplay<Integer> expReplayMock;
 
+    private ReplayMemoryExperienceHandler.Configuration buildConfiguration() {
+        return ReplayMemoryExperienceHandler.Configuration.builder()
+                .maxReplayMemorySize(10)
+                .batchSize(5)
+                .build();
+    }
+
     @Test
     public void when_addingFirstExperience_expect_notAddedToStoreBeforeNextObservationIsAdded() {
         // Arrange
@@ -87,7 +94,7 @@ public class ReplayMemoryExperienceHandlerTest {
     @Test
     public void when_addingExperience_expect_getTrainingBatchSizeReturnSize() {
         // Arrange
-        ReplayMemoryExperienceHandler sut = new ReplayMemoryExperienceHandler(10, 5, Nd4j.getRandom());
+        ReplayMemoryExperienceHandler sut = new ReplayMemoryExperienceHandler(buildConfiguration(), Nd4j.getRandom());
         sut.addExperience(new Observation(Nd4j.create(new double[] { 1.0 })), 1, 1.0, false);
         sut.addExperience(new Observation(Nd4j.create(new double[] { 2.0 })), 2, 2.0, false);
         sut.setFinalObservation(new Observation(Nd4j.create(new double[] { 3.0 })));
@@ -102,7 +109,7 @@ public class ReplayMemoryExperienceHandlerTest {
     @Test
     public void when_experienceSizeIsSmallerThanBatchSize_expect_TrainingBatchIsNotReady() {
         // Arrange
-        ReplayMemoryExperienceHandler sut = new ReplayMemoryExperienceHandler(10, 5, Nd4j.getRandom());
+        ReplayMemoryExperienceHandler sut = new ReplayMemoryExperienceHandler(buildConfiguration(), Nd4j.getRandom());
         sut.addExperience(new Observation(Nd4j.create(new double[] { 1.0 })), 1, 1.0, false);
         sut.addExperience(new Observation(Nd4j.create(new double[] { 2.0 })), 2, 2.0, false);
         sut.setFinalObservation(new Observation(Nd4j.create(new double[] { 3.0 })));
@@ -116,7 +123,7 @@ public class ReplayMemoryExperienceHandlerTest {
     @Test
     public void when_experienceSizeIsGreaterOrEqualToBatchSize_expect_TrainingBatchIsReady() {
         // Arrange
-        ReplayMemoryExperienceHandler sut = new ReplayMemoryExperienceHandler(10, 5, Nd4j.getRandom());
+        ReplayMemoryExperienceHandler sut = new ReplayMemoryExperienceHandler(buildConfiguration(), Nd4j.getRandom());
         sut.addExperience(new Observation(Nd4j.create(new double[] { 1.0 })), 1, 1.0, false);
         sut.addExperience(new Observation(Nd4j.create(new double[] { 2.0 })), 2, 2.0, false);
         sut.addExperience(new Observation(Nd4j.create(new double[] { 3.0 })), 3, 3.0, false);
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandlerTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandlerTest.java
index 2ce0d6659..6cdcd8620 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandlerTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/experience/StateActionExperienceHandlerTest.java
@@ -10,10 +10,16 @@ import static org.junit.Assert.*;
 
 public class StateActionExperienceHandlerTest {
 
+    private StateActionExperienceHandler.Configuration buildConfiguration(int batchSize) {
+        return StateActionExperienceHandler.Configuration.builder()
+                .batchSize(batchSize)
+                .build();
+    }
+
     @Test
     public void when_addingExperience_expect_generateTrainingBatchReturnsIt() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(Integer.MAX_VALUE);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(Integer.MAX_VALUE));
         sut.reset();
         Observation observation = new Observation(Nd4j.zeros(1));
         sut.addExperience(observation, 123, 234.0, true);
@@ -32,7 +38,7 @@ public class StateActionExperienceHandlerTest {
     @Test
     public void when_addingMultipleExperiences_expect_generateTrainingBatchReturnsItInSameOrder() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(Integer.MAX_VALUE);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(Integer.MAX_VALUE));
         sut.reset();
         sut.addExperience(null, 1, 1.0, false);
         sut.addExperience(null, 2, 2.0, false);
@@ -51,7 +57,7 @@ public class StateActionExperienceHandlerTest {
     @Test
     public void when_gettingExperience_expect_experienceStoreIsCleared() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(Integer.MAX_VALUE);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(Integer.MAX_VALUE));
         sut.reset();
         sut.addExperience(null, 1, 1.0, false);
 
@@ -67,7 +73,7 @@ public class StateActionExperienceHandlerTest {
     @Test
     public void when_addingExperience_expect_getTrainingBatchSizeReturnSize() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(Integer.MAX_VALUE);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(Integer.MAX_VALUE));
         sut.reset();
         sut.addExperience(null, 1, 1.0, false);
         sut.addExperience(null, 2, 2.0, false);
@@ -83,7 +89,7 @@ public class StateActionExperienceHandlerTest {
     @Test
     public void when_experienceIsEmpty_expect_TrainingBatchNotReady() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(5);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(5));
         sut.reset();
 
         // Act
@@ -96,7 +102,7 @@ public class StateActionExperienceHandlerTest {
     @Test
     public void when_experienceSizeIsGreaterOrEqualToThanBatchSize_expect_TrainingBatchIsReady() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(5);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(5));
         sut.reset();
         sut.addExperience(null, 1, 1.0, false);
         sut.addExperience(null, 2, 2.0, false);
@@ -114,7 +120,7 @@ public class StateActionExperienceHandlerTest {
     @Test
     public void when_experienceSizeIsSmallerThanBatchSizeButFinalObservationIsSet_expect_TrainingBatchIsReady() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(5);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(5));
         sut.reset();
         sut.addExperience(null, 1, 1.0, false);
         sut.addExperience(null, 2, 2.0, false);
@@ -130,7 +136,7 @@ public class StateActionExperienceHandlerTest {
     @Test
     public void when_experienceSizeIsZeroAndFinalObservationIsSet_expect_TrainingBatchIsNotReady() {
         // Arrange
-        StateActionExperienceHandler sut = new StateActionExperienceHandler(5);
+        StateActionExperienceHandler sut = new StateActionExperienceHandler(buildConfiguration(5));
         sut.reset();
         sut.setFinalObservation(null);
 
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscreteTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscreteTest.java
index e1424c286..90494961f 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscreteTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscreteTest.java
@@ -18,7 +18,7 @@
 package org.deeplearning4j.rl4j.learning.sync.qlearning.discrete;
 
 import org.deeplearning4j.gym.StepReply;
-import org.deeplearning4j.rl4j.agent.learning.ILearningBehavior;
+import org.deeplearning4j.rl4j.agent.learning.behavior.ILearningBehavior;
 import org.deeplearning4j.rl4j.learning.IHistoryProcessor;
 import org.deeplearning4j.rl4j.learning.configuration.QLearningConfiguration;
 import org.deeplearning4j.rl4j.learning.sync.qlearning.QLearning;
@@ -26,7 +26,6 @@ import org.deeplearning4j.rl4j.mdp.MDP;
 import org.deeplearning4j.rl4j.network.dqn.IDQN;
 import org.deeplearning4j.rl4j.space.Encodable;
 import org.deeplearning4j.rl4j.observation.Observation;
-import org.deeplearning4j.rl4j.space.Box;
 import org.deeplearning4j.rl4j.space.DiscreteSpace;
 import org.deeplearning4j.rl4j.space.ObservationSpace;
 import org.junit.Before;
@@ -101,6 +100,8 @@ public class QLearningDiscreteTest {
         when(mockQlearningConfiguration.getRewardFactor()).thenReturn(rewardFactor);
         when(mockQlearningConfiguration.getExpRepMaxSize()).thenReturn(maxExperienceReplay);
         when(mockQlearningConfiguration.getSeed()).thenReturn(123L);
+        when(mockQlearningConfiguration.getTargetDqnUpdateFreq()).thenReturn(1);
+        when(mockDQN.clone()).thenReturn(mockDQN);
 
         if(learningBehavior != null) {
             qLearningDiscrete = mock(
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/support/MockDQN.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/support/MockDQN.java
index 7fb3f8300..6970aec21 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/support/MockDQN.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/sync/support/MockDQN.java
@@ -2,12 +2,12 @@ package org.deeplearning4j.rl4j.learning.sync.support;
 
 import org.deeplearning4j.nn.api.NeuralNetwork;
 import org.deeplearning4j.nn.gradient.Gradient;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
 import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
-import org.deeplearning4j.rl4j.network.NeuralNet;
 import org.deeplearning4j.rl4j.network.dqn.IDQN;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 
 import java.io.IOException;
 import java.io.OutputStream;
@@ -69,7 +69,17 @@ public class MockDQN implements IDQN {
     }
 
     @Override
-    public void fit(DataSet featuresLabels) {
+    public void fit(FeaturesLabels featuresLabels) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Gradients computeGradients(FeaturesLabels updateLabels) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void applyGradients(Gradients gradients) {
         throw new UnsupportedOperationException();
     }
 
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java
index b05e55a19..b5ef4865b 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java
@@ -23,6 +23,8 @@ import org.deeplearning4j.nn.conf.layers.OutputLayer;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
 import org.deeplearning4j.rl4j.learning.IHistoryProcessor;
 import org.deeplearning4j.rl4j.learning.Learning;
 import org.deeplearning4j.rl4j.learning.configuration.QLearningConfiguration;
@@ -36,16 +38,13 @@ import org.deeplearning4j.rl4j.util.LegacyMDPWrapper;
 import org.junit.Test;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.lossfunctions.LossFunctions;
 
 import java.io.IOException;
 import java.io.OutputStream;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 /**
  *
@@ -90,7 +89,17 @@ public class PolicyTest {
         }
 
         @Override
-        public void fit(DataSet featuresLabels) {
+        public void fit(FeaturesLabels featuresLabels) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public Gradients computeGradients(FeaturesLabels updateLabels) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void applyGradients(Gradients gradients) {
             throw new UnsupportedOperationException();
         }
 
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockDQN.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockDQN.java
index 1b9c63b65..442b94f4d 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockDQN.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockDQN.java
@@ -2,13 +2,13 @@ package org.deeplearning4j.rl4j.support;
 
 import org.deeplearning4j.nn.api.NeuralNetwork;
 import org.deeplearning4j.nn.gradient.Gradient;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
 import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
-import org.deeplearning4j.rl4j.network.NeuralNet;
 import org.deeplearning4j.rl4j.network.dqn.IDQN;
 import org.deeplearning4j.rl4j.observation.Observation;
-import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.common.primitives.Pair;
-import org.nd4j.linalg.dataset.api.DataSet;
+import org.nd4j.linalg.api.ndarray.INDArray;
 
 import java.io.IOException;
 import java.io.OutputStream;
@@ -66,7 +66,17 @@ public class MockDQN implements IDQN {
     }
 
     @Override
-    public void fit(DataSet featuresLabels) {
+    public void fit(FeaturesLabels featuresLabels) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Gradients computeGradients(FeaturesLabels updateLabels) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void applyGradients(Gradients gradients) {
         throw new UnsupportedOperationException();
     }
 
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockNeuralNet.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockNeuralNet.java
index fe5b3a7f9..36480c9f6 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockNeuralNet.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/support/MockNeuralNet.java
@@ -2,11 +2,12 @@ package org.deeplearning4j.rl4j.support;
 
 import org.deeplearning4j.nn.api.NeuralNetwork;
 import org.deeplearning4j.nn.gradient.Gradient;
+import org.deeplearning4j.rl4j.agent.learning.update.FeaturesLabels;
+import org.deeplearning4j.rl4j.agent.learning.update.Gradients;
 import org.deeplearning4j.rl4j.network.ITrainableNeuralNet;
 import org.deeplearning4j.rl4j.network.NeuralNet;
 import org.deeplearning4j.rl4j.observation.Observation;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.DataSet;
 import org.nd4j.linalg.factory.Nd4j;
 
 import java.io.IOException;
@@ -42,7 +43,17 @@ public class MockNeuralNet implements NeuralNet {
     }
 
     @Override
-    public void fit(DataSet featuresLabels) {
+    public void fit(FeaturesLabels featuresLabels) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Gradients computeGradients(FeaturesLabels updateLabels) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void applyGradients(Gradients gradients) {
         throw new UnsupportedOperationException();
     }
 
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/trainer/SyncTrainerTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/trainer/SyncTrainerTest.java
new file mode 100644
index 000000000..fd99ba76d
--- /dev/null
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/trainer/SyncTrainerTest.java
@@ -0,0 +1,59 @@
+package org.deeplearning4j.rl4j.trainer;
+
+import org.apache.commons.lang3.builder.Builder;
+import org.deeplearning4j.rl4j.agent.IAgentLearner;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.junit.MockitoJUnitRunner;
+
+import java.util.function.Predicate;
+
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.*;
+
+@RunWith(MockitoJUnitRunner.class)
+public class SyncTrainerTest {
+
+    @Mock
+    IAgentLearner agentLearnerMock;
+
+    @Mock
+    Builder<IAgentLearner> agentLearnerBuilder;
+
+    SyncTrainer sut;
+
+    public void setup(Predicate<SyncTrainer> stoppingCondition) {
+        when(agentLearnerBuilder.build()).thenReturn(agentLearnerMock);
+
+        sut = new SyncTrainer(agentLearnerBuilder, stoppingCondition);
+    }
+
+    @Test
+    public void when_training_expect_stoppingConditionWillStopTraining() {
+        // Arrange
+        Predicate<SyncTrainer> stoppingCondition = t -> t.getEpisodeCount() >= 5; // Stop after 5 episodes
+        setup(stoppingCondition);
+
+        // Act
+        sut.train();
+
+        // Assert
+        assertEquals(5, sut.getEpisodeCount());
+    }
+
+    @Test
+    public void when_training_expect_agentIsRun() {
+        // Arrange
+        Predicate<SyncTrainer> stoppingCondition = t -> t.getEpisodeCount() >= 5; // Stop after 5 episodes
+        setup(stoppingCondition);
+
+        // Act
+        sut.train();
+
+        // Assert
+        verify(agentLearnerMock, times(5)).run();
+    }
+
+}
diff --git a/rl4j/rl4j-doom/pom.xml b/rl4j/rl4j-doom/pom.xml
index 76a31b4a8..31062c7e3 100644
--- a/rl4j/rl4j-doom/pom.xml
+++ b/rl4j/rl4j-doom/pom.xml
@@ -45,7 +45,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/rl4j/rl4j-gym/pom.xml b/rl4j/rl4j-gym/pom.xml
index 76e2e39ff..a27991338 100644
--- a/rl4j/rl4j-gym/pom.xml
+++ b/rl4j/rl4j-gym/pom.xml
@@ -50,7 +50,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/rl4j/rl4j-malmo/pom.xml b/rl4j/rl4j-malmo/pom.xml
index 3575da7a8..e9aa6f500 100644
--- a/rl4j/rl4j-malmo/pom.xml
+++ b/rl4j/rl4j-malmo/pom.xml
@@ -55,7 +55,7 @@
             <id>test-nd4j-native</id>
         </profile>
         <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
         </profile>
     </profiles>
 </project>
diff --git a/scalnet/pom.xml b/scalnet/pom.xml
index 39408dbf5..951e2ed25 100644
--- a/scalnet/pom.xml
+++ b/scalnet/pom.xml
@@ -293,7 +293,7 @@
             <dependencies>
                 <dependency>
                     <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                     <version>${project.version}</version>
                     <scope>test</scope>
                 </dependency>