Development updates (#9053)

* RL4J: Add generic update rule (#502) Signed-off-by: Alexandre Boulanger <aboulang2002@yahoo.com> * Shyrma reduce (#481) * - start working on improving of cpu legacy code for reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on improving legacy loops Signed-off-by: Yurii <iuriish@yahoo.com> * - still working on improving reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on improving reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - testing speed run of new reduce op Signed-off-by: Yurii <iuriish@yahoo.com> * - working on improvement of default loop for reduce op Signed-off-by: Yurii <iuriish@yahoo.com> * - update signatures of stuff which calls reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - make corrections in cuda reduce kernels Signed-off-by: Yurii <iuriish@yahoo.com> * - change loop for default case in broadcast legacy ops Signed-off-by: Yurii <iuriish@yahoo.com> * - comment some shape stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - comment unnecessary prints in RNGtests Signed-off-by: Yurii <iuriish@yahoo.com> * - finish to resolve conflicts after master has been merged Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of some compilation mistakes of cuda stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - minor changes Signed-off-by: Yurii <iuriish@yahoo.com> * - further search for bug causing crash on java test Signed-off-by: Yurii <iuriish@yahoo.com> * - add scalar case in reduce_ ... exec stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - minor corrections in NAtiveOps.cu Signed-off-by: Yurii <iuriish@yahoo.com> * - add switch to scalar case execReduceXD functions Signed-off-by: Yurii <iuriish@yahoo.com> * - add support for vectors old shape in ConstantShapeHelper::createShapeInfoWithNoUnitiesForReduce Signed-off-by: Yurii <iuriish@yahoo.com> * - correct cuda mirrorPad Signed-off-by: Yurii <iuriish@yahoo.com> * - add support for vectors old shape in cuda createShapeInfoWithNoUnitiesForReduce Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com> * Add support for CUDA 11.0 (#492) * Add support for CUDA 11.0 * libnd4j tweaks for CUDA 11 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * bindings update, again? Signed-off-by: raver119@gmail.com <raver119@gmail.com> * * Update versions of JavaCPP Presets for FFmpeg, OpenBLAS, and NumPy * update API to match CUDA 8 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * * Update version of JavaCPP Presets for CPython * C++ updated for cuDNN 8.0 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * 128-bit alignment for workspaces Signed-off-by: raver119@gmail.com <raver119@gmail.com> * change seed in 1 test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Fix dependecy duplication in python4j-parent pom * Fix group id for in python4j-numpy * few tests tweaked Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Remove macosx-x86_64-gpu from nd4j-tests-tensorflow * few minor tweaks for IndexReduce Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one test removed Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: Serhii Shepel <9946053+sshepel@users.noreply.github.com> * RL4J: Add SyncTrainer and AgentLearnerBuilder for a few algorithms (#504) Signed-off-by: Alexandre Boulanger <aboulang2002@yahoo.com> Co-authored-by: Alexandre Boulanger <44292157+aboulang2002@users.noreply.github.com> Co-authored-by: Yurii Shyrma <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com> Co-authored-by: Serhii Shepel <9946053+sshepel@users.noreply.github.com>
2020-07-26 21:59:27 +09:00 · 2020-07-26 21:59:27 +09:00 · 029b84e2b7
commit 029b84e2b7
parent ef665bfe49
253 changed files with 5029 additions and 3340 deletions
--- a/arbiter/arbiter-core/pom.xml
+++ b/arbiter/arbiter-core/pom.xml
@ -99,7 +99,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/arbiter/arbiter-deeplearning4j/pom.xml
+++ b/arbiter/arbiter-deeplearning4j/pom.xml
@ -77,7 +77,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/arbiter/arbiter-server/pom.xml
+++ b/arbiter/arbiter-server/pom.xml
@ -63,7 +63,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/arbiter/arbiter-ui/pom.xml
+++ b/arbiter/arbiter-ui/pom.xml
@ -37,7 +37,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/arbiter/pom.xml
+++ b/arbiter/pom.xml
@ -151,7 +151,7 @@
                                <skip>${skipTestResourceEnforcement}</skip>
                                <rules>
                                    <requireActiveProfile>
-                                        <profiles>test-nd4j-native,test-nd4j-cuda-10.2</profiles>
+                                        <profiles>test-nd4j-native,test-nd4j-cuda-11.0</profiles>
                                        <all>false</all>
                                    </requireActiveProfile>
                                </rules>
@ -333,11 +333,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${nd4j.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/change-cuda-versions.sh
+++ b/change-cuda-versions.sh
@ -20,7 +20,7 @@
 set -e
-VALID_VERSIONS=( 9.2 10.0 10.1 10.2 )
+VALID_VERSIONS=( 9.2 10.0 10.1 10.2 11.0 )
 usage() {
  echo "Usage: $(basename $0) [-h|--help] <cuda version to be used>
@ -47,6 +47,10 @@ check_cuda_version() {
 check_cuda_version "$VERSION"
 case $VERSION in
  11.0)
    VERSION2="8.0"
    VERSION3="1.5.4-SNAPSHOT"
    ;;
  10.2)
    VERSION2="7.6"
    VERSION3="1.5.3"
--- a/datavec/datavec-api/pom.xml
+++ b/datavec/datavec-api/pom.xml
@ -126,7 +126,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-arrow/pom.xml
+++ b/datavec/datavec-arrow/pom.xml
@ -62,7 +62,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-data/datavec-data-audio/pom.xml
+++ b/datavec/datavec-data/datavec-data-audio/pom.xml
@ -79,7 +79,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-data/datavec-data-codec/pom.xml
+++ b/datavec/datavec-data/datavec-data-codec/pom.xml
@ -66,7 +66,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-data/datavec-data-image/pom.xml
+++ b/datavec/datavec-data/datavec-data-image/pom.xml
@ -128,7 +128,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-data/datavec-data-nlp/pom.xml
+++ b/datavec/datavec-data/datavec-data-nlp/pom.xml
@ -81,7 +81,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-data/datavec-geo/pom.xml
+++ b/datavec/datavec-data/datavec-geo/pom.xml
@ -56,7 +56,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-data/datavec-hadoop/pom.xml
+++ b/datavec/datavec-data/datavec-hadoop/pom.xml
@ -74,7 +74,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-data/pom.xml
+++ b/datavec/datavec-data/pom.xml
@ -54,7 +54,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/datavec/datavec-excel/pom.xml
+++ b/datavec/datavec-excel/pom.xml
@ -65,7 +65,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-jdbc/pom.xml
+++ b/datavec/datavec-jdbc/pom.xml
@ -72,7 +72,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-local/pom.xml
+++ b/datavec/datavec-local/pom.xml
@ -95,7 +95,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-python/pom.xml
+++ b/datavec/datavec-python/pom.xml
@ -78,7 +78,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
@ -60,7 +60,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-model/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-model/pom.xml
@ -59,7 +59,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
@ -178,7 +178,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-spark-inference-parent/pom.xml
+++ b/datavec/datavec-spark-inference-parent/pom.xml
@ -38,7 +38,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/datavec-spark/pom.xml
+++ b/datavec/datavec-spark/pom.xml
@ -144,7 +144,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/datavec/pom.xml
+++ b/datavec/pom.xml
@ -108,7 +108,7 @@
                                <skip>${skipTestResourceEnforcement}</skip>
                                <rules>
                                    <requireActiveProfile>
-                                        <profiles>test-nd4j-native,test-nd4j-cuda-10.2</profiles>
+                                        <profiles>test-nd4j-native,test-nd4j-cuda-11.0</profiles>
                                        <all>false</all>
                                    </requireActiveProfile>
                                </rules>
@ -361,11 +361,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${nd4j.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-common-tests/pom.xml
+++ b/deeplearning4j/deeplearning4j-common-tests/pom.xml
@ -62,11 +62,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-common/pom.xml
+++ b/deeplearning4j/deeplearning4j-common/pom.xml
@ -40,7 +40,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-core/pom.xml
+++ b/deeplearning4j/deeplearning4j-core/pom.xml
@ -180,11 +180,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-cuda/pom.xml
+++ b/deeplearning4j/deeplearning4j-cuda/pom.xml
@ -16,7 +16,7 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
-    <artifactId>deeplearning4j-cuda-10.2</artifactId>
+    <artifactId>deeplearning4j-cuda-11.0</artifactId>
    <name>deeplearning4j-cuda</name>
    <parent>
        <groupId>org.deeplearning4j</groupId>
@ -26,9 +26,9 @@
    <properties>
        <!-- CUDA version is linked with the artifact name so cannot move to parent pom.xml -->
-        <cuda.version>10.2</cuda.version>
+        <cuda.version>11.0</cuda.version>
-        <cudnn.version>7.6</cudnn.version>
+        <cudnn.version>8.0</cudnn.version>
-        <javacpp-presets.cuda.version>1.5.3</javacpp-presets.cuda.version>
+        <javacpp-presets.cuda.version>1.5.4-SNAPSHOT</javacpp-presets.cuda.version>
    </properties>
    <dependencyManagement>
@ -112,7 +112,7 @@
            </build>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-cuda/src/main/java/org/deeplearning4j/cuda/convolution/CudnnConvolutionHelper.java
+++ b/deeplearning4j/deeplearning4j-cuda/src/main/java/org/deeplearning4j/cuda/convolution/CudnnConvolutionHelper.java
@ -254,17 +254,34 @@ public class CudnnConvolutionHelper extends BaseCudnnHelper implements Convoluti
                    throw new IllegalArgumentException("Unknown BwdDataAlgo: " + bwdDataAlgo);
            }
        } else {
            /*
            code = cudnnGetConvolutionBackwardFilterAlgorithm(cudnnContext, cudnnContext.srcTensorDesc,
                    cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.filterDesc,
                    mode == AlgoMode.NO_WORKSPACE ? CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE
                            : CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
                    0, algo1);
            */
            val fa = new cudnnConvolutionBwdFilterAlgoPerf_t();
            val counts = new int[1];
            code = cudnnFindConvolutionBackwardFilterAlgorithm(cudnnContext,  cudnnContext.srcTensorDesc,
                    cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.filterDesc, 1, counts, fa);
            algo1[0] = fa.algo();
            checkCudnn(false, "cudnnGetConvolutionBackwardFilterAlgorithm", code, input, weights, null, delta, kernel, strides, pad, mode, null, bwdFilterAlgo, bwdDataAlgo, convolutionMode, dilation);
            /*
            code = cudnnGetConvolutionBackwardDataAlgorithm(cudnnContext, cudnnContext.filterDesc,
                    cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.srcTensorDesc,
                    mode == AlgoMode.NO_WORKSPACE ? CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE
                            : CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
                    0, algo2);
             */
            val da = new cudnnConvolutionBwdDataAlgoPerf_t();
            code = cudnnFindConvolutionBackwardDataAlgorithm(cudnnContext, cudnnContext.filterDesc,
                    cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.srcTensorDesc, 1, counts, da);
            algo2[0] = da.algo();
            checkCudnn(false, "cudnnGetConvolutionBackwardDataAlgorithm", code, input, weights, null, delta, kernel, strides, pad, mode, null, bwdFilterAlgo, bwdDataAlgo, convolutionMode, dilation);
        }
@ -461,11 +478,17 @@ public class CudnnConvolutionHelper extends BaseCudnnHelper implements Convoluti
                    throw new IllegalArgumentException("Unknown FwdAlgo: " + fwdAlgo);
            }
        } else {
-            code = cudnnGetConvolutionForwardAlgorithm(cudnnContext, cudnnContext.srcTensorDesc,
+            /*
            code = cudnnGetConvolutionForwardAlgorithm_v7(cudnnContext, cudnnContext.srcTensorDesc,
                    cudnnContext.filterDesc, cudnnContext.convDesc,
                    cudnnContext.dstTensorDesc, mode == AlgoMode.NO_WORKSPACE
-                            ? CUDNN_CONVOLUTION_FWD_NO_WORKSPACE : CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+                            ? CUDNN_CONVOLUTION_FWD_ : CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
                    0, algo);
            */
            val cdf = new cudnnConvolutionFwdAlgoPerf_t();
            val count = new int[1];
            code = cudnnFindConvolutionForwardAlgorithm(cudnnContext, cudnnContext.srcTensorDesc, cudnnContext.filterDesc, cudnnContext.convDesc, cudnnContext.dstTensorDesc, 1, count, cdf);
            if(code != CUDNN_STATUS_SUCCESS){
                //If CuDNN can't infer algorithm - try IMPLICIT_GEMM
@ -477,6 +500,8 @@ public class CudnnConvolutionHelper extends BaseCudnnHelper implements Convoluti
                fwdAlgo = FwdAlgo.IMPLICIT_GEMM;
                algo[0] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
            }
            algo[0] = cdf.algo();
        }
        if(log.isTraceEnabled()){
--- a/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/cuda/lstm/ValidateCudnnLSTM.java
+++ b/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/cuda/lstm/ValidateCudnnLSTM.java
@ -29,6 +29,7 @@ import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.concurrency.AffinityManager;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.dataset.DataSet;
 import org.nd4j.linalg.factory.Nd4j;
@ -269,7 +270,7 @@ public class ValidateCudnnLSTM extends BaseDL4JTest {
        assertTrue(f.get(l0) instanceof CudnnLSTMHelper);
        assertTrue(f.get(l1) instanceof CudnnLSTMHelper);
-        Random r = new Random(12345);
+        Random r = new Random(123456);
        for (int x = 0; x < 1; x++) {
            INDArray input = Nd4j.rand(new int[] {minibatch, inputSize, timeSeriesLength});
            INDArray labels = Nd4j.zeros(minibatch, nOut, timeSeriesLength);
@ -284,7 +285,6 @@ public class ValidateCudnnLSTM extends BaseDL4JTest {
            mln2.fit(ds);
        }
        assertEquals(mln1.params(), mln2.params());
    }
--- a/deeplearning4j/deeplearning4j-data/deeplearning4j-datasets/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/deeplearning4j-datasets/pom.xml
@ -51,7 +51,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-data/deeplearning4j-datavec-iterators/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/deeplearning4j-datavec-iterators/pom.xml
@ -46,7 +46,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/pom.xml
@ -43,7 +43,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-data/pom.xml
+++ b/deeplearning4j/deeplearning4j-data/pom.xml
@ -38,7 +38,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml
+++ b/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml
@ -116,11 +116,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-graph/pom.xml
+++ b/deeplearning4j/deeplearning4j-graph/pom.xml
@ -64,7 +64,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-manifold/deeplearning4j-tsne/pom.xml
+++ b/deeplearning4j/deeplearning4j-manifold/deeplearning4j-tsne/pom.xml
@ -69,7 +69,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-manifold/pom.xml
+++ b/deeplearning4j/deeplearning4j-manifold/pom.xml
@ -41,7 +41,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml
+++ b/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml
@ -302,11 +302,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-modelimport/pom.xml
+++ b/deeplearning4j/deeplearning4j-modelimport/pom.xml
@ -135,11 +135,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
@ -125,11 +125,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
@ -49,7 +49,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-model/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-model/pom.xml
@ -53,7 +53,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/pom.xml
@ -89,11 +89,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/pom.xml
@ -44,7 +44,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
@ -72,7 +72,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-japanese/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-japanese/pom.xml
@ -75,7 +75,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-korean/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-korean/pom.xml
@ -67,7 +67,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
@ -110,7 +110,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
@ -91,7 +91,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-nlp-parent/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/pom.xml
@ -42,7 +42,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-nn/pom.xml
+++ b/deeplearning4j/deeplearning4j-nn/pom.xml
@ -128,7 +128,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-remote/deeplearning4j-json-server/pom.xml
+++ b/deeplearning4j/deeplearning4j-remote/deeplearning4j-json-server/pom.xml
@ -107,14 +107,14 @@
    </profile>
    <profile>
-      <id>test-nd4j-cuda-10.2</id>
+      <id>test-nd4j-cuda-11.0</id>
      <activation>
        <activeByDefault>false</activeByDefault>
      </activation>
      <dependencies>
        <dependency>
          <groupId>org.nd4j</groupId>
-          <artifactId>nd4j-cuda-10.2</artifactId>
+          <artifactId>nd4j-cuda-11.0</artifactId>
          <version>${project.version}</version>
          <scope>test</scope>
        </dependency>
--- a/deeplearning4j/deeplearning4j-remote/pom.xml
+++ b/deeplearning4j/deeplearning4j-remote/pom.xml
@ -27,7 +27,7 @@
      <id>test-nd4j-native</id>
    </profile>
    <profile>
-      <id>test-nd4j-cuda-10.2</id>
+      <id>test-nd4j-cuda-11.0</id>
    </profile>
  </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/pom.xml
@ -109,11 +109,11 @@
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <dependencies>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${project.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/pom.xml
@ -104,7 +104,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-scaleout/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/pom.xml
@ -38,7 +38,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/pom.xml
@ -80,7 +80,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp/pom.xml
@ -79,7 +79,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/pom.xml
@ -86,7 +86,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/pom.xml
@ -105,7 +105,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
@ -181,7 +181,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-components/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-components/pom.xml
@ -77,7 +77,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-model/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-model/pom.xml
@ -113,7 +113,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-standalone/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui-standalone/pom.xml
@ -32,7 +32,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-ui/pom.xml
@ -67,7 +67,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-vertx/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-vertx/pom.xml
@ -457,7 +457,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/deeplearning4j-ui-parent/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/pom.xml
@ -49,7 +49,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/deeplearning4j-zoo/pom.xml
+++ b/deeplearning4j/deeplearning4j-zoo/pom.xml
@ -85,7 +85,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
--- a/deeplearning4j/dl4j-integration-tests/pom.xml
+++ b/deeplearning4j/dl4j-integration-tests/pom.xml
@ -120,7 +120,7 @@
            <id>test-nd4j-native</id>
        </profile>
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
        </profile>
    </profiles>
 </project>
--- a/deeplearning4j/pom.xml
+++ b/deeplearning4j/pom.xml
@ -225,7 +225,7 @@
                            <skip>${skipBackendChoice}</skip>
                            <rules>
                                <requireActiveProfile>
-                                    <profiles>test-nd4j-native,test-nd4j-cuda-10.2</profiles>
+                                    <profiles>test-nd4j-native,test-nd4j-cuda-11.0</profiles>
                                    <all>false</all>
                                </requireActiveProfile>
                            </rules>
@ -500,7 +500,7 @@
        </profile>
        <!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
        <profile>
-            <id>test-nd4j-cuda-10.2</id>
+            <id>test-nd4j-cuda-11.0</id>
            <activation>
                <activeByDefault>false</activeByDefault>
            </activation>
@ -513,7 +513,7 @@
                </dependency>
                <dependency>
                    <groupId>org.nd4j</groupId>
-                    <artifactId>nd4j-cuda-10.2</artifactId>
+                    <artifactId>nd4j-cuda-11.0</artifactId>
                    <version>${nd4j.version}</version>
                    <scope>test</scope>
                </dependency>
--- a/libnd4j/include/array/NDArray.h
+++ b/libnd4j/include/array/NDArray.h
@ -628,17 +628,17 @@ namespace sd {
        *  keepDims - if true then put unities in place of reduced dimensions
        */
-        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
-        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
-        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
-        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
-        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
-        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
-        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims = false) const;
-        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false) const;
        /**
        *  method reduces array by excluding its shapes along dimensions present in given dimensions vector
@ -647,10 +647,10 @@ namespace sd {
        *  keepDims - if true then put unities in place of reduced dimensions
        *  extras - extra parameters
        */
-        void reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(sd::reduce::SameOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::SameOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(sd::reduce::BoolOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::BoolOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(sd::reduce::LongOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::LongOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool checkTargetShape = true) const;
        /**
        *  return variance of array elements set
--- a/libnd4j/include/array/NDArray.hXX
+++ b/libnd4j/include/array/NDArray.hXX
@ -1353,80 +1353,80 @@ void* NDArray::bufferWithOffset(Nd4jLong offset) {
 //////////////////////////////////////////////////////////////////////////
 // eventually method reduces array by excluding its shapes along axes present in dimensions vector
-NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims) const {
    std::vector<int> copy(dimensions);
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, isR() ? dataType() : Environment::getInstance().defaultFloatDataType(), keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, isR() ? dataType() : Environment::getInstance().defaultFloatDataType(), keepDims, false, getContext()->getWorkspace());
    NDArray result(newShape, true, getContext());
-    this->reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    this->reduceAlongDimension(op, result, copy, keepDims, false);
    return result;
 }
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims) const {
    std::vector<int> copy(dimensions);
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, keepDims, false, getContext()->getWorkspace());
    NDArray result(newShape, true, getContext());
-    reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    reduceAlongDimension(op, result, copy, keepDims, false);
    return result;
 }
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims) const {
    std::vector<int> copy(dimensions);
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::BOOL, keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::BOOL, keepDims, false, getContext()->getWorkspace());
    NDArray result(newShape, true, getContext());
-    reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    reduceAlongDimension(op, result, copy, keepDims, false);
    return result;
 }
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims) const {
    std::vector<int> copy(dimensions);
-    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::INT64, keepDims, supportOldShapes, getContext()->getWorkspace());
+    auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::INT64, keepDims, false, getContext()->getWorkspace());
    NDArray result(newShape, true, getContext());
-    reduceAlongDimension(op, result, copy, keepDims, supportOldShapes, false);
+    reduceAlongDimension(op, result, copy, keepDims, false);
    return result;
 }
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims) const {
-    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
+    return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims);
 }
 //////////////////////////////////////////////////////////////////////////
@ -4240,7 +4240,7 @@ NDArray NDArray::applyAllReduce3(sd::reduce3::Ops op, const NDArray& other, cons
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
    if (isS())
        throw std::runtime_error("NDArray::reduceAlongDimension FloatOps: you can't use this method on String array!");
@ -4250,7 +4250,7 @@ void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, con
    std::vector<int> copy(dimensions);
    if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
        if(!shape::shapeEquals(newShape, target.shapeInfo()))
            throw std::runtime_error("NDArray::reduceAlongDimension FloatOps: wrong target shape!");
    }
@ -4261,8 +4261,18 @@ void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, con
        NativeOpExecutioner::execReduceFloatScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(),nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
    }
    else {
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(shapeInfo(), copy);
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
-        NativeOpExecutioner::execReduceFloat(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), copy.data(), copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
        if(rankOf() - dimensions.size() != target.rankOf()) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
        NativeOpExecutioner::execReduceFloat(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
    }
    synchronize("NDArray::reduceAlongDimension FloatOps");
@ -4271,7 +4281,7 @@ void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, con
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
    if (isS())
        throw std::runtime_error("NDArray::reduceAlongDimension SameOps: you can't use this method on String array!");
@ -4281,7 +4291,7 @@ void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, cons
    std::vector<int> copy(dimensions);
    if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
        if(!shape::shapeEquals(newShape, target.shapeInfo()))
            throw std::runtime_error("NDArray::reduceAlongDimension SameOps: wrong target shape!");
    }
@ -4291,10 +4301,19 @@ void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, cons
    if(rankOf() == copy.size() || copy.empty()) {
        NativeOpExecutioner::execReduceSameScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
    }
-    else { //if (!isEmpty()) {
+    else {
-        auto pDims = sd::Environment::getInstance().isCPU() ? copy.data() : nullptr;
+
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(this->shapeInfo(), copy);
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
-        NativeOpExecutioner::execReduceSame(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
        if(rankOf() - dimensions.size() != target.rankOf()) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
        NativeOpExecutioner::execReduceSame(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
    }
    synchronize("NDArray::reduceAlongDimension SameOps");
@ -4303,7 +4322,7 @@ void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, cons
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
    if (isS())
        throw std::runtime_error("NDArray::reduceAlongDimension LongOps: you can't use this method on String array!");
@ -4313,7 +4332,7 @@ void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, cons
    std::vector<int> copy(dimensions);
    if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
        if(!shape::shapeEquals(newShape, target.shapeInfo()))
            throw std::runtime_error("NDArray::reduceAlongDimension LongOps: wrong target shape!");
    }
@ -4324,9 +4343,17 @@ void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, cons
        NativeOpExecutioner::execReduceLongScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
    }
    else {
-        auto pDims = sd::Environment::getInstance().isCPU() ? copy.data() : nullptr;
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(this->shapeInfo(), copy);
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
-        NativeOpExecutioner::execReduceLong(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+
        if(rankOf() - dimensions.size() != target.rankOf()) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
        NativeOpExecutioner::execReduceLong(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
    }
    synchronize("NDArray::reduceAlongDimension LongOps");
@ -4335,7 +4362,7 @@ void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, cons
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool checkTargetShape) const {
    if (isS())
        throw std::runtime_error("NDArray::reduceAlongDimension BoolOps cuda: you can't use this method on String array!");
@ -4345,7 +4372,7 @@ void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, cons
    std::vector<int> copy(dimensions);
    if(checkTargetShape) {
-        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, supportOldShapes, getContext()->getWorkspace());
+        auto newShape = ShapeUtils::evalReduceShapeInfo(target.ordering(), copy, *this, keepDims, false, getContext()->getWorkspace());
        if(!shape::shapeEquals(newShape, target.shapeInfo()))
            throw std::runtime_error("NDArray::reduceAlongDimension BoolOps cuda: wrong target shape!");
    }
@ -4356,9 +4383,17 @@ void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, cons
        NativeOpExecutioner::execReduceBoolScalar(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
    }
    else {
-        auto pDims = sd::Environment::getInstance().isCPU() ? copy.data() : nullptr;
+        const Nd4jLong* zShapeInfoH = target.shapeInfo();
-        auto packX = sd::ConstantTadHelper::getInstance().tadForDimensions(this->shapeInfo(), copy);
+        const Nd4jLong* zShapeInfoD = target.specialShapeInfo();
-        NativeOpExecutioner::execReduceBool(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
+
        if(rankOf() - dimensions.size() != target.rankOf()) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(target.shapeInfo(), copy, target.getContext()->getWorkspace());
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = ShapeUtils::evalDimsForReduceOp(rankOf(), copy);
        NativeOpExecutioner::execReduceBool(getContext(), op, buffer(), shapeInfo(), specialBuffer(), specialShapeInfo(), nullptr, target.buffer(), zShapeInfoH, target.specialBuffer(), zShapeInfoD, dims.data(), dims.size());
    }
    synchronize("NDArray::reduceAlongDimension LongOps");
--- a/libnd4j/include/helpers/ConstantShapeHelper.h
+++ b/libnd4j/include/helpers/ConstantShapeHelper.h
@ -51,6 +51,8 @@ namespace sd {
        ConstantShapeBuffer& bufferForShapeInfo(const Nd4jLong *shapeInfo);
        ConstantShapeBuffer& bufferForShapeInfo(sd::DataType dataType, char order, int rank, const Nd4jLong* shape);
        ConstantShapeBuffer& createShapeInfoWithUnitiesForBroadcast(const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, sd::memory::Workspace* workspace = nullptr, const std::vector<int> &dimensions = {});
        ConstantShapeBuffer& createShapeInfoWithNoUnitiesForReduce(const Nd4jLong* maxShapeInfo, const std::vector<int> &dimsWithUnities, sd::memory::Workspace* workspace = nullptr);
        ConstantShapeBuffer& createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, sd::memory::Workspace* workspace = nullptr);
        const Nd4jLong* emptyShapeInfo(sd::DataType dataType);
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
@ -41,43 +41,43 @@ namespace sd {
    public:
        template <typename OpType>
-        static FORCEINLINE void loopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, E* extraParams, int64_t start, int64_t stop);
+        static FORCEINLINE void loopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, E* extraParams);
    };
    template <typename X, typename Z>
    class ReductionFloatLoops : public ReductionLoops<X, Z, Z> {
    public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, Z* extraParams);
        template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, Z* extraParams);
    };
    template <typename X, typename Z>
    class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X, Z, X> {
    public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
        template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
    };
    template <typename X, typename Z>
    class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X, Z, X> {
    public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
        template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
    };
    template <typename X>
    class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X, X, X> {
    public:
-        static void wrapper(int opNum, const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void wrapper(int opNum, sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, X* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
        template <typename OpType>
-        static void innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
+        static void innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, X* z, const Nd4jLong *zShapeInfo, const int* dims, X* extraParams);
    };
@ -122,371 +122,612 @@ namespace sd {
        static void innerloopReduce3All(const X* x, const Nd4jLong* xShapeInfo, const X* y, const Nd4jLong* yShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* xTadShapeInfo, const Nd4jLong* xTadOffsets, const Nd4jLong* yTadShapeInfo, const Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
    };
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename E, typename OpType>
 static void reduceExec21(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
    auto func = PRAGMA_THREADS_FOR {
        for (auto i0 = start; i0 < stop; ++i0) {
            auto x0 = x + i0 * xStrd0;
            auto z0 = z + i0 * zStrd0;
            auto s = OpType::startingValue(x0);
            if(xStrd1 == 1)
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    s = OpType::update(s, OpType::op(x0[i1], extraParams), extraParams);
            else
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    s = OpType::update(s, OpType::op(x0[i1 * xStrd1], extraParams), extraParams);
            *z0 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis1), extraParams);
        }
    };
    samediff::Threads::parallel_for(func,  0,xAxis0);
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename E, typename OpType>
 static void reduceExec31(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis1 * xAxis2);
    auto func = PRAGMA_THREADS_FOR {
        for (auto i0 = start; i0 < stop; ++i0) {
            auto x0 = x + i0 * xStrd0;
            auto z0 = z + i0 * zStrd0;
            auto s = OpType::startingValue(x0);
            if(xStrd1 == 1)
                for (uint i2 = 0; i2 < xAxis2; ++i2)
                    for (uint i1 = 0; i1 < xAxis1; ++i1)
                        s = OpType::update(s, OpType::op(x0[i1 + i2*xStrd2], extraParams), extraParams);
            else if(xStrd2 == 1)
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2], extraParams), extraParams);
            else
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2], extraParams), extraParams);
            *z0 = OpType::postProcess(s, tadLen, extraParams);
        }
    };
    samediff::Threads::parallel_for(func,  0,xAxis0);
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename E, typename OpType>
 void reduceExec32(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 1);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 0);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
    auto func = PRAGMA_THREADS_FOR_2D {
-    /*
+        for (auto i0 = start_x; i0 < stop_x; ++i0) {
-    //////////////////////////////////////////////////////////////////////////////
+            for (auto i1 = start_y; i1 < stop_y; ++i1) {
    template<typename X, typename Y, typename Z>
    void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                        const Y* y, const Nd4jLong* yShapeInfo,
                              Z* z, const Nd4jLong* zShapeInfo,
                              Z* extraParams,
                              std::function<Z(X,Y,Z*)> op) {
-        const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
+                auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
                auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
-        const Nd4jLong* xShape  = shape::shapeOf(xShapeInfo);
+                auto s = OpType::startingValue(x1);
        const Nd4jLong* xStride = shape::stride(xShapeInfo);
        const Nd4jLong* yStride = shape::stride(yShapeInfo);
        const Nd4jLong* zStride = shape::stride(zShapeInfo);
-        const Nd4jLong len = shape::length(xShapeInfo);
+                if(xStrd2 == 1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        s = OpType::update(s, OpType::op(x1[i2], extraParams), extraParams);
                else
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        s = OpType::update(s, OpType::op(x1[i2 * xStrd2], extraParams), extraParams);
-        OmpLaunchHelper threadsInfo(len);
+                *z1 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis2), extraParams);
        switch (kindOfLoop) {
            case LoopKind::EWS1: {
                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
                {
                    const auto threadNum = omp_get_thread_num();
                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
                    const auto xi = x + threadOffset;
                    const auto yi = y + threadOffset;
                              auto zi = z + threadOffset;
                    PRAGMA_OMP_SIMD
                    for (uint i = 0; i < lenPerThread; i++)
                        zi[i] = op(xi[i], yi[i], extraParams);
            }
        }
-                break;
+    };
-            case LoopKind::EWSNONZERO: {
+    samediff::Threads::parallel_for(func, 0,xAxis0,1,  0,xAxis1,1);
-                const uint xEws = shape::elementWiseStride(xShapeInfo);
+}
                const uint yEws = shape::elementWiseStride(yShapeInfo);
                const uint zEws = shape::elementWiseStride(zShapeInfo);
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
+//////////////////////////////////////////////////////////////////////////
-                {
+template <typename X, typename Z, typename E, typename OpType>
-                    const auto threadNum = omp_get_thread_num();
+void reduceExec41(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
                    const auto xi = x + threadOffset * xEws;
                    const auto yi = y + threadOffset * yEws;
                          auto zi = z + threadOffset * zEws;
-                    PRAGMA_OMP_SIMD
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
-                    for (uint i = 0; i < lenPerThread; i++)
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
-                        zi[i*zEws] = op(xi[i*xEws], yi[i*yEws], extraParams);
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis1 * xAxis2 * xAxis3);
    auto func = PRAGMA_THREADS_FOR {
        for (auto i0 = start; i0 < stop; ++i0) {
            auto x0 = x + i0 * xStrd0;
            auto z0 = z + i0 * zStrd0;
            auto s = OpType::startingValue(x0);
            if(xStrd1 == 1)
                for (uint i3 = 0; i3 < xAxis3; ++i3)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i1 = 0; i1 < xAxis1; ++i1)
                        s = OpType::update(s, OpType::op(x0[i1 + i2*xStrd2 + i3*xStrd3], extraParams), extraParams);
            else if(xStrd2 == 1)
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i3 = 0; i3 < xAxis3; ++i3)
                        for (uint i2 = 0; i2 < xAxis2; ++i2)
                            s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2 + i3*xStrd3], extraParams), extraParams);
            else if(xStrd3 == 1)
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3], extraParams), extraParams);
            else
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3*xStrd3], extraParams), extraParams);
            *z0 = OpType::postProcess(s, tadLen, extraParams);
        }
    };
    samediff::Threads::parallel_for(func,  0,xAxis0);
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename E, typename OpType>
 void reduceExec42(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 1);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 0);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis2 * xAxis3);
    auto func = PRAGMA_THREADS_FOR_2D {
        for (auto i0 = start_x; i0 < stop_x; ++i0) {
            for (auto i1 = start_y; i1 < stop_y; ++i1) {
                auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
                auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
                auto s = OpType::startingValue(x1);
                if(xStrd2 == 1)
                    for (uint i3 = 0; i3 < xAxis3; ++i3)
                        for (uint i2 = 0; i2 < xAxis2; ++i2)
                            s = OpType::update(s, OpType::op(x1[i2 + i3*xStrd3], extraParams), extraParams);
                else if(xStrd3 == 1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3], extraParams), extraParams);
                else
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3*xStrd3], extraParams), extraParams);
                *z1 = OpType::postProcess(s, tadLen, extraParams);
            }
        }
-                break;
+    };
-            case LoopKind::RANK1: {
+    samediff::Threads::parallel_for(func, 0,xAxis0,1,  0,xAxis1,1);
-                PRAGMA_OMP_PARALLEL_FOR
+}
-                for (uint i0 = 0; i0 < len; ++i0)
+
-                    z[i0 * zStride[0]] = op(x[i0 * xStride[0]], y[i0 * yStride[0]], extraParams);
+//////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename E, typename OpType>
 void reduceExec43(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 2);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, 1);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
    const Nd4jLong zStrd2 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 2 : 0);
    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
    auto func = PRAGMA_THREADS_FOR_3D {
        for (auto i0 = start_x; i0 < stop_x; ++i0) {
            for (auto i1 = start_y; i1 < stop_y; ++i1) {
                for (auto i2 = start_z; i2 < stop_z; ++i2) {
                    auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
                    auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
                    auto s = OpType::startingValue(x2);
                    if(xStrd3 == 1)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            s = OpType::update(s, OpType::op(x2[i3], extraParams), extraParams);
                    else
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            s = OpType::update(s, OpType::op(x2[i3*xStrd3], extraParams), extraParams);
                    *z2 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis3), extraParams);
                }
                break;
            case LoopKind::RANK2: {
                PRAGMA_OMP_PARALLEL_FOR_SIMD
                for (uint i0 = 0; i0 < xShape[0]; ++i0)
                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
                        z[i0 * zStride[0] + i1 * zStride[1]] = op(x[i0 * xStride[0] + i1 * xStride[1]], y[i0 * yStride[0] + i1 * yStride[1]], extraParams);
            }
                break;
            case LoopKind::RANK3: {
                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
                for (uint i0 = 0; i0 < xShape[0]; ++i0)
                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
                            z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]], extraParams);
        }
-                break;
+    };
-            case LoopKind::RANK4: {
+    samediff::Threads::parallel_for(func,  0,xAxis0,1,  0,xAxis1,1,  0,xAxis2,1);
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(3)
+}
-                for (uint i0 = 0; i0 < xShape[0]; ++i0)
+
-                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
+//////////////////////////////////////////////////////////////////////////
-                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
+template <typename X, typename Z, typename E, typename OpType>
-                            for (uint i3 = 0; i3 < xShape[3]; ++i3)
+void reduceExec51(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
-                                z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]], extraParams);
+
    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   dims[0]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, dims[0]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, 0);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis1 * xAxis2 * xAxis3 * xAxis4);
    auto func = PRAGMA_THREADS_FOR {
        for (auto i0 = start; i0 < stop; ++i0) {
            auto x0 = x + i0 * xStrd0;
            auto z0 = z + i0 * zStrd0;
            auto s = OpType::startingValue(x0);
            if(xStrd1 == 1)
                for (uint i4 = 0; i4 < xAxis4; ++i4)
                    for (uint i3 = 0; i3 < xAxis3; ++i3)
                        for (uint i2 = 0; i2 < xAxis2; ++i2)
                            for (uint i1 = 0; i1 < xAxis1; ++i1)
                                s = OpType::update(s, OpType::op(x0[i1 + i2*xStrd2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
            else if(xStrd2 == 1)
                for (uint i4 = 0; i4 < xAxis4; ++i4)
                    for (uint i3 = 0; i3 < xAxis3; ++i3)
                        for (uint i1 = 0; i1 < xAxis1; ++i1)
                            for (uint i2 = 0; i2 < xAxis2; ++i2)
                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
            else if(xStrd3 == 1)
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i4 = 0; i4 < xAxis4; ++i4)
                            for (uint i3 = 0; i3 < xAxis3; ++i3)
                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3 + i4*xStrd4], extraParams), extraParams);
            else if(xStrd4 == 1)
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3*xStrd3 + i4], extraParams), extraParams);
            else
                for (uint i1 = 0; i1 < xAxis1; ++i1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x0[i1*xStrd1 + i2*xStrd2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
            *z0 = OpType::postProcess(s, tadLen, extraParams);
        }
-                break;
+    };
-            case LoopKind::RANK5: {
+    samediff::Threads::parallel_for(func,  0,xAxis0);
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(4)
+}
-                for (uint i0 = 0; i0 < xShape[0]; ++i0)
+
-                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
+//////////////////////////////////////////////////////////////////////////
-                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
+template <typename X, typename Z, typename E, typename OpType>
-                            for (uint i3 = 0; i3 < xShape[3]; ++i3)
+void reduceExec52(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
-                                for (uint i4 = 0; i4 < xShape[4]; ++i4)
+
-                                    z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]+i4*zStride[4]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]+i4*xStride[4]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]+i4*yStride[4]], extraParams);
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[1]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 1);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[0]);
    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 0);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   dims[2]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, dims[2]);
    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis2 * xAxis3 * xAxis4);
    auto func = PRAGMA_THREADS_FOR_2D {
        for (auto i0 = start_x; i0 < stop_x; ++i0) {
            for (auto i1 = start_y; i1 < stop_y; ++i1) {
                auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
                auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
                auto s = OpType::startingValue(x1);
                if(xStrd2 == 1)
                    for (uint i4 = 0; i4 < xAxis4; ++i4)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            for (uint i2 = 0; i2 < xAxis2; ++i2)
                                s = OpType::update(s, OpType::op(x1[i2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
                else if(xStrd3 == 1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i4 = 0; i4 < xAxis4; ++i4)
                            for (uint i3 = 0; i3 < xAxis3; ++i3)
                                s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3 + i4*xStrd4], extraParams), extraParams);
                else if(xStrd4 == 1)
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3*xStrd3 + i4], extraParams), extraParams);
                else
                    for (uint i2 = 0; i2 < xAxis2; ++i2)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x1[i2*xStrd2 + i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
                *z1 = OpType::postProcess(s, tadLen, extraParams);
            }
-                break;
+        }
    };
-            default: {
+    samediff::Threads::parallel_for(func, 0,xAxis0,1,  0,xAxis1,1);
-                uint xShapeInfoCast[MAX_RANK];
+}
                uint yShapeInfoCast[MAX_RANK];
                uint zShapeInfoCast[MAX_RANK];
-                bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+//////////////////////////////////////////////////////////////////////////
-                bool canCastY = DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+template <typename X, typename Z, typename E, typename OpType>
-                bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+void reduceExec53(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
+    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
-                {
+    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[2]);
-                    auto threadNum = omp_get_thread_num();
+    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 2);
-                    auto threadOffset = threadsInfo.getThreadOffset(threadNum);
+
-                    auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   dims[1]);
-                    PRAGMA_OMP_SIMD
+    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, dims[1]);
-                    for (uint i = 0; i < lenPerThread; i++) {
+    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, 1);
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[2] : dims[0]);
-                        z[zOffset] = op(x[xOffset], y[yOffset], extraParams);
+    const Nd4jLong zStrd2 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 2 : 0);
    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   dims[3]);
    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, dims[3]);
    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
    const Nd4jLong tadLen = static_cast<Nd4jLong>(xAxis3 * xAxis4);
    auto func = PRAGMA_THREADS_FOR_3D {
        for (auto i0 = start_x; i0 < stop_x; ++i0) {
            for (auto i1 = start_y; i1 < stop_y; ++i1) {
                for (auto i2 = start_z; i2 < stop_z; ++i2) {
                    auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
                    auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
                    auto s = OpType::startingValue(x2);
                    if(xStrd3 == 1)
                        for (uint i4 = 0; i4 < xAxis4; ++i4)
                            for (uint i3 = 0; i3 < xAxis3; ++i3)
                                s = OpType::update(s, OpType::op(x2[i3 + i4*xStrd4], extraParams), extraParams);
                    else if(xStrd4 == 1)
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x2[i3*xStrd3 + i4], extraParams), extraParams);
                    else
                        for (uint i3 = 0; i3 < xAxis3; ++i3)
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x2[i3*xStrd3 + i4*xStrd4], extraParams), extraParams);
                    *z2 = OpType::postProcess(s, tadLen, extraParams);
                }
            }
        }
    };
    samediff::Threads::parallel_for(func,  0,xAxis0,1,  0,xAxis1,1,  0,xAxis2,1);
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename E, typename OpType>
 void reduceExec54(const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
    const uint     xAxis0 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[0] : dims[3]);
    const Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[0] : dims[3]);
    const Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 0 : 3);
    const uint     xAxis1 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[1] : dims[2]);
    const Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[1] : dims[2]);
    const Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 1 : 2);
    const uint     xAxis2 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[2] : dims[1]);
    const Nd4jLong xStrd2 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[2] : dims[1]);
    const Nd4jLong zStrd2 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 2 : 1);
    const uint     xAxis3 = shape::sizeAt(xShapeInfo,   shape::order(zShapeInfo) == 'c' ? dims[3] : dims[0]);
    const Nd4jLong xStrd3 = shape::strideAt(xShapeInfo, shape::order(zShapeInfo) == 'c' ? dims[3] : dims[0]);
    const Nd4jLong zStrd3 = shape::strideAt(zShapeInfo, shape::order(zShapeInfo) == 'c' ? 3 : 0);
    const uint     xAxis4 = shape::sizeAt(xShapeInfo,   dims[4]);
    const Nd4jLong xStrd4 = shape::strideAt(xShapeInfo, dims[4]);
    auto func = PRAGMA_THREADS_FOR_3D {
        for (auto i0 = start_x; i0 < stop_x; ++i0) {
            for (auto i1 = start_y; i1 < stop_y; ++i1) {
                for (auto i2 = start_z; i2 < stop_z; ++i2) {
                    for (auto i3 = 0; i3 < xAxis3; ++i3) {
                        auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3;
                        auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
                        auto s = OpType::startingValue(x3);
                       if(xStrd4 == 1)
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x3[i4], extraParams), extraParams);
                        else
                            for (uint i4 = 0; i4 < xAxis4; ++i4)
                                s = OpType::update(s, OpType::op(x3[i4*xStrd4], extraParams), extraParams);
                        *z3 = OpType::postProcess(s, static_cast<Nd4jLong>(xAxis4), extraParams);
                    }
                }
            }
        }
-    }
+    };
-    */
+
    samediff::Threads::parallel_for(func,  0,xAxis0,1,  0,xAxis1,1,  0,xAxis2,1);
 }
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z, typename E, typename OpType>
 void reduceDefault(sd::memory::Workspace* workspace, const X *x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int *dims, E* extraParams) {
-    //////////////////////////////////////////////////////////////////////////////
+    const int zRank = shape::rank(zShapeInfo);
-    template<typename X, typename Z, typename E>
+    const int tadRank = shape::rank(xShapeInfo) - zRank;
    template <typename OpType>
    void sd::ReductionLoops<X, Z, E>::loopReduce(const X* x, const Nd4jLong* xShapeInfo,
                                                 Z* z, const Nd4jLong* zShapeInfo,
                                                 const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets,
                                                 E* extraParams,
                                                 int64_t start, int64_t stop) {
-        const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo);
+    Nd4jLong* outerXTadShapeInfo = sd::ShapeBuilders::createSubArrShapeInfo(xShapeInfo, dims, zRank);
    Nd4jLong* innerXTadShapeInfo = sd::ShapeBuilders::createSubArrShapeInfo(xShapeInfo, dims+zRank, tadRank);
    const bool sameOffsets1 = shape::haveSameShapeAndStrides(zShapeInfo, outerXTadShapeInfo);
    const bool sameOffsets2 = shape::haveSameShapeAndStrides(zShapeInfo, innerXTadShapeInfo);
    const Nd4jLong zLen   = shape::length(zShapeInfo);
-        const Nd4jLong tadLen = shape::length(tadShapeInfo);
+    const Nd4jLong tadLen = shape::length(innerXTadShapeInfo);
-        const uint tadEws = shape::elementWiseStride(tadShapeInfo);
+    Nd4jLong* zOffsets = nullptr;
-        const uint zEws = shape::elementWiseStride(zShapeInfo);
+    ALLOCATE(zOffsets, workspace, zLen, Nd4jLong);
    shape::calcOffsets(zShapeInfo, zOffsets);
-        const Nd4jLong* tadShape = shape::shapeOf(tadShapeInfo);
+    Nd4jLong* outerXTadOffsets = zOffsets;
-        const Nd4jLong* tadStride = shape::stride(tadShapeInfo);
+    if(!sameOffsets1) {
        ALLOCATE(outerXTadOffsets, workspace, zLen, Nd4jLong);
        shape::calcOffsets(outerXTadShapeInfo, outerXTadOffsets);
    }
-        int numThreads = OmpLaunchHelper::tadThreads(tadLen, zLen);
+    Nd4jLong* innerXTadOffsets = zOffsets;
    if(!sameOffsets2) {
        ALLOCATE(innerXTadOffsets, workspace, tadLen, Nd4jLong);
        shape::calcOffsets(innerXTadShapeInfo, innerXTadOffsets);
    }
-        switch (kindOfLoop) {
+    auto func = PRAGMA_THREADS_FOR{
        for (auto i = start; i < stop; ++i) {
            const auto tad = x + outerXTadOffsets[i];
            auto s = OpType::startingValue(tad);
            for (Nd4jLong j = 0; j < tadLen; j++)
                s = OpType::update(s, OpType::op(tad[innerXTadOffsets[j]], extraParams), extraParams);
            z[zOffsets[i]] = OpType::postProcess(s, tadLen, extraParams);
        }
    };
    samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo));
    RELEASE(outerXTadShapeInfo, workspace);
    RELEASE(innerXTadShapeInfo, workspace);
    RELEASE(zOffsets, workspace);
    if(!sameOffsets1)
        RELEASE(outerXTadOffsets, workspace);
    if(!sameOffsets2)
        RELEASE(innerXTadOffsets, workspace);
 }
 //////////////////////////////////////////////////////////////////////////////
 template<typename X, typename Z, typename E>
 template <typename OpType>
 void sd::ReductionLoops<X, Z, E>::loopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong *xShapeInfo, Z* z, const Nd4jLong *zShapeInfo, const int* dims, E* extraParams) {
    const int xRank = shape::rank(xShapeInfo);
    const int zRank = shape::rank(zShapeInfo);
            //*********************************************//
            // case LoopKind::SMALLARR2DX: {
    // shape::printShapeInfoLinear(xShapeInfo);
    // shape::printShapeInfoLinear(zShapeInfo);
-            //     const auto xLen = zLen * tadLen;
+    // shape::printIntArray(dims, shape::rank(xShapeInfo));
            //     for (uint i = 0; i < xLen; ++i) {
            //         const auto zOffset = shape::subArrayOffset(i, xShapeInfo, zShapeInfo, dimsToExclude, dimsLen);
            //         const uint tadInd = (i / tadEws) % tadLen;
            //         auto startVal = tadInd ? z[zOffset] : static_cast<Z>(OpType::startingValue(x));
            //         z[zOffset] = OpType::update(startVal, OpType::op(x[i], extraParams), extraParams);
            //         if(tadInd == tadLen - 1)
            //             z[zOffset] = OpType::postProcess(z[zOffset], tadLen, extraParams);
            //         printf("%u - %lld\n", i, zOffset);
            //     }
            // }
        case LoopKind::SMALLARR2DX: {
            const auto uTadLen = static_cast<uint>(tadLen);
            const auto uZLenMinusOne = static_cast<uint>(zLen - 1);
            const auto xLen = static_cast<uint>(zLen * uTadLen);
            const auto sv = static_cast<Z>(OpType::startingValue(x));
-            for (uint i = 0; i <= uZLenMinusOne; i++)
+    if(xRank == 2 && zRank == 1)
-                z[i] = OpType::startingValue(x);
+        reduceExec21<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-
+    else if(xRank == 3 && zRank == 1)
-            uint zOffset = 0;
+        reduceExec31<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-            for (uint i = 0; i < xLen; ++i) {
+    else if(xRank == 3 && zRank == 2)
-                z[zOffset] = OpType::update(z[zOffset], OpType::op(x[i], extraParams), extraParams);
+        reduceExec32<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-                zOffset = zOffset == uZLenMinusOne ? 0 : zOffset + 1;
+    else if(xRank == 4 && zRank == 1)
-            }
+        reduceExec41<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-
+    else if(xRank == 4 && zRank == 2)
-            for (uint i = 0; i <= uZLenMinusOne; i++)
+        reduceExec42<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-                z[i] = OpType::postProcess(z[i], tadLen, extraParams);
+    else if(xRank == 4 && zRank == 3)
-        }
+        reduceExec43<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-        break;
+    else if(xRank == 5 && zRank == 1)
-
+        reduceExec51<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-        //*********************************************//
+    else if(xRank == 5 && zRank == 2)
-        case LoopKind::EWS1: {
+        reduceExec52<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-            for (auto i = start; i < stop; i++) {
+    else if(xRank == 5 && zRank == 3)
-                auto tad = x + tadOffsets[i];
+        reduceExec53<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-                auto s = OpType::startingValue(tad);
+    else if(xRank == 5 && zRank == 4)
-
+        reduceExec54<X,Z,E,OpType>(x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-                for (Nd4jLong j = 0; j < tadLen; j++)
+    else
-                    s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
+        reduceDefault<X,Z,E,OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
-
+}
                z[i] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
         //*********************************************//
        case LoopKind::EWSNONZERO: {
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong j = 0; j < tadLen; j++)
                    s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
        //*********************************************//
        case LoopKind::RANK1: {
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong i0 = 0; i0 < tadLen; ++i0)
                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
                z[i] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
                            break;
                            //*********************************************//
        case LoopKind::RANK2: {
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
                z[i] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
        //*********************************************//
        case LoopKind::RANK3: {
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
                            s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
                z[i] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
        //*********************************************//
        case LoopKind::RANK4: {
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
                                s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
                z[i] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
        //*********************************************//
        case LoopKind::RANK5: {
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
                                for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4)
                                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
                z[i] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
        //*********************************************//
        case LoopKind::X_EWSNONZERO: {
            uint castZShapeInfo[MAX_RANK];
            const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong j = 0; j < tadLen; j++)
                    s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
                z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
        //*********************************************//
        case LoopKind::Z_EWSNONZERO: {
            uint castTadShapeInfo[MAX_RANK];
            const bool canCastTad = sd::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong j = 0; j < tadLen; j++) {
                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
                    s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
                }
                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
            };
        }
        break;
        //*********************************************//
        default: {
            auto innertadOffsets = new Nd4jLong[tadLen];
            shape::calcOffsets(tadShapeInfo, innertadOffsets);
            uint castZShapeInfo[MAX_RANK];
            const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
            for (auto i = start; i < stop; i++) {
                auto tad = x + tadOffsets[i];
                auto s = OpType::startingValue(tad);
                for (Nd4jLong j = 0; j < tadLen; j++)
                    s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
                z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
            };
            delete[] innertadOffsets;
        }
        }
    }
--- a/libnd4j/include/helpers/ShapeBuilders.h
+++ b/libnd4j/include/helpers/ShapeBuilders.h
@ -52,11 +52,9 @@ namespace sd {
        static Nd4jLong* copyShapeInfoAndType(const Nd4jLong* inShapeInfo, const Nd4jLong* shapeInfoToGetTypeFrom, const bool copyStrides, memory::Workspace* workspace = nullptr);
        /**
-        * allocates memory for new shapeInfo and copy all information from inShapeInfo to new shapeInfo except dimensions in dimsToExclude (unit dimensions) and corresponding strides
+        * allocates memory for sub-array shapeInfo and copy shape and strides at axes(positions) stored in dims
        * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
        * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
        */
-        static Nd4jLong* copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace = nullptr);
+        static Nd4jLong* createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, memory::Workspace* workspace = nullptr);
        static Nd4jLong* emptyShapeInfo(const sd::DataType dataType, memory::Workspace* workspace = nullptr);
--- a/libnd4j/include/helpers/ShapeUtils.h
+++ b/libnd4j/include/helpers/ShapeUtils.h
@ -40,6 +40,12 @@ namespace sd {
        static const Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const NDArray& arr, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
        static const Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const Nd4jLong* shapeInfo, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
        // for example
        // if rank = 3 and dimsToExclude = {0,2} then output = {1,0,2},   if rank = 3 and dimsToExclude = {2} then output = {0,1,2}
        // if rank = 3 and dimsToExclude = {0} then output = {1,2,0},     if rank = 4 and dimsToExclude = {0,3} then output = {1,2,0,3}
        static std::vector<int> evalDimsForReduceOp(const int rank, const std::vector<int>& dimsToExclude);
        /**
         * evaluate output shape for reduce operation when input shape is empty
         * behavior is analogous to tf
--- a/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
@ -94,9 +94,9 @@ namespace sd {
                auto tadOffsets = Environment::getInstance().isCPU() ? pack.primaryOffsets() : pack.specialOffsets();
                if (_opType == 0)
-                    NativeOpExecutioner::execReduceFloat(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size(), tadOnlyShapeInfo, tadOffsets);
+                    NativeOpExecutioner::execReduceFloat(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size());
                else
-                    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size(), tadOnlyShapeInfo, tadOffsets);
+                    NativeOpExecutioner::execReduceSame(LaunchContext::defaultContext(), _opNum, _x->buffer(), _x->shapeInfo(), _x->specialBuffer(), _x->specialShapeInfo(), nullptr, _z->buffer(), _z->shapeInfo(), _z->specialBuffer(), _z->specialShapeInfo(), nullptr, _axis.size());
            }
            manager.synchronize();
--- a/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
@ -184,6 +184,43 @@ ConstantShapeBuffer& ConstantShapeHelper::createShapeInfoWithUnitiesForBroadcast
    return bufferForShapeInfo(descriptor);
 }
-} // namespace sd
+
 ////////////////////////////////////////////////////////////////////////
 ConstantShapeBuffer& ConstantShapeHelper::createShapeInfoWithNoUnitiesForReduce(const Nd4jLong* inShapeInfo, const std::vector<int> &dimsWithUnities, sd::memory::Workspace* workspace) {
    Nd4jLong* newShapeInfo = nullptr;
    ALLOCATE(newShapeInfo, workspace, shape::shapeInfoLength(shape::rank(inShapeInfo) - dimsWithUnities.size()), Nd4jLong);
    int temp;
    if(dimsWithUnities.size() == 1 && shape::isCommonVector(inShapeInfo, temp) && temp == dimsWithUnities[0]) {
        auto dims = ShapeUtils::evalDimsToExclude(shape::rank(inShapeInfo), {temp});
        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dims.data(), dims.size(), newShapeInfo);
    } else {
        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsWithUnities.data(), dimsWithUnities.size(), newShapeInfo);
    }
    ShapeDescriptor descriptor(newShapeInfo);
    RELEASE(newShapeInfo, workspace);
    return bufferForShapeInfo(descriptor);
 }
 ////////////////////////////////////////////////////////////////////////
 ConstantShapeBuffer& ConstantShapeHelper::createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, sd::memory::Workspace* workspace) {
    Nd4jLong* newShapeInfo = ShapeBuilders::createSubArrShapeInfo(inShapeInfo, dims, dimsSize, workspace);
    ShapeDescriptor descriptor(newShapeInfo);
    RELEASE(newShapeInfo, workspace);
    return bufferForShapeInfo(descriptor);
 }
 }
 #endif
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@ -443,17 +443,17 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB,  NDArray* vC,
            // calculate index of current batch
            Nd4jLong batchInd;
            if(cRank > 2)
-                batchInd = shape::coords2index(cShapeInfo, cCoords.data(), cRank - 2, cBatchDims);
+                batchInd = shape::coords2index(cShapeInfo, cBatchDims, cRank - 2, cCoords.data());
            // evaluate A coordinates
            if(aRank > 2)
-                shape::index2coords(batchInd, aShapeInfo, aCoords.data(), aRank - 2, aBatchDims);
+                shape::index2coords(batchInd, aShapeInfo, aBatchDims, aRank - 2, aCoords.data());
            aCoords[aMaxis] = cCoords[cMaxis];
            aCoords[aKaxis] = 0;
            // evaluate B coordinates
            if(bRank > 2)
-                shape::index2coords(batchInd, bShapeInfo, bCoords.data(), bRank - 2, bBatchDims);
+                shape::index2coords(batchInd, bShapeInfo, bBatchDims, bRank - 2, bCoords.data());
            bCoords[bKaxis] = 0;
            bCoords[bNaxis] = cCoords[cNaxis];
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
@ -26,20 +26,19 @@ namespace sd {
    template<typename X, typename Z>
    template <typename OpType>
-    void ReductionBoolLoops<X, Z>::innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+    void ReductionBoolLoops<X, Z>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const int* dims, X* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
    }
    template<typename X, typename Y>
-    void ReductionBoolLoops<X, Y>::wrapper(const int opNum,
+    void ReductionBoolLoops<X, Y>::wrapper(const int opNum, sd::memory::Workspace* workspace,
                                           const X *x, const Nd4jLong *xShapeInfo,
                                           Y *z, const Nd4jLong *zShapeInfo,
-                                           const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets,
+                                           const int *dims, X *extraParams) {
                                           X *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_BOOL_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_BOOL_OPS);
 #endif
    }
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float.hpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float.hpp
@ -28,20 +28,19 @@ namespace sd {
    template<typename X, typename Z>
    template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(const X * x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+    void ReductionFloatLoops<X, Z>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, Z* z, const Nd4jLong* zShapeInfo, const int* dims, Z* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
    }
    template<typename X, typename Y>
-    void ReductionFloatLoops<X, Y>::wrapper(const int opNum, const X *x, const Nd4jLong *xShapeInfo,
+    void ReductionFloatLoops<X, Y>::wrapper(const int opNum, sd::memory::Workspace* workspace,
                                            const X *x, const Nd4jLong *xShapeInfo,
                                            Y *z, const Nd4jLong *zShapeInfo,
-                                            const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets,
+                                            const int *dims, Y *extraParams) {
                                            Y *extraParams,
                                            int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_FLOAT_OPS);
 #endif
    }
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
@ -33,18 +33,19 @@ namespace sd {
    template<typename X, typename Z>
    template <typename OpType>
-    void ReductionLongLoops<X, Z>::innerloopReduce(const X * x, const Nd4jLong* xShapeInfo, Z *z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+    void ReductionLongLoops<X, Z>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, Z *z, const Nd4jLong* zShapeInfo, const int* dims, X* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
    }
    template<typename X, typename Y>
-    void ReductionLongLoops<X, Y>::wrapper(const int opNum, const X *x, const Nd4jLong *xShapeInfo, Y *z,
+    void ReductionLongLoops<X, Y>::wrapper(const int opNum, sd::memory::Workspace* workspace,
-                                           const Nd4jLong *zShapeInfo, const Nd4jLong *tadShapeInfo,
+                                            const X *x, const Nd4jLong *xShapeInfo,
-                                           const Nd4jLong *tadOffsets, X *extraParams, int64_t start, int64_t stop) {
+                                            Y *z, const Nd4jLong *zShapeInfo,
                                            const int *dims, X *extraParams) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_LONG_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_LONG_OPS);
 #endif
    }
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
@ -26,23 +26,23 @@ namespace sd {
    template<typename X>
    template <typename OpType>
-    void ReductionSameLoops<X>::innerloopReduce(const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo, const Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+    void ReductionSameLoops<X>::innerloopReduce(sd::memory::Workspace* workspace, const X* x, const Nd4jLong* xShapeInfo, X* z, const Nd4jLong* zShapeInfo, const int* dims, X* extraParams) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+        ReductionLoops<X,X,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
    }
    template<typename X>
-    void ReductionSameLoops<X>::wrapper(const int opNum, const X *vx, const Nd4jLong *xShapeInfo, X *vz,
+    void ReductionSameLoops<X>::wrapper(const int opNum, sd::memory::Workspace* workspace,
-                                        const Nd4jLong *zShapeInfo, const Nd4jLong *tadShapeInfo,
+                                        const X *vx, const Nd4jLong *xShapeInfo,
-                                        const Nd4jLong *tadOffsets,
+                                        X *z, const Nd4jLong *zShapeInfo,
-                                           X *vextraParams, int64_t start, int64_t stop) {
+                                        const int *dims, X *vextraParams) {
 #ifndef INLINE_LOOPS
        auto x = reinterpret_cast<X *>(vx);
        auto z = reinterpret_cast<X *>(vz);
        auto extraParams = reinterpret_cast<X *>(vextraParams);
-        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_SAME_OPS);
+        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams), REDUCE_SAME_OPS);
 #endif
    }
--- a/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
+++ b/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
@ -24,6 +24,7 @@
 #include <helpers/ShapeBuilders.h>
 #include <execution/AffinityManager.h>
 #include <helpers/ConstantHelper.h>
 #include <helpers/ShapeUtils.h>
 #include <array/PrimaryPointerDeallocator.h>
 #include <array/CudaPointerDeallocator.h>
@ -187,4 +188,38 @@ ConstantShapeBuffer&  ConstantShapeHelper::createShapeInfoWithUnitiesForBroadcas
    return bufferForShapeInfo(descriptor);
 }
 ////////////////////////////////////////////////////////////////////////
 ConstantShapeBuffer& ConstantShapeHelper::createShapeInfoWithNoUnitiesForReduce(const Nd4jLong* inShapeInfo, const std::vector<int> &dimsWithUnities, sd::memory::Workspace* workspace) {
    Nd4jLong* newShapeInfo = nullptr;
    ALLOCATE(newShapeInfo, workspace, shape::shapeInfoLength(shape::rank(inShapeInfo) - dimsWithUnities.size()), Nd4jLong);
    int temp;
    if(dimsWithUnities.size() == 1 && shape::isCommonVector(inShapeInfo, temp) && temp == dimsWithUnities[0]) {
        auto dims = ShapeUtils::evalDimsToExclude(shape::rank(inShapeInfo), {temp});
        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dims.data(), dims.size(), newShapeInfo);
    } else {
        shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsWithUnities.data(), dimsWithUnities.size(), newShapeInfo);
    }
    ShapeDescriptor descriptor(newShapeInfo);
    RELEASE(newShapeInfo, workspace);
    return bufferForShapeInfo(descriptor);
 }
 ////////////////////////////////////////////////////////////////////////
 ConstantShapeBuffer& ConstantShapeHelper::createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, sd::memory::Workspace* workspace) {
    Nd4jLong* newShapeInfo = ShapeBuilders::createSubArrShapeInfo(inShapeInfo, dims, dimsSize, workspace);
    ShapeDescriptor descriptor(newShapeInfo);
    RELEASE(newShapeInfo, workspace);
    return bufferForShapeInfo(descriptor);
 }
 }
--- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
@ -571,17 +571,17 @@ static __global__ void batchedCudaGemm(const void* vA, const Nd4jLong* aShapeInf
        // calculate index of current batch
        Nd4jLong batchInd;
        if(cBatchDims != nullptr)
-            batchInd = shape::coords2index(cShapeInfo, cCoords, cRank - 2, cBatchDims);
+            batchInd = shape::coords2index(cShapeInfo, cBatchDims, cRank - 2, cCoords);
        // evaluate A coordinates
        if(aBatchDims != nullptr)
-            shape::index2coords(batchInd, aShapeInfo, aCoords, aRank - 2, aBatchDims);
+            shape::index2coords(batchInd, aShapeInfo, aBatchDims, aRank - 2, aCoords);
        aCoords[aMaxis] = cCoords[cMaxis];
        aCoords[aKaxis] = 0;
        // evaluate B coordinates
        if(bBatchDims != nullptr)
-            shape::index2coords(batchInd, bShapeInfo, bCoords, bRank - 2, bBatchDims);
+            shape::index2coords(batchInd, bShapeInfo, bBatchDims, bRank - 2, bCoords);
        bCoords[bKaxis] = 0;
        bCoords[bNaxis] = cCoords[cNaxis];
--- a/libnd4j/include/helpers/impl/ShapeBuilders.cpp
+++ b/libnd4j/include/helpers/impl/ShapeBuilders.cpp
@ -140,14 +140,26 @@ namespace sd {
    }
 ////////////////////////////////////////////////////////////////////////////////
-Nd4jLong* ShapeBuilders::copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace) {
+Nd4jLong* ShapeBuilders::createSubArrShapeInfo(const Nd4jLong* inShapeInfo, const int* dims, const int dimsSize, memory::Workspace* workspace) {
-    Nd4jLong *outShapeInfo = nullptr;
+    Nd4jLong *subArrShapeInfo = nullptr;
-    ALLOCATE(outShapeInfo, workspace, shape::shapeInfoLength(inShapeInfo[0] - dimsSize), Nd4jLong);
+    ALLOCATE(subArrShapeInfo, workspace, shape::shapeInfoLength(dimsSize), Nd4jLong);
-    shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsSize, dimsToExclude, outShapeInfo);
+    subArrShapeInfo[0] = dimsSize;                                 // rank
    sd::ArrayOptions::copyDataType(subArrShapeInfo, inShapeInfo);  // type
    subArrShapeInfo[2*dimsSize + 3] = shape::order(inShapeInfo);   // order
-    return outShapeInfo;
+    Nd4jLong* shape = shape::shapeOf(subArrShapeInfo);
    Nd4jLong* strides = shape::stride(subArrShapeInfo);
    for(int i = 0; i < dimsSize; ++i) {
        shape[i]   = shape::sizeAt(inShapeInfo, dims[i]);
        strides[i] = shape::strideAt(inShapeInfo, dims[i]);
    }
    shape::checkStridesEwsAndOrder(subArrShapeInfo);
    return subArrShapeInfo;
 }
 }
--- a/libnd4j/include/helpers/impl/ShapeUtils.cpp
+++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp
@ -1062,6 +1062,17 @@ bool ShapeUtils::areShapesEqual(const Nd4jLong* shapeInfo, const std::vector<Nd4
    return true;
 }
 ////////////////////////////////////////////////////////////////////////////////
 std::vector<int> ShapeUtils::evalDimsForReduceOp(const int rank, const std::vector<int>& dimsToExclude) {
    std::vector<int> output = ShapeUtils::evalDimsToExclude(rank, dimsToExclude);
    for(uint j = 0; j < dimsToExclude.size(); ++j)
        output.emplace_back(dimsToExclude[j]);
    return output;
 }
 ////////////////////////////////////////////////////////////////////////////////
 /*
 bool ShapeUtils::isSubArrayCase(const NDArray& arr1, const NDArray& arr2, std::vector<int>& sameDims) {
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@ -901,6 +901,16 @@ namespace shape {
    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0);
    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0);
    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0);
    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, const int* dims);     // length of dims is equal to rank of shapeInfo
    // all three arrays should have same rank
    // all three arrays should have same dimensions or some of them are 1 (that is satisfy broadcasting principle), strides may be different
    // shapeInfo1 - first array should have max length compared to rest of two arrays
    ND4J_EXPORT _CUDA_HD void getOffsetBroadcast(const Nd4jLong& startInd, const Nd4jLong ind,
                                                const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2, const Nd4jLong* shapeInfo3,
                                                const bool sameOffsets12, const bool sameOffsets13,
                                                int* coords,
                                                Nd4jLong& offset1, Nd4jLong& offset2, Nd4jLong& offset3);
    ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank);
@ -918,11 +928,12 @@ namespace shape {
    ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords);
    ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords);
    // ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, const int* dims, Nd4jLong *coords);
    /**
    * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
    */
-    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims);
+    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, int *coords);
    /**
    * Convert coordinates to the corresponding linear index (sequence number in other words)
@ -935,7 +946,7 @@ namespace shape {
    /**
    * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
    */
-    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims);
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int* dims, const int dimsSize, const int *coords);
   /**
   * increment n-dimensional array by one iteration by changing coord appropriately
@ -951,7 +962,7 @@ namespace shape {
    ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo);
    ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned);
-    ND4J_EXPORT _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo);
+    ND4J_EXPORT _CUDA_HD void printShapeInfo(const Nd4jLong *shapeInfo);
    ND4J_EXPORT _CUDA_HD void printShapeInfoLinear(const Nd4jLong *shapeInfo);
@ -1057,10 +1068,10 @@ namespace shape {
    ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities);
    /**
-    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude(points on unity dimensions) = {1,3}, dimsSize = 2
    * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
    */
-    INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo);
+    INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int* dimsToExclude, const int dimsSize, Nd4jLong* outShapeInfo);
    /**
    * get stride over contiguous axis (contiguous axis must have stride = 1)
@ -1847,13 +1858,13 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape,
    return index;
 }
-INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, const int *coords) {
    Nd4jLong index, shift = 1;;
-    index = coords[tadDims[dimsSize - 1]];
+    index = coords[dims[dimsLen - 1]];
-    for(uint i = dimsSize - 1;  i >= 1; --i) {
+    for(uint i = dimsLen - 1;  i >= 1; --i) {
-        shift *= shapeInfo[tadDims[i]];
+        shift *= shapeInfo[dims[i]];
        index += shift * coords[i - 1];
    }
@ -3324,6 +3335,18 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong
    return offset;
 }
 //////////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset) {
    Nd4jLong offset = baseOffset;
    for(uint i = 1; i <= shapeInfo[0]; ++i)
        if(shapeInfo[i] != 1)
            offset += coords[i - 1] * shapeInfo[shapeInfo[0] + i];
    return offset;
 }
 //////////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset) {
@ -3337,17 +3360,78 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coor
 }
 //////////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset) {
+INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, const int* dims) {
-    Nd4jLong offset = baseOffset;
+    Nd4jLong offset = 0;
    for(uint i = 1; i <= shapeInfo[0]; ++i)
        if(shapeInfo[i] != 1)
-            offset += coords[i - 1] * shapeInfo[shapeInfo[0] + i];
+            offset += coords[dims[i - 1]] * shapeInfo[shapeInfo[0] + i];
    return offset;
 }
 //////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD void getOffsetBroadcast(const Nd4jLong& startInd, const Nd4jLong ind,
                                            const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2, const Nd4jLong* shapeInfo3,
                                            const bool sameOffsets12, const bool sameOffsets13,
                                            int* coords,
                                            Nd4jLong& offset1, Nd4jLong& offset2, Nd4jLong& offset3) {
    const Nd4jLong* shape1 = shape::shapeOf(shapeInfo1);
    const Nd4jLong* strides1 = shape::stride(shapeInfo1);
    const Nd4jLong* shape2 = shape::shapeOf(shapeInfo2);
    const Nd4jLong* strides2 = shape::stride(shapeInfo2);
    const Nd4jLong* shape3 = shape::shapeOf(shapeInfo3);
    const Nd4jLong* strides3 = shape::stride(shapeInfo3);
    if(startInd == ind) {
        if(shape::rank(shapeInfo1) == 0) {
            offset1 = offset2 = offset3 = 0;
            return;
        }
        shape::index2coords(ind, shapeInfo1, coords);
        offset1 = shape::getOffset(shapeInfo1, coords);
        if(sameOffsets12)
            offset2 = offset1;
        else
            offset2 = shape::getOffset(shapeInfo2, coords);
        if(sameOffsets13)
            offset3 = offset1;
        else
            offset3 = shape::getOffset(shapeInfo3, coords);
        return;
    }
    int axis = shapeInfo1[0] - 1;
    while(coords[axis] == shape1[axis] - 1) {
        if(!sameOffsets12 && shape2[axis] != 1)
            offset2 -= (shape2[axis] - 1) * strides2[axis];
        if(!sameOffsets13 && shape3[axis] != 1)
            offset3 -= (shape3[axis] - 1) * strides3[axis];
        if(shape1[axis] != 1)
            offset1 -= (shape1[axis] - 1) * strides1[axis];
        coords[axis--] = 0;
    }
    ++coords[axis];
    offset1 += strides1[axis];
    if(!sameOffsets12 && shape2[axis] != 1)
        offset2 += strides2[axis];
    if(!sameOffsets13 && shape3[axis] != 1)
        offset3 += strides3[axis];
    if(sameOffsets12)
        offset2 = offset1;
    if(sameOffsets13)
        offset3 = offset1;
 }
 /**
 * Returns the tensor along dimension
@ -3443,7 +3527,7 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coo
        printf("\n");
    }
-    INLINEDEF _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo) {
+    INLINEDEF _CUDA_HD void printShapeInfo(const Nd4jLong *shapeInfo) {
        int rank = shape::rank(shapeInfo);
        Nd4jLong *shape = shape::shapeOf(shapeInfo);
        printf("Rank %d\n",rank);
@ -4583,89 +4667,92 @@ INLINEDEF void calcOffsets(const Nd4jLong* shapeInfo, Nd4jLong* offsets, const c
 //////////////////////////////////////////////////////////////////////
 INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong* strides, Nd4jLong* offsets, const char order) {
-    // if(false) {                     // tests showed that this code did calculation notably slower even for big N
+    const uint64_t len = shape::prodLong(shape, rank);
    //     Nd4jLong indexes[MAX_RANK];
    //     PRAGMA_OMP_PARALLEL_FOR_ARGS(private(indexes))
    //     for (Nd4jLong i = 0; i < N; ++i) {
    //         shape::index2coords(rank, shape, i, indexes);
    //         subArrOffsets[i] = 0;
    //         for (int j = 0; j < rank; ++j)
    //             if(shape[j] != 1)
    //                 subArrOffsets[i] += indexes[j] * strides[j];
    //     }
    //     return;
    // }
    // set offset for first sub-array, it is equal to zero always
    offsets[0] = 0;
-    Nd4jLong * idx          = new Nd4jLong[rank];
+    uint coords[MAX_RANK];
-    Nd4jLong* offsetPerDim = new Nd4jLong[rank];
+    memset(coords, 0, sizeof(uint) * rank);
    memset(idx, 0, sizeof(Nd4jLong) * rank);
    PRAGMA_OMP_SIMD
    for (int k = 0; k < rank; ++k)
        offsetPerDim[k] = (shape[k] - 1) * strides[k];
    Nd4jLong init = 0, i = 1;
    // nested loops - calculation of sub-array offsets
    if(order == 'c') {
-        Nd4jLong rankMinusOne = rank - 1, j = rankMinusOne;
+        for (uint64_t i = 1; i < len; ++i) {
            int axis = rank - 1;
            offsets[i] = 0;
            while(coords[axis] == shape[axis] - 1) {
                offsets[i] -= (shape[axis] - 1) * strides[axis];
                coords[axis--] = 0;
            }
            ++coords[axis];
            offsets[i] += offsets[i-1] + strides[axis];
        }
    } else {
-        while(j >= 0) {
+        for (uint64_t i = 1; i < len; ++i) {
-
+            int axis = 0;
-            if(shape[j] == 1) { --j; continue; } // ignore dimensions equal to unity
+            offsets[i] = 0;
-
+            while(coords[axis] == shape[axis] - 1) {
-            if(j == rankMinusOne) {              // last dimension
+                offsets[i] -= (shape[axis] - 1) * strides[axis];
-                for(int l = 1; l < shape[j]; ++l) {
+                coords[axis++] = 0;
                    offsets[i] = offsets[i - 1] + strides[j];
                    i++;
                }
                --j;
            }
            else if(idx[j] < shape[j] - 1) {
                init += strides[j];
                offsets[i++] = init;
                ++idx[j];
                j = rankMinusOne;
            }
            else {
                init -= offsetPerDim[j];
                idx[j--] = 0;
            }
        }
    }
    else {
        Nd4jLong j = 0;
        while(j < rank) {
            if(shape[j] == 1) { ++j; continue; } // ignore dimensions equal to unity
            if(j == 0) {              // last dimension
                for(int l = 1; l < shape[j]; ++l) {
                    offsets[i] = offsets[i - 1] + strides[j];
                    i++;
                }
                ++j;
            }
            else if(idx[j] < shape[j] - 1) {
                init += strides[j];
                offsets[i++] = init;
                ++idx[j];
                j = 0;
            }
            else {
                init -= offsetPerDim[j];
                idx[j++] = 0;
            }
            ++coords[axis];
            offsets[i] += offsets[i-1] + strides[axis];
        }
    }
-    delete []idx;
+    // Nd4jLong init = 0, i = 1;
-    delete []offsetPerDim;
+    // // nested loops - calculation of sub-array offsets
    // if(order == 'c') {
    //     int rankMinusOne = rank - 1, j = rankMinusOne;
    //     while(j >= 0) {
    //         if(shape[j] == 1) { --j; continue; } // ignore dimensions equal to unity
    //         if(j == rankMinusOne) {              // last dimension
    //             for(uint l = 1; l < shape[j]; ++l)
    //                 offsets[i++] = offsets[i - 1] + strides[j];
    //             --j;
    //         }
    //         else if(coords[j] < shape[j] - 1) {
    //             init += strides[j];
    //             offsets[i++] = init;
    //             ++coords[j];
    //             j = rankMinusOne;
    //         }
    //         else {
    //             init -= (shape[j] - 1) * strides[j];
    //             coords[j--] = 0;
    //         }
    //     }
    // }
    // else {
    //     int j = 0;
    //     while(j < rank) {
    //         if(shape[j] == 1) { ++j; continue; } // ignore dimensions equal to unity
    //         if(j == 0) {              // last dimension
    //             for(uint l = 1; l < shape[j]; ++l)
    //                 offsets[i++] = offsets[i - 1] + strides[j];
    //             ++j;
    //         }
    //         else if(coords[j] < shape[j] - 1) {
    //             init += strides[j];
    //             offsets[i++] = init;
    //             ++coords[j];
    //             j = 0;
    //         }
    //         else {
    //             init -= (shape[j] - 1) * strides[j];
    //             coords[j++] = 0;
    //         }
    //     }
    // }
 }
 //////////////////////////////////////////////////////////////////////
@ -4884,13 +4971,14 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL
 }
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, int *coords) {
-    for(uint i = dimsSize - 1; i > 0; --i) {
+    for(uint i = dimsLen - 1; i > 0; --i) {
-        coords[tadDims[i]] = index % shapeInfo[1 + tadDims[i]];
+        const auto ind = dims[i];
-        index /= shapeInfo[1 + tadDims[i]];
+        coords[ind] = index % shapeInfo[1 + ind];
        index /= shapeInfo[1 + ind];
    }
-    coords[tadDims[0]] = index;      // last iteration
+    coords[dims[0]] = index;      // last iteration
 }
 //////////////////////////////////////////////////////////////////////
@ -4921,6 +5009,64 @@ INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLo
    }
 }
 //////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities) {
    const int rank = shape::rank(inShapeInfo);
    const int numOfNonUnities = shape::numOfNonUnitDims(rank, shape::shapeOf(inShapeInfo));
    if(numOfNonUnities == rank) { // no unities in shape, no copy procedure
        shapeNoUnities   = const_cast<Nd4jLong*>(inShapeInfo) + 1;
        stridesNoUnities = const_cast<Nd4jLong*>(inShapeInfo) + 1 + rank;
        return numOfNonUnities;
    }
    for(uint j = 0, i = 0; i < rank; ++i) {
        if(shape::shapeOf(inShapeInfo)[i] != 1) {
            shapeNoUnities[j]                     = shape::shapeOf(inShapeInfo)[i];
            shapeNoUnities[numOfNonUnities + j++] = shape::stride(inShapeInfo)[i];
        }
    }
    stridesNoUnities = shapeNoUnities + numOfNonUnities;
    return numOfNonUnities;
 }
 //////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int* dimsToExclude, const int dimsSize, Nd4jLong* outShapeInfo) {
    outShapeInfo[0] = inShapeInfo[0] - dimsSize;
    for(uint j = 0, k = 0, i = 0; i < inShapeInfo[0]; ++i) {
        if(j < dimsSize && i == dimsToExclude[j]) {
            ++j;
            continue;
        }
        shape::shapeOf(outShapeInfo)[k]  = shape::shapeOf(inShapeInfo)[i];
        shape::stride(outShapeInfo)[k++] = shape::stride(inShapeInfo)[i];
    }
    sd::ArrayOptions::copyDataType(outShapeInfo, inShapeInfo);                      // type
    *shape::ews(outShapeInfo)             = shape::elementWiseStride(inShapeInfo);  // ews
    outShapeInfo[2 * outShapeInfo[0] + 3] = shape::order(inShapeInfo);              // order
 }
 //////////////////////////////////////////////////////////////////////
 // INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, const int* dims, const int dimsLen, int *coords) {
 //     if(startIndex == index) {
 //         shape::index2coords(index, shapeInfo, dims, dimsLen, coords);
 //     }
 //     else {
 //         int i = dimsLen - 1;
 //         while(coords[dims[i]] == shape::sizeAt(shapeInfo, dims[i]) - 1)
 //             coords[dims[i--]] = 0;
 //         ++coords[dims[i]];
 //     }
 // }
 //////////////////////////////////////////////////////////////////////
 // INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
@ -5111,50 +5257,6 @@ INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLo
 //     }
 // }
 //////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities) {
    const int rank = shape::rank(inShapeInfo);
    const int numOfNonUnities = shape::numOfNonUnitDims(rank, shape::shapeOf(inShapeInfo));
    if(numOfNonUnities == rank) { // no unities in shape, no copy procedure
        shapeNoUnities   = const_cast<Nd4jLong*>(inShapeInfo) + 1;
        stridesNoUnities = const_cast<Nd4jLong*>(inShapeInfo) + 1 + rank;
        return numOfNonUnities;
    }
    for(uint j = 0, i = 0; i < rank; ++i) {
        if(shape::shapeOf(inShapeInfo)[i] != 1) {
            shapeNoUnities[j]                     = shape::shapeOf(inShapeInfo)[i];
            shapeNoUnities[numOfNonUnities + j++] = shape::stride(inShapeInfo)[i];
        }
    }
    stridesNoUnities = shapeNoUnities + numOfNonUnities;
    return numOfNonUnities;
 }
 //////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo) {
    outShapeInfo[0] = inShapeInfo[0] - dimsSize;
    for(uint j = 0, k = 0, i = 0; i < inShapeInfo[0]; ++i) {
        if(j < dimsSize && i == dimsToExclude[j]) {
            ++j;
            continue;
        }
        shape::shapeOf(outShapeInfo)[k]  = shape::shapeOf(inShapeInfo)[i];
        shape::stride(outShapeInfo)[k++] = shape::stride(inShapeInfo)[i];
    }
    sd::ArrayOptions::copyDataType(outShapeInfo, inShapeInfo);                      // type
    *shape::ews(outShapeInfo)             = shape::elementWiseStride(inShapeInfo);  // ews
    outShapeInfo[2 * outShapeInfo[0] + 3] = shape::order(inShapeInfo);              // order
 }
 //////////////////////////////////////////////////////////////////////
 // INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
--- a/libnd4j/include/legacy/NativeOpExecutioner.h
+++ b/libnd4j/include/legacy/NativeOpExecutioner.h
@ -470,8 +470,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                                void *extraParams,
                                void *hZ, const Nd4jLong *hZShapeInfo,
                                void *dZ, const Nd4jLong *dZShapeInfo,
-                                int *dimension, int dimensionLength,
+                                int *dimension, int dimensionLength);
                                const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
    static void execReduceSame(sd::LaunchContext  *lc,
                               int opNum,
@ -480,8 +479,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                               void *extraParams,
                               void *hZ, const Nd4jLong *hZShapeInfo,
                               void *dZ, const Nd4jLong *dZShapeInfo,
-                               int *dimension, int dimensionLength,
+                               int *dimension, int dimensionLength);
                               const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
    static void execReduceBool(sd::LaunchContext  *lc,
                               int opNum,
@ -490,8 +488,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                               void *extraParams,
                               void *hZ, const Nd4jLong *hZShapeInfo,
                               void *dZ, const Nd4jLong *dZShapeInfo,
-                               int *dimension, int dimensionLength,
+                               int *dimension, int dimensionLength);
                               const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
    static void execReduceLong(sd::LaunchContext  *lc,
                               int opNum,
@ -500,8 +497,7 @@ static void execTransformBool(sd::LaunchContext  *lc,
                               void *extraParams,
                               void *hZ, const Nd4jLong *hZShapeInfo,
                               void *dZ, const Nd4jLong *dZShapeInfo,
-                               int *dimension, int dimensionLength,
+                               int *dimension, int dimensionLength);
                               const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets);
    /**
     *
--- a/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
@ -585,8 +585,7 @@ void NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
                                          void *extraParams,
                                          void *hZ, const Nd4jLong *hZShapeInfo,
                                          void *dZ, const Nd4jLong *dZShapeInfo,
-                                          int *dimension, int dimensionLength,
+                                          int *dimension, int dimensionLength) {
                                          const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
@ -597,13 +596,7 @@ void NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
    if (shape::isEmpty(hZShapeInfo))
        return;
-    auto func = PRAGMA_THREADS_FOR {
+      BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES, FLOAT_TYPES);
        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
    };
    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
 }
 ////////////////////////////////////////////////////////////////////////
@ -614,24 +607,16 @@ void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
                                         void *extraParams,
                                         void *hZ, const Nd4jLong *hZShapeInfo,
                                         void *dZ, const Nd4jLong *dZShapeInfo,
-                                         int *dimension, int dimensionLength,
+                                         int *dimension, int dimensionLength) {
                                         const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
    // nothing to do here if result is empty
    if (shape::isEmpty(hZShapeInfo))
        return;
-    auto func = PRAGMA_THREADS_FOR {
+    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES);
        BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES);
    };
    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
 }
 ////////////////////////////////////////////////////////////////////////
@ -642,8 +627,7 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
                                         void *extraParams,
                                         void *hZ, const Nd4jLong *hZShapeInfo,
                                         void *dZ, const Nd4jLong *dZShapeInfo,
-                                         int *dimension, int dimensionLength,
+                                         int *dimension, int dimensionLength) {
                                         const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
@ -653,13 +637,7 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
    if (shape::isEmpty(hZShapeInfo))
        return;
-    auto func = PRAGMA_THREADS_FOR {
+      BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES, BOOL_TYPES);
        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, BOOL_TYPES);
    };
    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
 }
 ////////////////////////////////////////////////////////////////////////
@ -670,8 +648,7 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
                                         void *extraParams,
                                         void *hZ, const Nd4jLong *hZShapeInfo,
                                         void *dZ, const Nd4jLong *dZShapeInfo,
-                                         int *dimension, int dimensionLength,
+                                         int *dimension, int dimensionLength) {
                                         const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
@ -681,13 +658,7 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
    if (shape::isEmpty(hZShapeInfo))
        return;
-    auto func = PRAGMA_THREADS_FOR {
+    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, lc ? lc->getWorkspace() : nullptr, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension), LIBND4J_TYPES, LONG_TYPES);
        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, LONG_TYPES);
    };
    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance().maxMasterThreads());
 }
 ////////////////////////////////////////////////////////////////////////
--- a/libnd4j/include/legacy/cpu/NativeOps.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOps.cpp
@ -447,28 +447,26 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
                                   OpaqueDataBuffer *dbZ, const Nd4jLong *hZShapeInfo, const Nd4jLong *dZShapeInfo,
                                   OpaqueDataBuffer *dbDimension, const Nd4jLong *hDimensionShape, const Nd4jLong *dDimensionShape) {
    try {
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        auto dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPackX = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        const auto zLen = shape::length(hZShapeInfo);
-        auto hTADShapeInfo = tadPackX.primaryShapeInfo();
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
-        auto hTADOffsets = tadPackX.primaryOffsets();
+
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        const Nd4jLong* zShapeInfoD = dZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        NativeOpExecutioner::execReduceFloat(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
        NativeOpExecutioner::execReduceFloat(nullptr, opNum,
                                             dbX->primary(),
                                             hXShapeInfo,
                                             dbX->special(),
                                             dXShapeInfo,
                                             extraParams,
                                             dbZ->primary(),
                                             hZShapeInfo,
                                             dbZ->special(),
                                             dZShapeInfo,
                                             dimension,
                                             dimensionLength,
                                             hTADShapeInfo,
                                             hTADOffsets);
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
@ -481,30 +479,27 @@ void execReduceBool2(Nd4jPointer *extraPointers,
                                void *extraParams,
                                OpaqueDataBuffer *dbZ, const Nd4jLong *hZShapeInfo, const Nd4jLong *dZShapeInfo,
                                OpaqueDataBuffer *dbDimension, const Nd4jLong *hDimensionShape, const Nd4jLong *dDimensionShape) {
    try {
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        auto dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension,
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
                                                                                dimensionLength);
-        auto hTADShapeInfo = tadPack.primaryShapeInfo();
+        const auto zLen = shape::length(hZShapeInfo);
-        auto hTADOffsets = tadPack.primaryOffsets();
+
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        const Nd4jLong* zShapeInfoD = dZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo)) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        NativeOpExecutioner::execReduceBool(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
        NativeOpExecutioner::execReduceBool(nullptr, opNum,
                                            dbX->primary(),
                                            hXShapeInfo,
                                            dbX->special(),
                                            dXShapeInfo,
                                            extraParams,
                                            dbZ->primary(),
                                            hZShapeInfo,
                                            dbZ->special(),
                                            dZShapeInfo,
                                            dimension,
                                            dimensionLength,
                                            hTADShapeInfo,
                                            hTADOffsets);
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
@ -521,26 +516,22 @@ void execReduceSame2(Nd4jPointer *extraPointers,
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension,
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
                                                                                dimensionLength);
-        auto hTADShapeInfo = tadPack.primaryShapeInfo();
+        const auto zLen = shape::length(hZShapeInfo);
-        auto hTADOffsets = tadPack.primaryOffsets();
+
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        const Nd4jLong* zShapeInfoD = dZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        NativeOpExecutioner::execReduceSame(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
        NativeOpExecutioner::execReduceSame(nullptr, opNum,
                                            dbX->primary(),
                                            hXShapeInfo,
                                            dbX->special(),
                                            dXShapeInfo,
                                            extraParams,
                                            dbZ->primary(),
                                            hZShapeInfo,
                                            dbZ->special(),
                                            dZShapeInfo,
                                            dimension,
                                            dimensionLength,
                                            hTADShapeInfo,
                                            hTADOffsets);
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
@ -557,25 +548,22 @@ void execReduceLong2(Nd4jPointer *extraPointers,
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
-        auto hTADShapeInfo = tadPack.primaryShapeInfo();
+        const auto zLen = shape::length(hZShapeInfo);
-        auto hTADOffsets = tadPack.primaryOffsets();
+
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        const Nd4jLong* zShapeInfoD = dZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
            zShapeInfoD = reinterpret_cast<Nd4jLong const*>(zPack.special());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        NativeOpExecutioner::execReduceLong(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), zShapeInfoH, dbZ->special(), zShapeInfoD, dims.data(), dims.size());
        NativeOpExecutioner::execReduceLong(nullptr, opNum,
                                            dbX->primary(),
                                            hXShapeInfo,
                                            dbX->special(),
                                            dXShapeInfo,
                                            extraParams,
                                            dbZ->primary(),
                                            hZShapeInfo,
                                            dbZ->special(),
                                            dZShapeInfo,
                                            dimension,
                                            dimensionLength,
                                            hTADShapeInfo,
                                            hTADOffsets);
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
--- a/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
+++ b/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
@ -210,7 +210,7 @@ void NativeOpExecutioner::execSummaryStatsScalar(sd::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
    auto reductionPointer = lc->getReductionPointer();
-    dim3 launchDims = dim3(256, 256, 32768);
+    dim3 launchDims = dim3(256, CUDA_BLOCK_SIZE, 1024);
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
@ -577,8 +577,7 @@ void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
                            void *extraParams,
                            void *hZ, Nd4jLong const* hZShapeInfo,
                            void *dZ, Nd4jLong const* dZShapeInfo,
-                            int *dimension, int dimensionLength,
+                            int *dimension, int dimensionLength) {
                            Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
@ -588,15 +587,14 @@ void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
    auto xRank = shape::rank(hXShapeInfo);
    if (zType != xType)
        throw datatype_exception::build("NativeOpExecutioner::execReduceSame requires both X & Z operands to have same type", xType, zType);
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 8192);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
-    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
+    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES);
    // TODO: remove after the release
    auto res = cudaStreamSynchronize(*stream);
@ -612,8 +610,7 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
                            void *extraParams,
                            void *hZ, Nd4jLong const* hZShapeInfo,
                            void *dZ, Nd4jLong const* dZShapeInfo,
-                            int *dimension,int dimensionLength,
+                            int *dimension,int dimensionLength) {
                            Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
@ -627,11 +624,10 @@ void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
    if (zType != sd::DataType::INT64)
        throw datatype_exception::build("NativeOpExecutioner::execReduceLong wrong Z data type", sd::DataType::INT64, zType);
    auto xRank = shape::rank(hXShapeInfo);
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);
+    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES, LONG_TYPES);
    // TODO: remove after the release
    auto res = cudaStreamSynchronize(*stream);
@ -648,8 +644,7 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
                            void *extraParams,
                            void *hZ, Nd4jLong const* hZShapeInfo,
                            void *dZ, Nd4jLong const* dZShapeInfo,
-                            int *dimension, int dimensionLength,
+                            int *dimension, int dimensionLength) {
                            Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
@ -663,11 +658,10 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
    if (zType != sd::DataType::BOOL)
        throw std::runtime_error("NativeOpExecutioner::execReduceBool requires Z operand to have BOOL type");
    auto xRank = shape::rank(hXShapeInfo);
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
+    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES, BOOL_TYPES);
    // TODO: remove after the release
    auto res = cudaStreamSynchronize(*stream);
@ -675,6 +669,45 @@ void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
        throw cuda_exception::build("execReduceBool failed", res);
 }
 ////////////////////////////////////////////////////////////////////////
 /**
 *
 * @param opNum
 * @param dX
 * @param dXShapeInfo
 * @param extraParams
 * @param dZ
 * @param dZShapeInfo
 */
 void  NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
                                          int opNum,
                                          const void *hX, const Nd4jLong *hXShapeInfo,
                                          const void *dX, const Nd4jLong *dXShapeInfo,
                                          void *extraParams,
                                          void *hZ, const Nd4jLong *hZShapeInfo,
                                          void *dZ, const Nd4jLong *dZShapeInfo,
                                          int *dimension, int dimensionLength) {
    auto stream = lc->getCudaStream();
    auto reductionPointer = lc->getReductionPointer();
    if (sd::Environment::getInstance().isDebugAndVerbose())
        printf("F8 opNum:[%i]\n", opNum);
    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
    auto numBlocks = shape::length(hZShapeInfo);
    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, reductionPointer, dZ, dZShapeInfo, hZShapeInfo, dimension), LIBND4J_TYPES, FLOAT_TYPES);
    // TODO: remove after the release
    auto res = cudaStreamSynchronize(*stream);
    if (res != 0)
        throw cuda_exception::build("execReduceFloat failed", res);
 }
 ////////////////////////////////////////////////////////////////////////
 /**
 *
@ -707,7 +740,8 @@ void NativeOpExecutioner::execIndexReduce(sd::LaunchContext  *lc,
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 	auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+	auto tadLength = shape::length(hXShapeInfo) / numBlocks;
    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, tadLength < CUDA_BLOCK_SIZE ? tadLength : CUDA_BLOCK_SIZE, 1024);
    if (zType != sd::DataType::INT64 && zType != sd::DataType::INT32)
        throw datatype_exception::build("NativeOpExecutioner::execIndexReduce requires Z operand to have INT32/INT64 type", zType);
@ -722,46 +756,6 @@ void NativeOpExecutioner::execIndexReduce(sd::LaunchContext  *lc,
        throw cuda_exception::build("execIndexReduce failed", res);
 }
 ////////////////////////////////////////////////////////////////////////
 /**
 *
 * @param opNum
 * @param dX
 * @param dXShapeInfo
 * @param extraParams
 * @param dZ
 * @param dZShapeInfo
 */
 void  NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
 										int opNum,
 										void const* hX, Nd4jLong const* hXShapeInfo,
        								void const* dX, Nd4jLong const* dXShapeInfo,
        								void *extraParams,
        								void *hZ, Nd4jLong const* hZShapeInfo,
 										void *dZ, Nd4jLong const* dZShapeInfo,
 										int *dimension,int dimensionLength,
 										Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
 	if (sd::Environment::getInstance().isDebugAndVerbose())
 		printf("F8 opNum:[%i]\n", opNum);
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
    auto xRank = shape::rank(hXShapeInfo);
    auto numBlocks = shape::length(hZShapeInfo);
    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
    // TODO: remove after the release
    auto res = cudaStreamSynchronize(*stream);
    if (res != 0)
        throw cuda_exception::build("execReduceFloat failed", res);
 }
 /**
@ -790,7 +784,7 @@ void NativeOpExecutioner::execIndexReduceScalar(sd::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
 	if (sd::Environment::getInstance().isDebugAndVerbose() && launchDims.x == 1)
 		printf("AF1 opNum:[%i]\n", opNum);
@ -840,7 +834,7 @@ void NativeOpExecutioner::execReduceFloatScalar(sd::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceScalar(launchDims, stream, opNum, dX,dXShapeInfo, hXShapeInfo, extraParams, dZ,dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
@ -870,9 +864,9 @@ void NativeOpExecutioner::execReduceBoolScalar(sd::LaunchContext  *lc,
        throw std::runtime_error("NativeOpExecutioner::execReduceBoolScalar requires Z operand to have BOOL type");
    auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, BOOL_TYPES);
@ -901,9 +895,9 @@ void NativeOpExecutioner::execReduceSameScalar(sd::LaunchContext  *lc,
        throw datatype_exception::build("NativeOpExecutioner::execReduceSameScalar requires both X & Z operands to have same type", xType, zType);
    auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES);
@ -932,9 +926,9 @@ void NativeOpExecutioner::execReduceLongScalar(sd::LaunchContext  *lc,
        throw datatype_exception::build("NativeOpExecutioner::execReduceLongScalar wrong Z data type", sd::DataType::INT64, zType);
    auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, LONG_TYPES);
@ -1128,7 +1122,7 @@ void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
    auto stream = lc->getCudaStream();
    auto reductionPointer = lc->getReductionPointer();
-    dim3 launchDims = dim3(256, 256, 32768);
+    dim3 launchDims = dim3(256, CUDA_BLOCK_SIZE, 1024);
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
@ -1158,7 +1152,7 @@ void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
-    dim3 launchDims = dim3(256, 256, 32768);
+    dim3 launchDims = dim3(256, CUDA_BLOCK_SIZE, 1024);
 	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
@ -1194,9 +1188,9 @@ void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(shape::length(hXShapeInfo), blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
    if (xType != yType)
        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
@ -1246,7 +1240,7 @@ void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum,
                                                                    dX, dXShapeInfo,
@ -1286,9 +1280,9 @@ void NativeOpExecutioner::execReduce3Scalar(sd::LaunchContext  *lc,
    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
    auto xLength = shape::length(hXShapeInfo);
-    auto blockWidth = 256;
+    auto blockWidth = CUDA_BLOCK_SIZE;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 1024);
    if (xType != yType)
        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
@ -1652,7 +1646,7 @@ void NativeOpExecutioner::execReduce3All(sd::LaunchContext  *lc,
    if (sd::Environment::getInstance().isDebugAndVerbose())
        printf("D119 opNum:[%i]\n", opNum);
-    dim3 launchDims(shape::length(hZShapeInfo), 256, 32768);
+    dim3 launchDims(shape::length(hZShapeInfo), CUDA_BLOCK_SIZE / 2, 1024);
    if (sd::Environment::getInstance().isVerbose() && launchDims.x == 1)
        printf("AD119 opNum:[%i]\n", opNum);
@ -1706,7 +1700,7 @@ void NativeOpExecutioner::execReduce3TAD(sd::LaunchContext  *lc,
        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, CUDA_BLOCK_SIZE, 1024);
    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, tadShapeInfo, tadOffsets, yTadShapeInfo, yTadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
--- a/libnd4j/include/legacy/cuda/NativeOps.cu
+++ b/libnd4j/include/legacy/cuda/NativeOps.cu
@ -454,17 +454,24 @@ void execReduceSame2(Nd4jPointer *extraPointers,
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
+        const auto zLen = shape::length(hZShapeInfo);
                                                                                dimension,
                                                                                shape::length(hDimensionShape));
        std::vector<int> dimensions(dimension, dimension + dimensionLength);
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
        NativeOpExecutioner::execReduceSame(&lc, opNum,
                dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
-                dimension, dimensionLength,
+                dims.data(), dims.size());
                tadPack.specialShapeInfo(), tadPack.specialOffsets());
        InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
    } catch (std::exception &e) {
@ -487,17 +494,25 @@ void execReduceLong2(Nd4jPointer *extraPointers,
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
+        const auto zLen = shape::length(hZShapeInfo);
-                                                                                dimension,
+
-                                                                                shape::length(hDimensionShape));
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
        NativeOpExecutioner::execReduceLong(&lc, opNum,
                dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
-                dimension, dimensionLength,
+                dims.data(), dims.size());
                tadPack.specialShapeInfo(), tadPack.specialOffsets());
        InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
    } catch (std::exception &e) {
@ -562,17 +577,25 @@ void execReduceBool2(Nd4jPointer *extraPointers,
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
+        const auto zLen = shape::length(hZShapeInfo);
-                                                                                dimension,
+
-                                                                                shape::length(hDimensionShape));
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
        NativeOpExecutioner::execReduceBool(&lc, opNum,
                dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
-                dimension, dimensionLength,
+                dims.data(), dims.size());
                tadPack.specialShapeInfo(), tadPack.specialOffsets());
        InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
    } catch (std::exception &e) {
@ -690,17 +713,25 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
        auto dimension = reinterpret_cast<int *>(dbDimension->primary());
        int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
-        auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(hXShapeInfo,
+        const auto zLen = shape::length(hZShapeInfo);
-                                                                                dimension,
+
-                                                                                shape::length(hDimensionShape));
+        std::vector<int> dimensions(dimension, dimension + dimensionLength);
        const Nd4jLong* zShapeInfoH = hZShapeInfo;
        if(shape::rank(hXShapeInfo) - dimensionLength != shape::rank(hZShapeInfo) && zLen != 1) {
            auto zPack = ConstantShapeHelper::getInstance().createShapeInfoWithNoUnitiesForReduce(hZShapeInfo, dimensions);
            zShapeInfoH = reinterpret_cast<Nd4jLong const*>(zPack.primary());
        }
        std::vector<int> dims = (zLen != 1) ? ShapeUtils::evalDimsForReduceOp(shape::rank(hXShapeInfo), dimensions) : std::vector<int>();
        LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
        NativeOpExecutioner::execReduceFloat(&lc, opNum,
                dbX->primary(), hXShapeInfo, dbX->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hXShapeInfo).special(),
                extraParams,
-                dbZ->primary(), hZShapeInfo, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(hZShapeInfo).special(),
+                dbZ->primary(), zShapeInfoH, dbZ->special(), ConstantShapeHelper::getInstance().bufferForShapeInfo(zShapeInfoH).special(),
-                dimension, dimensionLength,
+                dims.data(), dims.size());
                tadPack.specialShapeInfo(), tadPack.specialOffsets());
        InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
    } catch (std::exception &e) {
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@ -784,24 +784,14 @@ static void execDefault(const X *x, const Nd4jLong *xShapeInfo, const Y *y, cons
    const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
    const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
    const int rank = shape::rank(zShapeInfo);   // xRank = yRank = zRank
    auto func = PRAGMA_THREADS_FOR{
-        int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+        int coords[MAX_RANK];
        Nd4jLong xOffset, yOffset, zOffset;
        for (auto i = start; i < stop; ++i) {
-            shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
+            shape::getOffsetBroadcast(start, i, zShapeInfo, xShapeInfo, yShapeInfo, xzSameOffsets, yzSameOffsets, coords, zOffset, xOffset, yOffset);
            for (uint j = 0; j < rank; ++j) {
                xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
                yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
            }
            const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
            const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
            const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
            z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
        }
--- a/libnd4j/include/loops/cpu/broadcasting_bool.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.hpp
@ -665,24 +665,14 @@ static void execDefault(const X *x, const Nd4jLong *xShapeInfo, const X *y, cons
    const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
    const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
    const int rank = shape::rank(zShapeInfo);   // xRank = yRank = zRank
    auto func = PRAGMA_THREADS_FOR{
-        int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+        int coords[MAX_RANK];
        Nd4jLong xOffset, yOffset, zOffset;
        for (auto i = start; i < stop; ++i) {
-            shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
+            shape::getOffsetBroadcast(start, i, zShapeInfo, xShapeInfo, yShapeInfo, xzSameOffsets, yzSameOffsets, coords, zOffset, xOffset, yOffset);
            for (uint j = 0; j < rank; ++j) {
                xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
                yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
            }
            const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
            const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
            const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
            z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
        }
--- a/libnd4j/include/loops/cpu/broadcasting_int.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.hpp
@ -651,24 +651,14 @@ static void execDefault(const X *x, const Nd4jLong *xShapeInfo, const X *y, cons
    const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
    const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
    const int rank = shape::rank(zShapeInfo);   // xRank = yRank = zRank
    auto func = PRAGMA_THREADS_FOR{
-        int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+        int coords[MAX_RANK];
        Nd4jLong xOffset, yOffset, zOffset;
        for (auto i = start; i < stop; ++i) {
-            shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
+            shape::getOffsetBroadcast(start, i, zShapeInfo, xShapeInfo, yShapeInfo, xzSameOffsets, yzSameOffsets, coords, zOffset, xOffset, yOffset);
            for (uint j = 0; j < rank; ++j) {
                xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
                yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
            }
            const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
            const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
            const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
            z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
        }
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@ -114,71 +114,6 @@ namespace functions {
            DISPATCH_BY_OPNUM_TT(execScalar, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo), REDUCE_BOOL_OPS);
        }
        template <typename X, typename Y>
        void ReduceBoolFunction<X, Y>::exec(const int opNum,
                                            const void *x, const Nd4jLong *xShapeInfo,
                                            void *extraParams,
                                            void *z, const Nd4jLong *zShapeInfo,
                                            int *dimension, int dimensionLength,
                                            const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
                                            int64_t start, int64_t stop) {
                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_BOOL_OPS);
        }
        template <typename X, typename Z>
        template <typename OpType>
        void _CUDA_H ReduceBoolFunction<X,Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
                                                   void *vextraParams,
                                                   void *vresult, const Nd4jLong *zShapeInfo,
                                                   int *dimension, int dimensionLength,
                                                   const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset, int64_t start, int64_t stop) {
                auto x = reinterpret_cast<const X *>(vx);
                auto z = reinterpret_cast<Z *>(vresult);
                auto extraParams = reinterpret_cast<X *>(vextraParams);
                auto resultLength = shape::length(zShapeInfo);
                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                        return;
                    const auto startingVal = OpType::startingValue(x);
                    for (Nd4jLong i = 0; i < resultLength; i++)
                        z[i] = startingVal;
                    return;
                }
                //pre squeezed: this is for keeping the pointer to the original
                //shape information for tad offset
                //the squeezed information doesn't render the right strides for
                //tad offset
                // || tad.wholeThing
                if (resultLength == 1 || dimension == nullptr || dimensionLength == shape::rank(xShapeInfo)) {
                    z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
                    return;
                }
                auto tadOnlyShapeInfo = tadShapeInfo;
                auto tadOffsets = tadOffset;
                if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
                    if (dimensionLength < 1)
                        return;
                    auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(xShapeInfo, dimension, dimensionLength);
                    tadOnlyShapeInfo = tadPack.primaryShapeInfo();
                    tadOffsets = tadPack.primaryOffsets();
                }
 #ifdef INLINE_LOOPS
                sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
                sd::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
            }
        template <typename X, typename Z>
        template<typename OpType>
        void _CUDA_H ReduceBoolFunction<X,Z>::exec(const void *x, const Nd4jLong *xShapeInfo,
@ -220,7 +155,51 @@ namespace functions {
                return OpType::postProcess(intermediate[0], length, extraParams);
            }
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
 template <typename OpType>
 void _CUDA_H ReduceBoolFunction<X,Z>::exec(sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int* dims) {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceBoolFunction, , LIBND4J_TYPES, BOOL_TYPES);
+    const X* x = reinterpret_cast<const X*>(vx);
          Z* z = reinterpret_cast<Z*>(vz);
          X* extraParams = reinterpret_cast<X*>(vextraParams);
    const int xRank = shape::rank(xShapeInfo);
    const int zRank = shape::rank(zShapeInfo);
     if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
        const auto startingVal = OpType::startingValue(x);
        const auto zLen = shape::length(zShapeInfo);
        for (Nd4jLong i = 0; i < zLen; i++)
            z[i] = startingVal;
        return;
    }
    if (shape::length(zShapeInfo) == 1) {
        z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
        return;
    }
 #ifdef INLINE_LOOPS
    sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #else
    sd::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
 }
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
 void ReduceBoolFunction<X,Y>::exec(const int opNum, sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims) {
    DISPATCH_BY_OPNUM_TT(exec, PARAMS(workspace, vx, xShapeInfo, vextraParams, vz, zShapeInfo, dims), REDUCE_BOOL_OPS);
 }
 BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceBoolFunction, , LIBND4J_TYPES, BOOL_TYPES);
 }
 }
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
@ -26,11 +26,13 @@
 #include <helpers/OmpLaunchHelper.h>
 #include <helpers/Loops.h>
 #include <helpers/ConstantTadHelper.h>
 #include <helpers/ShapeBuilders.h>
 using namespace simdOps;
 namespace functions {
-    namespace reduce {
+namespace reduce    {
        template <typename X, typename Z>
        template <typename OpType>
        void _CUDA_H ReduceFloatFunction<X,Z>::execScalar(const void *vx, const Nd4jLong *xShapeInfo,
@ -133,86 +135,6 @@ namespace functions {
            DISPATCH_BY_OPNUM_TT(execScalar, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo), REDUCE_FLOAT_OPS);
        }
        template <typename X, typename Y>
        void ReduceFloatFunction<X, Y>::exec(const int opNum,
                                             const void *x, const Nd4jLong *xShapeInfo,
                                             void *extraParams,
                                             void *z, const Nd4jLong *zShapeInfo,
                                             int *dimension, int dimensionLength,
                                             const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
                                             int64_t start, int64_t stop) {
                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                               xShapeInfo,
                                               extraParams,
                                               z,
                                               zShapeInfo,
                                               dimension,
                                               dimensionLength,
                                               tadShapeInfo,
                                               tadOffset, start, stop),
                                  REDUCE_FLOAT_OPS);
        }
        template <typename X, typename Z>
        template <typename OpType>
        void _CUDA_H ReduceFloatFunction<X,Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
                                                    void *vextraParams,
                                                    void *vresult, const Nd4jLong *zShapeInfo,
                                                    int *dimension, int dimensionLength,
                                                    const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffset,
                                                    int64_t start, int64_t stop) {
                auto x = reinterpret_cast<const X *>(vx);
                auto z = reinterpret_cast<Z *>(vresult);
                auto extraParams = reinterpret_cast<Z *>(vextraParams);
                auto resultLength = shape::length(zShapeInfo);
                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                        return;
                    const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? sd::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
                    for (Nd4jLong i = 0; i < resultLength; i++)
                        z[i] = startingVal;
                    return;
                }
                //pre squeezed: this is for keeping the pointer to the original
                //shape information for tad offset
                //the squeezed information doesn't render the right strides for
                //tad offset
                // || tad.wholeThing
                if (resultLength == 1 || dimension == nullptr || dimensionLength == shape::rank(xShapeInfo)) {
                    z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
                    return;
                }
                if (OpType::requiresSpecialAccumulation) {
                    OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset);
                    return;
                }
                auto tadOnlyShapeInfo = tadShapeInfo;
                auto tadOffsets = tadOffset;
                if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
                    if (dimensionLength < 0)
                        return;
                    auto tadPack = sd::ConstantTadHelper::getInstance().tadForDimensions(xShapeInfo, dimension, dimensionLength);
                    tadOnlyShapeInfo = tadPack.primaryShapeInfo();
                    tadOffsets = tadPack.primaryOffsets();
                }
 #ifdef INLINE_LOOPS
                sd::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
                sd::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
            }
        template <typename X, typename Z>
        template<typename OpType>
        void _CUDA_H ReduceFloatFunction<X,Z>::exec(const void *x, const Nd4jLong *xShapeInfo,
@ -255,5 +177,54 @@ namespace functions {
            // return result
            return OpType::postProcess(intermediate[0], length, extraParams);
        }
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
 template<typename OpType>
 void _CUDA_H ReduceFloatFunction<X, Z>::exec(sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int* dims) {
    const X* x = reinterpret_cast<const X*>(vx);
          Z* z = reinterpret_cast<Z*>(vz);
          Z* extraParams = reinterpret_cast<Z*>(vextraParams);
    const int xRank = shape::rank(xShapeInfo);
    const int zRank = shape::rank(zShapeInfo);
     if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
        const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? sd::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
        const auto zLen = shape::length(zShapeInfo);
        for (Nd4jLong i = 0; i < zLen; i++)
            z[i] = startingVal;
        return;
    }
    if (shape::length(zShapeInfo) == 1) {
        z[0] = execScalar<OpType>(x, xShapeInfo, extraParams);
        return;
    }
    if (OpType::requiresSpecialAccumulation) {
        OpType::execSpecial(x, xShapeInfo, extraParams, z, zShapeInfo, const_cast<int*>(dims)+zRank, xRank-zRank, nullptr, nullptr);
        return;
    }
 #ifdef INLINE_LOOPS
    sd::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #else
    sd::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(workspace, x, xShapeInfo, z, zShapeInfo, dims, extraParams);
 #endif
 }
 ////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
 void ReduceFloatFunction<X, Y>::exec(const int opNum, sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims) {
    DISPATCH_BY_OPNUM_TT(exec, PARAMS(workspace, vx, xShapeInfo, vextraParams, vz, zShapeInfo, dims), REDUCE_FLOAT_OPS);
 }
 }
 }
--- a/Show More
+++ b/Show More