diff --git a/arbiter/arbiter-core/src/main/java/org/deeplearning4j/arbiter/util/WebUtils.java b/arbiter/arbiter-core/src/main/java/org/deeplearning4j/arbiter/util/WebUtils.java
deleted file mode 100644
index 632ad7474..000000000
--- a/arbiter/arbiter-core/src/main/java/org/deeplearning4j/arbiter/util/WebUtils.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-package org.deeplearning4j.arbiter.util;
-
-import org.slf4j.Logger;
-
-import java.awt.*;
-import java.net.URI;
-
-/**
- * Various utilities for webpages and dealing with browsers
- */
-public class WebUtils {
-
-    public static void tryOpenBrowser(String path, Logger log) {
-        try {
-            WebUtils.openBrowser(new URI(path));
-        } catch (Exception e) {
-            log.error("Could not open browser", e);
-            System.out.println("Browser could not be launched automatically.\nUI path: " + path);
-        }
-    }
-
-    public static void openBrowser(URI uri) throws Exception {
-        if (Desktop.isDesktopSupported()) {
-            Desktop.getDesktop().browse(uri);
-        } else {
-            throw new UnsupportedOperationException(
-                            "Cannot open browser on this platform: Desktop.isDesktopSupported() == false");
-        }
-    }
-
-}
diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java
index f49a8051d..4d507ee7d 100644
--- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java
+++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/BraninFunction.java
@@ -127,7 +127,7 @@ public class BraninFunction {
                     BraninConfig candidate = (BraninConfig) c.getValue();
 
                     double score = scoreFunction.score(candidate, null, (Map) null);
-                    System.out.println(candidate.getX1() + "\t" + candidate.getX2() + "\t" + score);
+//                    System.out.println(candidate.getX1() + "\t" + candidate.getX2() + "\t" + score);
 
                     Thread.sleep(20);
 
diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java
index 34916ebdc..99d2ad8d7 100644
--- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java
+++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/TestRandomSearch.java
@@ -54,7 +54,7 @@ public class TestRandomSearch extends BaseDL4JTest {
         runner.execute();
 
 
-        System.out.println("----- Complete -----");
+//        System.out.println("----- Complete -----");
     }
 
 
diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java
index 2055ede57..abeba96e8 100644
--- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java
+++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/TestRandomGenerator.java
@@ -16,8 +16,8 @@
 
 package org.deeplearning4j.arbiter.optimize.genetic;
 
+import org.apache.commons.lang3.NotImplementedException;
 import org.apache.commons.math3.random.RandomGenerator;
-import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 public class TestRandomGenerator implements RandomGenerator {
     private final int[] intRandomNumbers;
@@ -63,17 +63,17 @@ public class TestRandomGenerator implements RandomGenerator {
 
     @Override
     public long nextLong() {
-        throw new NotImplementedException();
+        throw new NotImplementedException("Not implemented");
     }
 
     @Override
     public boolean nextBoolean() {
-        throw new NotImplementedException();
+        throw new NotImplementedException("Not implemented");
     }
 
     @Override
     public float nextFloat() {
-        throw new NotImplementedException();
+        throw new NotImplementedException("Not implemented");
     }
 
     @Override
@@ -83,6 +83,6 @@ public class TestRandomGenerator implements RandomGenerator {
 
     @Override
     public double nextGaussian() {
-        throw new NotImplementedException();
+        throw new NotImplementedException("Not implemented");
     }
 }
diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java
index 9efe89620..9bde211f0 100644
--- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java
+++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/crossover/TwoParentsCrossoverOperatorTests.java
@@ -16,6 +16,7 @@
 
 package org.deeplearning4j.arbiter.optimize.genetic.crossover;
 
+import org.apache.commons.lang3.NotImplementedException;
 import org.deeplearning4j.BaseDL4JTest;
 import org.deeplearning4j.arbiter.optimize.generator.genetic.crossover.CrossoverResult;
 import org.deeplearning4j.arbiter.optimize.generator.genetic.crossover.TwoParentsCrossoverOperator;
@@ -26,7 +27,6 @@ import org.deeplearning4j.arbiter.optimize.genetic.TestParentSelection;
 import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer;
 import org.junit.Assert;
 import org.junit.Test;
-import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 public class TwoParentsCrossoverOperatorTests extends BaseDL4JTest {
 
@@ -42,7 +42,7 @@ public class TwoParentsCrossoverOperatorTests extends BaseDL4JTest {
 
         @Override
         public CrossoverResult crossover() {
-            throw new NotImplementedException();
+            throw new NotImplementedException("Not implemented");
         }
     }
 
diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java
index 093ffd486..c85022dca 100644
--- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java
+++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/culling/RatioCullOperatorTests.java
@@ -16,6 +16,7 @@
 
 package org.deeplearning4j.arbiter.optimize.genetic.culling;
 
+import org.apache.commons.lang3.NotImplementedException;
 import org.deeplearning4j.BaseDL4JTest;
 import org.deeplearning4j.arbiter.optimize.generator.genetic.Chromosome;
 import org.deeplearning4j.arbiter.optimize.generator.genetic.culling.RatioCullOperator;
@@ -24,7 +25,6 @@ import org.deeplearning4j.arbiter.optimize.generator.genetic.population.Populati
 import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer;
 import org.junit.Assert;
 import org.junit.Test;
-import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 import java.util.List;
 
@@ -46,7 +46,7 @@ public class RatioCullOperatorTests extends BaseDL4JTest {
 
         @Override
         public void cullPopulation() {
-            throw new NotImplementedException();
+            throw new NotImplementedException("Not implemented");
         }
 
         public double getCullRatio() {
diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java
index 1d2b74de9..ddd0ae91e 100644
--- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java
+++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/GeneticSelectionOperatorTests.java
@@ -16,6 +16,7 @@
 
 package org.deeplearning4j.arbiter.optimize.genetic.selection;
 
+import org.apache.commons.lang3.NotImplementedException;
 import org.apache.commons.math3.random.RandomGenerator;
 import org.deeplearning4j.BaseDL4JTest;
 import org.deeplearning4j.arbiter.optimize.generator.genetic.ChromosomeFactory;
@@ -33,7 +34,6 @@ import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer;
 import org.deeplearning4j.arbiter.optimize.genetic.TestRandomGenerator;
 import org.junit.Assert;
 import org.junit.Test;
-import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 import static org.junit.Assert.assertArrayEquals;
 
@@ -55,7 +55,7 @@ public class GeneticSelectionOperatorTests extends BaseDL4JTest {
 
         @Override
         public void cullPopulation() {
-            throw new NotImplementedException();
+            throw new NotImplementedException("Not implemented");
         }
 
         @Override
diff --git a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java
index 3f64279ee..5d8a8b361 100644
--- a/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java
+++ b/arbiter/arbiter-core/src/test/java/org/deeplearning4j/arbiter/optimize/genetic/selection/SelectionOperatorTests.java
@@ -16,6 +16,7 @@
 
 package org.deeplearning4j.arbiter.optimize.genetic.selection;
 
+import org.apache.commons.lang3.NotImplementedException;
 import org.deeplearning4j.BaseDL4JTest;
 import org.deeplearning4j.arbiter.optimize.generator.genetic.ChromosomeFactory;
 import org.deeplearning4j.arbiter.optimize.generator.genetic.population.PopulationInitializer;
@@ -24,7 +25,6 @@ import org.deeplearning4j.arbiter.optimize.generator.genetic.selection.Selection
 import org.deeplearning4j.arbiter.optimize.genetic.TestPopulationInitializer;
 import org.junit.Assert;
 import org.junit.Test;
-import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 public class SelectionOperatorTests extends BaseDL4JTest {
     private class TestSelectionOperator extends SelectionOperator {
@@ -39,7 +39,7 @@ public class SelectionOperatorTests extends BaseDL4JTest {
 
         @Override
         public double[] buildNextGenes() {
-            throw new NotImplementedException();
+            throw new NotImplementedException("Not implemented");
         }
     }
 
diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java
index 7c4ec38f4..54d73b775 100644
--- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java
+++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestComputationGraphSpace.java
@@ -158,7 +158,7 @@ public class TestComputationGraphSpace extends BaseDL4JTest {
             }
         }
 
-        System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount);
+//        System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount);
         assertTrue(reluCount > 0);
         assertTrue(tanhCount > 0);
 
diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java
index 1747b45f9..391139f32 100644
--- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java
+++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecution.java
@@ -162,7 +162,7 @@ public class TestGraphLocalExecution extends BaseDL4JTest {
             List<ResultReference> results = runner.getResults();
             assertTrue(results.size() > 0);
 
-            System.out.println("----- COMPLETE - " + results.size() + " results -----");
+//            System.out.println("----- COMPLETE - " + results.size() + " results -----");
         }
     }
 
diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java
index 2b9c5696d..91daa027f 100644
--- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java
+++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/computationgraph/TestGraphLocalExecutionGenetic.java
@@ -165,7 +165,7 @@ public class TestGraphLocalExecutionGenetic extends BaseDL4JTest {
             List<ResultReference> results = runner.getResults();
             assertTrue(results.size() > 0);
 
-            System.out.println("----- COMPLETE - " + results.size() + " results -----");
+//            System.out.println("----- COMPLETE - " + results.size() + " results -----");
         }
     }
 
diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java
index 6a5458e65..959cafc35 100644
--- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java
+++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestLayerSpace.java
@@ -101,7 +101,7 @@ public class TestLayerSpace extends BaseDL4JTest {
             double l2 = TestUtils.getL2(l);
             IActivation activation = l.getActivationFn();
 
-            System.out.println(lr + "\t" + l2 + "\t" + activation);
+//            System.out.println(lr + "\t" + l2 + "\t" + activation);
 
             assertTrue(lr >= 0.3 && lr <= 0.4);
             assertTrue(l2 >= 0.01 && l2 <= 0.1);
@@ -190,7 +190,7 @@ public class TestLayerSpace extends BaseDL4JTest {
             ActivationLayer al = als.getValue(d);
             IActivation activation = al.getActivationFn();
 
-            System.out.println(activation);
+//            System.out.println(activation);
 
             assertTrue(containsActivationFunction(actFns, activation));
         }
@@ -228,7 +228,7 @@ public class TestLayerSpace extends BaseDL4JTest {
             IActivation activation = el.getActivationFn();
             long nOut = el.getNOut();
 
-            System.out.println(activation + "\t" + nOut);
+//            System.out.println(activation + "\t" + nOut);
 
             assertTrue(containsActivationFunction(actFns, activation));
             assertTrue(nOut >= 10 && nOut <= 20);
@@ -295,7 +295,7 @@ public class TestLayerSpace extends BaseDL4JTest {
             long nOut = el.getNOut();
             double forgetGate = el.getForgetGateBiasInit();
 
-            System.out.println(activation + "\t" + nOut + "\t" + forgetGate);
+//            System.out.println(activation + "\t" + nOut + "\t" + forgetGate);
 
             assertTrue(containsActivationFunction(actFns, activation));
             assertTrue(nOut >= 10 && nOut <= 20);
diff --git a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java
index 99dc79f42..d4dbe9a3a 100644
--- a/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java
+++ b/arbiter/arbiter-deeplearning4j/src/test/java/org/deeplearning4j/arbiter/multilayernetwork/TestMultiLayerSpace.java
@@ -293,8 +293,8 @@ public class TestMultiLayerSpace extends BaseDL4JTest {
             assertTrue(nLayerCounts[i] >= 5); //Expect approx equal (50/3 each), but some variation randomly
         }
 
-        System.out.println("Number of layers: " + Arrays.toString(nLayerCounts));
-        System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount);
+//        System.out.println("Number of layers: " + Arrays.toString(nLayerCounts));
+//        System.out.println("ReLU vs. Tanh: " + reluCount + "\t" + tanhCount);
 
     }
 
diff --git a/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java b/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java
index 21e4e402a..40e8a3e41 100644
--- a/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java
+++ b/arbiter/arbiter-server/src/test/java/org/deeplearning4j/arbiter/server/ArbiterCLIRunnerTest.java
@@ -98,7 +98,8 @@ public class ArbiterCLIRunnerTest extends BaseDL4JTest {
         assertEquals(configuration,OptimizationConfiguration.fromJson(configuration.toJson()));
 
         FileUtils.writeStringToFile(new File(configPath),configuration.toJson());
-        System.out.println(configuration.toJson());
+//        System.out.println(configuration.toJson());
+        configuration.toJson();
 
         log.info("Starting test");
         cliRunner.runMain(
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java
new file mode 100644
index 000000000..637f5860f
--- /dev/null
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2015-2019 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.deeplearning4j.regressiontest;
+
+import org.deeplearning4j.BaseDL4JTest;
+import org.deeplearning4j.TestUtils;
+import org.deeplearning4j.nn.conf.BackpropType;
+import org.deeplearning4j.nn.conf.ConvolutionMode;
+import org.deeplearning4j.nn.conf.graph.LayerVertex;
+import org.deeplearning4j.nn.conf.layers.*;
+import org.deeplearning4j.nn.conf.layers.convolutional.Cropping2D;
+import org.deeplearning4j.nn.conf.layers.recurrent.Bidirectional;
+import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
+import org.deeplearning4j.nn.conf.layers.variational.VariationalAutoencoder;
+import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.graph.vertex.impl.MergeVertex;
+import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
+import org.deeplearning4j.nn.weights.WeightInitXavier;
+import org.deeplearning4j.regressiontest.customlayer100a.CustomLayer;
+import org.junit.Test;
+import org.nd4j.linalg.activations.impl.*;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.learning.config.Adam;
+import org.nd4j.linalg.learning.config.RmsProp;
+import org.nd4j.linalg.learning.regularization.L2Regularization;
+import org.nd4j.linalg.lossfunctions.impl.LossMAE;
+import org.nd4j.linalg.lossfunctions.impl.LossMCXENT;
+import org.nd4j.resources.Resources;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+
+import static org.junit.Assert.*;
+
+public class RegressionTest100b6 extends BaseDL4JTest {
+
+    @Override
+    public DataType getDataType() {
+        return DataType.FLOAT;
+    }
+
+    @Test
+    public void testCustomLayer() throws Exception {
+
+        for (DataType dtype : new DataType[]{DataType.DOUBLE, DataType.FLOAT, DataType.HALF}) {
+
+            String dtypeName = dtype.toString().toLowerCase();
+
+            File f = Resources.asFile("regression_testing/100b6/CustomLayerExample_100b6_" + dtypeName + ".bin");
+            MultiLayerNetwork.load(f, true);
+
+            MultiLayerNetwork net = MultiLayerNetwork.load(f, true);
+//            net = net.clone();
+
+            DenseLayer l0 = (DenseLayer) net.getLayer(0).conf().getLayer();
+            assertEquals(new ActivationTanH(), l0.getActivationFn());
+            assertEquals(new L2Regularization(0.03), TestUtils.getL2Reg(l0));
+            assertEquals(new RmsProp(0.95), l0.getIUpdater());
+
+            CustomLayer l1 = (CustomLayer) net.getLayer(1).conf().getLayer();
+            assertEquals(new ActivationTanH(), l1.getActivationFn());
+            assertEquals(new ActivationSigmoid(), l1.getSecondActivationFunction());
+            assertEquals(new RmsProp(0.95), l1.getIUpdater());
+
+            INDArray outExp;
+            File f2 = Resources
+                    .asFile("regression_testing/100b6/CustomLayerExample_Output_100b6_" + dtypeName + ".bin");
+            try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) {
+                outExp = Nd4j.read(dis);
+            }
+
+            INDArray in;
+            File f3 = Resources.asFile("regression_testing/100b6/CustomLayerExample_Input_100b6_" + dtypeName + ".bin");
+            try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) {
+                in = Nd4j.read(dis);
+            }
+
+            assertEquals(dtype, in.dataType());
+            assertEquals(dtype, outExp.dataType());
+            assertEquals(dtype, net.params().dataType());
+            assertEquals(dtype, net.getFlattenedGradients().dataType());
+            assertEquals(dtype, net.getUpdater().getStateViewArray().dataType());
+
+            //System.out.println(Arrays.toString(net.params().data().asFloat()));
+
+            INDArray outAct = net.output(in);
+            assertEquals(dtype, outAct.dataType());
+
+            assertEquals(dtype, net.getLayerWiseConfigurations().getDataType());
+            assertEquals(dtype, net.params().dataType());
+            boolean eq = outExp.equalsWithEps(outAct, 0.01);
+            assertTrue(outExp + " vs " + outAct, eq);        }
+    }
+
+
+    @Test
+    public void testLSTM() throws Exception {
+
+        File f = Resources.asFile("regression_testing/100b6/GravesLSTMCharModelingExample_100b6.bin");
+        MultiLayerNetwork net = MultiLayerNetwork.load(f, true);
+
+        LSTM l0 = (LSTM) net.getLayer(0).conf().getLayer();
+        assertEquals(new ActivationTanH(), l0.getActivationFn());
+        assertEquals(200, l0.getNOut());
+        assertEquals(new WeightInitXavier(), l0.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l0));
+        assertEquals(new Adam(0.005), l0.getIUpdater());
+
+        LSTM l1 = (LSTM) net.getLayer(1).conf().getLayer();
+        assertEquals(new ActivationTanH(), l1.getActivationFn());
+        assertEquals(200, l1.getNOut());
+        assertEquals(new WeightInitXavier(), l1.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l1));
+        assertEquals(new Adam(0.005), l1.getIUpdater());
+
+        RnnOutputLayer l2 = (RnnOutputLayer) net.getLayer(2).conf().getLayer();
+        assertEquals(new ActivationSoftmax(), l2.getActivationFn());
+        assertEquals(77, l2.getNOut());
+        assertEquals(new WeightInitXavier(), l2.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l2));
+        assertEquals(new Adam(0.005), l2.getIUpdater());
+
+        assertEquals(BackpropType.TruncatedBPTT, net.getLayerWiseConfigurations().getBackpropType());
+        assertEquals(50, net.getLayerWiseConfigurations().getTbpttBackLength());
+        assertEquals(50, net.getLayerWiseConfigurations().getTbpttFwdLength());
+
+        INDArray outExp;
+        File f2 = Resources.asFile("regression_testing/100b6/GravesLSTMCharModelingExample_Output_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) {
+            outExp = Nd4j.read(dis);
+        }
+
+        INDArray in;
+        File f3 = Resources.asFile("regression_testing/100b6/GravesLSTMCharModelingExample_Input_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) {
+            in = Nd4j.read(dis);
+        }
+
+        INDArray outAct = net.output(in);
+
+        assertEquals(outExp, outAct);
+    }
+
+    @Test
+    public void testVae() throws Exception {
+
+        File f = Resources.asFile("regression_testing/100b6/VaeMNISTAnomaly_100b6.bin");
+        MultiLayerNetwork net = MultiLayerNetwork.load(f, true);
+
+        VariationalAutoencoder l0 = (VariationalAutoencoder) net.getLayer(0).conf().getLayer();
+        assertEquals(new ActivationLReLU(), l0.getActivationFn());
+        assertEquals(32, l0.getNOut());
+        assertArrayEquals(new int[]{256, 256}, l0.getEncoderLayerSizes());
+        assertArrayEquals(new int[]{256, 256}, l0.getDecoderLayerSizes());
+        assertEquals(new WeightInitXavier(), l0.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l0));
+        assertEquals(new Adam(1e-3), l0.getIUpdater());
+
+        INDArray outExp;
+        File f2 = Resources.asFile("regression_testing/100b6/VaeMNISTAnomaly_Output_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) {
+            outExp = Nd4j.read(dis);
+        }
+
+        INDArray in;
+        File f3 = Resources.asFile("regression_testing/100b6/VaeMNISTAnomaly_Input_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) {
+            in = Nd4j.read(dis);
+        }
+
+        INDArray outAct = net.output(in);
+
+        assertEquals(outExp, outAct);
+    }
+
+
+    @Test
+    public void testYoloHouseNumber() throws Exception {
+
+        File f = Resources.asFile("regression_testing/100b6/HouseNumberDetection_100b6.bin");
+        ComputationGraph net = ComputationGraph.load(f, true);
+
+        int nBoxes = 5;
+        int nClasses = 10;
+
+        ConvolutionLayer cl = (ConvolutionLayer) ((LayerVertex) net.getConfiguration().getVertices()
+                .get("convolution2d_9")).getLayerConf().getLayer();
+        assertEquals(nBoxes * (5 + nClasses), cl.getNOut());
+        assertEquals(new ActivationIdentity(), cl.getActivationFn());
+        assertEquals(ConvolutionMode.Same, cl.getConvolutionMode());
+        assertEquals(new WeightInitXavier(), cl.getWeightInitFn());
+        assertArrayEquals(new int[]{1, 1}, cl.getKernelSize());
+
+        INDArray outExp;
+        File f2 = Resources.asFile("regression_testing/100b6/HouseNumberDetection_Output_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) {
+            outExp = Nd4j.read(dis);
+        }
+
+        INDArray in;
+        File f3 = Resources.asFile("regression_testing/100b6/HouseNumberDetection_Input_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) {
+            in = Nd4j.read(dis);
+        }
+
+        INDArray outAct = net.outputSingle(in);
+
+        boolean eq = outExp.equalsWithEps(outAct.castTo(outExp.dataType()), 1e-3);
+        assertTrue(eq);
+    }
+
+    @Test
+    public void testSyntheticCNN() throws Exception {
+
+        File f = Resources.asFile("regression_testing/100b6/SyntheticCNN_100b6.bin");
+        MultiLayerNetwork net = MultiLayerNetwork.load(f, true);
+
+        ConvolutionLayer l0 = (ConvolutionLayer) net.getLayer(0).conf().getLayer();
+        assertEquals(new ActivationReLU(), l0.getActivationFn());
+        assertEquals(4, l0.getNOut());
+        assertEquals(new WeightInitXavier(), l0.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l0));
+        assertEquals(new Adam(0.005), l0.getIUpdater());
+        assertArrayEquals(new int[]{3, 3}, l0.getKernelSize());
+        assertArrayEquals(new int[]{2, 1}, l0.getStride());
+        assertArrayEquals(new int[]{1, 1}, l0.getDilation());
+        assertArrayEquals(new int[]{0, 0}, l0.getPadding());
+
+        SeparableConvolution2D l1 = (SeparableConvolution2D) net.getLayer(1).conf().getLayer();
+        assertEquals(new ActivationReLU(), l1.getActivationFn());
+        assertEquals(8, l1.getNOut());
+        assertEquals(new WeightInitXavier(), l1.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l1));
+        assertEquals(new Adam(0.005), l1.getIUpdater());
+        assertArrayEquals(new int[]{3, 3}, l1.getKernelSize());
+        assertArrayEquals(new int[]{1, 1}, l1.getStride());
+        assertArrayEquals(new int[]{1, 1}, l1.getDilation());
+        assertArrayEquals(new int[]{0, 0}, l1.getPadding());
+        assertEquals(ConvolutionMode.Same, l1.getConvolutionMode());
+        assertEquals(1, l1.getDepthMultiplier());
+
+        SubsamplingLayer l2 = (SubsamplingLayer) net.getLayer(2).conf().getLayer();
+        assertArrayEquals(new int[]{3, 3}, l2.getKernelSize());
+        assertArrayEquals(new int[]{2, 2}, l2.getStride());
+        assertArrayEquals(new int[]{1, 1}, l2.getDilation());
+        assertArrayEquals(new int[]{0, 0}, l2.getPadding());
+        assertEquals(PoolingType.MAX, l2.getPoolingType());
+
+        ZeroPaddingLayer l3 = (ZeroPaddingLayer) net.getLayer(3).conf().getLayer();
+        assertArrayEquals(new int[]{4, 4, 4, 4}, l3.getPadding());
+
+        Upsampling2D l4 = (Upsampling2D) net.getLayer(4).conf().getLayer();
+        assertArrayEquals(new int[]{3, 3}, l4.getSize());
+
+        DepthwiseConvolution2D l5 = (DepthwiseConvolution2D) net.getLayer(5).conf().getLayer();
+        assertEquals(new ActivationReLU(), l5.getActivationFn());
+        assertEquals(16, l5.getNOut());
+        assertEquals(new WeightInitXavier(), l5.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l5));
+        assertEquals(new Adam(0.005), l5.getIUpdater());
+        assertArrayEquals(new int[]{3, 3}, l5.getKernelSize());
+        assertArrayEquals(new int[]{1, 1}, l5.getStride());
+        assertArrayEquals(new int[]{1, 1}, l5.getDilation());
+        assertArrayEquals(new int[]{0, 0}, l5.getPadding());
+        assertEquals(2, l5.getDepthMultiplier());
+
+        SubsamplingLayer l6 = (SubsamplingLayer) net.getLayer(6).conf().getLayer();
+        assertArrayEquals(new int[]{2, 2}, l6.getKernelSize());
+        assertArrayEquals(new int[]{2, 2}, l6.getStride());
+        assertArrayEquals(new int[]{1, 1}, l6.getDilation());
+        assertArrayEquals(new int[]{0, 0}, l6.getPadding());
+        assertEquals(PoolingType.MAX, l6.getPoolingType());
+
+        Cropping2D l7 = (Cropping2D) net.getLayer(7).conf().getLayer();
+        assertArrayEquals(new int[]{3, 3, 2, 2}, l7.getCropping());
+
+        ConvolutionLayer l8 = (ConvolutionLayer) net.getLayer(8).conf().getLayer();
+        assertEquals(4, l8.getNOut());
+        assertEquals(new WeightInitXavier(), l8.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l8));
+        assertEquals(new Adam(0.005), l8.getIUpdater());
+        assertArrayEquals(new int[]{4, 4}, l8.getKernelSize());
+        assertArrayEquals(new int[]{1, 1}, l8.getStride());
+        assertArrayEquals(new int[]{1, 1}, l8.getDilation());
+        assertArrayEquals(new int[]{0, 0}, l8.getPadding());
+
+        CnnLossLayer l9 = (CnnLossLayer) net.getLayer(9).conf().getLayer();
+        assertEquals(new WeightInitXavier(), l9.getWeightInitFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l9));
+        assertEquals(new Adam(0.005), l9.getIUpdater());
+        assertEquals(new LossMAE(), l9.getLossFn());
+
+        INDArray outExp;
+        File f2 = Resources.asFile("regression_testing/100b6/SyntheticCNN_Output_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) {
+            outExp = Nd4j.read(dis);
+        }
+
+        INDArray in;
+        File f3 = Resources.asFile("regression_testing/100b6/SyntheticCNN_Input_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) {
+            in = Nd4j.read(dis);
+        }
+
+        INDArray outAct = net.output(in);
+
+        //19 layers - CPU vs. GPU difference accumulates notably, but appears to be correct
+        if(Nd4j.getBackend().getClass().getName().toLowerCase().contains("native")){
+            assertEquals(outExp, outAct);
+        } else {
+            boolean eq = outExp.equalsWithEps(outAct, 0.1);
+            assertTrue(eq);
+        }
+    }
+
+    @Test
+    public void testSyntheticBidirectionalRNNGraph() throws Exception {
+
+        File f = Resources.asFile("regression_testing/100b6/SyntheticBidirectionalRNNGraph_100b6.bin");
+        ComputationGraph net = ComputationGraph.load(f, true);
+
+        Bidirectional l0 = (Bidirectional) net.getLayer("rnn1").conf().getLayer();
+
+        LSTM l1 = (LSTM) l0.getFwd();
+        assertEquals(16, l1.getNOut());
+        assertEquals(new ActivationReLU(), l1.getActivationFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l1));
+
+        LSTM l2 = (LSTM) l0.getBwd();
+        assertEquals(16, l2.getNOut());
+        assertEquals(new ActivationReLU(), l2.getActivationFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l2));
+
+        Bidirectional l3 = (Bidirectional) net.getLayer("rnn2").conf().getLayer();
+
+        SimpleRnn l4 = (SimpleRnn) l3.getFwd();
+        assertEquals(16, l4.getNOut());
+        assertEquals(new ActivationReLU(), l4.getActivationFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l4));
+
+        SimpleRnn l5 = (SimpleRnn) l3.getBwd();
+        assertEquals(16, l5.getNOut());
+        assertEquals(new ActivationReLU(), l5.getActivationFn());
+        assertEquals(new L2Regularization(0.0001), TestUtils.getL2Reg(l5));
+
+        MergeVertex mv = (MergeVertex) net.getVertex("concat");
+
+        GlobalPoolingLayer gpl = (GlobalPoolingLayer) net.getLayer("pooling").conf().getLayer();
+        assertEquals(PoolingType.MAX, gpl.getPoolingType());
+        assertArrayEquals(new int[]{2}, gpl.getPoolingDimensions());
+        assertTrue(gpl.isCollapseDimensions());
+
+        OutputLayer outl = (OutputLayer) net.getLayer("out").conf().getLayer();
+        assertEquals(3, outl.getNOut());
+        assertEquals(new LossMCXENT(), outl.getLossFn());
+
+        INDArray outExp;
+        File f2 = Resources.asFile("regression_testing/100b6/SyntheticBidirectionalRNNGraph_Output_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f2))) {
+            outExp = Nd4j.read(dis);
+        }
+
+        INDArray in;
+        File f3 = Resources.asFile("regression_testing/100b6/SyntheticBidirectionalRNNGraph_Input_100b6.bin");
+        try (DataInputStream dis = new DataInputStream(new FileInputStream(f3))) {
+            in = Nd4j.read(dis);
+        }
+
+        INDArray outAct = net.output(in)[0];
+
+        assertEquals(outExp, outAct);
+    }
+}
diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java
index 1a5a27918..f4c970a22 100644
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java
@@ -41,7 +41,7 @@ public class TestGraphLoading extends BaseDL4JTest {
 
         IGraph<String, String> graph = GraphLoader
                         .loadUndirectedGraphEdgeListFile(cpr.getTempFileFromArchive().getAbsolutePath(), 7, ",");
-        System.out.println(graph);
+//        System.out.println(graph);
 
         assertEquals(graph.numVertices(), 7);
         int[][] edges = {{1, 2}, {0, 2, 4}, {0, 1, 3, 4}, {2, 4, 5}, {1, 2, 3, 5, 6}, {3, 4, 6}, {4, 5}};
@@ -66,7 +66,7 @@ public class TestGraphLoading extends BaseDL4JTest {
                         edgeLineProcessor, vertexFactory, 10, false);
 
 
-        System.out.println(graph);
+//        System.out.println(graph);
 
         for (int i = 0; i < 10; i++) {
             List<Edge<String>> edges = graph.getEdgesOut(i);
@@ -111,7 +111,7 @@ public class TestGraphLoading extends BaseDL4JTest {
         Graph<String, String> graph = GraphLoader.loadGraph(verticesCPR.getTempFileFromArchive().getAbsolutePath(),
                         edgesCPR.getTempFileFromArchive().getAbsolutePath(), vertexLoader, edgeLineProcessor, false);
 
-        System.out.println(graph);
+//        System.out.println(graph);
 
         for (int i = 0; i < 10; i++) {
             List<Edge<String>> edges = graph.getEdgesOut(i);
diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java
index 94e1a20bf..77903f51e 100644
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java
@@ -71,7 +71,7 @@ public class TestGraphLoadingWeighted extends BaseDL4JTest {
             }
         }
 
-        System.out.println(graph);
+//        System.out.println(graph);
     }
 
 
diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java
index 0dc456107..b0adf3283 100644
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java
@@ -220,7 +220,7 @@ public class TestGraph extends BaseDL4JTest {
                 sum += transitionProb[i][j];
             for (int j = 0; j < transitionProb[i].length; j++)
                 transitionProb[i][j] /= sum;
-            System.out.println(Arrays.toString(transitionProb[i]));
+//            System.out.println(Arrays.toString(transitionProb[i]));
         }
 
         //Check that transition probs are essentially correct (within bounds of random variation)
diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java
index c1aedd47a..f0343bde9 100644
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java
@@ -145,8 +145,8 @@ public class DeepWalkGradientCheck extends BaseDL4JTest {
 
                         if (relError > MAX_REL_ERROR && absErr > MIN_ABS_ERROR)
                             fail(msg);
-                        else
-                            System.out.println(msg);
+//                        else
+//                            System.out.println(msg);
                     }
                 }
 
@@ -333,10 +333,10 @@ public class DeepWalkGradientCheck extends BaseDL4JTest {
 
                     if (relError > MAX_REL_ERROR && absErr > MIN_ABS_ERROR)
                         fail(msg);
-                    else
-                        System.out.println(msg);
+//                    else
+//                        System.out.println(msg);
                 }
-                System.out.println();
+//                System.out.println();
             }
 
         }
diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java
index d92c3bec1..97359cf15 100644
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java
@@ -67,7 +67,7 @@ public class TestDeepWalk extends BaseDL4JTest {
         for (int i = 0; i < 7; i++) {
             INDArray vector = deepWalk.getVertexVector(i);
             assertArrayEquals(new long[] {vectorSize}, vector.shape());
-            System.out.println(Arrays.toString(vector.dup().data().asFloat()));
+//            System.out.println(Arrays.toString(vector.dup().data().asFloat()));
         }
 
         GraphWalkIterator<String> iter = new RandomWalkIterator<>(graph, 8);
@@ -77,11 +77,11 @@ public class TestDeepWalk extends BaseDL4JTest {
         for (int t = 0; t < 5; t++) {
             iter.reset();
             deepWalk.fit(iter);
-            System.out.println("--------------------");
+//            System.out.println("--------------------");
             for (int i = 0; i < 7; i++) {
                 INDArray vector = deepWalk.getVertexVector(i);
                 assertArrayEquals(new long[] {vectorSize}, vector.shape());
-                System.out.println(Arrays.toString(vector.dup().data().asFloat()));
+//                System.out.println(Arrays.toString(vector.dup().data().asFloat()));
             }
         }
     }
@@ -160,7 +160,7 @@ public class TestDeepWalk extends BaseDL4JTest {
                 continue;
 
             double sim = deepWalk.similarity(i, nearestTo);
-            System.out.println(i + "\t" + nearestTo + "\t" + sim);
+//            System.out.println(i + "\t" + nearestTo + "\t" + sim);
             assertTrue(sim <= minSimNearest);
         }
     }
@@ -211,7 +211,7 @@ public class TestDeepWalk extends BaseDL4JTest {
         Graph<String, String> graph = GraphLoader
                         .loadUndirectedGraphEdgeListFile(cpr.getTempFileFromArchive().getAbsolutePath(), 13, ",");
 
-        System.out.println(graph);
+//        System.out.println(graph);
 
         Nd4j.getRandom().setSeed(12345);
 
@@ -229,11 +229,13 @@ public class TestDeepWalk extends BaseDL4JTest {
 
         //Calculate similarity(0,i)
         for (int i = 0; i < nVertices; i++) {
-            System.out.println(deepWalk.similarity(0, i));
+//            System.out.println(deepWalk.similarity(0, i));
+            deepWalk.similarity(0, i);
         }
 
         for (int i = 0; i < nVertices; i++)
-            System.out.println(deepWalk.getVertexVector(i));
+//            System.out.println(deepWalk.getVertexVector(i));
+            deepWalk.getVertexVector(i);
     }
 
     @Test(timeout = 60000L)
diff --git a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java
index 763aae822..76b2af0b5 100644
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java
@@ -38,9 +38,11 @@ public class TestGraphHuffman extends BaseDL4JTest {
 
         gh.buildTree(vertexDegrees);
 
-        for (int i = 0; i < 7; i++)
-            System.out.println(i + "\t" + gh.getCodeLength(i) + "\t" + gh.getCodeString(i) + "\t\t" + gh.getCode(i)
-                            + "\t\t" + Arrays.toString(gh.getPathInnerNodes(i)));
+        for (int i = 0; i < 7; i++) {
+            String s = i + "\t" + gh.getCodeLength(i) + "\t" + gh.getCodeString(i) + "\t\t" + gh.getCode(i)
+                    + "\t\t" + Arrays.toString(gh.getPathInnerNodes(i));
+//            System.out.println(s);
+        }
 
         int[] expectedLengths = {3, 2, 2, 5, 4, 2, 5};
         for (int i = 0; i < vertexDegrees.length; i++) {
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java
index 712b9c12b..8567dc379 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/DL4JModelValidator.java
@@ -3,6 +3,7 @@ package org.deeplearning4j.util;
 import lombok.NonNull;
 import org.apache.commons.io.IOUtils;
 import org.deeplearning4j.nn.api.Model;
+import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
@@ -121,7 +122,7 @@ public class DL4JModelValidator {
         }
 
         try{
-            MultiLayerConfiguration.fromJson(config);
+            ComputationGraphConfiguration.fromJson(config);
         } catch (Throwable t){
             return ValidationResult.builder()
                     .formatType("ComputationGraph")
diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java
index beb9af5b4..ad610739f 100644
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper-parameter-server/src/test/java/org/deeplearning4j/parallelism/parameterserver/ParameterServerParallelWrapperTest.java
@@ -79,8 +79,9 @@ public class ParameterServerParallelWrapperTest extends BaseDL4JTest {
         model.init();
 
         ParallelWrapper parameterServerParallelWrapper =
-                        new ParallelWrapper.Builder(model).trainerFactory(new ParameterServerTrainerContext())
-                                        .workers(Runtime.getRuntime().availableProcessors())
+                        new ParallelWrapper.Builder(model)
+                                .workers(Math.min(4, Runtime.getRuntime().availableProcessors()))
+                                .trainerFactory(new ParameterServerTrainerContext())
                                         .reportScoreAfterAveraging(true).prefetchBuffer(3).build();
         parameterServerParallelWrapper.fit(mnistTrain);
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java
index 55d893d8c..f3b3f974a 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-nlp-java8/src/test/java/org/deeplearning4j/spark/models/word2vec/SparkWord2VecTest.java
@@ -104,7 +104,7 @@ public class SparkWord2VecTest extends BaseDL4JTest {
         public void call(ExportContainer<VocabWord> v) throws Exception {
             assertNotNull(v.getElement());
             assertNotNull(v.getArray());
-            System.out.println(v.getElement() + " - " + v.getArray());
+//            System.out.println(v.getElement() + " - " + v.getArray());
         }
     }
 }
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java
index 0983cbd76..1515cf3cf 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSpark.java
@@ -66,7 +66,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build())
                         .build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
 
         JavaRDD<DataSet> irisData = getIris();
@@ -119,7 +119,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MSE).build())
                         .build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
         JavaRDD<DataSet> irisData = getIris();
         EarlyStoppingModelSaver<MultiLayerNetwork> saver = new InMemoryModelSaver<>();
@@ -155,7 +155,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build())
                         .build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
         JavaRDD<DataSet> irisData = getIris();
 
@@ -198,7 +198,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build())
                         .build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
         JavaRDD<DataSet> irisData = getIris();
 
@@ -231,7 +231,7 @@ public class TestEarlyStoppingSpark extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build())
                         .build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
 
         JavaRDD<DataSet> irisData = getIris();
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java
index 2e35b629c..0c4e2b2f8 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/TestEarlyStoppingSparkCompGraph.java
@@ -69,7 +69,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in")
                         .setOutputs("0").build();
         ComputationGraph net = new ComputationGraph(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
 
         JavaRDD<DataSet> irisData = getIris();
@@ -120,7 +120,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MSE).build(), "in")
                         .setOutputs("0").build();
         ComputationGraph net = new ComputationGraph(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
         JavaRDD<DataSet> irisData = getIris();
         EarlyStoppingModelSaver<ComputationGraph> saver = new InMemoryModelSaver<>();
@@ -158,7 +158,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in")
                         .setOutputs("0").build();
         ComputationGraph net = new ComputationGraph(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
         JavaRDD<DataSet> irisData = getIris();
 
@@ -203,7 +203,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in")
                         .setOutputs("0").build();
         ComputationGraph net = new ComputationGraph(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
         JavaRDD<DataSet> irisData = getIris();
 
@@ -238,7 +238,7 @@ public class TestEarlyStoppingSparkCompGraph extends BaseSparkTest {
                                         .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "in")
                         .setOutputs("0").build();
         ComputationGraph net = new ComputationGraph(conf);
-        net.setListeners(new ScoreIterationListener(1));
+        net.setListeners(new ScoreIterationListener(5));
 
 
         JavaRDD<DataSet> irisData = getIris();
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java
index 24d58bb17..c26db5642 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/data/TestShuffleExamples.java
@@ -59,7 +59,7 @@ public class TestShuffleExamples extends BaseSparkTest {
         int totalExampleCount = 0;
         for (DataSet ds : shuffledList) {
             totalExampleCount += ds.getFeatures().length();
-            System.out.println(Arrays.toString(ds.getFeatures().data().asFloat()));
+//            System.out.println(Arrays.toString(ds.getFeatures().data().asFloat()));
 
             assertEquals(ds.getFeatures(), ds.getLabels());
         }
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java
index e93cfeb92..d110a3b98 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/datavec/TestExport.java
@@ -86,7 +86,7 @@ public class TestExport extends BaseSparkTest {
         for (File file : files) {
             if (!file.getPath().endsWith(".bin"))
                 continue;
-            System.out.println(file);
+//            System.out.println(file);
             DataSet ds = new DataSet();
             ds.load(file);
             assertEquals(minibatchSize, ds.numExamples());
@@ -144,7 +144,7 @@ public class TestExport extends BaseSparkTest {
         for (File file : files) {
             if (!file.getPath().endsWith(".bin"))
                 continue;
-            System.out.println(file);
+//            System.out.println(file);
             MultiDataSet ds = new org.nd4j.linalg.dataset.MultiDataSet();
             ds.load(file);
             assertEquals(minibatchSize, ds.getFeatures(0).size(0));
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java
index 6094ed008..4d2ed4b97 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/common/repartition/HashingBalancedPartitionerTest.java
@@ -92,9 +92,9 @@ public class HashingBalancedPartitionerTest extends BaseSparkTest {
 
         int[][] colorCountsByPartition = new int[3][2];
         for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) {
-            System.out.println(val);
+//            System.out.println(val);
             Integer partition = hbp.getPartition(val._1());
-            System.out.println(partition);
+//            System.out.println(partition);
 
             if (val._2().equals("red"))
                 colorCountsByPartition[partition][0] += 1;
@@ -102,9 +102,9 @@ public class HashingBalancedPartitionerTest extends BaseSparkTest {
                 colorCountsByPartition[partition][1] += 1;
         }
 
-        for (int i = 0; i < 3; i++) {
-            System.out.println(Arrays.toString(colorCountsByPartition[i]));
-        }
+//        for (int i = 0; i < 3; i++) {
+//            System.out.println(Arrays.toString(colorCountsByPartition[i]));
+//        }
         for (int i = 0; i < 3; i++) {
             // avg red per partition : 2.33
             assertTrue(colorCountsByPartition[i][0] >= 1 && colorCountsByPartition[i][0] < 4);
@@ -178,12 +178,12 @@ public class HashingBalancedPartitionerTest extends BaseSparkTest {
                 colorCountsByPartition[partition][1] += 1;
         }
 
-        for (int i = 0; i < numPartitions; i++) {
-            System.out.println(Arrays.toString(colorCountsByPartition[i]));
-        }
-
-        System.out.println("Ideal red # per partition: " + avgRed);
-        System.out.println("Ideal blue # per partition: " + avgBlue);
+//        for (int i = 0; i < numPartitions; i++) {
+//            System.out.println(Arrays.toString(colorCountsByPartition[i]));
+//        }
+//
+//        System.out.println("Ideal red # per partition: " + avgRed);
+//        System.out.println("Ideal blue # per partition: " + avgBlue);
 
         for (int i = 0; i < numPartitions; i++) {
             // avg red per partition : 2.33
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java
index 0e29386a1..0de7875e2 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/graph/TestSparkComputationGraph.java
@@ -115,7 +115,7 @@ public class TestSparkComputationGraph extends BaseSparkTest {
         TrainingMaster tm = new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 10, 1, 0);
 
         SparkComputationGraph scg = new SparkComputationGraph(sc, cg, tm);
-        scg.setListeners(Collections.singleton((TrainingListener) new ScoreIterationListener(1)));
+        scg.setListeners(Collections.singleton((TrainingListener) new ScoreIterationListener(5)));
 
         JavaRDD<MultiDataSet> rdd = sc.parallelize(list);
         scg.fitMultiDataSet(rdd);
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java
index ecf9b937b..38a15ef8d 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java
@@ -31,8 +31,11 @@ import org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster;
 import org.junit.Test;
 import org.nd4j.evaluation.classification.Evaluation;
 import org.nd4j.linalg.activations.Activation;
+import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.dataset.DataSet;
 import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.learning.config.Adam;
 import org.nd4j.linalg.learning.config.Nesterovs;
 import org.nd4j.linalg.lossfunctions.LossFunctions;
 
@@ -45,8 +48,24 @@ import static org.junit.Assert.assertTrue;
 @Slf4j
 public class TestSparkDl4jMultiLayer extends BaseSparkTest {
 
-    @Test(timeout = 120000L)
+    @Override
+    public long getTimeoutMilliseconds() {
+        return 120000L;
+    }
+
+    @Override
+    public DataType getDataType() {
+        return DataType.FLOAT;
+    }
+
+    @Override
+    public DataType getDefaultFPDataType() {
+        return DataType.FLOAT;
+    }
+
+    @Test
     public void testEvaluationSimple() throws Exception {
+        Nd4j.getRandom().setSeed(12345);
 
         for( int evalWorkers : new int[]{1, 4, 8}) {
             //Simple test to validate DL4J issue 4099 is fixed...
@@ -75,18 +94,18 @@ public class TestSparkDl4jMultiLayer extends BaseSparkTest {
             //----------------------------------
             //Create network configuration and conduct network training
             MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
+                    .dataType(DataType.FLOAT)
                     .seed(12345)
                     .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
                     .activation(Activation.LEAKYRELU)
                     .weightInit(WeightInit.XAVIER)
-                    .updater(new Nesterovs(0.02, 0.9))
-                    .l2(1e-4)
+                    .updater(new Adam(1e-3))
+                    .l2(1e-5)
                     .list()
                     .layer(0, new DenseLayer.Builder().nIn(28 * 28).nOut(500).build())
                     .layer(1, new DenseLayer.Builder().nIn(500).nOut(100).build())
                     .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD)
                             .activation(Activation.SOFTMAX).nIn(100).nOut(10).build())
-
                     .build();
 
             //Configuration for Spark training: see https://deeplearning4j.org/docs/latest/deeplearning4j-scaleout-howto for explanation of these configuration options
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java
index 0188b15d9..9a6c80000 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestCompareParameterAveragingSparkVsSingleMachine.java
@@ -333,15 +333,16 @@ public class TestCompareParameterAveragingSparkVsSingleMachine {
                     sparkNet.fit(rdd);
                 }
 
-                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+//                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+                sparkNet.getSparkTrainingStats().statsAsString();
 
                 INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
 
-                System.out.println("Initial (Local) params:       " + Arrays.toString(initialParams.data().asFloat()));
-                System.out.println("Initial (Spark) params:       "
-                                + Arrays.toString(initialSparkParams.data().asFloat()));
-                System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat()));
-                System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat()));
+//                System.out.println("Initial (Local) params:       " + Arrays.toString(initialParams.data().asFloat()));
+//                System.out.println("Initial (Spark) params:       "
+//                                + Arrays.toString(initialSparkParams.data().asFloat()));
+//                System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat()));
+//                System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat()));
                 assertEquals(initialParams, initialSparkParams);
                 assertNotEquals(initialParams, finalParams);
                 assertEquals(finalParams, finalSparkParams);
@@ -405,15 +406,16 @@ public class TestCompareParameterAveragingSparkVsSingleMachine {
                     sparkNet.fit(rdd);
                 }
 
-                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+//                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+                sparkNet.getSparkTrainingStats().statsAsString();
 
                 INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
 
-                System.out.println("Initial (Local) params:       " + Arrays.toString(initialParams.data().asFloat()));
-                System.out.println("Initial (Spark) params:       "
-                                + Arrays.toString(initialSparkParams.data().asFloat()));
-                System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat()));
-                System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat()));
+//                System.out.println("Initial (Local) params:       " + Arrays.toString(initialParams.data().asFloat()));
+//                System.out.println("Initial (Spark) params:       "
+//                                + Arrays.toString(initialSparkParams.data().asFloat()));
+//                System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat()));
+//                System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat()));
                 assertArrayEquals(initialParams.data().asFloat(), initialSparkParams.data().asFloat(), 1e-8f);
                 assertArrayEquals(finalParams.data().asFloat(), finalSparkParams.data().asFloat(), 1e-6f);
 
@@ -478,18 +480,19 @@ public class TestCompareParameterAveragingSparkVsSingleMachine {
                     sparkNet.fit(rdd);
                 }
 
-                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+//                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+                sparkNet.getSparkTrainingStats().statsAsString();
 
                 INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
                 //                executioner.addToWatchdog(finalSparkParams, "finalSparkParams");
 
                 float[] fp = finalParams.data().asFloat();
                 float[] fps = finalSparkParams.data().asFloat();
-                System.out.println("Initial (Local) params:       " + Arrays.toString(initialParams.data().asFloat()));
-                System.out.println("Initial (Spark) params:       "
-                                + Arrays.toString(initialSparkParams.data().asFloat()));
-                System.out.println("Final (Local) params: " + Arrays.toString(fp));
-                System.out.println("Final (Spark) params: " + Arrays.toString(fps));
+//                System.out.println("Initial (Local) params:       " + Arrays.toString(initialParams.data().asFloat()));
+//                System.out.println("Initial (Spark) params:       "
+//                                + Arrays.toString(initialSparkParams.data().asFloat()));
+//                System.out.println("Final (Local) params: " + Arrays.toString(fp));
+//                System.out.println("Final (Spark) params: " + Arrays.toString(fps));
 
                 assertEquals(initialParams, initialSparkParams);
                 assertNotEquals(initialParams, finalParams);
@@ -551,14 +554,15 @@ public class TestCompareParameterAveragingSparkVsSingleMachine {
                     sparkNet.fit(rdd);
                 }
 
-                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+//                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
+                sparkNet.getSparkTrainingStats().statsAsString();
 
                 INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
 
-                System.out.println("Initial (Local) params:  " + Arrays.toString(initialParams.data().asFloat()));
-                System.out.println("Initial (Spark) params:  " + Arrays.toString(initialSparkParams.data().asFloat()));
-                System.out.println("Final (Local) params:    " + Arrays.toString(finalParams.data().asFloat()));
-                System.out.println("Final (Spark) params:    " + Arrays.toString(finalSparkParams.data().asFloat()));
+//                System.out.println("Initial (Local) params:  " + Arrays.toString(initialParams.data().asFloat()));
+//                System.out.println("Initial (Spark) params:  " + Arrays.toString(initialSparkParams.data().asFloat()));
+//                System.out.println("Final (Local) params:    " + Arrays.toString(finalParams.data().asFloat()));
+//                System.out.println("Final (Spark) params:    " + Arrays.toString(finalSparkParams.data().asFloat()));
                 assertArrayEquals(initialParams.data().asFloat(), initialSparkParams.data().asFloat(), 1e-8f);
                 assertArrayEquals(finalParams.data().asFloat(), finalSparkParams.data().asFloat(), 1e-6f);
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java
index c43729166..8558878b8 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestJsonYaml.java
@@ -37,7 +37,7 @@ public class TestJsonYaml {
         String json = tm.toJson();
         String yaml = tm.toYaml();
 
-        System.out.println(json);
+//        System.out.println(json);
 
         TrainingMaster fromJson = ParameterAveragingTrainingMaster.fromJson(json);
         TrainingMaster fromYaml = ParameterAveragingTrainingMaster.fromYaml(yaml);
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java
index ca7a168b2..3b328e210 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java
@@ -389,7 +389,7 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
         List<EventStats> workerFitStats = stats.getValue("ParameterAveragingWorkerFitTimesMs");
         for (EventStats e : workerFitStats) {
             ExampleCountEventStats eces = (ExampleCountEventStats) e;
-            System.out.println(eces.getTotalExampleCount());
+//            System.out.println(eces.getTotalExampleCount());
         }
 
         for (EventStats e : workerFitStats) {
@@ -457,7 +457,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
         assertNotEquals(paramsBefore, paramsAfter);
 
         SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
-        System.out.println(stats.statsAsString());
+//        System.out.println(stats.statsAsString());
+        stats.statsAsString();
 
         sparkNet.getTrainingMaster().deleteTempFiles(sc);
     }
@@ -483,7 +484,7 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
             i++;
         }
 
-        System.out.println("Saved to: " + tempDirF.getAbsolutePath());
+//        System.out.println("Saved to: " + tempDirF.getAbsolutePath());
 
 
 
@@ -527,7 +528,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
         SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
 
         //Expect
-        System.out.println(stats.statsAsString());
+//        System.out.println(stats.statsAsString());
+        stats.statsAsString();
         assertEquals(numSplits, stats.getValue("ParameterAveragingMasterRepartitionTimesMs").size());
 
         List<EventStats> list = stats.getValue("ParameterAveragingWorkerFitTimesMs");
@@ -566,8 +568,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
             i++;
         }
 
-        System.out.println("Saved to: " + tempDirF.getAbsolutePath());
-        System.out.println("Saved to: " + tempDirF2.getAbsolutePath());
+//        System.out.println("Saved to: " + tempDirF.getAbsolutePath());
+//        System.out.println("Saved to: " + tempDirF2.getAbsolutePath());
 
 
 
@@ -610,7 +612,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
         assertNotEquals(paramsBefore, paramsAfter);
 
         SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
-        System.out.println(stats.statsAsString());
+//        System.out.println(stats.statsAsString());
+        stats.statsAsString();
 
         //Same thing, buf for MultiDataSet objects:
         config = new Configuration();
@@ -631,7 +634,8 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
         assertNotEquals(paramsBefore, paramsAfter);
 
         stats = sparkNet.getSparkTrainingStats();
-        System.out.println(stats.statsAsString());
+//        System.out.println(stats.statsAsString());
+        stats.statsAsString();
     }
 
 
@@ -730,13 +734,13 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
                         .build();
 
         for (int avgFreq : new int[] {1, 5, 10}) {
-            System.out.println("--- Avg freq " + avgFreq + " ---");
+//            System.out.println("--- Avg freq " + avgFreq + " ---");
             SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf.clone(),
                             new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                                             .batchSizePerWorker(batchSizePerExecutor).averagingFrequency(avgFreq)
                                             .repartionData(Repartition.Always).build());
 
-            sparkNet.setListeners(new ScoreIterationListener(1));
+            sparkNet.setListeners(new ScoreIterationListener(5));
 
 
 
@@ -778,13 +782,13 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
                         .setOutputs("1").build();
 
         for (int avgFreq : new int[] {1, 5, 10}) {
-            System.out.println("--- Avg freq " + avgFreq + " ---");
+//            System.out.println("--- Avg freq " + avgFreq + " ---");
             SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf.clone(),
                             new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                                             .batchSizePerWorker(batchSizePerExecutor).averagingFrequency(avgFreq)
                                             .repartionData(Repartition.Always).build());
 
-            sparkNet.setListeners(new ScoreIterationListener(1));
+            sparkNet.setListeners(new ScoreIterationListener(5));
 
             JavaRDD<DataSet> rdd = sc.parallelize(list);
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java
index 5b49899c8..15d57b0a6 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/stats/TestTrainingStatsCollection.java
@@ -107,7 +107,7 @@ public class TestTrainingStatsCollection extends BaseSparkTest {
                 expectedStatNames.addAll(c);
             }
 
-            System.out.println(expectedStatNames);
+//            System.out.println(expectedStatNames);
 
 
             SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
@@ -119,7 +119,7 @@ public class TestTrainingStatsCollection extends BaseSparkTest {
             }
 
             String statsAsString = stats.statsAsString();
-            System.out.println(statsAsString);
+//            System.out.println(statsAsString);
             assertEquals(actualKeySet.size(), statsAsString.split("\n").length); //One line per stat
 
 
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java
index e88438766..f4b435d46 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/time/TestTimeSource.java
@@ -35,7 +35,7 @@ public class TestTimeSource {
             long systemTime = System.currentTimeMillis();
             long ntpTime = timeSource.currentTimeMillis();
             long offset = ntpTime - systemTime;
-            System.out.println("System: " + systemTime + "\tNTPTimeSource: " + ntpTime + "\tOffset: " + offset);
+//            System.out.println("System: " + systemTime + "\tNTPTimeSource: " + ntpTime + "\tOffset: " + offset);
             Thread.sleep(500);
         }
     }
@@ -49,7 +49,7 @@ public class TestTimeSource {
             long systemTime = System.currentTimeMillis();
             long ntpTime = timeSource.currentTimeMillis();
             long offset = ntpTime - systemTime;
-            System.out.println("System: " + systemTime + "\tSystemClockTimeSource: " + ntpTime + "\tOffset: " + offset);
+//            System.out.println("System: " + systemTime + "\tSystemClockTimeSource: " + ntpTime + "\tOffset: " + offset);
             assertEquals(systemTime, ntpTime, 2); //Should be exact, but we might randomly tick over between one ms and the next
             Thread.sleep(500);
         }
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java
index 1b3329530..a12b1e460 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/ui/TestListeners.java
@@ -87,7 +87,7 @@ public class TestListeners extends BaseSparkTest {
         net.fit(rdd);
 
         List<String> sessions = ss.listSessionIDs();
-        System.out.println("Sessions: " + sessions);
+//        System.out.println("Sessions: " + sessions);
         assertEquals(1, sessions.size());
 
         String sid = sessions.get(0);
@@ -95,15 +95,15 @@ public class TestListeners extends BaseSparkTest {
         List<String> typeIDs = ss.listTypeIDsForSession(sid);
         List<String> workers = ss.listWorkerIDsForSession(sid);
 
-        System.out.println(sid + "\t" + typeIDs + "\t" + workers);
+//        System.out.println(sid + "\t" + typeIDs + "\t" + workers);
 
         List<Persistable> lastUpdates = ss.getLatestUpdateAllWorkers(sid, StatsListener.TYPE_ID);
-        System.out.println(lastUpdates);
+//        System.out.println(lastUpdates);
 
-        System.out.println("Static info:");
+//        System.out.println("Static info:");
         for (String wid : workers) {
             Persistable staticInfo = ss.getStaticInfo(sid, StatsListener.TYPE_ID, wid);
-            System.out.println(sid + "\t" + wid);
+//            System.out.println(sid + "\t" + wid);
         }
 
         assertEquals(1, typeIDs.size());
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java
index e0759a549..ad1622966 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/util/TestRepartitioning.java
@@ -63,7 +63,7 @@ public class TestRepartitioning extends BaseSparkTest {
         assertEquals(10, rdd2.partitions().size());
         for (int i = 0; i < 10; i++) {
             List<String> partition = rdd2.collectPartitions(new int[] {i})[0];
-            System.out.println("Partition " + i + " size: " + partition.size());
+//            System.out.println("Partition " + i + " size: " + partition.size());
             assertEquals(100, partition.size()); //Should be exactly 100, for the util method (but NOT spark .repartition)
         }
     }
@@ -170,7 +170,7 @@ public class TestRepartitioning extends BaseSparkTest {
 
         List<Tuple2<Integer, Integer>> partitionCounts = initial.values().mapPartitionsWithIndex(new CountPartitionsFunction<Integer>(), true).collect();
 
-        System.out.println(partitionCounts);
+//        System.out.println(partitionCounts);
 
         List<Tuple2<Integer,Integer>> initialExpected = Arrays.asList(
                 new Tuple2<>(0,29),
@@ -185,7 +185,7 @@ public class TestRepartitioning extends BaseSparkTest {
 
         JavaRDD<Integer> afterRepartition = SparkUtils.repartitionBalanceIfRequired(initial.values(), Repartition.Always, 2, 112);
         List<Tuple2<Integer, Integer>> partitionCountsAfter = afterRepartition.mapPartitionsWithIndex(new CountPartitionsFunction<Integer>(), true).collect();
-        System.out.println(partitionCountsAfter);
+//        System.out.println(partitionCountsAfter);
 
         for(Tuple2<Integer,Integer> t2 : partitionCountsAfter){
             assertEquals(2, (int)t2._2());
@@ -219,8 +219,8 @@ public class TestRepartitioning extends BaseSparkTest {
             }
         }
 
-        System.out.println("min: " + min + "\t@\t" + minIdx);
-        System.out.println("max: " + max + "\t@\t" + maxIdx);
+//        System.out.println("min: " + min + "\t@\t" + minIdx);
+//        System.out.println("max: " + max + "\t@\t" + maxIdx);
 
         assertEquals(1, min);
         assertEquals(2, max);
@@ -244,7 +244,7 @@ public class TestRepartitioning extends BaseSparkTest {
 
         for (int i = 0; i < 10; i++) {
             List<String> partition = rdd2.collectPartitions(new int[] {i})[0];
-            System.out.println("Partition " + i + " size: " + partition.size());
+//            System.out.println("Partition " + i + " size: " + partition.size());
             assertTrue(partition.size() >= 90 && partition.size() <= 110);
         }
     }
diff --git a/libnd4j/CMakeLists.txt.mkldnn.in b/libnd4j/CMakeLists.txt.mkldnn.in
index 3de36dfde..e67b3554b 100644
--- a/libnd4j/CMakeLists.txt.mkldnn.in
+++ b/libnd4j/CMakeLists.txt.mkldnn.in
@@ -5,7 +5,7 @@ project(mkldnn-download NONE)
 include(ExternalProject)
 ExternalProject_Add(mkldnn
   GIT_REPOSITORY     https://github.com/intel/mkl-dnn.git
-  GIT_TAG           v1.1.3
+  GIT_TAG           v1.2
   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src"
   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build"
   CONFIGURE_COMMAND ""
diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h
index fe5f90bc3..3a68edde1 100644
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@@ -999,14 +999,14 @@ namespace nd4j {
         *  set new order and shape in case of suitable array length (in-place operation)
         *  order - order to set
         *  shape - shape to set
-        *
+        *  copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping
         *  if there was permute applied before or there are weird strides, then new buffer is allocated for array
         */
-		bool reshapei(const char order, const std::initializer_list<Nd4jLong>& shape);
-		bool reshapei(const char order, const std::vector<Nd4jLong>& shape);
+		bool reshapei(const char order, const std::initializer_list<Nd4jLong>& shape, const bool copyToNewBuff = true);
+		bool reshapei(const char order, const std::vector<Nd4jLong>& shape, const bool copyToNewBuff = true);
 
-        bool reshapei(const std::initializer_list<Nd4jLong>& shape);
-		bool reshapei(const std::vector<Nd4jLong>& shape);
+        bool reshapei(const std::initializer_list<Nd4jLong>& shape, const bool copyToNewBuff = true);
+		bool reshapei(const std::vector<Nd4jLong>& shape, const bool copyToNewBuff = true);
 
         /**
         *  creates new array with corresponding order and shape, new array will point on _buffer of this array
@@ -1015,8 +1015,8 @@ namespace nd4j {
         *
         * if permute have been applied before or there are weird strides, then new buffer is allocated for new array
         */
-		NDArray reshape(const char order, const std::vector<Nd4jLong>& shape) const &;
-        NDArray reshape(const char order, const std::vector<Nd4jLong>& shape) &&;
+		NDArray reshape(const char order, const std::vector<Nd4jLong>& shape, const bool copyToNewBuff = true) const &;
+        NDArray reshape(const char order, const std::vector<Nd4jLong>& shape, const bool copyToNewBuff = true) &&;
 
         /**
         *  calculate strides and set given order
@@ -1493,7 +1493,7 @@ namespace nd4j {
          * @return
          */
         bool isS() const;
-        
+
         template <typename T>
         std::vector<T> asVectorT();
 
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index 79137ac3a..f7c6d0684 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -42,7 +42,7 @@ ND4J_EXPORT std::u32string NDArray::e(const Nd4jLong i) const;
 ////////////////////////////////////////////////////////////////////////
 // copy constructor
 NDArray::NDArray(const NDArray& other) {
-    
+
     _context = other._context;
     _offset  = 0;
 
@@ -308,7 +308,7 @@ NDArray::NDArray(const std::u16string& u16string, nd4j::DataType dtype, nd4j::La
     if (!unicode::isStringValidU16(u16string.data(), u16string.data() + u16string.size())) {
         throw std::invalid_argument("NDArray::NDArray: invalid character in input string");
     }
-    
+
     // one word that is why used 1
     Nd4jLong headerLength = ShapeUtils::stringBufferHeaderRequirements(1);
 
@@ -435,11 +435,11 @@ NDArray::NDArray(const std::string& str, nd4j::DataType dtype, nd4j::LaunchConte
     _offset = 0;
 
     setShapeInfo(ShapeDescriptor::scalarDescriptor(dtype));
-    
+
     memcpy(bufferAsT<int8_t>(), &offsets[0], 2 * sizeof(Nd4jLong));
-    
+
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
-    
+
     if (dtype == DataType::UTF8) {
         memcpy(data, str.data(), str.size());
     }
@@ -456,13 +456,13 @@ NDArray::NDArray(const std::string& str, nd4j::DataType dtype, nd4j::LaunchConte
 /////////////////////////////////////////////////////////////////////////
 // constructors for vector of  strings
 NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char*>& string, const nd4j::DataType dataType, nd4j::LaunchContext* context) {
-    
+
     if (!DataTypeUtils::isS(dataType))
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
 
     if (shape::prodLong(shape.data(), shape.size()) != string.size())
         throw std::invalid_argument("NDArray::NDArray: Number of strings should match length of array");
-        
+
     for (const auto& str : string) {
         if (!unicode::isStringValidU8(str, str + std::char_traits<char>::length(str)) ) {
             throw std::invalid_argument("NDArray::NDArray: invalid character in input string");
@@ -497,11 +497,11 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
     setAttached(context->getWorkspace() != nullptr);
 
     memcpy(bufferAsT<int8_t>(), offsets.data(), offsets.size() * sizeof(Nd4jLong));
-    
+
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
                 auto cdata = data + offsets[e];
                 if (dataType == DataType::UTF16) {
                     unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e]));
@@ -568,7 +568,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::stri
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
              auto cdata = data + offsets[e];
              if (dataType == DataType::UTF16) {
                  unicode::utf8to16(string[e].data(), cdata, string[e].size());
@@ -631,11 +631,11 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16s
     setAttached(context->getWorkspace() != nullptr);
 
     memcpy(bufferAsT<int8_t>(), offsets.data(), offsets.size() * sizeof(Nd4jLong));
-    
+
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
-    
+
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
              auto cdata = data + offsets[e];
              if (dtype == DataType::UTF16) {
                  memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t));
@@ -699,9 +699,9 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
 
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
-    
+
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
              auto cdata = data + offsets[e];
              if (dtype == DataType::UTF16) {
                  memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t));
@@ -715,7 +715,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
         }
     };
     samediff::Threads::parallel_for(func, 0, lengthOf(), 1);
-    
+
     tickWriteHost();
     syncToDevice();
 }
@@ -764,10 +764,10 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s
 
     memcpy(bufferAsT<int8_t>(), offsets.data(), offsets.size() * sizeof(Nd4jLong));
 
-    auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); 
-    
+    auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
+
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto cdata = data + offsets[e];
             if (dtype == DataType::UTF16) {
                 unicode::utf32to16(string[e].data(), cdata, string[e].size());
@@ -781,7 +781,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s
         }
     };
     samediff::Threads::parallel_for(func, 0, lengthOf(), 1);
-    
+
     tickWriteHost();
     syncToDevice();
 }
@@ -831,9 +831,9 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
     memcpy(bufferAsT<int8_t>(), offsets.data(), offsets.size() * sizeof(Nd4jLong));
 
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
-    
+
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto cdata = data + offsets[e];
             if (dtype == DataType::UTF16) {
                 unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e]));
@@ -847,7 +847,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
         }
     };
     samediff::Threads::parallel_for(func, 0, lengthOf(), 1);
-    
+
     tickWriteHost();
     syncToDevice();
 }
@@ -887,8 +887,8 @@ bool NDArray::isC() const {
 
 //////////////////////////////////////////////////////////////////////////
 bool NDArray::isS() const {
-    return (dataType() == DataType::UTF8 || 
-            dataType() == DataType::UTF16 || 
+    return (dataType() == DataType::UTF8 ||
+            dataType() == DataType::UTF16 ||
             dataType() == DataType::UTF32);
 }
 
@@ -1197,8 +1197,8 @@ void NDArray::assign(const NDArray& other, bool allowParallelism) {
             throw std::runtime_error("NDArray::assign: lengths of arrays are mismatched");
         }
 
-        // memcpy is allowed only for same order && same ews (being equal to 1)
-        if (ordering() == other.ordering() && dataType() == other.dataType() && ews() == 1 && other.ews() == 1)
+        // memcpy is allowed only for same order c && same ews (being equal to 1)
+        if (ordering() == other.ordering() && ordering() == 'c' && dataType() == other.dataType() && ews() == 1 && other.ews() == 1)
             copyBuffersContinuouslyFrom(other, other.lengthOf() * other.sizeOfT());
         else {
             NDArray::prepareSpecialUse({this}, {&other});
@@ -1569,20 +1569,25 @@ Nd4jLong NDArray::tensorsAlongDimension(const std::vector<int>& dimensions) cons
 
 //////////////////////////////////////////////////////////////////////////
 void NDArray::printShapeInfo(const char * msg) const {
-    //shape::printShapeInfo(_shapeInfo);
-    if (msg == nullptr)
-        shape::printShapeInfoLinear(_shapeInfo);
-    else {
-        int rank = shape::rank(_shapeInfo);
-        int lim = shape::shapeInfoLength(rank);
-        printf("%s: [", msg);
-        for (int i = 0; i < shape::shapeInfoLength(rank); i++) {
-            printf("%lld", (long long) _shapeInfo[i]);
-            if (i < lim - 1)
-                printf(", ");
-        }
-        printf("]\n");
+
+    int rank = shape::rank(_shapeInfo);
+    int lim = shape::shapeInfoLength(rank);
+
+    if(msg != nullptr)
+        printf("shapeInfo %s: [", msg);
+    else
+        printf("shapeInfo: [");
+
+    printf("%i,  ", rank);
+    for (int i = 1; i < shape::shapeInfoLength(rank) - 3; i++){
+        if(i == rank + 1)
+            printf("  ");
+        printf("%lld,", _shapeInfo[i]);
     }
+    printf("  %lld,", shape::type(_shapeInfo));
+    printf("%lld,", shape::elementWiseStride(_shapeInfo));
+    printf("%lld]\n", (Nd4jLong)shape::order(_shapeInfo));
+
     fflush(stdout);
 }
 
@@ -1624,7 +1629,7 @@ void NDArray::printBuffer(const char* msg, Nd4jLong limit, const bool sync) cons
             if (e < limit - 1)
                 printf(", ");
         }
-    } 
+    }
     else if (this->isS()) {
         // todo do we need this print offsets
         /*
@@ -1773,7 +1778,7 @@ void NDArray::printIndexedBuffer(const char* msg, Nd4jLong limit) const {
             printf("%s\n", this->e<bool>(0)?"true":"false");
         }
         else if (this->isS()) {
-            // todo do we need this 
+            // todo do we need this
             // printf("\"%lld\"\n", this->getOffset(e));
             printf("\"%s\"\n", this->e<std::string>(0).c_str());
         }
@@ -1855,19 +1860,19 @@ void NDArray::updateStrides(const char order) {
 
 //////////////////////////////////////////////////////////////////////////
 // set new order and shape in case of suitable array length
-bool NDArray::reshapei(const char order, const std::initializer_list<Nd4jLong>& shape) {
+bool NDArray::reshapei(const char order, const std::initializer_list<Nd4jLong>& shape, const bool copyToNewBuff) {
     std::vector<Nd4jLong> vShape(shape);
-    return reshapei(order, vShape);
+    return reshapei(order, vShape, copyToNewBuff);
 }
 
 //////////////////////////////////////////////////////////////////////////
-bool NDArray::reshapei(const std::initializer_list<Nd4jLong>& shape) {
-    return reshapei('c', shape);
+bool NDArray::reshapei(const std::initializer_list<Nd4jLong>& shape, const bool copyToNewBuff) {
+    return reshapei(ordering(), shape, copyToNewBuff);
 }
 
 //////////////////////////////////////////////////////////////////////////
-bool NDArray::reshapei(const std::vector<Nd4jLong>& shape) {
-    return reshapei('c', shape);
+bool NDArray::reshapei(const std::vector<Nd4jLong>& shape, const bool copyToNewBuff) {
+    return reshapei(ordering(), shape, copyToNewBuff);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1918,18 +1923,18 @@ Nd4jLong NDArray::argMax(std::initializer_list<int> dimensions) {
 
 //////////////////////////////////////////////////////////////////////////
 // create new array with corresponding order and shape, new array will point to the same _buffer as this array
-NDArray NDArray::reshape(const char order, const std::vector<Nd4jLong>& shape) const & {
+NDArray NDArray::reshape(const char order, const std::vector<Nd4jLong>& shape, const bool copyToNewBuff) const & {
 
     NDArray newArr(getDataBuffer(), ShapeDescriptor(getShapeInfo()), getContext(), getBufferOffset());
-    newArr.reshapei(order, shape);
+    newArr.reshapei(order, shape, copyToNewBuff);
 
     return newArr;
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reshape(const char order, const std::vector<Nd4jLong>& shape) && {
+NDArray NDArray::reshape(const char order, const std::vector<Nd4jLong>& shape, const bool copyToNewBuff) && {
 
-    this->reshapei(order, shape);
+    this->reshapei(order, shape, copyToNewBuff);
     return std::move(*this);
 }
 
@@ -1971,7 +1976,7 @@ bool NDArray::permutei(const std::initializer_list<int>& dimensions) {
 
 //////////////////////////////////////////////////////////////////////////
 bool NDArray::permutei(const std::vector<int>& dimensions) {
-    return permutei(dimensions.data(), dimensions.size());
+    return permutei(dimensions.data(), rankOf());
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1993,7 +1998,7 @@ bool NDArray::permutei(const std::vector<Nd4jLong>& dimensions) {
     for (int e = 0; e < dimensions.size(); e++)
         ivec[e] = dimensions[e];
 
-    return permutei(ivec.data(), ivec.size());
+    return permutei(ivec.data(), rankOf());
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2029,9 +2034,8 @@ NDArray NDArray::permute(const Nd4jLong* dimensions, const int rank) && {
 
 //////////////////////////////////////////////////////////////////////////
 NDArray NDArray::permute(const std::vector<int>& dimensions) const &{
-    auto data = dimensions.data();
-    auto size = dimensions.size();
-    return permute(data, size);
+
+    return permute(dimensions.data(), rankOf());
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2043,7 +2047,8 @@ NDArray NDArray::permute(const std::vector<int>& dimensions) && {
 
 //////////////////////////////////////////////////////////////////////////
 NDArray NDArray::permute(const std::vector<Nd4jLong>& dimensions) const & {
-    return permute(dimensions.data(), dimensions.size());
+
+    return permute(dimensions.data(), rankOf());
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2106,12 +2111,12 @@ void NDArray::permute(const Nd4jLong *dimensions, const int rank, NDArray& targe
 
 //////////////////////////////////////////////////////////////////////////
 void NDArray::permute(const std::vector<int>& dimensions, NDArray& target) const {
-    permute(dimensions.data(), dimensions.size(), target);
+    permute(dimensions.data(), rankOf(), target);
 }
 
 //////////////////////////////////////////////////////////////////////////
 void NDArray::permute(const std::vector<Nd4jLong>& dimensions, NDArray& target) const {
-    permute(dimensions.data(), dimensions.size(), target);
+    permute(dimensions.data(), rankOf(), target);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2280,7 +2285,7 @@ template <typename T>
 NDArray NDArray::asT() const{
 
     auto result = isScalar() ? NDArray('c', {}, std::vector<double>{0.}, DataTypeUtils::fromT<T>(), this->getContext()) : NDArray(ordering(), getShapeAsVector(), DataTypeUtils::fromT<T>(), this->getContext());
-    
+
     NDArray::prepareSpecialUse({&result}, {this});
     NativeOpExecutioner::execTransformAny(getContext(), transform::AnyOps::Assign, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), result.getBuffer(), result.getShapeInfo(), result.getSpecialBuffer(), result.getSpecialShapeInfo(), nullptr, nullptr, nullptr);
     NDArray::registerSpecialUse({&result}, {this});
@@ -2298,15 +2303,15 @@ NDArray NDArray::asS() const {
 
     auto dtype = DataTypeUtils::fromT<T>();
 
-    if (!(DataTypeUtils::isS(dtype))) 
+    if (!(DataTypeUtils::isS(dtype)))
         throw std::invalid_argument("NDArray::asS: invalid DataType used");
-    
+
     if (dtype == dataType()) {
-        
+
         Nd4jLong offsetsLength = ShapeUtils::stringBufferHeaderRequirements(lengthOf());
         const auto nInputoffsets = bufferAsT<Nd4jLong>();
         std::shared_ptr<DataBuffer> pBuffer = std::make_shared<DataBuffer>(offsetsLength + nInputoffsets[lengthOf()], dtype, getContext()->getWorkspace(), true);
-        
+
         NDArray res(pBuffer, ShapeDescriptor(dtype, ordering(), getShapeAsVector()), getContext());
         res.setAttached(getContext()->getWorkspace() != nullptr);
 
@@ -2319,7 +2324,7 @@ NDArray NDArray::asS() const {
         registerPrimaryUse({ &res }, { this });
         return res;
     }
- 
+
     Nd4jLong offsetsLength = ShapeUtils::stringBufferHeaderRequirements(lengthOf());
 
     std::vector<Nd4jLong> offsets(lengthOf() + 1);
@@ -2353,7 +2358,7 @@ NDArray NDArray::asS() const {
 
     NDArray res(pBuffer, ShapeDescriptor(dtype, ordering(), getShapeAsVector()), getContext());
     res.setAttached(getContext()->getWorkspace() != nullptr);
-    
+
     preparePrimaryUse({ &res }, { this });
 
     memcpy(res.bufferAsT<int8_t>(), offsets.data(), offsets.size() * sizeof(Nd4jLong));
@@ -2362,7 +2367,7 @@ NDArray NDArray::asS() const {
     const auto inData = bufferAsT<int8_t>() + offsetsLength;
 
     auto func = PRAGMA_THREADS_FOR{
-        for (int e = start; e < stop; e += increment) {
+        for (int e = start; e < stop; e++) {
            auto cdata = outData + offsets[e];
            auto end = nInputoffsets[e + 1];
            auto idata = inData + nInputoffsets[e];
@@ -2403,7 +2408,7 @@ BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArray::asS, () const, LIBND
 
 ////////////////////////////////////////////////////////////////////////
 NDArray NDArray::asT(DataType dtype) const {
-    
+
     if (isS() && !DataTypeUtils::isS(dtype))
         throw std::runtime_error("NDArray::asT: you can't use this method on String array with not string DataType!");
 
@@ -3221,7 +3226,7 @@ BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT std::vector, NDArray::asVectorT(), LI
 
 //////////////////////////////////////////////////////////////////////////
 // set new order and shape in case of suitable array length
-bool NDArray::reshapei(const char order, const std::vector<Nd4jLong>& cshape) {
+bool NDArray::reshapei(const char order, const std::vector<Nd4jLong>& cshape, const bool copyToNewBuff) {
 
     // check firstly whether cshape is identical to shape of array, if yes then reshape is unnecessary
     if(order == ordering() && shape::shapeEquals(rankOf(), shapeOf(), cshape.size(), cshape.data()))
@@ -3293,19 +3298,15 @@ bool NDArray::reshapei(const char order, const std::vector<Nd4jLong>& cshape) {
     Nd4jLong *shapeInfoNew;
     ALLOCATE(shapeInfoNew, getContext()->getWorkspace(), shape::shapeInfoLength(rank), Nd4jLong);
 
-    bool canReshape = shape::reshapeC(rankOf(), shapeInfo(), shape.size(), shape.data(), shapeInfoNew);
+    bool canReshape = shape::reshapeC(shapeInfo(), order, shape.size(), shape.data(), shapeInfoNew);
 
-    // we can do this only if there was no permute applied, or there are no weird strides
     if (canReshape) {
-        if(ordering() == 'c' && order == 'f')
-            throw std::invalid_argument("NDArray::reshapei(order, shape): in case of reshapeC it doesn't make sense to reshape from c order to f order !");
-
-        shape::setEws(shapeInfoNew, arrLength);
         setShapeInfo(shapeInfoNew);
     }
     else {
         NDArray temp(order, shape, dataType(), getContext());
-        this->applyTransform(transform::Assign, temp, nullptr);
+        if(copyToNewBuff)
+            this->applyTransform(transform::Assign, temp, nullptr);
         *this = std::move(temp);
     }
 
@@ -3463,9 +3464,9 @@ NDArray NDArray::dup(const char newOrder) const {
     if (isS()) {
         if (dataType() == DataType::UTF8) {
             std::vector<std::string> strings(lengthOf());
-            
+
             auto func = PRAGMA_THREADS_FOR{
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
                            strings[i] = std::move(this->e<std::string>(i));
                     }
             };
@@ -3478,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const {
             std::vector<std::u16string> strings(lengthOf());
 
             auto func = PRAGMA_THREADS_FOR{
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
                            strings[i] = std::move(this->e<std::u16string>(i));
                     }
             };
@@ -3490,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const {
 
         std::vector<std::u32string> strings(lengthOf());
         auto func = PRAGMA_THREADS_FOR{
-               for (auto i = start; i < stop; i += increment) {
+               for (auto i = start; i < stop; i++) {
                       strings[i] = std::move(this->e<std::u32string>(i));
                }
         };
@@ -3521,7 +3522,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
 
     if (isS()) {
         // string is special case, we'll compare them one by one, considering both arrays are guaranteed to have the same length
-        
+
         if (dataType() == DataType::UTF8) {
             for (int e = 0; e < this->lengthOf(); e++) {
                 auto s1 = this->e<std::string>(e);
@@ -3585,7 +3586,7 @@ std::string NDArray::e(const Nd4jLong i) const {
     if (i == lengthOf())
         throw std::runtime_error("Can't get std::string for index out of range");
 
-    
+
     if (this->dataType() == DataType::UTF16) {
         auto u16 = this->e<std::u16string>(i);
         std::string s;
@@ -4846,7 +4847,7 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
     auto shapeOf = shape::shapeOf(newShapeInfo);
     auto stridesOf = shape::stride(newShapeInfo);
 
-    Nd4jLong offset(0), subArrLen(1);
+    Nd4jLong offset = 0;
     int n(isStrided ? 3 : 2), first, last, stride;
 
     for (int d = rank - 1; d >= 0; --d) {
@@ -4863,29 +4864,31 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
             if(shapeOf[d] != 1)
                 stridesOf[d] *= stride;
         }
+    }
 
-        subArrLen *= shapeOf[d];
+    Nd4jLong *newShapeInfo2 = newShapeInfo;
+
+    if(!keepUnitiesInShape) {
+
+        std::vector<int> dimsWithUnities;
+
+        for (uint d = 0; d < rank; ++d)
+            if(idx[n*d] != idx[n*d+1] && shapeOf[d] == 1)
+                dimsWithUnities.push_back(d);
+
+        if(!dimsWithUnities.empty())
+            newShapeInfo2 = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
     }
 
     // check if there is possibility to set ews = 1
-    shape::setEws(newShapeInfo, subArrLen);
+    shape::checkStridesEwsAndOrder(newShapeInfo2);
 
-    NDArray result(_buffer, ShapeDescriptor(newShapeInfo), getContext(), offset + getBufferOffset());
+    NDArray result(_buffer, ShapeDescriptor(newShapeInfo2), getContext(), offset + getBufferOffset());
     result._isView = true;
 
-    if(!keepUnitiesInShape) {
-        const int coeff = isStrided ? 3 : 2;
-        std::vector<Nd4jLong> nonUnitDims;
-
-        for (int d = 0; d < rank; ++d)
-            if(!(idx[coeff*d] != idx[coeff*d+1] && newShapeInfo[d+1] == 1))
-                nonUnitDims.push_back(newShapeInfo[d+1]);
-
-        if(nonUnitDims.size() != rank)
-            result.reshapei(nonUnitDims);
-    }
-
     RELEASE(newShapeInfo, getContext()->getWorkspace());
+    if(newShapeInfo != newShapeInfo2)
+        RELEASE(newShapeInfo2, getContext()->getWorkspace());
 
     return result;
 }
diff --git a/libnd4j/blas/cpu/GraphExecutioner.cpp b/libnd4j/blas/cpu/GraphExecutioner.cpp
index 2190afbf1..98b3204cd 100644
--- a/libnd4j/blas/cpu/GraphExecutioner.cpp
+++ b/libnd4j/blas/cpu/GraphExecutioner.cpp
@@ -179,7 +179,7 @@ namespace graph {
         nd4j_debug("Embedded graph execution finished. %i variable(s) migrated\n", cnt);
 
     } else if (node->hasCustomOp()) {
-        // if we have something to execute - lets just execute it.
+        // now, if we have something to execute - lets just execute it.
         auto status = node->getCustomOp()->execute(&context);
         if (status != ND4J_STATUS_OK)
             return status;
@@ -494,8 +494,10 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
         nd4j::memory::MemoryRegistrator::getInstance()->setGraphMemoryFootprintIfGreater(h, m);
     }
 
-    if (tempFlow)
+    if (tempFlow) {
         delete flowPath;
+        __variableSpace->setFlowPath(nullptr);
+    }
 
     return Status::OK();
 }
diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp
index 9bdf41a16..58d4b3c34 100644
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@@ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
 
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             shape::index2coords(i, target.getShapeInfo(), coords);
             const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
 
@@ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) {
     auto y = reinterpret_cast<T *>(yBuffer);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             auto temp = x[i];
             x[i] = y[i];
             y[i] = temp;
@@ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     if(result.ordering() == 'c') {           //  ews == 1 always here
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
             }
@@ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     else {
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto xOffset = result.getOffset(i);
                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
@@ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
     // loop through input array
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             shape::index2coords(i, output.getShapeInfo(), coords);
 
             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
diff --git a/libnd4j/blas/cpu/NDArrayLambda.hpp b/libnd4j/blas/cpu/NDArrayLambda.hpp
index 86d798efc..8bced3de4 100644
--- a/libnd4j/blas/cpu/NDArrayLambda.hpp
+++ b/libnd4j/blas/cpu/NDArrayLambda.hpp
@@ -22,7 +22,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
     if (this->ordering() == second.ordering() && this->ordering() == third.ordering()  && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(f[e], s[e], t[e]);
         };
 
@@ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto tOffset = this->getOffset(e);
                     auto uOffset = second.getOffset(e);
                     auto vOffset = third.getOffset(e);
@@ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto tOffset = this->getOffset(e);
                     auto uOffset = second.getOffset(e);
                     auto vOffset = third.getOffset(e);
@@ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(f[e], s[e]);
         };
 
@@ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
 
@@ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
                     auto zOffset = target.getOffset(e);
@@ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(f[e]);
         };
 
@@ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
 
                     f[xOffset] = func(f[xOffset]);
@@ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto zOffset = target.getOffset(e);
 
@@ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(e, f[e]);
         };
 
@@ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
 
                     f[xOffset] = func(e, f[xOffset]);
@@ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto zOffset = target.getOffset(e);
 
@@ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func((Nd4jLong) e, f[e], s[e]);
         };
 
@@ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
 
@@ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
                     auto zOffset = target.getOffset(e);
diff --git a/libnd4j/blas/cpu/NativeOpExecutioner.cpp b/libnd4j/blas/cpu/NativeOpExecutioner.cpp
index c155bd781..1fedb0241 100644
--- a/libnd4j/blas/cpu/NativeOpExecutioner.cpp
+++ b/libnd4j/blas/cpu/NativeOpExecutioner.cpp
@@ -163,15 +163,44 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
 
+    auto loopKind = nd4j::LoopKind::deduceKindOfLoopBroadcast(hXShapeInfo, hYShapeInfo, hZShapeInfo);
+
     auto func = PRAGMA_THREADS_FOR {
-        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, loopKind, start, stop), LIBND4J_TYPES);
     };
 
-    auto xLen = shape::length(hXShapeInfo);
-    auto yLen = shape::length(hYShapeInfo);
-    auto numTads = xLen / yLen;
+    Nd4jLong numTads = 0;
+
+    switch (loopKind) {
+        case nd4j::LoopKind::BROADCAST_SCALAR_X: {
+                numTads = shape::length(hXShapeInfo);
+            }
+            break;
+        case nd4j::LoopKind::BROADCAST_SCALAR_Y: {
+                numTads = shape::length(hYShapeInfo);
+            }
+            break;
+        case nd4j::LoopKind::BROADCAST_3D: {
+            numTads = shape::sizeAt(hZShapeInfo, 0);
+            }
+            break;
+        case nd4j::LoopKind::BROADCAST_4D: {
+            numTads = shape::sizeAt(hZShapeInfo, 0) * shape::sizeAt(hZShapeInfo, 1);
+            }
+            break;
+        case nd4j::LoopKind::BROADCAST_5D: {
+            numTads = shape::sizeAt(hZShapeInfo, 0) * shape::sizeAt(hZShapeInfo, 1);
+            }
+            break;
+        default: {
+            auto xLen = shape::length(hXShapeInfo);
+            auto yLen = shape::length(hYShapeInfo);
+            numTads = xLen / yLen;
+        }
+    }
 
     samediff::Threads::parallel_tad(func, 0, numTads);
+
 #endif
 }
 
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp
index b945c5bcf..e82f2224e 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@@ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx,
     _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto idx = start; idx < stop; idx += increment) {
+        for (auto idx = start; idx < stop; idx++) {
             auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
             auto zTadOffsetForBlock = zTadOffsets[idx];
 
@@ -1356,7 +1356,7 @@ void tearGeneric(void *vx,
     auto numTads = shape::length(hXShapeInfo) / tadLength;
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             auto hZ = reinterpret_cast<T *>(targets[i]);
             auto s = hX + tadOffsets[i];
 
@@ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
     auto dZ = reinterpret_cast<T **>(dz);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto f = start; f < stop; f += increment) {
+        for (auto f = start; f < stop; f++) {
             auto hX = reinterpret_cast<T *>(dX[f]);
             //auto hZ = reinterpret_cast<T *>(dZ[f]);
 
diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h
index 3af77ca39..abc804f5e 100644
--- a/libnd4j/include/array/DataTypeConversions.h
+++ b/libnd4j/include/array/DataTypeConversions.h
@@ -52,7 +52,7 @@ namespace nd4j {
                                 TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
 #else
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto e = start; e < stop; e += increment)
+                    for (auto e = start; e < stop; e++)
                         buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                 };
 
@@ -110,7 +110,7 @@ namespace nd4j {
                                 TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
 #else
                             auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                             };
 
@@ -138,7 +138,7 @@ namespace nd4j {
 
 #else
                             auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                             };
 
@@ -164,7 +164,7 @@ namespace nd4j {
                                 TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
 #else
                             auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                             };
 
diff --git a/libnd4j/include/graph/VariableProxy.h b/libnd4j/include/graph/VariableProxy.h
index 1c253e9d8..c2a6e9c62 100644
--- a/libnd4j/include/graph/VariableProxy.h
+++ b/libnd4j/include/graph/VariableProxy.h
@@ -58,6 +58,7 @@ namespace nd4j {
             virtual void putVariable(int id, Variable *variable);
             virtual void putVariable(int id, NDArray *array);
             virtual void putVariable(int id, int idx, NDArray *array);
+            virtual void putVariable(int id, int idx, NDArray &array);
             virtual void putVariable(int id, int idx, Variable *array);
 
             virtual void replaceVariable(Variable *variable);
diff --git a/libnd4j/include/graph/VariableSpace.h b/libnd4j/include/graph/VariableSpace.h
index 9443d34b1..81abaf6e8 100644
--- a/libnd4j/include/graph/VariableSpace.h
+++ b/libnd4j/include/graph/VariableSpace.h
@@ -100,6 +100,7 @@ namespace nd4j {
             virtual void putVariable(int id, Variable *variable);
             virtual void putVariable(int id, NDArray *array);
             virtual void putVariable(int id, int idx, NDArray *array);
+            virtual void putVariable(int id, int idx, NDArray &array);
             virtual void putVariable(int id, int idx, Variable *array);
 
             virtual void dropVariable(std::pair<int,int> &pair);
diff --git a/libnd4j/include/graph/impl/Graph.cpp b/libnd4j/include/graph/impl/Graph.cpp
index 2acedcea3..4b337dd0d 100644
--- a/libnd4j/include/graph/impl/Graph.cpp
+++ b/libnd4j/include/graph/impl/Graph.cpp
@@ -1088,8 +1088,23 @@ namespace nd4j {
                 if (e < node->input()->size() - 1)
                     nd4j_printf(", ", "");
             }
+
+            if (node->opType() == OpType_CUSTOM) {
+                auto ctx = node->protoContext();
+                if (ctx->getIArguments()->size() > 0) {
+                    printf("]; iArgs: [");
+
+                    for (int e = 0; e < ctx->getIArguments()->size(); e++) {
+                        printf("%i", ctx->getIArguments()->at(e));
+                        if (e < ctx->getIArguments()->size() - 1)
+                            nd4j_printf(", ", "");
+                    }
+                }
+            }
+
             nd4j_printf("]; \n", "");
 
+
 //            printf("\n");
             fflush(stdout);
         }
diff --git a/libnd4j/include/graph/impl/Variable.cpp b/libnd4j/include/graph/impl/Variable.cpp
index 5b8f00b25..c2c5ff61f 100644
--- a/libnd4j/include/graph/impl/Variable.cpp
+++ b/libnd4j/include/graph/impl/Variable.cpp
@@ -60,8 +60,11 @@ namespace nd4j {
             result->_name = this->_name;
             result->_index = this->_index;
 
-            if (this->_ndarray != nullptr)
+            if (this->_ndarray != nullptr) {
                 result->_ndarray = new NDArray(this->_ndarray->dup(this->_ndarray->ordering()));
+                result->_readOnly = false;
+                result->_removable = true;
+            }
 
             if (this->_list != nullptr)
                 result->_list = this->_list->clone();
diff --git a/libnd4j/include/graph/impl/VariableProxy.cpp b/libnd4j/include/graph/impl/VariableProxy.cpp
index 85664f24a..e8abf1310 100644
--- a/libnd4j/include/graph/impl/VariableProxy.cpp
+++ b/libnd4j/include/graph/impl/VariableProxy.cpp
@@ -191,6 +191,9 @@ namespace nd4j {
             _current->putVariable(id, array);
         }
 
+        void nd4j::graph::VariableProxy::putVariable(int id, int idx, NDArray &array) {
+            _current->putVariable(id, idx, array);
+        }
         
         void VariableProxy::putVariable(int id, int idx, NDArray *array) {
             _current->putVariable(id, idx, array);
diff --git a/libnd4j/include/graph/impl/VariableSpace.cpp b/libnd4j/include/graph/impl/VariableSpace.cpp
index 8318befb0..735f0260a 100644
--- a/libnd4j/include/graph/impl/VariableSpace.cpp
+++ b/libnd4j/include/graph/impl/VariableSpace.cpp
@@ -263,19 +263,19 @@ namespace nd4j {
         void nd4j::graph::VariableSpace::putVariable(int id, Variable *variable) {
             // we don't want to add variables more then once
             if (_variables.count(id) > 0 || _temporary.count(id) > 0) {
-                // nd4j_verbose("Trying to update variable for node_%i\n", id);
-
                 auto local = id < 0 ? _variables.at(id) : _temporary.at(id);
 
                 if (!local->hasNDArray() && variable->hasNDArray()) {
-                    // nd4j_verbose("Saving variable for node_%i\n", id);
                     local->setNDArray(variable->getNDArray());
+
+                    // we're inheriting this from Variable
+                    local->markReadOnly(variable->isReadOnly());
+                    local->markRemovable(variable->isRemovable());
                 }
+
                 return;
             }
 
-            //nd4j_debug("Adding Variable to Space: id: %i; Array is null: %i;\n", id, variable->getNDArray() == nullptr);
-
             _varmap.lock();
 
             _handles->emplace_back(variable);
@@ -314,6 +314,21 @@ namespace nd4j {
             }
         }
 
+        void nd4j::graph::VariableSpace::putVariable(int id, int idx, NDArray &array) {
+            auto *var = new nd4j::graph::Variable(&array, "", id, idx);
+            var->markRemovable(false);
+            var->markReadOnly(true);
+
+            // let's see if this op needs
+            bool d = this->hasVariable(id, idx);
+
+            this->putVariable(id, var);
+
+            // if var for this nodeid already exists - we'll just delete variable
+            if (d)
+                delete var;
+        }
+
         void nd4j::graph::VariableSpace::putVariable(int id, NDArray *array) {
             auto *var = new nd4j::graph::Variable(array);
             this->putVariable(id, var);
diff --git a/libnd4j/include/graph/profiling/NodeProfile.h b/libnd4j/include/graph/profiling/NodeProfile.h
index 51b02326d..62df0c34a 100644
--- a/libnd4j/include/graph/profiling/NodeProfile.h
+++ b/libnd4j/include/graph/profiling/NodeProfile.h
@@ -24,6 +24,7 @@
 #include <pointercast.h>
 #include <dll.h>
 #include <string>
+#include <vector>
 
 namespace nd4j {
     namespace graph {
@@ -65,6 +66,9 @@ namespace nd4j {
 
             // total amount of memory used during execution
             Nd4jLong _memoryTotal = 0L;
+
+            std::vector<std::string> _inputShapes;
+            std::vector<std::string> _outputShapes;
         public:
             NodeProfile() = default;
             ~NodeProfile() = default;
@@ -84,10 +88,15 @@ namespace nd4j {
             void setObjectsSize(Nd4jLong bytes);
             void setTotalSize(Nd4jLong bytes);
 
-            Nd4jLong getActivationsSize();
-            Nd4jLong getTemporarySize();
-            Nd4jLong getObjectsSize();
-            Nd4jLong getTotalSize();
+            void addInputShape(Nd4jLong *shapeInfo);
+            void addOutputShape(Nd4jLong *shapeInfo);
+
+            Nd4jLong getActivationsSize() const;
+            Nd4jLong getTemporarySize() const;
+            Nd4jLong getObjectsSize() const;
+            Nd4jLong getTotalSize() const;
+
+            Nd4jLong getExecutionTime() const;
 
             std::string& name();
 
diff --git a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp
index 6c7cccc01..ea8e7bc49 100644
--- a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp
+++ b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp
@@ -21,6 +21,8 @@
 #include <graph/profiling/GraphProfile.h>
 #include <helpers/logger.h>
 #include <chrono>
+#include <templatemath.h>
+#include <algorithm>
 
 namespace nd4j {
     namespace graph {
@@ -184,9 +186,26 @@ namespace nd4j {
             if (_profiles.empty())
                 nd4j_printf("No nodes in graph\n","");
 
-            for (auto v: _profiles)
+            // printint out stuff
+            std::vector<NodeProfile*> sorted;
+            for (auto v: _profiles) {
                 v->printOut();
-            
+                sorted.emplace_back(v);
+            }
+
+            if (_profiles.size() > 1) {
+                // building hot spots
+                std::sort(sorted.begin(), sorted.end(), [](const NodeProfile *a, const NodeProfile *b) -> bool {
+                    return a->getExecutionTime() > b->getExecutionTime();
+                });
+
+                nd4j_printf("\nTop 30 reports by EXEC:\n", "");
+                auto limit = nd4j::math::nd4j_min<int>(30, sorted.size());
+                for (int e = 0; e < limit; e++) {
+                    sorted[e]->printOut();
+                }
+            }
+
             nd4j_printf("\nSpecial timers:\n", "");
             if (_timings.empty())
                 nd4j_printf("No special timers were set\n","");
diff --git a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp
index 025cd8651..cbea09616 100644
--- a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp
+++ b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp
@@ -32,7 +32,7 @@ namespace nd4j {
             // graph->printOut();
 
             // warm up
-            for (int e = 0; e < 1000; e++) {
+            for (int e = 0; e < iterations; e++) {
                 FlowPath fp;
 
                 auto _vs = varSpace->clone();
diff --git a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp
index ab5d2a4c4..a6a990eb8 100644
--- a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp
+++ b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp
@@ -20,6 +20,7 @@
 
 #include <helpers/logger.h>
 #include <graph/profiling/NodeProfile.h>
+#include <helpers/ShapeUtils.h>
 
 namespace nd4j {
     namespace graph {
@@ -35,9 +36,23 @@ namespace nd4j {
             nd4j_printf("      Memory: ACT: %lld; TMP: %lld; OBJ: %lld; TTL: %lld;\n", _memoryActivations / _merges, _memoryTemporary / _merges, _memoryObjects / _merges, _memoryTotal / _merges);
             nd4j_printf("      Time: PREP: %lld ns; EXEC: %lld ns; TTL: %lld ns;\n", _preparationTime / _merges, _executionTime / _merges, _totalTime / _merges);
             nd4j_printf("      PREP: INPUT: %lld ns; SHAPE: %lld ns; ARRAY: %lld ns;\n", _inputTime / _merges, _shapeTime / _merges, _arrayTime / _merges);
+
+            std::string inputs;
+            std::string outputs;
+
+            int cnt = 0;
+            for (const auto &v: _inputShapes)
+                inputs += v + "    ";
+
+            for (const auto &v: _outputShapes)
+                outputs += v + "    ";
+
+
+            nd4j_printf("      Inputs: %s\n", inputs.c_str());
+            nd4j_printf("      Outputs: %s\n", outputs.c_str());
         };
 
-        Nd4jLong NodeProfile::getActivationsSize() {
+        Nd4jLong NodeProfile::getActivationsSize() const {
             return _memoryActivations;
         }
 
@@ -53,15 +68,15 @@ namespace nd4j {
             _inputTime = time;
         }
 
-        Nd4jLong NodeProfile::getTemporarySize() {
+        Nd4jLong NodeProfile::getTemporarySize()  const{
             return _memoryTemporary;
         }
             
-        Nd4jLong NodeProfile::getObjectsSize() {
+        Nd4jLong NodeProfile::getObjectsSize()  const{
             return _memoryObjects;
         }
 
-        Nd4jLong NodeProfile::getTotalSize() {
+        Nd4jLong NodeProfile::getTotalSize()  const{
             return _memoryTotal;
         }
 
@@ -97,6 +112,18 @@ namespace nd4j {
             _memoryTotal = bytes;
         }
 
+        Nd4jLong NodeProfile::getExecutionTime() const {
+            return _executionTime;
+        }
+
+        void NodeProfile::addInputShape(Nd4jLong *shapeInfo) {
+            _inputShapes.emplace_back(ShapeUtils::shapeInfoAsString(shapeInfo));
+        }
+
+        void NodeProfile::addOutputShape(Nd4jLong *shapeInfo) {
+            _outputShapes.emplace_back(ShapeUtils::shapeInfoAsString(shapeInfo));
+        }
+
         void NodeProfile::merge(NodeProfile *other) {
             _merges += other->_merges;
             _memoryObjects += other->_memoryObjects;
@@ -110,6 +137,9 @@ namespace nd4j {
             _shapeTime += other->_shapeTime;
             _arrayTime += other->_arrayTime;
             _inputTime += other->_inputTime;
+
+            _inputShapes = other->_inputShapes;
+            _outputShapes = other->_outputShapes;
         }
 
         std::string& NodeProfile::name() {
@@ -129,6 +159,9 @@ namespace nd4j {
             _shapeTime = other->_shapeTime;
             _arrayTime = other->_arrayTime;
             _inputTime = other->_inputTime;
+
+            _inputShapes = other->_inputShapes;
+            _outputShapes = other->_outputShapes;
         }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/LoopKind.h b/libnd4j/include/helpers/LoopKind.h
index f8f8084c8..d97f3b225 100644
--- a/libnd4j/include/helpers/LoopKind.h
+++ b/libnd4j/include/helpers/LoopKind.h
@@ -37,12 +37,13 @@ namespace nd4j {
 class ND4J_EXPORT LoopKind {
     
     public:
-        enum Kind {SMALLARR2DX, EWS1, EWSNONZERO, RANK1, RANK2, RANK3, RANK4, RANK5, X_EWSNONZERO, Y_EWSNONZERO, Z_EWSNONZERO, COMMON};
+        enum Kind { SMALLARR2DX, EWS1, EWSNONZERO, RANK1, RANK2, RANK3, RANK4, RANK5, X_EWSNONZERO, Y_EWSNONZERO, Z_EWSNONZERO, COMMON, BROADCAST_SCALAR_X, BROADCAST_SCALAR_Y, BROADCAST_3D, BROADCAST_4D, BROADCAST_5D };
 
         static FORCEINLINE Kind deduceKindOfLoopXZ(const Nd4jLong* xShapeInfo, const Nd4jLong* zShapeInfo);
         static FORCEINLINE Kind deduceKindOfLoopXYZ(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo);
         static FORCEINLINE Kind deduceKindOfLoopTadXZ(const Nd4jLong* xShapeInfo, const Nd4jLong* zShapeInfo, const Nd4jLong* tadShapeInfo);        
         static FORCEINLINE Kind deduceKindOfLoopTadXYZ(const Nd4jLong* xTadShapeInfo, const Nd4jLong* yTadShapeInfo, const Nd4jLong* zShapeInfo);
+        static FORCEINLINE Kind deduceKindOfLoopBroadcast(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo);
     
 };
 
@@ -82,6 +83,57 @@ LoopKind::Kind LoopKind::deduceKindOfLoopXZ(const Nd4jLong* xShapeInfo, const Nd
     return COMMON;
 }
 
+LoopKind::Kind LoopKind::deduceKindOfLoopBroadcast(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo) {
+    auto xRank = shape::rank(xShapeInfo);
+    auto yRank = shape::rank(yShapeInfo);
+    auto zRank = shape::rank(zShapeInfo);
+
+    auto xOrder = shape::order(xShapeInfo);
+    auto yOrder = shape::order(yShapeInfo);
+    auto zOrder = shape::order(zShapeInfo);
+
+    auto xEws = shape::elementWiseStride(xShapeInfo);
+    auto yEws = shape::elementWiseStride(yShapeInfo);
+    auto zEws = shape::elementWiseStride(zShapeInfo);
+
+    bool bNDLoopsRanks = (xRank == zRank && yRank <= xRank && yRank >= 2);
+
+    int countUnityDimsInY = 0, countUnityDimsInX = 0;
+    for (int i = 0; i < xRank; i++) {
+        if (i < yRank)
+            countUnityDimsInY += (1 == shape::sizeAt(yShapeInfo, i)) ? 1 : 0;
+        countUnityDimsInX += (1 == shape::sizeAt(xShapeInfo, i)) ? 1 : 0;
+    }
+
+    bool bNotCommonVectorCase = (countUnityDimsInY != yRank - 1) && (countUnityDimsInX != xRank - 1);
+
+    if (3 == xRank && bNDLoopsRanks && bNotCommonVectorCase)
+        return nd4j::LoopKind::BROADCAST_3D;
+    if (4 == xRank && bNDLoopsRanks && bNotCommonVectorCase)
+        return nd4j::LoopKind::BROADCAST_4D;
+    if (5 == xRank && bNDLoopsRanks && bNotCommonVectorCase)
+        return nd4j::LoopKind::BROADCAST_5D;
+
+
+    if (xRank == yRank && xRank == zRank && xOrder == 'c' && yOrder == 'c' && zOrder == 'c' && xEws == 1 && yEws == 1 && zEws == 1 && xRank >= 2) {
+        // we validate that shapes are equal till the last dim
+        for (int e = 0; e <  xRank - 1; e++) {
+            if (xShapeInfo[e+1] != yShapeInfo[e+1])
+                return COMMON;
+        }
+
+        // now, if one of the shapes has 1 as last dim
+        auto detect = xShapeInfo[xRank] == 1 ? -1 : (yShapeInfo[xRank] == 1) ? 1 : 0;
+
+        if (detect == 1)
+            return nd4j::LoopKind::BROADCAST_SCALAR_Y;
+        else if (detect == -1)
+            return nd4j::LoopKind::BROADCAST_SCALAR_X;
+        }
+
+    return nd4j::LoopKind::COMMON;
+}
+
 //////////////////////////////////////////////////////////////////////////////
 LoopKind::Kind LoopKind::deduceKindOfLoopXYZ(const Nd4jLong* xShapeInfo, const Nd4jLong* yShapeInfo, const Nd4jLong* zShapeInfo) {
 
diff --git a/libnd4j/include/helpers/ShapeBuilders.h b/libnd4j/include/helpers/ShapeBuilders.h
index 49ef20e9f..2d71c7ab2 100644
--- a/libnd4j/include/helpers/ShapeBuilders.h
+++ b/libnd4j/include/helpers/ShapeBuilders.h
@@ -30,15 +30,15 @@
 
 namespace nd4j {
     class ND4J_EXPORT ShapeBuilders {
-    public:        
+    public:
         static Nd4jLong* createScalarShapeInfo(nd4j::DataType dataType, nd4j::memory::Workspace* workspace = nullptr);
-        
+
         static Nd4jLong* createVectorShapeInfo(const nd4j::DataType dataType, const Nd4jLong length, nd4j::memory::Workspace* workspace = nullptr);
 
         /**
         *   create shapeInfo for given order basing on shape stored in shapeOnly vector
         *   memory allocation for shapeInfo is on given workspace
-        */        
+        */
         static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, int rank, const Nd4jLong* shapeOnly, memory::Workspace* workspace = nullptr);
         static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong>& shapeOnly, memory::Workspace* workspace = nullptr);
         static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const std::initializer_list<Nd4jLong>& shapeOnly, memory::Workspace* workspace = nullptr);
@@ -51,6 +51,13 @@ namespace nd4j {
         static Nd4jLong* copyShapeInfoAndType(const Nd4jLong* inShapeInfo, const DataType dtype, const bool copyStrides, memory::Workspace* workspace = nullptr);
         static Nd4jLong* copyShapeInfoAndType(const Nd4jLong* inShapeInfo, const Nd4jLong* shapeInfoToGetTypeFrom, const bool copyStrides, memory::Workspace* workspace = nullptr);
 
+        /**
+        * allocates memory for new shapeInfo and copy all information from inShapeInfo to new shapeInfo except dimensions in dimsToExclude (unit dimensions) and corresponding strides
+        * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
+        * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
+        */
+        static Nd4jLong* copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace = nullptr);
+
         static Nd4jLong* emptyShapeInfo(const nd4j::DataType dataType, memory::Workspace* workspace = nullptr);
 
         static Nd4jLong* emptyShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong> &shape, memory::Workspace* workspace = nullptr);
diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h
index c99a0b0de..39ea3edaa 100644
--- a/libnd4j/include/helpers/ShapeUtils.h
+++ b/libnd4j/include/helpers/ShapeUtils.h
@@ -50,11 +50,13 @@ namespace nd4j {
     	static std::vector<Nd4jLong> evalRepeatShape(int axis, const std::vector<int>& repeats, const NDArray& arr);
 
         // evaluate shapeInfo of permuted array
-        static Nd4jLong* evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace);
+        // if setContigStrides = true, then set contiguous strides in output shapeInfo in accordance with arr order
+        static Nd4jLong* evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides = false);
         static Nd4jLong* evalPermShapeInfo(const Nd4jLong* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace);
 
         // evaluate shapeInfo of transposed array
-        static Nd4jLong* evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace);
+        // if setContigStrides = true, then set contiguous strides in output shapeInfo in accordance with arr order
+        static Nd4jLong* evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides = false);
 
         static bool copyVectorPart(std::vector<int>& target, std::vector<int>& source, int rank, int offset);
 
@@ -97,6 +99,8 @@ namespace nd4j {
         static std::string shapeAsString(const int rank, const Nd4jLong* shapeInfo);
         static std::string strideAsString(const NDArray* array);
 
+        static std::string shapeInfoAsString(const Nd4jLong* shapeInfo);
+
         static std::vector<Nd4jLong> shapeAsVector(const Nd4jLong* shapeInfo);
 
         // evaluate shapeInfo for diagonal array which is made using input arr elements as diagonal
@@ -176,6 +180,17 @@ namespace nd4j {
             return (numStrings + 1) * sizeof(Nd4jLong);
         }
 
+        /**
+         * This method selects strides based on dimentions required for broadcasting
+         * @param const pointer to input (Y) shape info for strides selection
+         * @param rank of input (X) to broadcasting
+         * @param dimentions size
+         * @param const pointer to dimentions for broadcasting
+         * @param pointer to output strides have to be pre allocated by 0
+         * @return
+         */
+        static void copyCertainStridesFromShapeInfo(const Nd4jLong* inShapeInfo, const int nRank, const int dimsSize, const int* dims, Nd4jLong* outStrides);
+
         /*
         * check whether arr1/arr2 is sub-array of arr2/arr1,
         * this method do not evaluate what array is sub-array, it returns true if arr1 is sub-array of arr2 or arr2 is sub-array of arr1
diff --git a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
index 822b5ad0d..d48cfca61 100644
--- a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
@@ -68,7 +68,7 @@ namespace nd4j {
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(shapeInfo, dimsToExclude);
             const int subArrRank = (rank == dimsToExclude.size() || descriptor.areUnitiesinShape()) ? rank : rank - dimsToExclude.size();
 
-            auto sPtr = new Nd4jLong[shape::shapeInfoLength(subArrRank)];
+            auto sPtr = new Nd4jLong[shape::shapeInfoLength(subArrRank)];   // shape of sub-arrays (same for all for them)
             auto oPtr = new Nd4jLong[numOfSubArrs];
 
             if (numOfSubArrs > 0)
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
index 1aaaaebc7..b661d02e7 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
@@ -49,7 +49,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
         case nd4j::LoopKind::EWS1: {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -70,7 +70,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
         case nd4j::LoopKind::EWSNONZERO: {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -91,7 +91,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
         case nd4j::LoopKind::RANK1: {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -114,7 +114,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(2, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -141,7 +141,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(3, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -170,7 +170,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(4, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -201,7 +201,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(5, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -234,7 +234,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -258,7 +258,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -284,7 +284,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
diff --git a/libnd4j/include/helpers/impl/MmulHelper.cpp b/libnd4j/include/helpers/impl/MmulHelper.cpp
index 716062a53..abc353132 100644
--- a/libnd4j/include/helpers/impl/MmulHelper.cpp
+++ b/libnd4j/include/helpers/impl/MmulHelper.cpp
@@ -43,23 +43,30 @@ nd4j::NDArray* nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::N
 
     auto outShape = ShapeUtils::evalShapeForTensorDot(a, b, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt);
 
-    NDArray aPR = a->permute(permutAt);
-    NDArray bPR = b->permute(permutBt);
+    // check whether permutation is necessary
+    const NDArray* aP = permutAt.empty() ? a : new NDArray(a->permute(permutAt));
+    const NDArray* bP = permutBt.empty() ? b : new NDArray(b->permute(permutBt));
 
     // check whether reshape is necessary
-    if(!aPR.isSameShape(shapeAt))
-        aPR.reshapei( shapeAt);
-    if(!bPR.isSameShape(shapeBt))
-        bPR.reshapei( shapeBt);
+    const NDArray* aPR = aP->isSameShape(shapeAt) ? aP : new NDArray(aP->reshape(aP->ordering(), shapeAt));
+    const NDArray* bPR = bP->isSameShape(shapeAt) ? bP : new NDArray(bP->reshape(bP->ordering(), shapeBt));
 
-    NDArray* c = mmul(&aPR, &bPR, nullptr, 1.0, 0.0);
+    NDArray* c = mmul(aPR, bPR, nullptr, 1.0, 0.0);
 
     c->reshapei(outShape);
 
+    if(aP != aPR)
+        delete aPR;
+    if(bP != bPR)
+        delete bPR;
+    if(a != aP)
+        delete aP;
+    if(b != bP)
+        delete bP;
+
     return c;
 }
 
-
 //////////////////////////////////////////////////////////////////////////
 void nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, nd4j::NDArray* c, const std::vector<int>& axes_a, const std::vector<int>& axes_b, const std::vector<int>& permutForC) {
 
@@ -67,32 +74,38 @@ void nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b,
     std::vector<Nd4jLong> shapeAt, shapeBt;
     ShapeUtils::evalShapeForTensorDot(a, b, axes_a, axes_b, permutAt, permutBt, shapeAt, shapeBt);
 
-    NDArray *cP(c), *cPR(c);
-
     // check whether permutation is required
-    if(!permutForC.empty())
-        cP = new NDArray(c->permute(permutForC));
+    NDArray* cP = permutForC.empty() ? c : new NDArray(c->permute(permutForC));
 
-    auto aPR = a->permute(permutAt);
-    auto bPR = b->permute(permutBt);
+    // check whether permutation is necessary
+    const NDArray* aP = permutAt.empty() ? a : new NDArray(a->permute(permutAt));
+    const NDArray* bP = permutBt.empty() ? b : new NDArray(b->permute(permutBt));
 
     // check whether reshape is necessary
-    if(!aPR.isSameShape(shapeAt))
-            aPR.reshapei(shapeAt);
-    if(!bPR.isSameShape(shapeBt))
-            bPR.reshapei(shapeBt);
+    const NDArray* aPR = aP->isSameShape(shapeAt) ? aP : new NDArray(aP->reshape(aP->ordering(), shapeAt));
+    const NDArray* bPR = bP->isSameShape(shapeAt) ? bP : new NDArray(bP->reshape(bP->ordering(), shapeBt));
 
-    if(!cP->isSameShape({aPR.sizeAt(0), bPR.sizeAt(1)}))
-        cPR = new NDArray(cP->reshape(cP->ordering(), {aPR.sizeAt(0), bPR.sizeAt(1)}));
+    std::vector<Nd4jLong> requiredCshape = {aPR->sizeAt(0), bPR->sizeAt(1)};
 
-    mmul(&aPR, &bPR, cPR, 1.0, 0.0);
+    NDArray* cPR = cP->isSameShape(requiredCshape) ? cP : new NDArray(cP->reshape(cP->ordering(), requiredCshape, false));
+
+    mmul(aPR, bPR, cPR, 1.0, 0.0);
 
     if(cPR->getBuffer() != cP->getBuffer() || cPR->getSpecialBuffer() != cP->getSpecialBuffer() )   // this means both permute and reshape have been performed on c, cP always points on c->getBuffer()
         cP->assign(cPR);
 
-    if(cPR != c)
+   if(aP != aPR)
+        delete aPR;
+    if(bP != bPR)
+        delete bPR;
+    if(a != aP)
+        delete aP;
+    if(b != bP)
+        delete bP;
+
+    if(cP != cPR)
         delete cPR;
-    if(cP != c)
+    if(c != cP)
         delete cP;
 }
 
@@ -129,7 +142,7 @@ void nd4j::MmulHelper::tensorDot(const NDArray* a, const NDArray* b, NDArray* c,
     if(!whatToDoWithC.empty()) {
         cArrs = std::vector<NDArray*>(whatToDoWithC.size()+1, c);
         for(int i = 0; i < cArrs.size()-1; ++i)
-            cArrs[i+1] = (whatToDoWithC[i] == 'p') ? new NDArray(cArrs[i]->permute(modifC[i])) : new NDArray(cArrs[i]->reshape(c->ordering(), modifC[i]));  // since we ignore first element in cArrs (that is cArrs[0]) then it is always equal to c
+            cArrs[i+1] = (whatToDoWithC[i] == 'p') ? new NDArray(cArrs[i]->permute(modifC[i])) : new NDArray(cArrs[i]->reshape(c->ordering(), modifC[i], false));  // since we ignore first element in cArrs (that is cArrs[0]) then it is always equal to c
     }
 
     mmul(aPR, bPR, cArrs[cArrs.size()-1], 1.0, 0.0);
@@ -208,7 +221,7 @@ nd4j::NDArray* MmulHelper::mmul(const nd4j::NDArray* A, const nd4j::NDArray* B,
     // vector x matrix, A{M} x B{M,N} = C{N} -> reduce to matrix x matrix A2{1,M} x B{M,N} = C2{1,N}, since there is no corresponding blas operation sgevm
     if(isAVector && bRank == 2) {
         NDArray* A2 = new NDArray(A->reshape(A->ordering(), {1, A->lengthOf()}));               // A{M} -> A2{1,M}
-        NDArray* C2 = C ? new NDArray(C->reshape(C->ordering(), {1, C->lengthOf()})) : nullptr; // C{N} -> C2{1,N}
+        NDArray* C2 = C ? new NDArray(C->reshape(C->ordering(), {1, C->lengthOf()}, false)) : nullptr; // C{N} -> C2{1,N}
         auto result = mmulMxM(A2, B, C2, alpha, beta, outOrder);                                // result{1,N}
         delete A2;
         delete C2;
diff --git a/libnd4j/include/helpers/impl/ShapeBuilders.cpp b/libnd4j/include/helpers/impl/ShapeBuilders.cpp
index 70aa934ca..d8443e180 100644
--- a/libnd4j/include/helpers/impl/ShapeBuilders.cpp
+++ b/libnd4j/include/helpers/impl/ShapeBuilders.cpp
@@ -139,5 +139,15 @@ namespace nd4j {
         return ShapeBuilders::copyShapeInfoAndType(inShapeInfo, ArrayOptions::dataType(shapeInfoToGetTypeFrom), copyStrides, workspace);
     }
 
+////////////////////////////////////////////////////////////////////////////////
+Nd4jLong* ShapeBuilders::copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace) {
+
+    Nd4jLong *outShapeInfo = nullptr;
+    ALLOCATE(outShapeInfo, workspace, shape::shapeInfoLength(inShapeInfo[0] - dimsSize), Nd4jLong);
+
+    shape::excludeUnitiesFromShapeInfo(inShapeInfo, dimsSize, dimsToExclude, outShapeInfo);
+
+    return outShapeInfo;
+}
 
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp
index 165ed5ffd..10babeae1 100644
--- a/libnd4j/include/helpers/impl/ShapeUtils.cpp
+++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp
@@ -75,10 +75,23 @@ std::vector<Nd4jLong> ShapeUtils::evalShapeForTensorDot(const Nd4jLong* aShapeIn
     permutBt = axesB;
     permutBt.insert(permutBt.end(), list_B.begin(), list_B.end());
 
+    // if permut contains something like {0,1,2,..rank-1}, then there is no need to make permutation and we return empty vector in this case
+    uint i1, i2;
+    for(i1 = 0; i1 < aRank; ++i1)
+        if(permutAt[i1] != i1)
+            break;
+    if(i1 == aRank)
+        permutAt = {};
+    for(i2 = 0; i2 < bRank; ++i2)
+        if(permutBt[i2] != i2)
+            break;
+    if(i2 == bRank)
+        permutBt = {};
+
     Nd4jLong n2 = 1;
     for (int i = 0; i < axeAsize; i++)
         n2 *= aShapeInfo[axesA[i] + 1];
-    shapeAt = {-1, n2};
+    shapeAt = {shape::length(aShapeInfo) / n2, n2};
 
     std::vector<Nd4jLong> oldShapeA;
     oldShapeA.resize(list_A.size());
@@ -89,7 +102,7 @@ std::vector<Nd4jLong> ShapeUtils::evalShapeForTensorDot(const Nd4jLong* aShapeIn
     Nd4jLong n3 = 1;
     for (int i = 0; i < axeBsize; i++)
         n3 *= bShapeInfo[axesB[i] + 1];
-    shapeBt = {n3, -1};
+    shapeBt = {n3, shape::length(bShapeInfo) / n3};
 
     std::vector<Nd4jLong> oldShapeB;
     oldShapeB.resize(list_B.size());
@@ -300,32 +313,37 @@ std::vector<Nd4jLong> ShapeUtils::evalRepeatShape(int axis, const std::vector<in
     return outShape;
 }
 
-
 //////////////////////////////////////////////////////////////////////////
 // evaluate shapeInfo of permuted array
-    Nd4jLong* ShapeUtils::evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides) {
 
-        if (!arr.nonNull())
-            throw std::runtime_error("ShapeUtils::evalPermShapeInfo static method: wrong arguments in pn/termute method: either array is nullptr!");
+    if (!arr.nonNull())
+        throw std::runtime_error("ShapeUtils::evalPermShapeInfo static method: wrong arguments: array is nullptr!");
 
-        if (rank != arr.rankOf())
-            throw std::runtime_error("ShapeUtils::evalPermShapeInfo static method: wrong arguments in pn/termute method: rank is not suitable!");
+    if (rank != arr.rankOf())
+        throw std::runtime_error("ShapeUtils::evalPermShapeInfo static method: wrong arguments: rank is not suitable!");
 
-        auto shapeInfoLength = shape::shapeInfoLength(rank);
-        // allocate memory for new array - shapeInfo
+    auto shapeInfoLength = shape::shapeInfoLength(rank);
 
-        Nd4jLong *shapeInfoNew = nullptr;
-        ALLOCATE(shapeInfoNew, workspace, shapeInfoLength, Nd4jLong);
-        // copy arr _shapeInfo into new array
-        memcpy(shapeInfoNew, arr.getShapeInfo(), shape::shapeInfoByteLength(rank));
-        // perform buffer permutation
-        shape::doPermuteShapeInfo(shapeInfoNew, dimensions);
+    // allocate memory for new array - shapeInfo
+    Nd4jLong *shapeInfoNew = nullptr;
+    ALLOCATE(shapeInfoNew, workspace, shapeInfoLength, Nd4jLong);
 
-        ShapeDescriptor descriptor(shapeInfoNew);
-        RELEASE(shapeInfoNew, workspace);
-        return ConstantShapeHelper::getInstance()->bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
-    }
+    // copy arr _shapeInfo into new array
+    memcpy(shapeInfoNew, arr.getShapeInfo(), shape::shapeInfoByteLength(rank));
 
+    // perform buffer permutation
+    shape::doPermuteShapeInfo(shapeInfoNew, dimensions, arr.lengthOf());
+
+    if(setContigStrides)
+        shape::updateStrides(shapeInfoNew, arr.ordering());
+
+    ShapeDescriptor descriptor(shapeInfoNew);
+
+    RELEASE(shapeInfoNew, workspace);
+
+    return ConstantShapeHelper::getInstance()->bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
+}
 
     //////////////////////////////////////////////////////////////////////////
     // evaluate shapeInfo of permuted array
@@ -337,14 +355,14 @@ std::vector<Nd4jLong> ShapeUtils::evalRepeatShape(int axis, const std::vector<in
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate shapeInfo of transposed array
-    Nd4jLong* ShapeUtils::evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace) {
+    Nd4jLong* ShapeUtils::evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides) {
 
         int rank = arr.rankOf();
         std::vector<int> dimensions(rank);
         for (int i = 0; i < rank; ++i)
             dimensions[i] = rank - 1 - i;
 
-        return evalPermShapeInfo(dimensions.data(), dimensions.size(), arr, workspace);
+        return evalPermShapeInfo(dimensions.data(), dimensions.size(), arr, workspace, setContigStrides);
     }
 
 //////////////////////////////////////////////////////////////////////////
@@ -653,6 +671,26 @@ Nd4jLong* ShapeUtils::evalTileShapeInfo(const NDArray& arr, const std::vector<Nd
         return result;
     }
 
+    std::string ShapeUtils::shapeInfoAsString(const Nd4jLong* shapeInfo) {
+
+        if(!shapeInfo)
+            throw std::runtime_error("ShapeUtils::shapeAsString method: input shapeInfo must not be nullptr !");
+
+        std::string result;
+
+        int len = shape::shapeInfoLength(shapeInfo[0]);
+
+        result.append("[");
+        for (int e = 0; e < len; e++) {
+            result += flatbuffers::NumToString(shapeInfo[e]);
+            if (e < len - 1)
+                result.append(", ");
+        }
+        result.append("]");
+
+        return result;
+    }
+
 
     std::string ShapeUtils::shapeAsString(const int rank, const Nd4jLong* shapeInfo) {
         if(!shapeInfo)
@@ -1019,6 +1057,29 @@ std::vector<int> ShapeUtils::tadAxesForSimpleBroadcast(const NDArray& max, const
     return numOfMinTads == 1 ? maxTadDims : std::vector<int>();
 }
 
+void ShapeUtils::copyCertainStridesFromShapeInfo(const Nd4jLong* inShapeInfo, const int nRank, const int dimsSize, const int* dims, Nd4jLong* outStrides) {
+
+    int yRank = shape::rank(inShapeInfo);
+    auto  yOrigStride = shape::stride(inShapeInfo);
+
+    if (yRank == nRank) {
+        for (int i = 0; i < yRank; ++i) {
+            // x[2,3,4] * y[2,1,4] = z[2,3,4]
+            outStrides[i] = (1 == shape::sizeAt(inShapeInfo, i)) ? 0 : yOrigStride[i];
+        }
+    }
+    else {
+
+        auto dimEx = nd4j::ShapeUtils::evalDimsToExclude(nRank, dimsSize, dims);
+
+        for (int i = 0, it = 0; i < nRank; ++i) {
+            auto nCount = std::count(dimEx.cbegin(), dimEx.cend(), i);
+            outStrides[i] = (0 == nCount) ? yOrigStride[it++] : 0;
+            if (it == yRank)
+                break;
+        }
+    }
+}
 ////////////////////////////////////////////////////////////////////////////////
 /*
 bool ShapeUtils::isSubArrayCase(const NDArray& arr1, const NDArray& arr2, std::vector<int>& sameDims) {
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index 12162d77c..d4e95c65f 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -131,7 +131,11 @@ namespace shape {
 
     ND4J_EXPORT _CUDA_HD bool canReshape(const int oldRank, Nd4jLong* oldShape, const int newRank, Nd4jLong* newShape, bool isFOrder);
 
-    ND4J_EXPORT _CUDA_HD bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo);
+    ND4J_EXPORT _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, const char newOrder, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo);
+    /**
+    * newShapeInfo contains rank, shape and order only, no strides/ews/type
+    */
+    ND4J_EXPORT _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShapeInfo);
 
     /**
     * Get the shape info buffer
@@ -365,6 +369,13 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD bool isRowVector(const Nd4jLong *shapeInfo);
 
     ND4J_EXPORT _CUDA_HD bool isColumnVector(Nd4jLong *shapeInfo);
+
+    /**
+    * shape - input inShape is shape only, not shapeInfo
+    * returns number of non-unity dimensions in inShape
+    */
+    ND4J_EXPORT _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape);
+
     /**
  * Returns whether the
  * given shape is a vector or not
@@ -379,7 +390,8 @@ namespace shape {
  * Returns the shape portion of an information
  * buffer
  */
-    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeOf(Nd4jLong *buffer);
+    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeOf(Nd4jLong *shapeInfo);
+    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeOf(const Nd4jLong *shapeInfo);
 
 /**
  * Return a copy of a buffer.
@@ -888,9 +900,9 @@ namespace shape {
 * @return the double at the specified index
 */
 
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *indices, Nd4jLong baseOffset = 0);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *indices, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0);
 
     ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank);
 
@@ -994,21 +1006,16 @@ namespace shape {
     // rank is equal to size of shape
     ND4J_EXPORT void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong* strides, Nd4jLong* offsets, const char order = 'c');
     ND4J_EXPORT void calcOffsets(const Nd4jLong* shapeInfo, Nd4jLong* offsets, const char order = 'c');
-    ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c');
-    ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c');
+    // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c');
+    // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c');
     ND4J_EXPORT _CUDA_HD void shapeOldScalar(nd4j::DataType dtype, Nd4jLong* const buffer, const char order);
 
-    // deduce element-wise stride
-    // if array is scalar or unit length vector then ews = 1
-    // if array is common vector then ews = stride of non-unity dimension
-    // if strides are normal set ews = 1, otherwise ews = 0
-    ND4J_EXPORT _CUDA_HD void setEws(Nd4jLong* shapeInfo, Nd4jLong len);
-
     // deduce order and element-wise stride
     // if array is scalar or unit length vector then ews = 1 and order is preserved
     // if array is common vector then ews = stride of non-unity dimension and order is preserved
     // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
-    ND4J_EXPORT _CUDA_HD void setOrderAndEws(Nd4jLong* shapeInfo, Nd4jLong len = -1);
+    ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
+    ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo);
 
     /**
     * processes whole set of sub-arrays
@@ -1018,12 +1025,26 @@ namespace shape {
     * numOfSubArrs - number of sub-arrays, size of subArrOffsets is equal to numOfSubArrs
     * dimsSize - size of dimsToExclude, if dimsSize = array rank or dimsSize = 0 it means sub-array is whole array, copy of wholeShapeInfo and one zero offset will be returned
     * dimsToExclude - MUST BE SORTED, dimensions to evaluate sub-array along, i.e. when shape is [2,3,4,5] and dimsToExclude={0,2}, then there will be 8 sub-arrays with shape [3,5]
-    * subArrShapeInfo    - output argument, contains shapeInfo common for all sub-arrays
+    * subArrShapeInfo    - output argument, contains shapeInfo (same for all sub-arrays)
     * subArrOffsets      - output argument, contains successive sub-arrays offsets from original this-buffer
     * keepUnitiesInShape - if false then eliminate unities from sub-array shapeInfo, for example {1,a,1,b} -> {a,b}
     */
     ND4J_EXPORT _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo, const Nd4jLong numOfSubArrs, const int dimsSize, const int* dimsToExclude, Nd4jLong* subArrShapeInfo, Nd4jLong* subArrOffsets, bool keepUnitiesInShape = false);
 
+    /**
+    * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99}
+    * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order
+    * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1}
+    * returns number of non-unity dimensions in inShapeInfo
+    * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo
+    */
+    ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities);
+
+    /**
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
+    * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
+    */
+    INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo);
 
 
 
@@ -2050,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
             shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank];
         }
 
-        shape::setOrderAndEws(shapeInfo, len);
+        shape::checkStridesEwsAndOrder(shapeInfo);
 
         delete[] temp;
     }
@@ -2227,7 +2248,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
     INLINEDEF _CUDA_HD bool isCommonVector(const Nd4jLong *shapeInfo, int& posOfNonUnityDim) {
 
         if(rank(shapeInfo) > 0 && length(shapeInfo) == 1) {
-            posOfNonUnityDim = 0;
+            posOfNonUnityDim = -1;
             return true;
         }
 
@@ -2272,6 +2293,18 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
         return isVector && !shapeFirstOne;
     }
 
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape) {
+
+    int num = 0;
+
+    for(uint i = 0; i < rank; ++i)
+        if(inShape[i] != 1)
+            ++num;
+
+    return num;
+}
+
     INLINEDEF _CUDA_HD int oneDimEqualToLength(Nd4jLong *shape, int rank) {
         for(int i = 0; i < rank; i++) {
             if(shape[i] == shape::prodLong(shape,rank))
@@ -2310,8 +2343,14 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
  * Returns the shape portion of an information
  * buffer
  */
-    INLINEDEF _CUDA_HD Nd4jLong *shapeOf(Nd4jLong *buffer) {
-        return buffer + 1;
+    INLINEDEF _CUDA_HD Nd4jLong *shapeOf(Nd4jLong *shapeInfo) {
+
+        return shapeInfo + 1;
+    }
+
+    INLINEDEF _CUDA_HD Nd4jLong *shapeOf(const Nd4jLong *shapeInfo) {
+
+        return  shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo));
     }
 
 /**
@@ -2444,7 +2483,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
         newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer);
 
         // correct order and ews if necessary
-        shape::setOrderAndEws(newShapeBuffer);
+        shape::checkStridesEwsAndOrder(newShapeBuffer);
 
         delete[] indices;
 
@@ -3918,121 +3957,151 @@ INLINEDEF _CUDA_HD bool areStridesDefault(const Nd4jLong* shapeInfo) {
 //         return true;
 //     }
 
-// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) {
+//////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) {
 
 //         // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements
 //         // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo
 
-//         const int newOrder = isFOrder ? 102 : 99;
-//         const int oldOrder = oldShapeInfo[2 * oldRank + 3];
-
 //         newShapeInfo[0] = newRank;
 //         memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong));
 
-//         Nd4jLong* newStrides = shape::stride(newShapeInfo);
-//         const Nd4jLong* oldShape = shape::shapeOf(const_cast<Nd4jLong*>(oldShapeInfo));
+//         Nd4jLong* newStrides       = shape::stride(newShapeInfo);
+//         const Nd4jLong* oldShape   = shape::shapeOf(const_cast<Nd4jLong*>(oldShapeInfo));
 //         const Nd4jLong* oldStrides = shape::stride(const_cast<Nd4jLong*>(oldShapeInfo));
-//         int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
-
+//         Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
 
 //         while (newStart < newRank && oldStart < oldRank) {
 
 //             newDim = newShape[newStart];
 //             oldDim = oldShape[oldStart];
 
-//             while (newDim != oldDim)
+//             while (newDim != oldDim && newDim > 0 && oldDim > 0)
 //                 if (newDim < oldDim) newDim *= newShape[newStop++];
 //                 else                 oldDim *= oldShape[oldStop++];
 
 //             // ------ Check whether the original axes can be combined ------ //
-//             for (int i = oldStart; i < oldStop - 1; i++) {
-
-//                 if(oldShape[i] == 1) {                         // ignore strides like {...,1,1,...}
-//                     if(oldOrder == 102) ++oldStart;
+//             for (int step = 1, i = oldStart; i < oldStop - 1; ++i) {
+//                 if(oldShape[i] == 1)                // skip unity-dimension and its stride
 //                     continue;
-//                 }
-
-//                 if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i])
-//                     return false;       // not contiguous enough
-//                 if(oldOrder == 99  && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1])
-//                     return false;       // not contiguous enough
+//                 while((i + step) < oldRank && oldShape[i + step] == 1)
+//                     ++step;                         // skip following unity-dimensions and its strides if such are present
+//                 if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step])
+//                     return false;                   // not contiguous enough
 //             }
 
-//             // ------ Calculate new strides for all axes currently worked with ------ //
-//             if(isFOrder) {
-//                 newStrides[newStart] = oldStrides[oldStart];
-//                 for (int i = newStart + 1; i < newStop; ++i)
-//                     newStrides[i] = newStrides[i - 1] * newShape[i - 1];
-//             }
-//             else {
-//                 newStrides[newStop - 1] = oldStrides[oldStop - 1];
-//                 for (int i = newStop - 1; i > newStart; --i)
-//                     newStrides[i - 1] = newStrides[i] * newShape[i];
-//             }
+//             newStrides[newStop - 1] = oldStrides[oldStop - 1];
+//             for (int i = newStop - 1; i > newStart; --i)
+//                 newStrides[i - 1] = newStrides[i] * newShape[i];
 
 //             newStart = newStop++;
 //             oldStart = oldStop++;
 //         }
 
-//         newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo);    // order
-//         newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo);    // ews
-//         newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);    // type
+//         // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank)
+//         for (int i = newStart; i < newRank; ++i)
+//             newStrides[i] = 1;
+
+//         newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo);                 // order
+//         newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo);     // ews
+//         newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);                  // type
 
 //         return true;
 //     }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) {
+INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, const char newOrder, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) {
 
-        // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements
-        // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo
+    // copy shape from newShape into newShapeInfo
+    newShapeInfo[0] = newRank;
+    memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong));
 
-        newShapeInfo[0] = newRank;
-        memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong));
+    // copy order
+    newShapeInfo[2 * newRank + 3] = newOrder;
 
-        Nd4jLong* newStrides       = shape::stride(newShapeInfo);
-        const Nd4jLong* oldShape   = shape::shapeOf(const_cast<Nd4jLong*>(oldShapeInfo));
-        const Nd4jLong* oldStrides = shape::stride(const_cast<Nd4jLong*>(oldShapeInfo));
-        Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
+    return shape::reshapeC(oldShapeInfo, newShapeInfo);
+}
 
-        while (newStart < newRank && oldStart < oldRank) {
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShapeInfo) {
 
-            newDim = newShape[newStart];
-            oldDim = oldShape[oldStart];
+    // newShapeInfo contains rank, shape and order; but no strides, type and ews
 
-            while (newDim != oldDim && newDim > 0 && oldDim > 0)
-                if (newDim < oldDim) newDim *= newShape[newStop++];
-                else                 oldDim *= oldShape[oldStop++];
-
-            // ------ Check whether the original axes can be combined ------ //
-            for (int step = 1, i = oldStart; i < oldStop - 1; ++i) {
-                if(oldShape[i] == 1)                // skip unity-dimension and its stride
-                    continue;
-                while((i + step) < oldRank && oldShape[i + step] == 1)
-                    ++step;                         // skip following unity-dimensions and its strides if such are present
-                if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step])
-                    return false;                   // not contiguous enough
-            }
-
-            newStrides[newStop - 1] = oldStrides[oldStop - 1];
-            for (int i = newStop - 1; i > newStart; --i)
-                newStrides[i - 1] = newStrides[i] * newShape[i];
-
-            newStart = newStop++;
-            oldStart = oldStop++;
-        }
-
-        // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank)
-        for (int i = newStart; i < newRank; ++i)
-            newStrides[i] = 1;
-
-        newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo);                 // order
-        newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo);     // ews
-        newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);                  // type
+    const int newRank = shape::rank(newShapeInfo);
 
+    // if oldShapeInfo is scalar or vector with length=1
+    if(shape::length(oldShapeInfo) == 1) {
+        for (uint i = 0; i < newRank; ++i)
+            shape::stride(newShapeInfo)[i] = 1;
+        newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);
+        *shape::ews(newShapeInfo) = 1;
         return true;
     }
 
+    const auto oldOrder = shape::order(oldShapeInfo);
+    const auto newOrder = shape::order(newShapeInfo);
+    const auto oldEws   = shape::elementWiseStride(const_cast<Nd4jLong*>(oldShapeInfo));
+
+    if(oldEws > 0 && oldOrder != newOrder)
+        return false;
+
+    // *** FIRST STAGE - exclude unity dimensions from oldShapeInfo and newShapeInfo (if such are present of course), since they don't affect on strides evaluation, however they complicate code
+
+    // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
+    Nd4jLong tempBuffer[4*MAX_RANK];
+    Nd4jLong *oldShape = tempBuffer, *newShape = tempBuffer + 2*MAX_RANK, *oldStrides,  *newStrides;
+
+    // exclude unities from oldShapeInfo
+    const int oldNumOfNonUnities = shape::excludeUnitiesFromShapeInfo(oldShapeInfo, oldShape, oldStrides);
+    const int newNumOfNonUnities = shape::excludeUnitiesFromShapeInfo(newShapeInfo, newShape, newStrides);
+
+    // *** SECOND STAGE - strides evaluation
+
+    int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
+
+    while (newStart < newNumOfNonUnities && oldStart < oldNumOfNonUnities) {
+
+        newDim = newShape[newStart];
+        oldDim = oldShape[oldStart];
+
+        while (newDim != oldDim && newDim > 0 && oldDim > 0) {
+
+            if (newDim < oldDim)
+                newDim *= newShape[newStop++];
+            else
+                oldDim *= oldShape[oldStop++];
+        }
+
+        // check c-contiguous of old axes range
+        for(uint i = oldStart; i < oldStop - 1; ++i)    // do not check value of last stride, it doesn't matter
+            if(oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1])
+                return false;                   // not contiguous
+
+        // fill newStrides in c manner
+        newStrides[newStop - 1] = oldStrides[oldStop - 1];  // copy last stride
+        for (int i = newStop - 2; i >= newStart; --i)
+            newStrides[i] = newStrides[i + 1] * newShape[i + 1];
+
+        newStart = newStop++;
+        oldStart = oldStop++;
+    }
+
+    // fill new calculated strides into newShapeInfo, take into account possible unities in shape
+    for (int j = 0, i = 0; i < newRank; ++i)
+        shape::stride(newShapeInfo)[i] = (shape::shapeOf(newShapeInfo)[i] == 1) ? 1 : newStrides[j++];
+
+    // set ews
+    if(oldEws == 0)
+        shape::checkStridesEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides);  // set ews and order
+    else {
+        newShapeInfo[2 * newRank + 3] = oldOrder;                   // order
+        *shape::ews(newShapeInfo) = oldEws;                         // ews
+    }
+
+    newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);      // type
+
+    return true;
+}
 
 
     INLINEDEF _CUDA_H bool canReshape(const int oldRank, Nd4jLong* oldShape, const int newRank, Nd4jLong* newShapeOf, bool isFOrder) {
@@ -4573,129 +4642,101 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD setEws(Nd4jLong* shapeInfo, Nd4jLong len) {
+INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo) {
 
+    // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
+    Nd4jLong tempBuffer[2*MAX_RANK];
+    Nd4jLong *shape = tempBuffer, *strides;
 
-    const int rank          = shape::rank(shapeInfo);
-    const Nd4jLong* shape   = shape::shapeOf(shapeInfo);
-    const Nd4jLong* strides = shape::stride(shapeInfo);
-    const char order        = shape::order(shapeInfo);
-    Nd4jLong* ews           = shape::ews(shapeInfo);
+    // exclude unities from shapeInfo
+    const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides);
 
-    if(len == -1)   // calculate array length if it is not given
-        len = shape::length(shapeInfo);
-
-    if(len <= 1) {  //  empty, scalar or unity-vector case
-        *ews = 1;
-        return;
-    }
-
-    int nonUnityDim(0);
-    if(shape::isCommonVector(shapeInfo, nonUnityDim)) {
-        *ews = strides[nonUnityDim];
-        return;
-    }
-
-    // check last(c)/first(f) dimension, it should be equal to 1
-    if((order == 'c' && shape[rank - 1] != 1 && strides[rank - 1] != 1) || (order == 'f' && shape[0] != 1 && strides[0] != 1)) {
-        *ews = 0;
-        return;
-    }
-
-    Nd4jLong correctStride = 1;
-    if(order == 'c') {
-        for (int i = rank - 2; i >= 0 ; i--) {
-            correctStride *= shape[i + 1];
-            if(shape[i] == 1)
-                continue;
-            if(correctStride != strides[i]) {
-                *ews = 0;
-                return;
-            }
-        }
-    }
-    else {
-        for (int i = 1; i < rank; ++i) {
-            correctStride *= shape[i - 1];
-            if(shape[i] == 1)
-                continue;
-            if(correctStride != strides[i]) {
-                *ews = 0;
-                return;
-            }
-        }
-    }
-
-    *ews = 1;
+    shape::checkStridesEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD void setOrderAndEws(Nd4jLong* shapeInfo, Nd4jLong len) {
+INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
 
-    const int rank          = shape::rank(shapeInfo);
-    const Nd4jLong* shape   = shape::shapeOf(shapeInfo);
-    const Nd4jLong* strides = shape::stride(shapeInfo);
-    const char order        = shape::order(shapeInfo);
-    Nd4jLong* ews           = shape::ews(shapeInfo);
+    const int rank = shape::rank(shapeInfo);
 
-    if(len == -1)   // calculate array length if it is not given
-        len = shape::length(shapeInfo);
-
-    if(len <= 1) {  //  empty, scalar or unity-vector case
-        *ews = 1;
+    if(shape::length(shapeInfo) == 1) {
+        *shape::ews(shapeInfo) = 1;
+        shapeInfo[rank * 2 + 3] = (int)proposedOrder;
         return;
     }
 
-    int nonUnityDim(0);
-    if(shape::isCommonVector(shapeInfo, nonUnityDim)) {        // in this case we don't change order
-        *ews = strides[nonUnityDim];
+    if(numOfNonUnities == 1) {      // case of common vector
+        *shape::ews(shapeInfo) = *stridesNoUnities;
+        shapeInfo[rank * 2 + 3] = (int)proposedOrder;
         return;
     }
 
-    // check if strides are contiguous in respect to c-order
-    // firstly check last stride, it should be equal to 1
-    if (strides[rank - 1] == 1 || shape[rank - 1] == 1) {     // last dimension is ok, go on through the rest dimensions in reverse order
-        Nd4jLong correctStride = 1;
-        bool cContiguous = true;
-        for (int i = rank - 2; i >= 0 ; i--) {
-            correctStride *= shape[i + 1];
-            if(shape[i] == 1)
-                continue;
-            if(correctStride != strides[i]) {
-                cContiguous = false;
-                break;
-            }
-        }
-        if(cContiguous) {
-            *ews = 1;
-            shapeInfo[shape::shapeInfoLength(rank) - 1] = 99;
-            return;
+    bool contiguous = true;
+
+    //*** check whether strides are in c contiguous order ***//
+    for (uint i = 0; i < numOfNonUnities - 1; ++i) {
+        if(stridesNoUnities[i] != shapeNoUnities[i + 1] * stridesNoUnities[i + 1]) {
+            contiguous = false;
+            break;
         }
     }
 
-    // now check if strides are contiguous in respect to f-order
-    // firstly check first stride, it should be equal to 1
-    if(strides[0] == 1 || shape[0] == 1) {           // first dimension is ok, go on through the rest dimensions
-        Nd4jLong correctStride = 1;
-        bool fContiguous = true;
-        for (int i = 1; i < rank; ++i) {
-            correctStride *= shape[i - 1];
-            if(shape[i] == 1)
-                continue;
-            if(correctStride != strides[i]) {
-                fContiguous = false;
-                break;
-            }
+    if(contiguous) {
+
+        // for example we have shapeInfo = {3, 5,1,1, 4,4,1, ...} then we should change it to shapeInfo = {3, 5,1,1, 4,4,4, ...ews=4}
+        if(numOfNonUnities < rank) {    // unities are present in shape
+
+            int indNonUnit = rank - 1;
+
+            while(shape::shapeOf(shapeInfo)[indNonUnit--] == 1)
+
+            for(int j = indNonUnit + 2; j < rank; ++j)
+                shape::stride(shapeInfo)[j] = stridesNoUnities[numOfNonUnities - 1];
+
+            for(int j = indNonUnit; j >= 0; --j)
+                if(shape::shapeOf(shapeInfo)[j] == 1)
+                    shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j + 1] * shape::stride(shapeInfo)[j + 1];
         }
-        if(fContiguous) {
-            *ews = 1;
-            shapeInfo[shape::shapeInfoLength(rank) - 1] = 102;
-            return;
+
+        *shape::ews(shapeInfo) = stridesNoUnities[numOfNonUnities - 1];
+        shapeInfo[rank * 2 + 3] = 99;
+        return;
+    }
+
+    contiguous = true;
+
+    //*** check whether strides are in f contiguous order ***//
+    for (uint i = 1; i < numOfNonUnities; ++i) {
+        if(stridesNoUnities[i] != shapeNoUnities[i - 1] * stridesNoUnities[i - 1]) {
+            contiguous = false;
+            break;
         }
     }
 
-    *ews = 0;
-    // if both cContiguous and fContiguous are false then order is preserved
+    if(contiguous) {
+
+        // for example we have shapeInfo = {3, 1,1,5, 1,4,4, ...} then we should change it to shapeInfo = {3, 1,1,5, 4,4,4, ...ews=4}
+        if(numOfNonUnities < rank) {    // unities are present in shape
+
+            int indNonUnit = 0;
+
+            while(shape::shapeOf(shapeInfo)[indNonUnit++] == 1)
+
+            for(int j = 0; j < indNonUnit - 1; ++j)
+                shape::stride(shapeInfo)[j] = stridesNoUnities[0];
+
+            for(int j = indNonUnit; j < rank; ++j)
+                if(shape::shapeOf(shapeInfo)[j] == 1)
+                    shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j - 1] * shape::stride(shapeInfo)[j - 1];
+        }
+
+        *shape::ews(shapeInfo) = stridesNoUnities[0];
+        shapeInfo[rank * 2 + 3] = 102;
+        return;
+    }
+
+    *shape::ews(shapeInfo) = 0;
+    shapeInfo[rank * 2 + 3] = (int)proposedOrder;
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -4709,49 +4750,42 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo
         return;
     }
 
-    Nd4jLong *outShapeInfo = new Nd4jLong[shape::shapeInfoLength(wholeShapeInfo)];
-    memcpy(outShapeInfo, wholeShapeInfo, shape::shapeInfoByteLength(wholeShapeInfo));
+    const int subArrRank = keepUnitiesInShape ? rank : rank - dimsSize;
+
+    subArrShapeInfo[0] = subArrRank;                                    // rank
+    subArrShapeInfo[2 * subArrRank + 1] = shape::type(wholeShapeInfo);  // type
+    subArrShapeInfo[2 * subArrRank + 3] = shape::order(wholeShapeInfo); // order
 
     Nd4jLong* shape   = new Nd4jLong[dimsSize];
     Nd4jLong* strides = new Nd4jLong[dimsSize];
 
-    const int subArrRank = keepUnitiesInShape ? rank : rank - dimsSize;
-    Nd4jLong* shapeNoUnities = nullptr;
-    if(!keepUnitiesInShape)
-        shapeNoUnities = new Nd4jLong[subArrRank];
-
-    Nd4jLong subArrLen = 1;
-
     for(int k = subArrRank - 1, j = dimsSize - 1, i = rank - 1; i >= 0; --i) {
+
         if(j >= 0 && i == dimsToExclude[j]) {
-            strides[j] = shape::stride(outShapeInfo)[i];
-            shape[j--] = shape::shapeOf(outShapeInfo)[i];
-            shape::shapeOf(outShapeInfo)[i] = 1;
+
+            strides[j] = shape::stride(wholeShapeInfo)[i];
+            shape[j--] = shape::shapeOf(wholeShapeInfo)[i];
+
+            if(keepUnitiesInShape) {
+                shape::shapeOf(subArrShapeInfo)[k]  = 1;
+                shape::stride(subArrShapeInfo)[k--] = shape::stride(wholeShapeInfo)[i];
+            }
         }
         else {
-            subArrLen *= shape::shapeOf(outShapeInfo)[i];
-            if(!keepUnitiesInShape)
-                shapeNoUnities[k--] = shape::shapeOf(outShapeInfo)[i];
+            shape::shapeOf(subArrShapeInfo)[k]  = shape::shapeOf(wholeShapeInfo)[i];
+            shape::stride(subArrShapeInfo)[k--] = shape::stride(wholeShapeInfo)[i];
         }
-    }
 
-    // evaluate ews
-    shape::setEws(outShapeInfo, subArrLen);
+    }
 
     // calculation of sub-array offsets (subArrOffsets)
     shape::calcOffsets(dimsSize, shape, strides, subArrOffsets);
 
-    // remove unities from outShapeInfo if required
-    if(!keepUnitiesInShape) {
-        shape::reshapeC(rank, outShapeInfo, subArrRank, shapeNoUnities, subArrShapeInfo);
-        delete []shapeNoUnities;
-    }
-    else
-        memcpy(subArrShapeInfo, outShapeInfo, shape::shapeInfoLength(subArrRank) * sizeof(Nd4jLong));
+    // evaluate ews
+    shape::checkStridesEwsAndOrder(subArrShapeInfo);
 
     delete []strides;
     delete []shape;
-    delete []outShapeInfo;
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -4815,195 +4849,238 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
+// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
 
-    // we assume all array have same length
-    const Nd4jLong len = shape::length(xShapeInfo);
+//     // we assume all array have same length
+//     const Nd4jLong len = shape::length(xShapeInfo);
 
-    const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
-    const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
-    const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo);
+//     const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
+//     const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
+//     const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo);
 
-    const char xOrder = shape::order(xShapeInfo);
-    const char yOrder = shape::order(yShapeInfo);
-    const char zOrder = shape::order(zShapeInfo);
+//     const char xOrder = shape::order(xShapeInfo);
+//     const char yOrder = shape::order(yShapeInfo);
+//     const char zOrder = shape::order(zShapeInfo);
 
-    const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo);
+//     const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo);
 
-    if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) {
-        xOffsets = yOffsets = zOffsets = nullptr;
+//     if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) {
+//         xOffsets = yOffsets = zOffsets = nullptr;
+//     }
+//     else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) {
+//         xOffsets = yOffsets = nullptr;
+//         zOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
+//     }
+//     else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) {
+//         xOffsets = zOffsets = nullptr;
+//         yOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//     }
+//     else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) {
+//         yOffsets = zOffsets = nullptr;
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//     }
+//     else if(xEws == 1) {
+//         xOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
+//             }
+//         }
+//     }
+//     else if(yEws == 1) {
+//         yOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets, yOrder);
+//             }
+//         }
+//     }
+//     else if(zEws == 1) {
+//         zOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets, zOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets, zOrder);
+//             }
+//         }
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) {
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets);
+//         yOffsets = zOffsets = xOffsets;
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets);
+//             }
+//         }
+//         yOffsets = xOffsets;
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//         }
+//         zOffsets = xOffsets;
+//     }
+//     else {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets);
+//             }
+//         }
+//     }
+// }
+
+//////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) {
+
+//     // we assume all array have same length
+//     const Nd4jLong len = shape::length(xShapeInfo);
+
+//     const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
+//     const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
+
+//     const char xOrder = shape::order(xShapeInfo);
+//     const char yOrder = shape::order(yShapeInfo);
+
+//     const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo);
+
+//     if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) {
+//         xOffsets = yOffsets = nullptr;
+//     }
+//     else if(xEws == 1) {
+//         xOffsets = nullptr;
+//         yOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//     }
+//     else if(yEws == 1) {
+//         yOffsets = nullptr;
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets);
+//         yOffsets = xOffsets;
+//     }
+//     else {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//         }
+//     }
+// }
+
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities) {
+
+    const int rank = shape::rank(inShapeInfo);
+    const int numOfNonUnities = shape::numOfNonUnitDims(rank, shape::shapeOf(inShapeInfo));
+
+    if(numOfNonUnities == rank) { // no unities in shape, no copy procedure
+        shapeNoUnities   = const_cast<Nd4jLong*>(inShapeInfo) + 1;
+        stridesNoUnities = const_cast<Nd4jLong*>(inShapeInfo) + 1 + rank;
+        return numOfNonUnities;
     }
-    else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) {
-        xOffsets = yOffsets = nullptr;
-        zOffsets = new Nd4jLong[len];
-        shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
-    }
-    else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) {
-        xOffsets = zOffsets = nullptr;
-        yOffsets = new Nd4jLong[len];
-        shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
-    }
-    else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) {
-        yOffsets = zOffsets = nullptr;
-        xOffsets = new Nd4jLong[len];
-        shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
-    }
-    else if(xEws == 1) {
-        xOffsets = nullptr;
-        PRAGMA_OMP_PARALLEL_SECTIONS
-        {
-            PRAGMA_OMP_SECTION
-            {
-                yOffsets = new Nd4jLong[len];
-                shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                zOffsets = new Nd4jLong[len];
-                shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
-            }
-        }
-    }
-    else if(yEws == 1) {
-        yOffsets = nullptr;
-        PRAGMA_OMP_PARALLEL_SECTIONS
-        {
-            PRAGMA_OMP_SECTION
-            {
-                xOffsets = new Nd4jLong[len];
-                shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                zOffsets = new Nd4jLong[len];
-                shape::calcOffsets(zShapeInfo, zOffsets, yOrder);
-            }
-        }
-    }
-    else if(zEws == 1) {
-        zOffsets = nullptr;
-        PRAGMA_OMP_PARALLEL_SECTIONS
-        {
-            PRAGMA_OMP_SECTION
-            {
-                xOffsets = new Nd4jLong[len];
-                shape::calcOffsets(xShapeInfo, xOffsets, zOrder);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                yOffsets = new Nd4jLong[len];
-                shape::calcOffsets(yShapeInfo, yOffsets, zOrder);
-            }
-        }
-    }
-    else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) {
-        xOffsets = new Nd4jLong[len];
-        shape::calcOffsets(xShapeInfo, xOffsets);
-        yOffsets = zOffsets = xOffsets;
-    }
-    else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-        PRAGMA_OMP_PARALLEL_SECTIONS
-        {
-            PRAGMA_OMP_SECTION
-            {
-                xOffsets = new Nd4jLong[len];
-                shape::calcOffsets(xShapeInfo, xOffsets);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                zOffsets = new Nd4jLong[len];
-                shape::calcOffsets(zShapeInfo, zOffsets);
-            }
-        }
-        yOffsets = xOffsets;
-    }
-    else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-        PRAGMA_OMP_PARALLEL_SECTIONS
-        {
-            PRAGMA_OMP_SECTION
-            {
-                xOffsets = new Nd4jLong[len];
-                shape::calcOffsets(xShapeInfo, xOffsets);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                yOffsets = new Nd4jLong[len];
-                shape::calcOffsets(yShapeInfo, yOffsets);
-            }
-        }
-        zOffsets = xOffsets;
-    }
-    else {
-        PRAGMA_OMP_PARALLEL_SECTIONS
-        {
-            PRAGMA_OMP_SECTION
-            {
-                xOffsets = new Nd4jLong[len];
-                shape::calcOffsets(xShapeInfo, xOffsets);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                yOffsets = new Nd4jLong[len];
-                shape::calcOffsets(yShapeInfo, yOffsets);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                zOffsets = new Nd4jLong[len];
-                shape::calcOffsets(zShapeInfo, zOffsets);
-            }
+
+    for(uint j = 0, i = 0; i < rank; ++i) {
+        if(shape::shapeOf(inShapeInfo)[i] != 1) {
+            shapeNoUnities[j]                     = shape::shapeOf(inShapeInfo)[i];
+            shapeNoUnities[numOfNonUnities + j++] = shape::stride(inShapeInfo)[i];
         }
     }
+
+    stridesNoUnities = shapeNoUnities + numOfNonUnities;
+
+    return numOfNonUnities;
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) {
+INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo) {
 
-    // we assume all array have same length
-    const Nd4jLong len = shape::length(xShapeInfo);
+    outShapeInfo[0] = inShapeInfo[0] - dimsSize;
 
-    const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
-    const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
-
-    const char xOrder = shape::order(xShapeInfo);
-    const char yOrder = shape::order(yShapeInfo);
-
-    const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo);
-
-    if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) {
-        xOffsets = yOffsets = nullptr;
-    }
-    else if(xEws == 1) {
-        xOffsets = nullptr;
-        yOffsets = new Nd4jLong[len];
-        shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
-    }
-    else if(yEws == 1) {
-        yOffsets = nullptr;
-        xOffsets = new Nd4jLong[len];
-        shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
-    }
-    else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-        xOffsets = new Nd4jLong[len];
-        shape::calcOffsets(xShapeInfo, xOffsets);
-        yOffsets = xOffsets;
-    }
-    else {
-        PRAGMA_OMP_PARALLEL_SECTIONS
-        {
-            PRAGMA_OMP_SECTION
-            {
-                xOffsets = new Nd4jLong[len];
-                shape::calcOffsets(xShapeInfo, xOffsets);
-            }
-            PRAGMA_OMP_SECTION
-            {
-                yOffsets = new Nd4jLong[len];
-                shape::calcOffsets(yShapeInfo, yOffsets);
-            }
+    for(uint j = 0, k = 0, i = 0; i < inShapeInfo[0]; ++i) {
+        if(j < dimsSize && i == dimsToExclude[j]) {
+            ++j;
+            continue;
         }
-    }
-}
 
+        shape::shapeOf(outShapeInfo)[k]  = shape::shapeOf(inShapeInfo)[i];
+        shape::stride(outShapeInfo)[k++] = shape::stride(inShapeInfo)[i];
+    }
+
+    outShapeInfo[2 * outShapeInfo[0] + 1] = shape::type(inShapeInfo);   // type
+    *shape::ews(outShapeInfo)             = shape::elementWiseStride(inShapeInfo);   // ews
+    outShapeInfo[2 * outShapeInfo[0] + 3] = shape::order(inShapeInfo);  // order
+}
 
 
 }
diff --git a/libnd4j/include/loops/broadcasting.h b/libnd4j/include/loops/broadcasting.h
index a38e79c3f..ebf702004 100755
--- a/libnd4j/include/loops/broadcasting.h
+++ b/libnd4j/include/loops/broadcasting.h
@@ -40,6 +40,7 @@
 #endif
 
 #include <helpers/TAD.h>
+#include <helpers/LoopKind.h>
 
 #include "legacy_ops.h"
 
@@ -122,6 +123,7 @@ namespace functions {
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
                              Nd4jLong *tadOffsetZ,
+                             nd4j::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop);
 
@@ -149,6 +151,7 @@ namespace functions {
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
                              Nd4jLong *tadOffsetZ,
+                             nd4j::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop);
 
diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
index 6005c3647..f047d1136 100644
--- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
+++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
@@ -14,9 +14,9 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-//
-// @author Yurii Shyrma (iuriish@yahoo.com)
-//
+ //
+ // @author Yurii Shyrma (iuriish@yahoo.com)
+ //
 
 #include <loops/TrueBroadcastHelper.h>
 #include <ops/ops.h>
@@ -24,226 +24,268 @@
 
 using namespace simdOps;
 
-namespace nd4j    {
-namespace helpers {
+namespace nd4j {
+    namespace helpers {
 
-////////////////////////////////////////////////////////////////////////
-template <typename X, typename  Y, typename Z>
-template<typename OpType>
-void TrueBroadcastHelper<X, Y, Z>::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+        ////////////////////////////////////////////////////////////////////////
+        template <typename X, typename  Y, typename Z>
+        template<typename OpType>
+        void TrueBroadcastHelper<X, Y, Z>::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
 
 
-    const X* x = reinterpret_cast<X*>(xArr.getBuffer());
-    const Y* y = reinterpret_cast<Y*>(yArr.getBuffer());
-    Z* z = reinterpret_cast<Z*>(zArr.getBuffer());
+            const X* x = reinterpret_cast<X*>(xArr.getBuffer());
+            const Y* y = reinterpret_cast<Y*>(yArr.getBuffer());
+            Z* z = reinterpret_cast<Z*>(zArr.getBuffer());
 
-    const auto xShapeInfo = xArr.getShapeInfo();
-    const auto yShapeInfo = yArr.getShapeInfo();
-    const auto zShapeInfo = zArr.getShapeInfo();
+            const auto xShapeInfo = xArr.getShapeInfo();
+            const auto yShapeInfo = yArr.getShapeInfo();
+            const auto zShapeInfo = zArr.getShapeInfo();
 
-    const int xRank = xArr.rankOf();
-    const int yRank = yArr.rankOf();
-    const int zRank = zArr.rankOf();
+            const int xRank = xArr.rankOf();
+            const int yRank = yArr.rankOf();
+            const int zRank = zArr.rankOf();
 
-    bool bSpecialCase = (1 == xArr.ews() && 'c' == xArr.ordering() && 1 == yRank &&
-                         1 == yArr.ews() && 'c' == yArr.ordering() &&
-                         1 == zArr.ews() && 'c' == zArr.ordering());
+            bool bSpecialCase = (1 == xArr.ews() && 'c' == xArr.ordering() &&
+                1 == yArr.ews() && 'c' == yArr.ordering() &&
+                1 == zArr.ews() && 'c' == zArr.ordering());
 
-    if (bSpecialCase) {
-        auto yLen = (uint32_t)yArr.lengthOf();
-        auto func = PRAGMA_THREADS_FOR{
-           for (uint32_t i = start; i < stop; i++) {
-               auto rZ = z + (i * yLen);
-               auto v = x[i];
-               for (uint32_t j = 0; j < yLen; j++) {
-                    rZ[j] = OpType::op(v, y[j]);
-               }
-           }
-        };
-        samediff::Threads::parallel_tad(func, 0, xArr.lengthOf());
-        return;
+            if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) {
+                auto yLen = (uint32_t)yArr.lengthOf();
+                auto func = PRAGMA_THREADS_FOR{
+                   for (uint32_t i = start; i < stop; i++) {
+                       auto rZ = z + (i * yLen);
+                       auto v = x[i];
+                       for (uint32_t j = 0; j < yLen; j++) {
+                            rZ[j] = OpType::op(v, y[j]);
+                       }
+                   }
+                };
+                samediff::Threads::parallel_tad(func, 0, xArr.lengthOf());
+                return;
+            }
+
+
+            auto yShapeInt = yArr.getShapeAsVectorInt();
+            auto xShapeInt = xArr.getShapeAsVectorInt();
+            auto nCountY = std::count_if(yShapeInt.cbegin(), yShapeInt.cend(), [](int i) { return i == 1; });
+            auto nCountX = std::count_if(xShapeInt.cbegin(), xShapeInt.cend(), [](int i) { return i == 1; });
+
+            bool bSpecialCase2 = (xRank == zRank && yRank == zRank && 1 == xArr.sizeAt(-1) && 1 == yArr.sizeAt(-2) && 1 == nCountY && 1 == nCountX);
+
+            if (bSpecialCase && bSpecialCase2) {
+
+                int zDim1 = zArr.sizeAt(-2);
+                int zDim2 = zArr.sizeAt(-1);
+
+                int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
+
+                auto func = PRAGMA_THREADS_FOR{
+                     for (uint32_t total = start; total < stop; total++) {
+
+                        uint32_t i = total / zDim1;
+                        uint32_t j = total % zDim1;
+
+                        uint32_t index = (i * zDim1) + j;
+                        auto rZ = z + (index * zDim2);
+                        auto rY = y + (i * zDim2);
+                        auto rX = x[index];
+
+                        for (uint32_t n = 0; n < zDim2; n++) {
+                             rZ[n] = OpType::op(rX, rY[n]);
+                        }
+                    }
+                };
+                samediff::Threads::parallel_tad(func, 0, nLen, 1);
+                return;
+            }
+
+
+            const Nd4jLong zLen = zArr.lengthOf();
+            auto func = PRAGMA_THREADS_FOR{
+                std::vector<Nd4jLong> xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf());
+
+                for (auto i = start; i < stop; ++i) {
+
+                    shape::index2coords(i, zShapeInfo, zCoords.data());
+
+                    for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) {
+
+                        if (ix >= 0) {
+                            if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) {
+                                xCoords[ix--] = zCoords[iz];
+                            }
+                             else {
+                              xCoords[ix--] = 0;
+                             }
+                        }
+                    
+                        if (iy >= 0) {
+                              if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) {
+                                  yCoords[iy--] = zCoords[iz];
+                              }
+                              else {
+                                  yCoords[iy--] = 0;
+                              }
+                          }
+                        }
+
+                        const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
+                        const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
+                        const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data());
+                        
+                        z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLen);
+        }
+
+        template <typename X, typename  Y, typename Z>
+        void TrueBroadcastHelper<X, Y, Z>::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+            DISPATCH_BY_OPNUM_TTT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_OPS);
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        template <typename X, typename  Z>
+        template<typename OpType>
+        void TrueBroadcastBoolHelper<X, Z>::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+
+            const X* x = reinterpret_cast<X*>(xArr.getBuffer());
+            const X* y = reinterpret_cast<X*>(yArr.getBuffer());
+            Z* z = reinterpret_cast<Z*>(zArr.getBuffer());
+
+            const auto xShapeInfo = xArr.getShapeInfo();
+            const auto yShapeInfo = yArr.getShapeInfo();
+            const auto zShapeInfo = zArr.getShapeInfo();
+
+            const int xRank = xArr.rankOf();
+            const int yRank = yArr.rankOf();
+            const int zRank = zArr.rankOf();
+
+            const Nd4jLong zLen = zArr.lengthOf();
+
+            auto func = PRAGMA_THREADS_FOR{
+                std::vector<Nd4jLong> xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf());
+
+                for (auto i = start; i < stop; ++i) {
+
+                    shape::index2coords(i, zShapeInfo, zCoords.data());
+
+                    for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) {
+
+                        if (ix >= 0) {
+                            if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) {
+                                xCoords[ix--] = zCoords[iz];
+                            }
+                            else {
+                                xCoords[ix--] = 0;
+                            }
+                        }
+                
+                        if (iy >= 0) {
+                            if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) {
+                                yCoords[iy--] = zCoords[iz];
+                            }
+                            else {
+                                 yCoords[iy--] = 0;
+                            }
+                        }
+                }
+                
+                const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
+                const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
+                const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data());
+                
+                z[zOffset] = OpType::op(x[xOffset], y[yOffset], nullptr);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLen);
+        }
+
+        template <typename X, typename  Y>
+        void TrueBroadcastBoolHelper<X, Y>::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+            DISPATCH_BY_OPNUM_TT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_BOOL_OPS);
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        template <typename X>
+        template<typename OpType>
+        void TrueBroadcastIntHelper<X>::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+
+            const X* x = reinterpret_cast<X*>(xArr.getBuffer());
+            const X* y = reinterpret_cast<X*>(yArr.getBuffer());
+            X* z = reinterpret_cast<X*>(zArr.getBuffer());
+
+            const auto xShapeInfo = xArr.getShapeInfo();
+            const auto yShapeInfo = yArr.getShapeInfo();
+            const auto zShapeInfo = zArr.getShapeInfo();
+
+            const int xRank = xArr.rankOf();
+            const int yRank = yArr.rankOf();
+            const int zRank = zArr.rankOf();
+
+            const Nd4jLong zLen = zArr.lengthOf();
+
+            auto func = PRAGMA_THREADS_FOR{
+                std::vector<Nd4jLong> xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf());
+
+                for (auto i = start; i < stop; ++i) {
+
+                    shape::index2coords(i, zShapeInfo, zCoords.data());
+
+                    for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) {
+
+                        if (ix >= 0) {
+                            if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) {
+                                xCoords[ix--] = zCoords[iz];
+                            }
+                            else {
+                                xCoords[ix--] = 0;
+                            }
+                        }
+                
+                        if (iy >= 0) {
+                            if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) {
+                               yCoords[iy--] = zCoords[iz];
+                            }
+                            else {
+                               yCoords[iy--] = 0;
+                            }
+                        }
+                    }
+                
+                    const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
+                    const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
+                    const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data());
+                    
+                    z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLen);
+        }
+
+        template <typename X>
+        void TrueBroadcastIntHelper<X>::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+            DISPATCH_BY_OPNUM_T(exec, PARAMS(xArr, yArr, zArr), BROADCAST_INT_OPS);
+        }
+
+        /*
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_0);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_1);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_2);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_3);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_4);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_5);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_6);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_7);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_8);
+        BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_9);
+
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastBoolHelper, , LIBND4J_TYPES, BOOL_TYPES);
+
+        BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastIntHelper, , INTEGER_TYPES);
+        */
     }
-
-    const Nd4jLong zLen = zArr.lengthOf();
-    auto func = PRAGMA_THREADS_FOR {
-        std::vector<Nd4jLong> xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf());
-
-        for (auto i = start; i < stop; ++i) {
-
-            shape::index2coords(i, zShapeInfo, zCoords.data());
-
-            for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) {
-
-                if (ix >= 0) {
-                    if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) {
-                        xCoords[ix--] = zCoords[iz];
-                    } else {
-                        xCoords[ix--] = 0;
-                    }
-                }
-
-                if (iy >= 0) {
-                    if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) {
-                        yCoords[iy--] = zCoords[iz];
-                    } else {
-                        yCoords[iy--] = 0;
-                    }
-                }
-            }
-
-            const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
-            const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
-            const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data());
-
-            z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
-        }
-    };
-
-    samediff::Threads::parallel_for(func, 0, zLen);
 }
-
-template <typename X, typename  Y, typename Z>
-void TrueBroadcastHelper<X, Y, Z>::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
-	DISPATCH_BY_OPNUM_TTT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_OPS);
-}
-
-////////////////////////////////////////////////////////////////////////
-template <typename X, typename  Z>
-template<typename OpType>
-void TrueBroadcastBoolHelper<X, Z>::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
-
-    const X* x = reinterpret_cast<X*>(xArr.getBuffer());
-    const X* y = reinterpret_cast<X*>(yArr.getBuffer());
-    	  Z* z = reinterpret_cast<Z*>(zArr.getBuffer());
-
-    const auto xShapeInfo = xArr.getShapeInfo();
-    const auto yShapeInfo = yArr.getShapeInfo();
-    const auto zShapeInfo = zArr.getShapeInfo();
-
-    const int xRank = xArr.rankOf();
-    const int yRank = yArr.rankOf();
-    const int zRank = zArr.rankOf();
-
-    const Nd4jLong zLen  = zArr.lengthOf();
-
-    auto func = PRAGMA_THREADS_FOR {
-        std::vector<Nd4jLong> xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf());
-
-        for (auto i = start; i < stop; ++i) {
-
-            shape::index2coords(i, zShapeInfo, zCoords.data());
-
-            for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) {
-
-                if (ix >= 0) {
-                    if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) {
-                        xCoords[ix--] = zCoords[iz];
-                    } else {
-                        xCoords[ix--] = 0;
-                    }
-                }
-
-                if (iy >= 0) {
-                    if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) {
-                        yCoords[iy--] = zCoords[iz];
-                    } else {
-                        yCoords[iy--] = 0;
-                    }
-                }
-            }
-
-            const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
-            const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
-            const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data());
-
-            z[zOffset] = OpType::op(x[xOffset], y[yOffset], nullptr);
-        }
-    };
-
-    samediff::Threads::parallel_for(func, 0, zLen);
-}
-
-template <typename X, typename  Y>
-void TrueBroadcastBoolHelper<X, Y>::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
-	DISPATCH_BY_OPNUM_TT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_BOOL_OPS);
-}
-
-////////////////////////////////////////////////////////////////////////
-template <typename X>
-template<typename OpType>
-void TrueBroadcastIntHelper<X>::exec(const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
-
-    const X* x = reinterpret_cast<X*>(xArr.getBuffer());
-    const X* y = reinterpret_cast<X*>(yArr.getBuffer());
-    	  X* z = reinterpret_cast<X*>(zArr.getBuffer());
-
-    const auto xShapeInfo = xArr.getShapeInfo();
-    const auto yShapeInfo = yArr.getShapeInfo();
-    const auto zShapeInfo = zArr.getShapeInfo();
-
-    const int xRank = xArr.rankOf();
-    const int yRank = yArr.rankOf();
-    const int zRank = zArr.rankOf();
-
-    const Nd4jLong zLen  = zArr.lengthOf();
-
-    auto func = PRAGMA_THREADS_FOR {
-        std::vector<Nd4jLong> xCoords(xArr.rankOf()), yCoords(yArr.rankOf()), zCoords(zArr.rankOf());
-
-        for (auto i = start; i < stop; ++i) {
-
-            shape::index2coords(i, zShapeInfo, zCoords.data());
-
-            for (int ix = xRank - 1, iy = yRank - 1, iz = zRank - 1; iz >= 0; --iz) {
-
-                if (ix >= 0) {
-                    if (xShapeInfo[ix + 1] == zShapeInfo[iz + 1]) {
-                        xCoords[ix--] = zCoords[iz];
-                    } else {
-                        xCoords[ix--] = 0;
-                    }
-                }
-
-                if (iy >= 0) {
-                    if (yShapeInfo[iy + 1] == zShapeInfo[iz + 1]) {
-                        yCoords[iy--] = zCoords[iz];
-                    } else {
-                        yCoords[iy--] = 0;
-                    }
-                }
-            }
-
-            const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
-            const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
-            const auto zOffset = shape::getOffset(zShapeInfo, zCoords.data());
-
-            z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
-        }
-    };
-
-    samediff::Threads::parallel_for(func, 0, zLen);
-}
-
-template <typename X>
-void TrueBroadcastIntHelper<X>::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
-	DISPATCH_BY_OPNUM_T(exec, PARAMS(xArr, yArr, zArr), BROADCAST_INT_OPS);
-}
-
-/*
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_0);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_1);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_2);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_3);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_4);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_5);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_6);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_7);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_8);
-BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_9);
-
-BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastBoolHelper, , LIBND4J_TYPES, BOOL_TYPES);
-
-BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastIntHelper, , INTEGER_TYPES);
-*/
-}
-}
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index 37dbf833f..62058bd20 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -25,6 +25,7 @@
 #include <LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
+#include <helpers/ShapeUtils.h>
 
 using namespace simdOps;
 
@@ -75,6 +76,7 @@ namespace functions {
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
                              Nd4jLong *zTadOffset,
+                             nd4j::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
@@ -88,7 +90,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset, start, stop), BROADCAST_OPS);
+                                               zTadOffset, loopKind, start, stop), BROADCAST_OPS);
         }
 
         template <typename X, typename  Y, typename Z>
@@ -105,6 +107,7 @@ namespace functions {
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
                              Nd4jLong *zTadOffset,
+                             nd4j::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop) {
 
@@ -142,7 +145,14 @@ namespace functions {
                 auto yEws = shape::elementWiseStride(yShapeInfo);
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
-                const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
+
+                const nd4j::LoopKind::Kind kindOfLoop =
+                    (loopKind == nd4j::LoopKind::BROADCAST_SCALAR_X ||
+                        loopKind == nd4j::LoopKind::BROADCAST_SCALAR_Y ||
+                        loopKind == nd4j::LoopKind::BROADCAST_3D ||
+                        loopKind == nd4j::LoopKind::BROADCAST_4D ||
+                        loopKind == nd4j::LoopKind::BROADCAST_5D)
+                    ? loopKind : nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
                     for (auto i = start; i < stop; i++) {
@@ -163,6 +173,131 @@ namespace functions {
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]);
                     }
+                } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_X){
+                    // this loop effectively turns broadcast into series of scalar ops
+                    auto loopLength = yShapeInfo[shape::rank(yShapeInfo)];
+
+                    for (auto i = start; i < stop; i++) {
+                        auto oY = y + (i * loopLength);
+                        auto oZ = z + (i * loopLength);
+
+                        const auto oX = x[i];
+
+                        PRAGMA_OMP_SIMD
+                        for (unsigned int f = 0; f < loopLength; f++)
+                            oZ[f] = OpType::op(oX, oY[f]);
+                    }
+                } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){
+                    // this loop effectively turns broadcast into series of scalar ops
+                    auto loopLength = xShapeInfo[shape::rank(xShapeInfo)];
+
+                    for (auto i = start; i < stop; i++) {
+                        auto oX = x + (i * loopLength);
+                        auto oZ = z + (i * loopLength);
+
+                        const auto oY = y[i];
+
+                        PRAGMA_OMP_SIMD
+                        for (unsigned int f = 0; f < loopLength; f++)
+                            oZ[f] = OpType::op(oX[f], oY);
+                    }
+                }
+                else if (kindOfLoop == nd4j::LoopKind::BROADCAST_3D) {
+
+                    int xRank = shape::rank(xShapeInfo);
+                    int yRank = shape::rank(yShapeInfo);
+
+                    auto  xStrides = shape::stride(xShapeInfo);
+                    auto  zStrides = shape::stride(zShapeInfo);
+
+                    Nd4jLong  yStrides[3] = { 0,0,0 };
+                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
+
+                    uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
+                    uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
+
+                    for (uint32_t index0 = start; index0 < stop; index0++) {
+
+                        PRAGMA_OMP_SIMD
+                            for (uint32_t index1 = 0; index1 < nSize1; index1++) {
+                                for (uint32_t index2 = 0; index2 < nSize2; index2++) {
+                                    auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2);
+                                    auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2);
+                                    auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2);
+                                    *rZ = OpType::op(*rX, *rY);
+                                }
+                            }
+
+                    }
+
+                }
+                else if (kindOfLoop == nd4j::LoopKind::BROADCAST_4D) {
+
+                    int xRank = shape::rank(xShapeInfo);
+                    int yRank = shape::rank(yShapeInfo);
+
+                    auto  xStrides = shape::stride(xShapeInfo);
+                    auto  zStrides = shape::stride(zShapeInfo);
+
+                    Nd4jLong  yStrides[4] = { 0,0,0,0 };
+                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
+
+                    uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
+                    uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
+                    uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
+
+                    for (uint32_t i = start; i < stop; i++) {
+
+                        uint32_t index0 = i / nSize1;
+                        uint32_t index1 = i % nSize1;
+
+                        PRAGMA_OMP_SIMD
+                            for (uint32_t index2 = 0; index2 < nSize2; index2++) {
+                                for (uint32_t index3 = 0; index3 < nSize3; index3++) {
+                                    auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3);
+                                    auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3);
+                                    auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3);
+                                    *rZ = OpType::op(*rX, *rY);
+                                }
+                            }
+                    }
+
+                }
+                else if (kindOfLoop == nd4j::LoopKind::BROADCAST_5D) {
+
+                    int xRank = shape::rank(xShapeInfo);
+                    int yRank = shape::rank(yShapeInfo);
+
+                    auto  xStrides = shape::stride(xShapeInfo);
+                    auto  zStrides = shape::stride(zShapeInfo);
+
+                    Nd4jLong  yStrides[5] = { 0,0,0,0,0 };
+                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
+
+                    uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
+                    uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
+                    uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
+                    uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4);
+
+                    for (uint32_t i = start; i < stop; i++) {
+
+                        uint32_t index0 = i / nSize1;
+                        uint32_t index1 = i % nSize1;
+
+                        PRAGMA_OMP_SIMD
+                            for (uint32_t index2 = 0; index2 < nSize2; index2++) {
+                                for (uint32_t index3 = 0; index3 < nSize3; index3++) {
+                                    for (uint32_t index4 = 0; index4 < nSize4; index4++) {
+                                        auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3 + xStrides[4] * index4);
+                                        auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3 + yStrides[4] * index4);
+                                        auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3 + zStrides[4] * index4);
+
+                                        *rZ = OpType::op(*rX, *rY);
+                                    }
+                                }
+                            }
+                    }
+
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
diff --git a/libnd4j/include/loops/cpu/indexreduce.hpp b/libnd4j/include/loops/cpu/indexreduce.hpp
index 829f60a18..8d3af7eb4 100644
--- a/libnd4j/include/loops/cpu/indexreduce.hpp
+++ b/libnd4j/include/loops/cpu/indexreduce.hpp
@@ -73,7 +73,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
         auto func = PRAGMA_THREADS_FOR {
             intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 IndexValue<X> curr(x[i], i);
                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
             }
@@ -88,7 +88,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
         auto func = PRAGMA_THREADS_FOR {
             intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                 IndexValue<X> curr(x[offset], i);
                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
diff --git a/libnd4j/include/loops/cpu/random.hpp b/libnd4j/include/loops/cpu/random.hpp
index 35674de36..ab9793694 100644
--- a/libnd4j/include/loops/cpu/random.hpp
+++ b/libnd4j/include/loops/cpu/random.hpp
@@ -75,7 +75,7 @@ namespace functions {
 
                     auto func = PRAGMA_THREADS_FOR {
                         PRAGMA_OMP_SIMD
-                        for (auto i = start; i < stop; i += increment)  {
+                        for (auto i = start; i < stop; i++)  {
                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                         }
@@ -93,7 +93,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
@@ -111,7 +111,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
@@ -129,7 +129,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
@@ -149,7 +149,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
@@ -197,7 +197,7 @@ namespace functions {
                 else{
                     auto func = PRAGMA_THREADS_FOR {
                         PRAGMA_OMP_SIMD
-                        for (uint64_t i = start; i < stop; i += increment)  {
+                        for (uint64_t i = start; i < stop; i++)  {
                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                         }
@@ -213,7 +213,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
@@ -255,7 +255,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[offset] = OpClass::op(i, length, rng, extraArguments);
                     }
diff --git a/libnd4j/include/loops/cpu/reduce3.hpp b/libnd4j/include/loops/cpu/reduce3.hpp
index 8d50aedbc..c24a3d474 100644
--- a/libnd4j/include/loops/cpu/reduce3.hpp
+++ b/libnd4j/include/loops/cpu/reduce3.hpp
@@ -88,7 +88,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
 
     if (kindOfLoop == nd4j::LoopKind::EWS1) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
             }
         };
@@ -98,7 +98,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
     } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
             }
@@ -110,7 +110,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                 auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
index a8f766f6a..2e36b8085 100644
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@@ -158,7 +158,7 @@ namespace functions {
             const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
 
                     auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
                     auto tx = x + tadOffsetForBlock;
diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu
index 5ca6f0067..37a0ac804 100644
--- a/libnd4j/include/loops/cuda/transform/transform_any.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_any.cu
@@ -84,7 +84,7 @@ namespace functions {
 	    	auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 			int totalThreads = gridDim.x * blockDim.x;
 
-		    if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
+		    if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') {
 
 				for (int i = tid; i < length; i += totalThreads)
 					z[i * zEws] = OpType::op(x[i * xEws], params);
diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu
index 0f56020b0..d64328494 100644
--- a/libnd4j/include/loops/cuda/transform/transform_bool.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu
@@ -89,7 +89,7 @@ namespace functions {
 	    	    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 				int totalThreads = gridDim.x * blockDim.x;
 
-		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
+		        if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') {
 
 					for (int i = tid; i < length; i += totalThreads)
 						z[i * zEws] = OpType::op(x[i * xEws], params);
diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu
index 49d6ab26f..2e82efdb3 100644
--- a/libnd4j/include/loops/cuda/transform/transform_float.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_float.cu
@@ -97,7 +97,7 @@ namespace functions {
 	    	    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 				int totalThreads = gridDim.x * blockDim.x;
 
-		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
+		        if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') {
 
 					for (Nd4jLong i = tid; i < length; i += totalThreads)
                         z[i * zEws] = OpType::op(x[i * xEws], params);
diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu
index 4c587111b..0a66590a5 100644
--- a/libnd4j/include/loops/cuda/transform/transform_same.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_same.cu
@@ -87,7 +87,7 @@ namespace functions {
 	    	    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 				int totalThreads = gridDim.x * blockDim.x;
 
-		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
+		        if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') {
 
 					for (int i = tid; i < length; i += totalThreads)
 						z[i * zEws] = OpType::op(x[i * xEws], params);
diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu
index 1136ef695..35ab0b1dc 100644
--- a/libnd4j/include/loops/cuda/transform/transform_strict.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu
@@ -89,7 +89,7 @@ namespace functions {
 	    	    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 				int totalThreads = gridDim.x * blockDim.x;
 
-		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
+		        if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') {
 
 					for (int i = tid; i < length; i += totalThreads)
 						z[i * zEws] = OpType::op(x[i * xEws], params);
diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp
index b12ff5796..36c95e731 100644
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@@ -81,7 +81,7 @@ namespace nd4j {
 
         // now we actually apply quantization
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
             }
         };
@@ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         int flimit = limit + 4;
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 int el = x[e];
                 int ael = nd4j::math::nd4j_abs<int>(el) - 1;
                 z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
@@ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         auto z = reinterpret_cast<T *>(dz);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 z[i] = static_cast<T>(static_cast<float>(x[i]));
             }
         };
diff --git a/libnd4j/include/ops/declarable/OpDescriptor.h b/libnd4j/include/ops/declarable/OpDescriptor.h
index 2c857f3c0..302559ad8 100644
--- a/libnd4j/include/ops/declarable/OpDescriptor.h
+++ b/libnd4j/include/ops/declarable/OpDescriptor.h
@@ -147,6 +147,9 @@ namespace nd4j {
             // returns TRUE if this op allows in-place execution
             bool allowsInplace();
 
+            // this method allows you to enable/disable inplace call for a given op
+            void allowInplace(bool reallyAllow);
+
             // this method returns opNum (applicable for legacy XYZ ops only)
             int getOpNum();
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/identity.cpp b/libnd4j/include/ops/declarable/generic/activations/identity.cpp
index 5ae5b0690..e424772fc 100644
--- a/libnd4j/include/ops/declarable/generic/activations/identity.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/identity.cpp
@@ -27,12 +27,10 @@ namespace nd4j {
     namespace ops {
         OP_IMPL(identity, 1, 1, true) {
             auto first = INPUT_VARIABLE(0);
-            auto z = this->getZ(block);
+            auto z = OUTPUT_VARIABLE(0);
 
-            // just for lulz
-            first->applyTransform(nd4j::transform::Identity, *z);
-
-            STORE_RESULT(*z);
+            if (!block.isInplace())
+                first->applyTransform(nd4j::transform::Identity, *z);
 
             return Status::OK();
         }
@@ -60,8 +58,8 @@ namespace nd4j {
         DECLARE_TYPES(identity_bp) {
             getOpDescriptor()
                     ->setAllowedInputTypes(0, DataType::ANY)
-                    ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
-                    ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF});
+                    ->setAllowedInputTypes(1, {ALL_FLOATS})
+                    ->setAllowedOutputTypes(0, {ALL_FLOATS});
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
index 3dd64a113..a673b1988 100644
--- a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
@@ -20,7 +20,7 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), fully rewritten
 //
 
-#include <op_boilerplate.h> 
+#include <op_boilerplate.h>
 #if NOT_EXCLUDED(OP_matmul)
 
 #include <ops/declarable/CustomOperations.h>
@@ -29,142 +29,128 @@
 namespace nd4j {
     namespace ops {
 
-        CUSTOM_OP_IMPL(matmul, 2, 1, false, 0, -2) {
-            auto x = INPUT_VARIABLE(0);
-            auto y = INPUT_VARIABLE(1);
-            auto z = OUTPUT_VARIABLE(0);
+//////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(matmul, 2, 1, false, 0, -2) {
 
-            const int iSize = (int) block.getIArguments()->size();
-            int transX = iSize > 0 ? INT_ARG(0) : 0;
-            int transY = iSize > 1 ? INT_ARG(1) : 0;
-            const int transZ = iSize > 2 ? INT_ARG(2) : 0;
+    auto x = INPUT_VARIABLE(0);
+    auto y = INPUT_VARIABLE(1);
+    auto z = OUTPUT_VARIABLE(0);
 
-            const int xRank = x->rankOf();
-            const int yRank = y->rankOf();
-            const int zRank = z->rankOf();
+    const int iSize = (int) block.getIArguments()->size();
+    int transX = iSize > 0 ? INT_ARG(0) : 0;
+    int transY = iSize > 1 ? INT_ARG(1) : 0;
+    const int transZ = iSize > 2 ? INT_ARG(2) : 0;
 
-            if (transZ) {
-                x = INPUT_VARIABLE(1);
-                y = INPUT_VARIABLE(0);
-                bool temp = transX;
-                transX = !transY;
-                transY = !temp;
-            }
+    const int xRank = x->rankOf();
+    const int yRank = y->rankOf();
+    const int zRank = z->rankOf();
 
-            const int xLastDim = transX ? -2 : -1;
-            const int yLastDim = transY ? -2 : -1;
-            const int xLastButOneDim = transX ? -1 : -2;
-            const int yLastButOneDim = transY ? -1 : -2;
+    if (transZ) {
+        x = INPUT_VARIABLE(1);
+        y = INPUT_VARIABLE(0);
+        bool temp = transX;
+        transX = !transY;
+        transY = !temp;
+    }
 
-            // ******* input validation ******* //
-            REQUIRE_TRUE(xRank > 0 && yRank > 0, 0,
-                         "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !",
-                         xRank, yRank);
+    const int xLastDim = transX ? -2 : -1;
+    const int yLastDim = transY ? -2 : -1;
+    const int xLastButOneDim = transX ? -1 : -2;
+    const int yLastButOneDim = transY ? -1 : -2;
 
-            if (xRank == 1 && yRank == 1) {  // dot case, output is scalar (or vector with length = 1)
-                REQUIRE_TRUE(x->lengthOf() == y->lengthOf(), 0,
-                             "MATMUL OP: since input arrays are vectors they must have the same length, but got x length = %i, y length = %i !",
-                             x->lengthOf(), y->lengthOf());
-            } else if (xRank == 1 && yRank == 2) {  // vector x matrix, i.e. [4] x [4,5] = [5], output is vector
-                REQUIRE_TRUE(x->lengthOf() == y->sizeAt(yLastButOneDim), 0,
-                             "MATMUL OP: input arrays have inconsistent shapes for vector-matrix product: x %s, y %s !",
-                             ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
-            } else if (xRank == 2 && yRank == 1) {   // matrix x vector , i.e. [4,5] x [5] = [4], output is vector
-                REQUIRE_TRUE(x->sizeAt(xLastDim) == y->lengthOf(), 0,
-                             "MATMUL OP: input arrays have inconsistent shapes for matrix-vector product: x %s, y %s !",
-                             ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
-            } else {
-                REQUIRE_TRUE(xRank == yRank && yRank == zRank, 0,
-                             "MATMUL OP: input and output arrays must have the same rank, but got instead: x rank = %i, y rank = %i, z rank = %i !",
-                             xRank, yRank, zRank);
-                REQUIRE_TRUE(x->sizeAt(xLastDim) == y->sizeAt(yLastButOneDim) &&
-                             x->sizeAt(xLastButOneDim) == z->sizeAt(-2) && y->sizeAt(yLastDim) == z->sizeAt(-1), 0,
-                             "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !",
-                             ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(),
-                             ShapeUtils::shapeAsString(z).c_str());
+    // ******* input validation ******* //
+    REQUIRE_TRUE(xRank > 0 && yRank > 0, 0, "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !", xRank, yRank);
 
-                if (xRank > 2)   // outer dims must be the same
-                    for (int i = 0; i < xRank - 2; ++i)
-                        REQUIRE_TRUE(x->sizeAt(i) == y->sizeAt(i) && y->sizeAt(i) == z->sizeAt(i), 0,
-                                     "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !",
-                                     ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(),
-                                     ShapeUtils::shapeAsString(z).c_str());
-            }
-            // ******* end of input validation ******* //
+    if (xRank == 1 && yRank == 1) {  // dot case, output is scalar (or vector with length = 1)
+        REQUIRE_TRUE(x->lengthOf() == y->lengthOf(), 0, "MATMUL OP: since input arrays are vectors they must have the same length, but got x length = %i, y length = %i !", x->lengthOf(), y->lengthOf());
+    } else if (xRank == 1 && yRank == 2) {  // vector x matrix, i.e. [4] x [4,5] = [5], output is vector
+        REQUIRE_TRUE(x->lengthOf() == y->sizeAt(yLastButOneDim), 0, "MATMUL OP: input arrays have inconsistent shapes for vector-matrix product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
+    } else if (xRank == 2 && yRank == 1) {   // matrix x vector , i.e. [4,5] x [5] = [4], output is vector
+        REQUIRE_TRUE(x->sizeAt(xLastDim) == y->lengthOf(), 0, "MATMUL OP: input arrays have inconsistent shapes for matrix-vector product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
+    } else {
+        REQUIRE_TRUE(xRank == yRank && yRank == zRank, 0, "MATMUL OP: input and output arrays must have the same rank, but got instead: x rank = %i, y rank = %i, z rank = %i !", xRank, yRank, zRank);
+        REQUIRE_TRUE(x->sizeAt(xLastDim) == y->sizeAt(yLastButOneDim) && x->sizeAt(xLastButOneDim) == z->sizeAt(-2) && y->sizeAt(yLastDim) == z->sizeAt(-1), 0, "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str());
 
-            MmulHelper::matmul(x, y, z, transX, transY);
+        if (xRank > 2)   // outer dims must be the same
+            for (int i = 0; i < xRank - 2; ++i)
+                REQUIRE_TRUE(x->sizeAt(i) == y->sizeAt(i) && y->sizeAt(i) == z->sizeAt(i), 0, "MATMUL OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str());
+    }
+    // ******* end of input validation ******* //
 
-            return Status::OK();
-        }
+    MmulHelper::matmul(x, y, z, transX, transY);
 
-        DECLARE_SYN(mMul, matmul);
+    return Status::OK();
+}
 
-        DECLARE_SYN(mmul, matmul);
+DECLARE_SYN(mMul, matmul);
 
-        DECLARE_SYN(gemm, matmul);
+DECLARE_SYN(mmul, matmul);
 
-        DECLARE_SYN(gemv, matmul);
+DECLARE_SYN(gemm, matmul);
 
-        DECLARE_SYN(dot, matmul);
+DECLARE_SYN(gemv, matmul);
 
+DECLARE_SYN(dot, matmul);
 
-        DECLARE_SHAPE_FN(matmul) {
+//////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(matmul) {
 
-            auto xShapeInfo = inputShape->at(0);
-            auto yShapeInfo = inputShape->at(1);
+    auto xShapeInfo = inputShape->at(0);
+    auto yShapeInfo = inputShape->at(1);
 
-            const int iSize = (int) block.getIArguments()->size();
-            int transX = iSize > 0 ? INT_ARG(0) : 0;
-            int transY = iSize > 1 ? INT_ARG(1) : 0;
-            const int transZ = iSize > 2 ? INT_ARG(2) : 0;
+    const int iSize = (int) block.getIArguments()->size();
+    int transX = iSize > 0 ? INT_ARG(0) : 0;
+    int transY = iSize > 1 ? INT_ARG(1) : 0;
+    const int transZ = iSize > 2 ? INT_ARG(2) : 0;
 
-            REQUIRE_TRUE(xShapeInfo[0] > 0 && yShapeInfo[0] > 0, 0,
-                         "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !",
-                         xShapeInfo[0], yShapeInfo[0]);
+    REQUIRE_TRUE(xShapeInfo[0] > 0 && yShapeInfo[0] > 0, 0,
+                 "MATMUL OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !",
+                 xShapeInfo[0], yShapeInfo[0]);
 
-            if (transZ) {
-                xShapeInfo = inputShape->at(1);
-                yShapeInfo = inputShape->at(0);
-                bool temp = transX;
-                transX = !transY;
-                transY = !temp;
-            }
+    if (transZ) {
+        xShapeInfo = inputShape->at(1);
+        yShapeInfo = inputShape->at(0);
+        bool temp = transX;
+        transX = !transY;
+        transY = !temp;
+    }
 
-            auto zShapeOnly = ShapeUtils::evalShapeForMatmul(xShapeInfo, yShapeInfo, transX, transY);
+    auto zShapeOnly = ShapeUtils::evalShapeForMatmul(xShapeInfo, yShapeInfo, transX, transY);
 
-            auto dtypeX = ArrayOptions::dataType(xShapeInfo);
-            auto dtypeY = ArrayOptions::dataType(yShapeInfo);
+    auto dtypeX = ArrayOptions::dataType(xShapeInfo);
+    auto dtypeY = ArrayOptions::dataType(yShapeInfo);
 
-            auto xOrder = shape::order(xShapeInfo);
-            auto yOrder = shape::order(yShapeInfo);
-            auto zOrder = xOrder == 'c' && yOrder == 'c' ? 'c' : 'f';
+    auto xOrder = shape::order(xShapeInfo);
+    auto yOrder = shape::order(yShapeInfo);
+    auto zOrder = xOrder == 'c' && yOrder == 'c' ? 'c' : 'f';
 
-            // we just pick the higher data type out of X and Y
-            auto dtypeZ = dtypeX > dtypeY ? dtypeX : dtypeY;
+    // we just pick the higher data type out of X and Y
+    auto dtypeZ = dtypeX > dtypeY ? dtypeX : dtypeY;
 
-            auto newShape = ConstantShapeHelper::getInstance()->createShapeInfo(dtypeZ, zOrder, zShapeOnly);
-            return SHAPELIST(newShape);
-        }
+    auto newShape = ConstantShapeHelper::getInstance()->createShapeInfo(dtypeZ, zOrder, zShapeOnly);
+    return SHAPELIST(newShape);
+}
 
-        DECLARE_TYPES(matmul) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(0, {ALL_FLOATS})
-                    ->setAllowedInputTypes(1, {ALL_FLOATS})
-                    ->setAllowedOutputTypes(0, {ALL_FLOATS});
-        }
+//////////////////////////////////////////////////////////////////////
+DECLARE_TYPES(matmul) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(0, {ALL_FLOATS})
+            ->setAllowedInputTypes(1, {ALL_FLOATS})
+            ->setAllowedOutputTypes(0, {ALL_FLOATS});
+}
 
+//////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(matmul_bp, 3, 2, false, 0, -2) {
+    auto x = INPUT_VARIABLE(0);
+    auto y = INPUT_VARIABLE(1);
+    auto eps = INPUT_VARIABLE(2);
+    auto dldx = OUTPUT_VARIABLE(0);
+    auto dldy = OUTPUT_VARIABLE(1);
 
-        CUSTOM_OP_IMPL(matmul_bp, 3, 2, false, 0, -2) {
-            auto x = INPUT_VARIABLE(0);
-            auto y = INPUT_VARIABLE(1);
-            auto eps = INPUT_VARIABLE(2);
-            auto dldx = OUTPUT_VARIABLE(0);
-            auto dldy = OUTPUT_VARIABLE(1);
-
-            const int iSize = (int) block.getIArguments()->size();
-            int transX = iSize > 0 ? INT_ARG(0) : 0;
-            int transY = iSize > 1 ? INT_ARG(1) : 0;
-            const int transZ = iSize > 2 ? INT_ARG(2) : 0;
+    const int iSize = (int) block.getIArguments()->size();
+    int transX = iSize > 0 ? INT_ARG(0) : 0;
+    int transY = iSize > 1 ? INT_ARG(1) : 0;
+    const int transZ = iSize > 2 ? INT_ARG(2) : 0;
 
 /*
 In: x=[a,b], y=[b,c]
@@ -177,34 +163,35 @@ F   F   T   [a,b]   [b,c]   [c,a]   [c,a]
 */
 
 
-            nd4j::ops::matmul op;
-            op.execute({eps, y}, {dldx}, {}, {transZ, !transY, transX}, {});
-            op.execute({x, eps}, {dldy}, {}, {!transX, transZ, transY}, {});
+    nd4j::ops::matmul op;
+    op.execute({eps, y}, {dldx}, {}, {transZ, !transY, transX}, {});
+    op.execute({x, eps}, {dldy}, {}, {!transX, transZ, transY}, {});
 
-            return Status::OK();
-        }
+    return Status::OK();
+}
 
+//////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(matmul_bp) {
+    Nd4jLong *xShapeInfo;
+    Nd4jLong *yShapeInfo;
 
-        DECLARE_SHAPE_FN(matmul_bp) {
-            Nd4jLong *xShapeInfo;
-            Nd4jLong *yShapeInfo;
+    COPY_SHAPE(inputShape->at(0), xShapeInfo);
+    COPY_SHAPE(inputShape->at(1), yShapeInfo);
 
-            COPY_SHAPE(inputShape->at(0), xShapeInfo);
-            COPY_SHAPE(inputShape->at(1), yShapeInfo);
+    return SHAPELIST(CONSTANT(xShapeInfo), CONSTANT(yShapeInfo));
+}
 
-            return SHAPELIST(CONSTANT(xShapeInfo), CONSTANT(yShapeInfo));
-        }
+//////////////////////////////////////////////////////////////////////
+DECLARE_TYPES(matmul_bp) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(0, {ALL_FLOATS})
+            ->setAllowedInputTypes(1, {ALL_FLOATS})
+            ->setAllowedInputTypes(2, {ALL_FLOATS})
+            ->setAllowedOutputTypes(0, {ALL_FLOATS})
+            ->setAllowedOutputTypes(1, {ALL_FLOATS});
+}
 
-        DECLARE_TYPES(matmul_bp) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(0, {ALL_FLOATS})
-                    ->setAllowedInputTypes(1, {ALL_FLOATS})
-                    ->setAllowedInputTypes(2, {ALL_FLOATS})
-                    ->setAllowedOutputTypes(0, {ALL_FLOATS})
-                    ->setAllowedOutputTypes(1, {ALL_FLOATS});
-        }
-
-    }
+}
 }
 
 
diff --git a/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp b/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp
index 2c362b23d..3db3b6097 100644
--- a/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp
@@ -21,70 +21,174 @@
 #include <op_boilerplate.h>
 #if NOT_EXCLUDED(OP_tensormmul)
 
+#include <numeric>
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 #include <MmulHelper.h>
 
+
 namespace nd4j {
-    namespace ops {
-        CUSTOM_OP_IMPL(tensormmul, 2, 1, false, 0, -1) {
-            auto a = INPUT_VARIABLE(0);
-            auto b = INPUT_VARIABLE(1);
+namespace ops  {
 
-            auto c = OUTPUT_VARIABLE(0); //
+////////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(tensormmul, 2, 1, false, 0, -1) {
 
-            REQUIRE_TRUE(a->dataType() == b->dataType(), 0, "tensormmul: A, B and C data types must be the same");
+    auto a = INPUT_VARIABLE(0);
+    auto b = INPUT_VARIABLE(1);
 
-            // building axes
-            int axe0_size = INT_ARG(0);
-            int axe1_size = INT_ARG(axe0_size+1);
-            std::vector<int> axes_0(axe0_size), axes_1(axe1_size);
-            for (int e = 0; e < axe0_size; e++)
-                axes_0[e] = (int) INT_ARG(e+1);
+    auto c = OUTPUT_VARIABLE(0);
 
-            for (int e = 0; e < axe1_size; e++)
-                axes_1[e] = (int) INT_ARG(e + axe0_size + 2);
+    REQUIRE_TRUE(a->dataType() == b->dataType(), 0, "tensormmul: A, B and C data types must be the same");
 
-            nd4j_verbose("axe0: %i; axe1: %i;\n", axes_0.size(), axes_1.size());
+    // building axes
+    int axe0_size = INT_ARG(0);
+    int axe1_size = INT_ARG(axe0_size+1);
+    std::vector<int> axes_0(axe0_size), axes_1(axe1_size);
+    for (int e = 0; e < axe0_size; e++)
+        axes_0[e] = (int)INT_ARG(e + 1);
 
-            MmulHelper::tensorDot(a, b, c, axes_0, axes_1);
-            return Status::OK();
-        }
-        DECLARE_SYN(tensordot, tensormmul);
+    for (int e = 0; e < axe1_size; e++)
+        axes_1[e] = (int)INT_ARG(e + axe0_size + 2);
 
+    nd4j_verbose("axe0: %i; axe1: %i;\n", axes_0.size(), axes_1.size());
 
-        DECLARE_SHAPE_FN(tensormmul) {               
-        
-            auto aShapeInfo = inputShape->at(0);
-            auto bShapeInfo = inputShape->at(1);
+    MmulHelper::tensorDot(a, b, c, axes_0, axes_1);
+    return Status::OK();
+}
+DECLARE_SYN(tensordot, tensormmul);
 
-            REQUIRE_TRUE(ArrayOptions::dataType(aShapeInfo) == ArrayOptions::dataType(bShapeInfo), 0, "tensormmul: A and B data types must be the same");
+////////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(tensormmul) {
 
-            // building axes
-            int axe0_size = INT_ARG(0);
-            int axe1_size = INT_ARG(axe0_size+1);
-            std::vector<int> axes_0(axe0_size), axes_1(axe1_size);
-            for (int e = 0; e < axe0_size; e++)
-                axes_0[e] = (int) INT_ARG(e+1);
+    auto aShapeInfo = inputShape->at(0);
+    auto bShapeInfo = inputShape->at(1);
 
-            for (int e = 0; e < axe1_size; e++)
-                axes_1[e] = (int) INT_ARG(e + axe0_size + 2);
+    REQUIRE_TRUE(ArrayOptions::dataType(aShapeInfo) == ArrayOptions::dataType(bShapeInfo), 0, "tensormmul: A and B data types must be the same");
 
-            // evaluate shapes 
-            std::vector<int> permutAt, permutBt;
-            std::vector<Nd4jLong> shapeAt, shapeBt;
-            auto outShape = nd4j::ShapeUtils::evalShapeForTensorDot(aShapeInfo, bShapeInfo, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt);
+    // building axes
+    int axe0_size = INT_ARG(0);
+    int axe1_size = INT_ARG(axe0_size+1);
+    std::vector<int> axes_0(axe0_size), axes_1(axe1_size);
+    for (int e = 0; e < axe0_size; e++)
+        axes_0[e] = (int) INT_ARG(e+1);
 
-            return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(aShapeInfo), 'c', outShape)));
-        }
+    for (int e = 0; e < axe1_size; e++)
+        axes_1[e] = (int) INT_ARG(e + axe0_size + 2);
 
-        DECLARE_TYPES(tensormmul) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
-                    ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
-                    ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF});
-        }
+    // evaluate shapes
+    std::vector<int> permutAt, permutBt;
+    std::vector<Nd4jLong> shapeAt, shapeBt;
+    auto outShape = nd4j::ShapeUtils::evalShapeForTensorDot(aShapeInfo, bShapeInfo, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt);
+
+    return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(aShapeInfo), 'c', outShape)));
+}
+
+////////////////////////////////////////////////////////////////////////
+DECLARE_TYPES(tensormmul) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
+            ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
+            ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF});
+}
+
+////////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(tensormmul_bp, 3, 2, false, 0, -1) {
+
+    auto A = INPUT_VARIABLE(0);
+    auto B = INPUT_VARIABLE(1);
+
+    auto dLdC = INPUT_VARIABLE(2);
+
+    auto dLdA = OUTPUT_VARIABLE(0);
+    auto dLdB = OUTPUT_VARIABLE(1);
+
+    REQUIRE_TRUE( (A->dataType() == B->dataType() && (dLdC->dataType() == A->dataType())), 0, "tensormmul_bp: A, B and dLdC data types must be the same");
+
+    int axe0Size = INT_ARG(0);
+    int axe1Size = INT_ARG(axe0Size + 1);
+
+    auto Arank = A->rankOf();
+    auto Brank = B->rankOf();
+    auto dLdCrank = dLdC->rankOf();
+
+    REQUIRE_TRUE((Arank >= axe0Size), 0, "tensormmul_bp: A rank must be the higher or same as input axes 0");
+
+    REQUIRE_TRUE((Brank >= axe1Size), 0, "tensormmul_bp: B rank must be the higher or same as input axes 1");
+
+    // building axes
+    std::vector<int> axes0(axe0Size), axes1(axe1Size);
+    for (uint e = 0; e < axe0Size; e++)
+        axes0[e] = (int)INT_ARG(e + 1);
+    for (uint e = 0; e < axe1Size; e++)
+        axes1[e] = (int)INT_ARG(e + axe0Size + 2);
+
+    std::vector<int> permutAt, permutBt;
+    std::vector<Nd4jLong> shapeAt, shapeBt;
+
+    ShapeUtils::evalShapeForTensorDot(A, B, axes0, axes1, permutAt, permutBt, shapeAt, shapeBt);
+
+    // special case for scalar value
+    if (dLdC->isScalar()) {
+
+        dLdA->assign((*dLdC) * *B);
+        dLdB->assign((*dLdC) * *A);
+
+        return Status::OK();
     }
+
+    std::vector<int> axesA = ShapeUtils::evalDimsToExclude(Arank, axes0);
+    std::vector<int> axesB = ShapeUtils::evalDimsToExclude(Brank, axes1);
+
+    // rank always have to be divided by 2
+    std::vector<int> axesAdLdC, axesBdLdC;
+    if (dLdCrank > 1) {
+        axesAdLdC.resize(dLdCrank / 2);
+        std::iota(axesAdLdC.begin(), axesAdLdC.end(), 0);
+        axesBdLdC = ShapeUtils::evalDimsToExclude(dLdCrank, axesAdLdC);
+    }
+    else {
+        axesAdLdC.push_back(0);
+        axesBdLdC.push_back(0);
+    }
+
+    // calculate dLdA
+    MmulHelper::tensorDot(dLdC, B, dLdA, axesBdLdC, axesB, permutAt);
+
+    // calculate dLdB
+    MmulHelper::tensorDot(A, dLdC, dLdB, axesA, axesAdLdC, permutBt);
+
+    return Status::OK();
+}
+
+////////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(tensormmul_bp) {
+
+    auto aShapeInfo = inputShape->at(0);
+    auto bShapeInfo = inputShape->at(1);
+    auto dLShapeInfo = inputShape->at(2);
+
+    REQUIRE_TRUE((ArrayOptions::dataType(aShapeInfo) == ArrayOptions::dataType(bShapeInfo) &&
+                 (ArrayOptions::dataType(dLShapeInfo) == ArrayOptions::dataType(aShapeInfo))), 0, "tensormmul_bp: A, B and dLdC data types must be the same");
+
+    Nd4jLong* dLdAShapeInfo = nullptr;
+    Nd4jLong* dLdBShapeInfo = nullptr;
+
+    COPY_SHAPE(aShapeInfo, dLdAShapeInfo);
+    COPY_SHAPE(bShapeInfo, dLdBShapeInfo);
+
+    return SHAPELIST(CONSTANT(dLdAShapeInfo), CONSTANT(dLdBShapeInfo));
+}
+
+////////////////////////////////////////////////////////////////////////
+DECLARE_TYPES(tensormmul_bp) {
+    getOpDescriptor()
+        ->setAllowedInputTypes(0, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF }) // maybe better ALL_FLOATS
+        ->setAllowedInputTypes(1, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF })
+        ->setAllowedInputTypes(2, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF })
+        ->setAllowedOutputTypes(0, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF })
+        ->setAllowedOutputTypes(1, { DataType::FLOAT32, DataType::DOUBLE, DataType::HALF });
+}
+}
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
index 9cd3285f3..c5e26c73e 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
@@ -79,7 +79,7 @@ CUSTOM_OP_IMPL(conv1d, 2, 1, false, 0, 5) {
     }
 
     auto inputReshaped   = input  ->reshape(input->ordering(),   reshapeForInput);
-    auto outputReshaped  = output ->reshape(output->ordering(),  reshapeForOutput);
+    auto outputReshaped  = output ->reshape(output->ordering(),  reshapeForOutput, false);
     auto weightsReshaped = weights->reshape(weights->ordering(), {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)});   // [kW, iC, oC] -> [1, kW, iC, oC]
 
     nd4j::ops::conv2d conv2d;
@@ -216,10 +216,10 @@ CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) {
     }
 
     auto inputReshaped   = input  ->reshape(input->ordering(),  reshapeForInput);
-    auto gradIReshaped   = gradI  ->reshape(gradI->ordering(),  reshapeForInput);
+    auto gradIReshaped   = gradI  ->reshape(gradI->ordering(),  reshapeForInput, false);
     auto gradOReshaped   = gradO  ->reshape(gradO->ordering(),  reshapeForGradO);
-    auto weightsReshaped = weights->reshape(weights->ordering(),{1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)});    // [kW, iC, oC] -> [1, kW, iC, oC]
-    auto gradWReshaped   = gradW  ->reshape(gradW->ordering(),  {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)});    // [kW, iC, oC] -> [1, kW, iC, oC]
+    auto weightsReshaped = weights->reshape(weights->ordering(),{1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)});       // [kW, iC, oC] -> [1, kW, iC, oC]
+    auto gradWReshaped   = gradW  ->reshape(gradW->ordering(),  {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}, false);// [kW, iC, oC] -> [1, kW, iC, oC]
 
     nd4j::ops::conv2d_bp conv2dBP;
     auto status = conv2dBP.execute({&inputReshaped, &weightsReshaped, bias, &gradOReshaped}, {&gradIReshaped, &gradWReshaped, gradB}, {}, {1,kW,  1,sW,  0,pW,  1,dW,  paddingMode,  !isNCW}, {});
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
index 0652f1840..7ce42756d 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
@@ -239,7 +239,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
     //----- calculation of gradO -----//
     if(gradB) {
         if(gradB->rankOf() == 2)
-            gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
+            gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false));
         gradO->reduceAlongDimension(reduce::Sum, *gradB, gradOaxesForDot);                          // sum over bS oD oH oW
         if(gradB != OUTPUT_VARIABLE(2))
             delete gradB;
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
index 4a5bbd845..e3632f36a 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
@@ -233,7 +233,7 @@ CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) {
     // ----- calculation of gradB ----- //
     if(gradB) {
         if(gradB->rankOf() == 2)
-            gradB = new NDArray(gradB->reshape(gradB->ordering(), {gradB->lengthOf()}));
+            gradB = new NDArray(gradB->reshape(gradB->ordering(), {gradB->lengthOf()}, false));
         gradO->reduceAlongDimension(reduce::Sum, *gradB, {0, 2, 3});                                // sum over bS, oH, oW
         if(gradB != OUTPUT_VARIABLE(2))
             delete gradB;
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
index 1b832ea68..78d275c69 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
@@ -243,7 +243,7 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {
     // ----- calculation of gradB ----- //
     if(gradB) {
         if(gradB->rankOf() == 2)
-            gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
+            gradB = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false));
         gradO->reduceAlongDimension(reduce::Sum, *gradB, {0, 2, 3, 4});                                // sum over bS, oD, oH, oW
         if(gradB != OUTPUT_VARIABLE(2))
             delete gradB;
diff --git a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
index cfc080117..22c7a9137 100644
--- a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
@@ -31,22 +31,17 @@ namespace nd4j {
             REQUIRE_TRUE(w->isMatrix(), 0, "relu_layer: weights argument should be a 2D tensor, but got rank %i instead!", w->rankOf());
             REQUIRE_TRUE(b->isVector(), 0, "relu_layer: biases argument should be a 1D tensor, but got rank %i instead!", b->rankOf());
             REQUIRE_TRUE(b->lengthOf() == w->sizeAt(1), 0, "relu_layer: biases array length should match to columns of weights matrix, however got length = %i and columns = %i!", b->lengthOf(), w->sizeAt(1));
-            REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!",
-                x->sizeAt(1), w->sizeAt(0));
-
+            REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!", x->sizeAt(1), w->sizeAt(0));
 
             auto output = OUTPUT_VARIABLE(0);
-            //T bound = (T)0.f;
-            //nd4j_printf("Matrix x(%ix%i), Matrix w(%ix%i), b(1x%i)\n", x->sizeAt(0), x->sizeAt(1), w->sizeAt(0), w->sizeAt(1), b->lengthOf());
 
             nd4j::ops::xw_plus_b op;
-            std::unique_ptr<ResultSet> result(op.evaluate({x, w, b}));
-            REQUIRE_TRUE(Status::OK() == result->status(), 0, "relu_layer: xw_plus_b op failed on input data.");
+            auto status = op.execute({x, w, b}, {output});
+            REQUIRE_TRUE(Status::OK() == status, 0, "relu_layer: xw_plus_b op failed on input data.");
 
             auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0;
 
-            auto xw = result->at(0);
-            xw->applyScalar(nd4j::scalar::RELU, scalar, *output);
+            output->applyScalar(nd4j::scalar::RELU, scalar, *output);
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
index 7afb24bd7..cdce8a95a 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
@@ -23,7 +23,8 @@
 
 //#include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/CustomOperations.h>
-#include <ops/declarable/helpers/image_resize.h>
+#include <ops/declarable/helpers/crop_and_resize.h>
+
 namespace nd4j {
     namespace ops {
         CUSTOM_OP_IMPL(crop_and_resize, 4, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
index 984672ad2..dc304e4a9 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
@@ -61,13 +61,13 @@ namespace nd4j {
             }
 
             auto source = inRank == 4?image->reshape(image->ordering(), {image->sizeAt(0), image->sizeAt(1), image->sizeAt(2), image->sizeAt(3)}):image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)});
-            auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}):output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)});
+            auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}, false) : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false);
 
             return helpers::resizeAreaFunctor(block.launchContext(), &source, width, height, alignCorners, &target);
         }
 
         DECLARE_SHAPE_FN(resize_area) {
-            auto shapeList = SHAPELIST(); 
+            auto shapeList = SHAPELIST();
             auto in = inputShape->at(0);
 
             Nd4jLong* outputShape;
@@ -90,7 +90,7 @@ namespace nd4j {
             }
 
             REQUIRE_TRUE(inRank == 4 || inRank == 3, 0, "resize_area: Source tensor should have rank 4, but %i given.", inRank);
-            
+
             ALLOCATE(outputShape, block.getWorkspace(), shape::shapeInfoLength(inRank), Nd4jLong);
             outputShape[0] = inRank;
             if (inRank == 4) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
index 26ca7eec9..63da432c7 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
@@ -62,13 +62,13 @@ namespace nd4j {
             REQUIRE_TRUE(!halfPixelAlign || (halfPixelAlign && !alignCorners), 0, "resize_bicubic: `half_pixel_centers' should be false or true only when `align_corners' is false");
 
             auto source = inRank == 4?image->reshape(image->ordering(), {image->sizeAt(0), image->sizeAt(1), image->sizeAt(2), image->sizeAt(3)}):image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)});
-            auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}):output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)});
+            auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}, false) : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false);
 
             return helpers::resizeBicubicFunctorA(block.launchContext(), &source, width, height, alignCorners, halfPixelAlign, &target);
         }
 
         DECLARE_SHAPE_FN(resize_bicubic) {
-            auto shapeList = SHAPELIST(); 
+            auto shapeList = SHAPELIST();
             auto in = inputShape->at(0);
 
             Nd4jLong* outputShape;
@@ -82,7 +82,7 @@ namespace nd4j {
             height = newImageSize->e<int>(1);
 
             REQUIRE_TRUE(inRank == 4 || inRank == 3, 0, "resize_bicubic: Source tensor should have rank 4, but %i given.", inRank);
-            
+
             ALLOCATE(outputShape, block.getWorkspace(), shape::shapeInfoLength(inRank), Nd4jLong);
             outputShape[0] = inRank;
             if (inRank == 4) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
index 652b78cf1..fa7054c29 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
@@ -43,7 +43,7 @@ namespace nd4j {
             REQUIRE_TRUE(inRank == output->rankOf(), 0, "resize_bilinear: Input and output ranks should be equals, but %i and %i occured.", inRank, output->rankOf());
 
             auto source = inRank == 4?image->reshape(image->ordering(), {image->sizeAt(0), image->sizeAt(1), image->sizeAt(2), image->sizeAt(3)}):image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)});
-            auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}):output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)});
+            auto target = inRank == 4?output->reshape(output->ordering(), {output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3)}, false) : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false);
 
             if (block.width() > 1) {
                 auto newImageSize = INPUT_VARIABLE(1);
@@ -71,7 +71,7 @@ namespace nd4j {
         }
 
         DECLARE_SHAPE_FN(resize_bilinear) {
-            auto shapeList = SHAPELIST(); 
+            auto shapeList = SHAPELIST();
             auto in = inputShape->at(0);
 
             Nd4jLong* outputShape;
@@ -94,7 +94,7 @@ namespace nd4j {
                 width = INT_ARG(0);
                 height = INT_ARG(1);
             }
-            
+
             ALLOCATE(outputShape, block.getWorkspace(), shape::shapeInfoLength(inRank), Nd4jLong);
             outputShape[0] = inRank;
             if (inRank == 4) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
index db477f569..9d6ac8a81 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
@@ -63,13 +63,13 @@ namespace nd4j {
             REQUIRE_TRUE(((alignCorners && height > 2) || (height > 0)) && ((alignCorners && width > 1) || (width > 0)), 0,  "resize_nearest_neighbor: Wrong input or output size to resize (width = %d, height = %d)", width, height);
 
             auto source = inRank == 4?*image:image->reshape(image->ordering(), {1, image->sizeAt(0), image->sizeAt(1), image->sizeAt(2)});
-            auto target = inRank == 4?*output:output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)});
+            auto target = inRank == 4 ? *output : output->reshape(output->ordering(), {1, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2)}, false);
 
             return helpers::resizeNeighborFunctor(block.launchContext(), inRank==4?image:&source, width, height, alignCorners, halfPixelCenter, inRank == 4 ? output : &target);
         }
 
         DECLARE_SHAPE_FN(resize_nearest_neighbor) {
-            auto shapeList = SHAPELIST(); 
+            auto shapeList = SHAPELIST();
             auto in = inputShape->at(0);
             auto inRank = shape::rank(in);
             Nd4jLong* outputShape;
diff --git a/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp b/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp
index bbc1f6a1c..efa723c20 100644
--- a/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp
@@ -47,11 +47,12 @@ namespace nd4j {
 
             shape.insert(shape.begin() + axis, 1);
 
-            auto tmp = input->reshape(input->ordering(), shape);
-            output->assign(tmp);
-
-            STORE_RESULT(output);
-
+            if (input->ews() == 1 && output->ews() == 1 && input->ordering() == output->ordering()) {
+                output->dataBuffer()->copyBufferFrom(*input->dataBuffer().get(), output->lengthOf() * DataTypeUtils::sizeOfElement(output->dataType()), 0, input->bufferOffset());
+            } else {
+                auto tmp = input->reshape(input->ordering(), shape);
+                output->assign(tmp);
+            }
             return Status::OK();
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/shape/permute.cpp b/libnd4j/include/ops/declarable/generic/shape/permute.cpp
index 7e5efaa85..63c20e888 100644
--- a/libnd4j/include/ops/declarable/generic/shape/permute.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/permute.cpp
@@ -15,7 +15,8 @@
  ******************************************************************************/
 
 //
-// Created by raver119 on 29/10/17.
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
@@ -29,80 +30,52 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
 // here iArgs is int vector of ordered set of dimensions to be permuted
-        CUSTOM_OP_IMPL(permute, 1, 1, true, 0, -2) {
-            auto x = INPUT_VARIABLE(0);
+CUSTOM_OP_IMPL(permute, 1, 1, true, 0, -2) {
 
-            bool replace = false;
+    auto x = INPUT_VARIABLE(0);
+    auto z = OUTPUT_VARIABLE(0);
 
-            auto origArgs = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT<int>() : *block.getIArguments();
-            std::vector<int> arguments({});
-            if(origArgs.size() > 0){
-                for (int e = 0; e < origArgs.size(); e++) {
-                    int ax = origArgs[e];
-                    if (ax < 0)
-                        ax += x->rankOf();
-
-                    arguments.emplace_back(ax);
-                }
-
-                replace = true;
-            } else {
-                for (int e = x->rankOf() - 1; e >= 0; e--)
-                    arguments.emplace_back(e);
-            }
-
-            // 0D edge case
-            if (x->rankOf() == 0) {
-                REQUIRE_TRUE(arguments.size() == 1, 0, "Permute: only one axis is allowed for scalar");
-                auto output = OUTPUT_VARIABLE(0);
-                if (!block.isInplace())
-                    output->assign(x);
-
-                return Status::OK();
-            }
-
-            if(block.isInplace()) {		// in-place
-                x->permutei(arguments);
-                STORE_RESULT(x);
-            } else {
-                auto output = OUTPUT_VARIABLE(0);
-                auto result = x->permute(arguments);
-                output->assign(result);
-                STORE_RESULT(output);
-            }
-
-            return Status::OK();
-        }
-
-        DECLARE_TYPES(permute) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
-                    ->setAllowedInputTypes(1, {ALL_INTS})
-                    ->setSameMode(true);
-        }
-
-        DECLARE_SHAPE_FN(permute) {
-            auto shapeList = SHAPELIST();
-            auto arguments = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT<int>() : *block.getIArguments();
-
-            if (shape::rank(inputShape->at(0)) == 0) {
-                shapeList->push_back(ConstantShapeHelper::getInstance()->scalarShapeInfo(ArrayOptions::dataType(inputShape->at(0))));
-            } else if (inputShape->size() == 1 && !arguments.empty()) {
-                shapeList->push_back(ShapeUtils::evalPermShapeInfo(arguments.data(), arguments.size(), *INPUT_VARIABLE(0), block.workspace()));
-            } else {
-                if(arguments.size() == 0){
-                    //Reverse dimensions
-                    int rank = shape::rank(inputShape->at(0));
-                    for (int e = rank - 1; e >= 0; e--)
-                        arguments.emplace_back(e);
-                }
-
-                shapeList->push_back(ShapeUtils::evalPermShapeInfo(arguments.data(), arguments.size(), *INPUT_VARIABLE(0), block.workspace()));
-            }
-    
-            return shapeList;
-        }
+    if (x->isEmpty()) {
+        REQUIRE_TRUE(z->isEmpty(), 0, "PERMUTE OP: when input is empty, output must also be empty");
+        return Status::OK();    //No op
     }
+
+    if (block.width() == 1 && block.getIArguments()->size() == 0) {
+        z->assign(x->transpose());
+        return Status::OK();
+    }
+
+    std::vector<int> permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT<int>() : *block.getIArguments();
+
+    z->assign(x->permute(permutationVector));
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+DECLARE_TYPES(permute) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+            ->setAllowedInputTypes(1, {ALL_INTS})
+            ->setSameMode(true);
+}
+
+//////////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(permute) {
+
+    auto x = INPUT_VARIABLE(0);
+
+    if (block.width() == 1 && block.getIArguments()->size() == 0)
+        return SHAPELIST(ShapeUtils::evalTranspShapeInfo(*x, block.workspace(), true));
+
+    std::vector<int> permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT<int>() : *block.getIArguments();
+
+    auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(permutationVector.data(), x->rankOf(), *x, block.workspace(), true);
+
+    return SHAPELIST(outputShapeInfo);
+}
+
+}
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp
index 1d76138f2..4a06455eb 100644
--- a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp
@@ -24,254 +24,240 @@
 #include <ops/declarable/CustomOperations.h>
 
 namespace nd4j {
-    namespace ops {
-        //////////////////////////////////////////////////////////////////////////
-        // here iArgs is a vector with (optional) negative of order as first element:
-        // ({-order, dim1, dim2, dim3, ...})
-        CUSTOM_OP_IMPL(reshape, 1, 1, true, 0, -2) {
-            auto x = INPUT_VARIABLE(0);
+namespace ops  {
 
-            if (block.width() == 1) {
-                auto arguments = block.getIArguments();
-                int argsSize = arguments->size();
-                
-                //Special case: empty.reshape(<other empty shape>) -> return empty
-                if (x->isEmpty()) {
-                    REQUIRE_TRUE(OUTPUT_VARIABLE(0)->isEmpty(), 0, "Reshape: when input is empty, output must also be empty");
-                    return ND4J_STATUS_OK;    //No op
+//////////////////////////////////////////////////////////////////////////
+// here iArgs is a vector with (optional) negative of order as first element:
+// ({-order, dim1, dim2, dim3, ...})
+CUSTOM_OP_IMPL(reshape, 1, 1, false, 0, -2) {
+
+    auto x = INPUT_VARIABLE(0);
+    auto z = OUTPUT_VARIABLE(0);
+
+    //Special case: empty.reshape(<other empty shape>) -> return empty
+        if (x->isEmpty()) {
+            REQUIRE_TRUE(z->isEmpty(), 0, "Reshape: when input is empty, output must also be empty");
+            return Status::OK();    //No op
+        }
+
+    if (block.width() == 1) {
+
+        auto arguments = block.getIArguments();
+        int argsSize = arguments->size();
+
+
+
+        int e = 1;
+        char order = (char) -(*arguments)[0];
+        if (order != 'c' && order != 'f') {
+            order = 'c'; //x->ordering();
+            e = 0;
+        }
+
+        REQUIRE_TRUE(argsSize - e >= 1, 0, "Reshape arguments should have at least 1 dimension");
+
+        std::vector<Nd4jLong> shapeNew;
+        int e2 = e;
+        for (; e < (int) arguments->size(); e++) {
+            if (arguments->at(e) == -1){
+                Nd4jLong shapeLength = 1;
+                for(; e2 < e; e2++){
+                    shapeLength *= arguments->at(e2);
                 }
-
-                int e = 1;
-                char order = (char) -(*arguments)[0];
-                if (order != 'c' && order != 'f') {
-                    order = 'c'; //x->ordering();
-                    e = 0;
-                }
-
-                REQUIRE_TRUE(argsSize - e >= 1, 0, "Reshape arguments should have at least 1 dimension");
-
-                std::vector<Nd4jLong> shapeNew;
-                int e2 = e;
-                for (; e < (int) arguments->size(); e++) {
-                    if (arguments->at(e) == -1){
-                        Nd4jLong shapeLength = 1;
-                        for(; e2 < e; e2++){
-                            shapeLength *= arguments->at(e2);
-                        }
-                        for(e2 = e + 1; e2 < arguments->size(); e2++){
-                            shapeLength *= arguments->at(e2);
-                        }
-                        Nd4jLong realShape = x->lengthOf() / shapeLength;
-                        shapeNew.push_back(realShape);
-                    }
-                    else{
-                        shapeNew.push_back(arguments->at(e));
-                    }
-
-                }
-
-                auto len = shape::prodLong(shapeNew.data(), shapeNew.size());
-                REQUIRE_TRUE(len == x->lengthOf(), 0, "Reshape: lengths before and after reshape should match, but got %i vs %i", x->lengthOf(), len);
-
-                if (Environment::getInstance()->isDebugAndVerbose()) {
-                    nd4j_printv("Reshape: new shape", shapeNew);
-                }
-
-                if (block.isInplace()) {
-                    if (x->reshapei(order, shapeNew)) {
-                        STORE_RESULT(*x);
-                        return ND4J_STATUS_OK;
-                    }
-                } else {
-                    auto ret = OUTPUT_VARIABLE(0);
-                    auto xr = x->reshape(order, shapeNew);
-                    ret->assign(xr);
-                    STORE_RESULT(*ret);
-
-                    return Status::OK();
-                }
-            } else if (block.width() == 2) {
-                auto s = INPUT_VARIABLE(1);
-                
-                //Special case: empty.reshape(-1) -> return empty
-                if (x->isEmpty()) {
-                    //REQUIRE_TRUE(s->lengthOf() == 1 && s->e<Nd4jLong>(0) == -1, 0, "Reshape: when input is empty, shape must be [-1]");
-                    REQUIRE_TRUE(OUTPUT_VARIABLE(0)->isEmpty(), 0, "Reshape: when input is empty, output must also be empty");
-                    return Status::OK();    //No op
-                }
-
-                char order = 'c';
-                if (block.numI() > 0)
-                    order = (char) -INT_ARG(0);
-
-                std::vector<Nd4jLong> shapeNew(s->lengthOf());
-
-                for (int e = 0; e < (int) s->lengthOf(); e++) {
-                    auto dim = s->e<Nd4jLong >(e);
-                    if (dim == -1){
-                        Nd4jLong shapeLength = 1;
-                        for(int e2 = 0; e2 < e; e2++){
-                            shapeLength *= s->e<Nd4jLong>(e2);
-                        }
-                        for(int e2 = e + 1; e2 < (int) s->lengthOf(); e2++){
-                            REQUIRE_TRUE(s->e<Nd4jLong>(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed.");
-                            shapeLength *= s->e<Nd4jLong>(e2);
-                        }
-                        Nd4jLong realShape = x->lengthOf() / shapeLength;
-                        shapeNew[e] = realShape;
-                    }
-                    else{
-                        shapeNew[e] = dim;
-                    }
-                }
-
-                if (Environment::getInstance()->isDebugAndVerbose()) {
-                    nd4j_printv("Reshape: new shape", shapeNew);
-                }
-
-                if (block.isInplace()) {
-                    if (x->reshapei(order, shapeNew)) {
-                        STORE_RESULT(*x);
-                        return Status::OK();
-                    }
-                } else {
-                    auto ret = OUTPUT_VARIABLE(0);
-                    if (s->isEmpty()) {
-                        // just a scalar
-                        ret->assign(x);
-                    } else {
-                        auto xr = x->reshape(order, shapeNew);
-                        ret->assign(xr);
-                    }
-
-                    return Status::OK();
+                for(e2 = e + 1; e2 < arguments->size(); e2++){
+                    shapeLength *= arguments->at(e2);
                 }
+                Nd4jLong realShape = x->lengthOf() / shapeLength;
+                shapeNew.push_back(realShape);
+            }
+            else{
+                shapeNew.push_back(arguments->at(e));
             }
 
-            return ND4J_STATUS_BAD_INPUT;
         }
 
+        auto len = shape::prodLong(shapeNew.data(), shapeNew.size());
+        REQUIRE_TRUE(len == x->lengthOf(), 0, "Reshape: lengths before and after reshape should match, but got %i vs %i", x->lengthOf(), len);
 
-        DECLARE_TYPES(reshape) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
-                    ->setAllowedInputTypes(1, {ALL_INTS})
-                    ->setSameMode(true);
+        if (Environment::getInstance()->isDebugAndVerbose()) {
+            nd4j_printv("Reshape: new shape", shapeNew);
         }
 
-        DECLARE_SHAPE_FN(reshape) {
-            auto inp = inputShape->at(0);
+        auto xr = x->reshape(order, shapeNew);
+        z->assign(xr);
+        STORE_RESULT(*z);
 
-            // we can launch op using Int arguments
-            if (inputShape->size() == 1) {
-                REQUIRE_TRUE(block.numI() > 0, 0, "Reshape: new shape should be provided as NDArray or int arguments, but nothing was defined");
-                std::vector<int> *arguments = block.getIArguments();
+        return Status::OK();
 
-                int e = 1;
-                char order = (char) -(*arguments)[0];
-                if (order != 'c' && order != 'f') {
-                    order = shape::order(inp);
-                    e = 0;
+    } else if (block.width() == 2) {
+
+        auto s = INPUT_VARIABLE(1);
+
+        char order = 'c';
+        if (block.numI() > 0)
+            order = (char) -INT_ARG(0);
+
+        std::vector<Nd4jLong> shapeNew(s->lengthOf());
+
+        for (int e = 0; e < (int) s->lengthOf(); e++) {
+            auto dim = s->e<Nd4jLong >(e);
+            if (dim == -1){
+                Nd4jLong shapeLength = 1;
+                for(int e2 = 0; e2 < e; e2++){
+                    shapeLength *= s->e<Nd4jLong>(e2);
                 }
-
-                std::vector<Nd4jLong> shapeNew;
-
-                int e2 = e;
-                for (; e < (int) arguments->size(); e++) {
-                    if ((int) arguments->at(e) == -1){
-
-                        Nd4jLong shapeLength = 1;
-                        for(; e2 < e; e2 ++){
-                            shapeLength *= arguments->at(e2);
-                        }
-                        for(e2 = e + 1; e2 < arguments->size(); e2++){
-                            REQUIRE_TRUE(arguments->at(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed.");
-                            shapeLength *= arguments->at(e2);
-                        }
-
-                        if(shapeLength == 0){
-                            //Edge case for empty:
-                            shapeNew.push_back(0);
-                        } else {
-                            //Standard case
-                            Nd4jLong realShape = shape::length(inp) / shapeLength;
-                            shapeNew.push_back(realShape);
-                        }
-                    }
-                    else{
-                        shapeNew.push_back(arguments->at(e));
-                    }
+                for(int e2 = e + 1; e2 < (int) s->lengthOf(); e2++){
+                    REQUIRE_TRUE(s->e<Nd4jLong>(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed.");
+                    shapeLength *= s->e<Nd4jLong>(e2);
                 }
-
-                return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(inp), order, shapeNew)));
-            } else {
-                // or, with second input "as shape"
-                auto x = INPUT_VARIABLE(0);
-                auto y = INPUT_VARIABLE(1);
-
-                // special case here
-                if (y->isEmpty()) {
-                    REQUIRE_TRUE(x->lengthOf() == 1, 0, "Reshape: new length doesn't match existing array");
-                    return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(ArrayOptions::dataType(inp)));
-                }
-                //Special case: empty.reshape(-1) -> return empty
-                if (x->isEmpty()) {
-                    //REQUIRE_TRUE(y->lengthOf() == 1 && y->e<Nd4jLong>(0) == -1, 0, "Reshape: when input is empty, shape must be [-1]");
-                    auto shapeOf = y->getBufferAsVector<Nd4jLong>();
-                    Nd4jLong prod = 1;
-                    bool hasNegs = false;
-                    for (auto v:shapeOf) {
-                        if (v < 0) {
-                            hasNegs = true;
-                            v = 0;
-                        }
-
-                        prod *= v;
-                    }
-
-                    REQUIRE_TRUE(prod == 0, 0, "Reshape: in case of empty arrays reshape must return empty array as well");
-
-                    // if there are -1s - we turn them into zeros
-                    if (hasNegs) {
-                        for (int e = 0; e < shapeOf.size(); e++)
-                            if (shapeOf[e] < 0)
-                                shapeOf[e] = 0;
-                    }
-
-                    auto newShape = ShapeBuilders::createShapeInfo(ArrayOptions::dataType(inp), shape::order(inp), y->lengthOf(), shapeOf.data());
-                    return SHAPELIST(CONSTANT(newShape));
-                }
-
-                std::vector<Nd4jLong> shapeNew(y->lengthOf());
-
-                for (int e = 0; e < (int) y->lengthOf(); e++) {
-                    auto dim = y->e<Nd4jLong>(e);
-                    if (dim == -1){
-                        Nd4jLong shapeLength = 1;
-                        for(int e2 = 0; e2 < e; e2++){
-                            shapeLength *= y->e<Nd4jLong>(e2);
-                        }
-                        for(int e2 = e + 1; e2 < (int)y->lengthOf(); e2++){
-                            REQUIRE_TRUE(y->e<Nd4jLong>(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed.");
-                            shapeLength *= y->e<Nd4jLong>(e2);
-                        }
-
-                        if(shapeLength == 0){
-                            //Edge case for empty:
-                            shapeNew[e] = 0;
-                        } else {
-                            Nd4jLong realShape = shape::length(inp) / shapeLength;
-                            shapeNew[e] = realShape;
-                        }
-                    }else {
-                        shapeNew[e] = dim;
-                    }
-                }
-
-                return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inp), 'c', shapeNew));
+                Nd4jLong realShape = x->lengthOf() / shapeLength;
+                shapeNew[e] = realShape;
+            }
+            else{
+                shapeNew[e] = dim;
             }
         }
+
+        if (Environment::getInstance()->isDebugAndVerbose()) {
+            nd4j_printv("Reshape: new shape", shapeNew);
+        }
+
+        if (s->isEmpty()) {
+            // just a scalar
+            z->assign(x);
+        } else {
+            auto xr = x->reshape(order, shapeNew);
+            z->assign(xr);
+        }
+
+        return Status::OK();
+
     }
+
+    return ND4J_STATUS_BAD_INPUT;
+}
+
+
+DECLARE_TYPES(reshape) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+            ->setAllowedInputTypes(1, {ALL_INTS})
+            ->setSameMode(true);
+}
+
+DECLARE_SHAPE_FN(reshape) {
+    auto inp = inputShape->at(0);
+
+    // we can launch op using Int arguments
+    if (inputShape->size() == 1) {
+        REQUIRE_TRUE(block.numI() > 0, 0, "Reshape: new shape should be provided as NDArray or int arguments, but nothing was defined");
+        std::vector<int> *arguments = block.getIArguments();
+
+        int e = 1;
+        char order = (char) -(*arguments)[0];
+        if (order != 'c' && order != 'f') {
+            order = shape::order(inp);
+            e = 0;
+        }
+
+        std::vector<Nd4jLong> shapeNew;
+
+        int e2 = e;
+        for (; e < (int) arguments->size(); e++) {
+            if ((int) arguments->at(e) == -1){
+
+                Nd4jLong shapeLength = 1;
+                for(; e2 < e; e2 ++){
+                    shapeLength *= arguments->at(e2);
+                }
+                for(e2 = e + 1; e2 < arguments->size(); e2++){
+                    REQUIRE_TRUE(arguments->at(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed.");
+                    shapeLength *= arguments->at(e2);
+                }
+
+                if(shapeLength == 0){
+                    //Edge case for empty:
+                    shapeNew.push_back(0);
+                } else {
+                    //Standard case
+                    Nd4jLong realShape = shape::length(inp) / shapeLength;
+                    shapeNew.push_back(realShape);
+                }
+            }
+            else{
+                shapeNew.push_back(arguments->at(e));
+            }
+        }
+
+        return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(inp), order, shapeNew)));
+    } else {
+        // or, with second input "as shape"
+        auto x = INPUT_VARIABLE(0);
+        auto y = INPUT_VARIABLE(1);
+
+        // special case here
+        if (y->isEmpty()) {
+            REQUIRE_TRUE(x->lengthOf() == 1, 0, "Reshape: new length doesn't match existing array");
+            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(ArrayOptions::dataType(inp)));
+        }
+        //Special case: empty.reshape(-1) -> return empty
+        if (x->isEmpty()) {
+            //REQUIRE_TRUE(y->lengthOf() == 1 && y->e<Nd4jLong>(0) == -1, 0, "Reshape: when input is empty, shape must be [-1]");
+            auto shapeOf = y->getBufferAsVector<Nd4jLong>();
+            Nd4jLong prod = 1;
+            bool hasNegs = false;
+            for (auto v:shapeOf) {
+                if (v < 0) {
+                    hasNegs = true;
+                    v = 0;
+                }
+
+                prod *= v;
+            }
+
+            REQUIRE_TRUE(prod == 0, 0, "Reshape: in case of empty arrays reshape must return empty array as well");
+
+            // if there are -1s - we turn them into zeros
+            if (hasNegs) {
+                for (int e = 0; e < shapeOf.size(); e++)
+                    if (shapeOf[e] < 0)
+                        shapeOf[e] = 0;
+            }
+
+            auto newShape = ShapeBuilders::createShapeInfo(ArrayOptions::dataType(inp), shape::order(inp), y->lengthOf(), shapeOf.data());
+            return SHAPELIST(CONSTANT(newShape));
+        }
+
+        std::vector<Nd4jLong> shapeNew(y->lengthOf());
+
+        for (int e = 0; e < (int) y->lengthOf(); e++) {
+            auto dim = y->e<Nd4jLong>(e);
+            if (dim == -1){
+                Nd4jLong shapeLength = 1;
+                for(int e2 = 0; e2 < e; e2++){
+                    shapeLength *= y->e<Nd4jLong>(e2);
+                }
+                for(int e2 = e + 1; e2 < (int)y->lengthOf(); e2++){
+                    REQUIRE_TRUE(y->e<Nd4jLong>(e2) != -1, 0, "Reshape : Only one unknown dimension (-1) is allowed.");
+                    shapeLength *= y->e<Nd4jLong>(e2);
+                }
+
+                if(shapeLength == 0){
+                    //Edge case for empty:
+                    shapeNew[e] = 0;
+                } else {
+                    Nd4jLong realShape = shape::length(inp) / shapeLength;
+                    shapeNew[e] = realShape;
+                }
+            }else {
+                shapeNew[e] = dim;
+            }
+        }
+
+        return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inp), 'c', shapeNew));
+    }
+}
+}
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp
index 92dc2a146..3035f104b 100644
--- a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp
@@ -28,35 +28,27 @@ namespace nd4j {
 
 
     //////////////////////////////////////////////////////////////////////////
-    CUSTOM_OP_IMPL(reshapeas, 2, 1, true, 0, 0) {
-    
+    CUSTOM_OP_IMPL(reshapeas, 2, 1, false, 0, 0) {
+
         auto x = INPUT_VARIABLE(0);
         auto y = INPUT_VARIABLE(1);
 
         auto z = OUTPUT_VARIABLE(0);
-        std::vector<Nd4jLong> shapeNew(y->shapeOf(), y->shapeOf() + y->rankOf());
-        char order = y->ordering();
 
-        if (x->reshapei(order, shapeNew)) {
-            *z = *x;
-            STORE_RESULT(*z);
+        if (x->reshapei(y->ordering(), y->getShapeAsVector())) {
+
+            z->assign(x);
             return Status::OK();
         }
 
         return ND4J_STATUS_BAD_INPUT;
     }
     DECLARE_SYN(reshape_as, reshapeas);
-    
-    DECLARE_SHAPE_FN(reshapeas) {
-    
-    auto inputShapeInfo = inputShape->at(1);    
-    int shapeInfoLength = inputShapeInfo[0]*2 + 4;
 
-    Nd4jLong* outputShapeInfo(nullptr);
-    COPY_SHAPE(inputShapeInfo, outputShapeInfo);
-    
-    return SHAPELIST(CONSTANT(outputShapeInfo));
-}
+    DECLARE_SHAPE_FN(reshapeas) {
+
+        return SHAPELIST(ShapeBuilders::copyShapeInfo(INPUT_VARIABLE(1)->getShapeInfo(), false, block.workspace()));
+    }
 
         DECLARE_TYPES(reshapeas) {
             getOpDescriptor()
diff --git a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp
index 3b158ff3a..22e229643 100644
--- a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp
@@ -25,7 +25,7 @@
 
 namespace nd4j {
     namespace ops {
-        CUSTOM_OP_IMPL(squeeze, 1, 1, true, 0, -2) {
+        CUSTOM_OP_IMPL(squeeze, 1, 1, false, 0, -2) {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
@@ -36,14 +36,14 @@ namespace nd4j {
                     int _a = INT_ARG(e);
                     if (_a < 0)
                         _a += input->rankOf();
-                        
+
                     axis.emplace_back(_a);
                 }
             else if (block.width() > 1) {
                 auto a = INPUT_VARIABLE(1);
                 for (Nd4jLong e = 0; e < a->lengthOf(); e++) {
                     int _a = a->e<int>(e);
-                    
+
                     if (_a < 0)
                         _a += input->rankOf();
 
@@ -71,10 +71,14 @@ namespace nd4j {
             }
 
             if (block.isInplace()) {
-                output->reshapei(input->ordering(), shape);
+                output->reshapei(input->ordering(), shape, false);
             } else {
-                auto tmp = input->reshape(input->ordering(), shape);
-                output->assign(tmp);
+                if (input->ews() == 1 && output->ews() == 1 && input->ordering() == output->ordering()) {
+                    output->dataBuffer()->copyBufferFrom(*input->dataBuffer().get(), output->lengthOf() * DataTypeUtils::sizeOfElement(output->dataType()), 0, input->bufferOffset());
+                } else {
+                    auto tmp = input->reshape(input->ordering(), shape);
+                    output->assign(tmp);
+                }
             }
 
             return Status::OK();
@@ -106,20 +110,20 @@ namespace nd4j {
                     int _a = INT_ARG(e);
                     if (_a < 0)
                         _a += rank;
-                        
+
                     axis.emplace_back(_a);
                 }
             else if (block.width() > 1) {
                 auto a = INPUT_VARIABLE(1);
                 for (int e = 0; e < a->lengthOf(); e++) {
                     int _a = a->e<int>(e);
-                    
+
                     if (_a < 0)
                         _a += rank;
 
                     axis.emplace_back(_a);
                 }
-                
+
             }
 
             auto order = shape::order(in);
diff --git a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp
index cc88fb46c..d71fbddd5 100644
--- a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp
@@ -25,7 +25,7 @@
 
 namespace nd4j {
 namespace ops {
-    CUSTOM_OP_IMPL(tile_to_shape, 1, 1, true, 0, -1) {
+    CUSTOM_OP_IMPL(tile_to_shape, 1, 1, false, 0, -1) {
 
         auto input = INPUT_VARIABLE(0);
         auto output = OUTPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp
index 5d01b8bbf..4ec586370 100644
--- a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp
@@ -15,7 +15,8 @@
  ******************************************************************************/
 
 //
-// Created by raver119 on 29/10/17.
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
@@ -25,113 +26,52 @@
 #include <helpers/ShapeUtils.h>
 
 namespace nd4j {
-namespace ops {
+namespace ops  {
 
-    //////////////////////////////////////////////////////////////////////////
-    CUSTOM_OP_IMPL(transpose, 1, 1, true, 0, 0) {
-        auto x = INPUT_VARIABLE(0);
-        if (block.width() == 1) {
-            if (block.isInplace()) {
-                x->transposei();
-                STORE_RESULT(*x);
-            } else {
-                auto output = OUTPUT_VARIABLE(0);
-                auto t = x->transpose();
-                output->assign(t);
-                STORE_RESULT(*output);
-            }
-        } else {
-            // this is tf-mode transpose, that's nd4j permute
-            bool replace = false;
-            std::vector<int> arguments(*block.getIArguments());
+//////////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(transpose, 1, 1, false, 0, 0) {
 
-            auto w = block.width();
-            auto a = arguments.size();
+    auto x = INPUT_VARIABLE(0);
+    auto z = OUTPUT_VARIABLE(0);
 
-            if (w == 2 && a == 0) {
-                auto axis = INPUT_VARIABLE(1);
-                for (int e = 0; e < axis->lengthOf(); e++) {
-                    auto ax = axis->e<int>(e);
-                    if (ax < 0)
-                        ax += x->rankOf();
+    //Special case: empty.reshape(<other empty shape>) -> return empty
+    if (x->isEmpty()) {
+        REQUIRE_TRUE(z->isEmpty(), 0, "TRANSPOSE OP: when input is empty, output must also be empty");
+        return Status::OK();    //No op
+    }
 
-                    arguments.emplace_back(ax);
-                }
-
-                replace = true;
-            } else if (a == 0) {
-                for (int e = x->rankOf() - 1; e >= 0; e--)
-                    arguments.emplace_back(e);
-            }
-
-            // 0D edge case
-            if (x->rankOf() == 0) {
-                REQUIRE_TRUE(arguments.size() == 1, 0, "Permute: only one axis is allowed for scalar");
-                auto output = OUTPUT_VARIABLE(0);
-                if (!block.isInplace())
-                    output->assign(x);
-
-                return Status::OK();
-            }
-
-            if(block.isInplace()) {		// in-place
-                x->permutei(arguments);
-                STORE_RESULT(x);
-            } else {
-                auto input = x->permute(arguments);
-
-                auto output = OUTPUT_VARIABLE(0);
-                output->assign(input);
-             }
-        }
+    if (block.width() == 1 && block.getIArguments()->size() == 0) {
+        z->assign(x->transpose());
         return Status::OK();
     }
 
-    DECLARE_TYPES(transpose) {
-        getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
-                ->setSameMode(true);
-    }
+    std::vector<int> permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT<int>() : *block.getIArguments();
 
-    DECLARE_SHAPE_FN(transpose) {
-        if (block.width() == 1) {
-            auto outputShapeInfo = ShapeUtils::evalTranspShapeInfo(*INPUT_VARIABLE(0), block.workspace());
-            return SHAPELIST(outputShapeInfo);
-        } else {
-            // this is basically permute mode
-            auto shapeList = SHAPELIST();
-            auto arguments = block.getIArguments();
-            if (shape::rank(inputShape->at(0)) == 0) {
-                Nd4jLong *newshape;
-                ALLOCATE(newshape, block.getWorkspace(), shape::shapeInfoLength(inputShape->at(0)), Nd4jLong);
-                newshape[0] = 0;
-                newshape[1] = 0;
-                newshape[2] = 1;
-                newshape[3] = 99;
-                ArrayOptions::copyDataType(newshape, inputShape->at(0));
-                shapeList->push_back(newshape);
-            } else if (arguments->size() > 0 || inputShape->size() > 1) {
-                auto axis = arguments->size() > 0 ? *arguments : (INPUT_VARIABLE(1))->template asVectorT<int>();
-                auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(axis.data(), axis.size(), *INPUT_VARIABLE(0), block.workspace());
-                shapeList->push_back(outputShapeInfo);
-            } else if (inputShape->size() == 2) {
-                // dead end
-                auto axis = INPUT_VARIABLE(1);
-                auto axisV = axis->template asVectorT<Nd4jLong>();
-                auto newshape = ShapeUtils::evalPermShapeInfo(axisV.data(), axisV.size(), *INPUT_VARIABLE(0), block.workspace());
-                shapeList->push_back(newshape);
-            } else {
-                int rank = shape::rank(inputShape->at(0));
-                for (int e = rank - 1; e >= 0; e--)
-                    arguments->emplace_back(e);
+    z->assign(x->permute(permutationVector));
 
-                auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(arguments->data(), arguments->size(), *INPUT_VARIABLE(0), block.workspace());
-                shapeList->push_back(outputShapeInfo);
-            }
+    return Status::OK();
+}
+
+DECLARE_TYPES(transpose) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setSameMode(true);
+}
+
+DECLARE_SHAPE_FN(transpose) {
+
+    auto x = INPUT_VARIABLE(0);
+
+    if (block.width() == 1 && block.getIArguments()->size() == 0)
+        return SHAPELIST(ShapeUtils::evalTranspShapeInfo(*x, block.workspace(), true));
+
+    std::vector<int> permutationVector = block.width() > 1 ? INPUT_VARIABLE(1)->asVectorT<int>() : *block.getIArguments();
+
+    auto outputShapeInfo = ShapeUtils::evalPermShapeInfo(permutationVector.data(), x->rankOf(), *x, block.workspace(), true);
+
+    return SHAPELIST(outputShapeInfo);
+}
 
-            return shapeList;
-        }
-    }
 }
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
index 2003eef3f..faa59fa6c 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
@@ -42,8 +42,8 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
     std::vector<int> arrsToDelete;
     int index = 0;
     bool allOfSameType = true;
-    auto theFirstRank = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
-    auto theFirstDatatype = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
+    auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
+    auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
 
     for(int i = 0; i < numOfInArrs; ++i) {
         auto input = INPUT_VARIABLE(i);
@@ -51,10 +51,10 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
 
 // TODO: follow two lines are in accordance to current tf.concat spec. Commented for compatibility with legacy
 //        REQUIRE_TRUE(currentRank > 0, 0, "Rank of input variable %i must be greater 0, but is %lld instead.", i, currentRank);
-//        REQUIRE_TRUE(theFirstRank == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, theFirstRank);
+//        REQUIRE_TRUE(rankOfFirstArr == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, rankOfFirstArr);
         if(!input->isEmpty()) {
 
-            allOfSameType &= (theFirstDatatype == input->dataType());
+            allOfSameType &= (typeOfFirstArr == input->dataType());
 
             if(input->rankOf() == 0) {
                 auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext());
diff --git a/libnd4j/include/ops/declarable/headers/blas.h b/libnd4j/include/ops/declarable/headers/blas.h
index 08f8f79a7..d94d365dd 100644
--- a/libnd4j/include/ops/declarable/headers/blas.h
+++ b/libnd4j/include/ops/declarable/headers/blas.h
@@ -57,7 +57,8 @@ namespace nd4j {
          * IArgs[1]... axes values for second array
          */
         #if NOT_EXCLUDED(OP_tensormmul)
-        DECLARE_CUSTOM_OP(tensormmul, 2, 1, false, 0, -1);   
+        DECLARE_CUSTOM_OP(tensormmul, 2, 1, false, 0, -1);
+        DECLARE_CUSTOM_OP(tensormmul_bp, 3, 2, false, 0, -1);
         #endif
 
         /**
diff --git a/libnd4j/include/ops/declarable/headers/shape.h b/libnd4j/include/ops/declarable/headers/shape.h
index 3d47c24bf..c21cdb84d 100644
--- a/libnd4j/include/ops/declarable/headers/shape.h
+++ b/libnd4j/include/ops/declarable/headers/shape.h
@@ -26,15 +26,15 @@
 namespace nd4j {
     namespace ops {
         #if NOT_EXCLUDED(OP_permute)
-        DECLARE_CUSTOM_OP(permute, 1, 1, true, 0, -2);   
+        DECLARE_CUSTOM_OP(permute, 1, 1, false, 0, -2);
         #endif
 
         #if NOT_EXCLUDED(OP_reshapeas)
-        DECLARE_CUSTOM_OP(reshapeas, 2, 1, true, 0, 0);      
+        DECLARE_CUSTOM_OP(reshapeas, 2, 1, false, 0, 0);
         #endif
 
         #if NOT_EXCLUDED(OP_transpose)
-        DECLARE_CUSTOM_OP(transpose, 1, 1, true, 0, 0);
+        DECLARE_CUSTOM_OP(transpose, 1, 1, false, 0, 0);
         #endif
 
         #if NOT_EXCLUDED(OP_shape_of)
@@ -46,7 +46,7 @@ namespace nd4j {
         #endif
 
         #if NOT_EXCLUDED(OP_squeeze)
-        DECLARE_CUSTOM_OP(squeeze, 1, 1, true, 0, -2);
+        DECLARE_CUSTOM_OP(squeeze, 1, 1, false, 0, -2);
         #endif
 
         #if NOT_EXCLUDED(OP_expand_dims)
@@ -54,11 +54,11 @@ namespace nd4j {
         #endif
 
         #if NOT_EXCLUDED(OP_reshape)
-        DECLARE_CUSTOM_OP(reshape, 1, 1, true, 0, -2);
+        DECLARE_CUSTOM_OP(reshape, 1, 1, false, 0, -2);
         #endif
 
         #if NOT_EXCLUDED(OP_size_at)
-        DECLARE_CUSTOM_OP(size_at, 1, 1, true, 0, 1);
+        DECLARE_CUSTOM_OP(size_at, 1, 1, false, 0, 1);
         #endif
 
         /**
@@ -80,8 +80,8 @@ namespace nd4j {
          * @tparam T
          */
         #if NOT_EXCLUDED(OP_tile_to_shape)
-        DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, true, 0, -1);
-        DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, true, 0, -1);
+        DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, false, 0, -1);
+        DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, false, 0, -1);
         #endif
 
         /**
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
index f8704d7b0..baf19de10 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@@ -153,7 +153,7 @@ namespace helpers {
         auto rowSize = sizeof(T) * colCount;
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto n = start; n < stop; n += increment) {
+            for (auto n = start; n < stop; n++) {
                 int s = rowP->e<int>(n);
                 int end = rowP->e<int>(n + 1);
                 int shift = n * colCount;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index 9a11baf37..2e63c9d5e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -191,6 +191,70 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
         BUILD_SINGLE_SELECTOR(xType, logSoftMaxForVector_, (input.getBuffer(), input.getShapeInfo(), output.buffer(), output.shapeInfo()), FLOAT_TYPES);
     }
 
+    template <typename T>
+    void softmax_loop(T *input, T *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen);
+
+    template <>
+    FORCEINLINE void softmax_loop(float *input, float *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i++) {
+                auto inBuff = input + offsets[i];
+                auto outBuff = output + offsets[i];
+
+                float max = -DataTypeUtils::max<float>();
+                float sum = 0.f;
+
+                #pragma omp simd reduction(max:max)
+                for (uint j = 0; j < tadLen; ++j)
+                    max = nd4j::math::nd4j_max<float>(max, inBuff[j]);
+
+                #pragma omp simd reduction(+:sum)
+                for (uint j = 0; j < tadLen; ++j) {
+                    float temp = nd4j::math::nd4j_exp<float, float>(inBuff[j] - max);
+                    outBuff[j] = temp;
+                    sum += temp;
+                }
+
+                #pragma omp simd
+                for (uint j = 0; j < tadLen; ++j)
+                    outBuff[j] /= sum;
+            }
+        };
+
+        samediff::Threads::parallel_tad(func,0, numOfSubArrs);
+    }
+
+
+    template <typename T>
+    FORCEINLINE void softmax_loop(T *input, T *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i++) {
+                auto inBuff = input + offsets[i];
+                auto outBuff = output + offsets[i];
+
+                T max = -DataTypeUtils::max<T>();
+                T sum(0.f);
+
+                #pragma omp simd reduction(maxT:max)
+                for (uint j = 0; j < tadLen; ++j)
+                    max = nd4j::math::nd4j_max<T>(max, inBuff[j]);
+
+                #pragma omp simd reduction(sumT:sum)
+                for (uint j = 0; j < tadLen; ++j) {
+                    T temp = nd4j::math::nd4j_exp<T, T>(inBuff[j] - max);
+                    outBuff[j] = temp;
+                    sum += temp;
+                }
+
+                #pragma omp simd
+                for (uint j = 0; j < tadLen; ++j)
+                    outBuff[j] /= sum;
+            }
+        };
+
+        samediff::Threads::parallel_tad(func,0, numOfSubArrs);
+    }
+
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
@@ -213,31 +277,10 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
         const uint tadLen       = shape::length(tadShapeInfo);
 
         if(shape::elementWiseStride(tadShapeInfo) == 1){
+            T *inBuff = input.bufferAsT<T>();
+            T *outBuff = output.bufferAsT<T>();
 
-            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
-
-                    T *inBuff = input.bufferAsT<T>() + tadOffsets[i];
-                    T *outBuff = output.bufferAsT<T>() + tadOffsets[i];
-
-                    T max = -DataTypeUtils::max<T>();
-                    T sum = 0;
-
-                    for (uint j = 0; j < tadLen; ++j)
-                        max = nd4j::math::nd4j_max<T>(max, inBuff[j]);
-
-                    for (uint j = 0; j < tadLen; ++j) {
-                        T temp = nd4j::math::nd4j_exp<T, T>(inBuff[j] - max);
-                        outBuff[j] = temp;
-                        sum += temp;
-                    }
-
-                    for (uint j = 0; j < tadLen; ++j)
-                        outBuff[j] /= sum;
-                }
-            };
-
-            samediff::Threads::parallel_tad(func,0, numOfSubArrs);
+            softmax_loop(inBuff, outBuff, tadOffsets, numOfSubArrs, tadLen);
         }
         else {
 
@@ -248,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
             shape::calcOffsets(tadShapeInfo, offsets);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
                     auto outBuff = output.bufferAsT<T>() + tadOffsets[i];
 
@@ -298,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
     const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             // FIXME: double!
             double x = input.e<double>(i);
             if (x < 0.0) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
index e5242a5be..39e51f6d7 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
@@ -374,6 +374,28 @@ namespace nd4j {
 
 			template <typename X, typename Y>
 			static void addBias_(const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
+			   /*
+			    if (input.rankOf() == 2 && bias.rankOf() == 1 && input.sizeAt(1) == bias.sizeAt(0) && input.ordering() == 'c') {
+			        int rows = input.sizeAt(0);
+			        int biasLen = bias.lengthOf();
+
+                    auto inB = input.bufferAsT<X>();
+                    auto bB = bias.bufferAsT<Y>();
+                    auto outB = output.bufferAsT<X>();
+
+			        for (int e = 0; e < rows; e++) {
+			            auto row = inB + (e * biasLen);
+                        auto out = outB + (e * biasLen);
+
+			            for (int t = 0; t < biasLen; t++) {
+			                out[t] = row[t] + bB[t];
+			            }
+			        }
+
+                    return;
+			    }
+			    */
+
 				Nd4jLong* x_shapeInfo = input.getShapeInfo();
 				Nd4jLong* z_shapeInfo = output.getShapeInfo();
 				X* x = input.bufferAsT<X>();
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
index 978c037fa..5a22b02eb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@@ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
 
                 const T *xTad = x + packX.platformOffsets()[i];
                 T *zTad = z + packZ.platformOffsets()[i];
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
index d4b0de398..594280ebe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@@ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 const T *xTad = x + packX.platformOffsets()[i];
                 T *zTad = z + packZ.platformOffsets()[i];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
index b408da720..c63dc3c1c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
@@ -94,7 +94,7 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st
         int vaSize = vA.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto p = start; p < stop; p += increment) {
+            for (auto p = start; p < stop; p++) {
                 auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
                 auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
                 auto C = reinterpret_cast<T *>(vC.at(p)->buffer());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index ad2e29a97..aa9624600 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
 
         Nd4jLong coords[MAX_RANK];
 
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
 
             shape::index2coords(i, input->getShapeInfo(), coords);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
index 83cc966ba..5e80d12fb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
@@ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
 	int xLen = x.lengthOf();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
     };
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
index 5aad38da8..26f82bdd9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
         auto func = PRAGMA_THREADS_FOR {
             T *col, *im;
 
-            for (uint b = start; b < stop; b += increment) {
+            for (uint b = start; b < stop; b++) {
                 T *im0 = imBuff + b * imStride0;
                 T *col4 = colBuff + b * colStride0;
                 for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp
new file mode 100644
index 000000000..3177cca34
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_0, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp
new file mode 100644
index 000000000..cd9c00dc5
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_1, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp
new file mode 100644
index 000000000..3b126d288
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_2, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp
new file mode 100644
index 000000000..cca97a1ac
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_3, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp
new file mode 100644
index 000000000..568492c08
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_4, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp
new file mode 100644
index 000000000..1491c9e1d
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_5, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp
new file mode 100644
index 000000000..8517a39e9
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_6, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp
new file mode 100644
index 000000000..e12190170
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_7, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp
new file mode 100644
index 000000000..c4ddd7066
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_8, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp
new file mode 100644
index 000000000..38cf05787
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include "../crop_and_resize.hpp"
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_9, FLOAT_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
new file mode 100644
index 000000000..1bdf0a6ad
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
+//
+
+
+#include <ops/declarable/helpers/transforms.h>
+#include <ops/specials.h>
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            //////////////////////////////////////////////////////////////////////////
+            template<typename T>
+            static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+                nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
+            }
+
+            void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+                BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
+            }
+
+            BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
index 4f8989caf..39449c7f8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
@@ -32,7 +32,7 @@ namespace helpers {
         int lLen = labels->lengthOf();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (int j = start; j < stop; j += increment) {
+            for (int j = start; j < stop; j++) {
                 auto label = labels->e<Nd4jLong>(j);
                 auto pred = predictions->e<Nd4jLong>(j);
                 T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
index db09f0d3c..51ddc0369 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@@ -432,7 +432,7 @@ namespace nd4j {
                 ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
 
             NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-            NDArray outputReshaped = output->reshape(output->ordering(), outReShape);
+            NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false);
 
             helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
             MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
@@ -505,7 +505,7 @@ namespace nd4j {
             if(gradB) {
                 NDArray* gradBR = gradB;
                 if(gradB->rankOf() == 2)
-                    gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
+                    gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false));
                 gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1});                      // sum over bS, oH, oW
 
                 if(gradBR != gradB)
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp
new file mode 100644
index 000000000..233699163
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019-2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include <execution/Threads.h>
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+
+// ------------------------------------------------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------------------------------------------------ //
+// crop and resize helper functor:
+// \@param context - launch context for operation
+// \@param images - batch of images (4D tensor) with shape {batch, width, height, channels} with given type
+// \@param boxes - float boxes for crop
+// \@param indices - integer boxes indices for crop
+// \@param cropSize - integer size (newWidth, newHeight)
+// \@param method - one of bilinear (0) or nearest neighbour (1) interpolation algorithm
+// \@param extrapolationVal - radix to increase/decrease image
+// \@param crops - output image batch (4D with given type)
+//
+            void
+            cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes,
+                                 NDArray const *indices, NDArray const *cropSize,
+                                 int method, double extrapolationVal, NDArray *crops) {
+                BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_, (images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES);
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
new file mode 100644
index 000000000..1f55378c0
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#include <ops/declarable/helpers/crop_and_resize.h>
+#include <execution/Threads.h>
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            template<typename T, typename F, typename I>
+            void cropAndResizeFunctor_(NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
+                const int batchSize = images->sizeAt(0);
+                const int imageHeight = images->sizeAt(1);
+                const int imageWidth = images->sizeAt(2);
+
+                const int numBoxes = crops->sizeAt(0);
+                const int cropHeight = crops->sizeAt(1);
+                const int cropWidth = crops->sizeAt(2);
+                const int depth = crops->sizeAt(3);
+
+                for (auto b = 0; b < numBoxes; ++b) {
+                    T y1 = boxes->t<F>(b, 0);
+                    T x1 = boxes->t<F>(b, 1);
+                    T y2 = boxes->t<F>(b, 2);
+                    T x2 = boxes->t<F>(b, 3);
+
+                    int bIn = indices->e<int>(b);
+                    if (bIn >= batchSize) {
+                        continue;
+                    }
+
+                    T heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / (cropHeight - 1) : T(0);
+                    T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
+
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto y = start; y < stop; y++) {
+                            const float inY = (cropHeight > 1)
+                                              ? y1 * (imageHeight - 1) + y * heightScale
+                                              : 0.5 * (y1 + y2) * (imageHeight - 1);
+
+                            if (inY < 0 || inY > imageHeight - 1) {
+                                for (auto x = 0; x < cropWidth; ++x) {
+                                    for (auto d = 0; d < depth; ++d) {
+                                        crops->p(b, y, x, d, extrapolationVal);
+                                    }
+                                }
+                                continue;
+                            }
+                            if (method == 0 /* bilinear */) {
+                                const int topYIndex = nd4j::math::p_floor(inY);
+                                const int bottomYIndex = nd4j::math::p_ceil(inY);
+                                const float y_lerp = inY - topYIndex;
+
+                                for (auto x = 0; x < cropWidth; ++x) {
+                                    const float in_x = (cropWidth > 1)
+                                                       ? x1 * (imageWidth - 1) + x * widthScale
+                                                       : 0.5 * (x1 + x2) * (imageWidth - 1);
+
+                                    if (in_x < 0 || in_x > imageWidth - 1) {
+                                        for (auto d = 0; d < depth; ++d) {
+                                            crops->p(b, y, x, d, extrapolationVal);
+                                        }
+                                        continue;
+                                    }
+                                    int left_x_index = math::p_floor(in_x);
+                                    int right_x_index = math::p_ceil(in_x);
+                                    T x_lerp = in_x - left_x_index;
+
+                                    for (auto d = 0; d < depth; ++d) {
+                                        const float topLeft(images->e<float>(bIn, topYIndex, left_x_index, d));
+                                        const float topRight(images->e<float>(bIn, topYIndex, right_x_index, d));
+                                        const float bottomLeft(images->e<float>(bIn, bottomYIndex, left_x_index, d));
+                                        const float bottomRight(images->e<float>(bIn, bottomYIndex, right_x_index, d));
+                                        const float top = topLeft + (topRight - topLeft) * x_lerp;
+                                        const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp;
+                                        crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
+                                    }
+                                }
+                            } else {  // method is "nearest neighbor"
+                                for (auto x = 0; x < cropWidth; ++x) {
+                                    const float inX = (cropWidth > 1)
+                                                      ? x1 * (imageWidth - 1) + x * widthScale
+                                                      : 0.5 * (x1 + x2) * (imageWidth - 1);
+
+                                    if (inX < 0 || inX > imageWidth - 1) {
+                                        for (auto d = 0; d < depth; ++d) {
+                                            crops->p(b, y, x, d, extrapolationVal);
+                                        }
+                                        continue;
+                                    }
+                                    const int closestXIndex = roundf(inX);
+                                    const int closestYIndex = roundf(inY);
+                                    for (auto d = 0; d < depth; ++d) {
+                                        crops->p(b, y, x, d, images->e<T>(bIn, closestYIndex, closestXIndex, d));
+                                    }
+                                }
+                            }
+                        }
+                    };
+
+                    samediff::Threads::parallel_for(func, 0, cropHeight);
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
index 0adb0e249..6a8523925 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
@@ -30,7 +30,7 @@ namespace helpers {
 void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
     auto _a = a->reshape(a->ordering(), {-1, 3});
     auto _b = b->reshape(b->ordering(), {-1, 3});
-    auto _o = o->reshape(o->ordering(), {-1, 3});
+    auto _o = o->reshape(o->ordering(), {-1, 3}, false);
 
     auto tadsA = _a.allTensorsAlongDimension({1});
     auto tadsB = _b.allTensorsAlongDimension({1});
@@ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray
     int tads = tadsA.size();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto a_ = tadsA.at(e);
             auto b_ = tadsB.at(e);
             auto o_ = tadsO.at(e);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
index f041452ab..d3e524ff4 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
@@ -46,7 +46,7 @@ namespace helpers {
         if (isNHWC) {
             const int total_count = batch_size * output_height * output_width * output_depth;
             auto func = PRAGMA_THREADS_FOR {
-                for (auto out_idx = start; out_idx < stop; out_idx += increment) {
+                for (auto out_idx = start; out_idx < stop; out_idx++) {
                     const int d = out_idx % output_depth;
                     const int out_idx2 = out_idx / output_depth;
                     const int w = out_idx2 % output_width;
@@ -70,7 +70,7 @@ namespace helpers {
             const int total_count = batch_size * input_depth_by_input_area;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (int input_idx = start; input_idx < stop; input_idx += increment) {
+                for (int input_idx = start; input_idx < stop; input_idx++) {
                     const int n_bY_bX_oC_iY = input_idx / input_width;
                     const int iX = input_idx - n_bY_bX_oC_iY * input_width;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
index 8035f8216..2a51b92a6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
@@ -32,7 +32,7 @@ template <typename T>
 static void diGamma_(const NDArray& x, NDArray& z) {
 
 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             z.p(i, diGammaScalar<T>(x.e<T>(i)));
     };
 	samediff::Threads::parallel_for(func, 0, x.lengthOf());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
index 9db974b36..a470f140a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@@ -35,7 +35,7 @@ namespace helpers {
         int inLen = input->lengthOf();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));
 
                 if (val < probValue)
@@ -130,7 +130,7 @@ namespace helpers {
         nd4j::graph::RandomGenerator nodeRng(3019L, seed);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
                 float xVal = input->e<float>(e);
                 output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
index 281e6c809..0673a6f2b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@@ -62,7 +62,7 @@ namespace nd4j {
                     unsigned int outSize = outputList.size();
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
                             outputs[i].first = outputList[i];
                             outputs[i].second = 0;
                             for (int e = 0; e < indices->lengthOf(); ++e)
@@ -168,7 +168,7 @@ namespace nd4j {
                     unsigned int gradsSize = inputGradientList.size();
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
                             outputs[i].first = inputGradientList[i];
                             outputs[i].second = 0;
                             for (int e = 0; e < indices->lengthOf(); ++e)
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
index 0a46c995e..b2707ea5c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
@@ -50,7 +50,7 @@ namespace helpers {
             colCast = 0;
 
        auto func = PRAGMA_THREADS_FOR {
-           for (auto batch = 0; batch < stop; batch += increment) {
+           for (auto batch = 0; batch < stop; batch++) {
                auto patch = listOfMatricies.at(batch);
                auto outMatrix = listOfOutputs.at(batch);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
index 3fb7c290d..ed844e84f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@@ -21,6 +21,8 @@
 #include <ops/declarable/helpers/gather.h>
 #include <numeric>
 #include <execution/Threads.h>
+#include <ShapeUtils.h>
+#include <ConstantTadHelper.h>
 
 namespace nd4j {
 namespace ops {
@@ -36,7 +38,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
 
     const int numOfIntArgs = intArgs.size();
 
-    if (indices != nullptr) {  
+    if (indices != nullptr) {
 
         // first case: indices consist of only one scalar
         if(indices->isScalar()) {
@@ -46,7 +48,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
                 auto idx = indices->e<Nd4jLong>(0);
                 auto scalarNDArray = input->e(idx);
                 output->assign(scalarNDArray);
-            } 
+            }
             else {
                 NDArray inSubArr = (*input)(indices->e<Nd4jLong>(0), {axis});
                 output->assign(inSubArr);
@@ -54,41 +56,122 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
         }
         else {
 
-            std::vector<int> dimsOut(indices->rankOf());
-            std::iota(dimsOut.begin(), dimsOut.end(), axis);   // fill with axis, axis+1, ... axis+indices->rankOf()-1
-            const Nd4jLong numOfSubArrs = indices->lengthOf();
+            if(input->rankOf() == 1 && output->rankOf() == 1) {
 
-            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
-                    NDArray subArrOut = (*output)(i, dimsOut);
-                    NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
-                    subArrOut.assign(subArrIn);
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto i = start; i < stop; i++)
+                        output->p(i, input->e(indices->e<Nd4jLong>(i)));
+                };
+
+                samediff::Threads::parallel_for(func, 0, output->lengthOf());
+
+            }
+            else {
+
+                std::vector<int> dimsOut;
+                for (int i = 0; i < axis; ++i)
+                    dimsOut.push_back(i);
+                for (int i = axis+indices->rankOf(); i < output->rankOf(); ++i)
+                    dimsOut.push_back(i);
+
+                std::vector<int> dimsIn = ShapeUtils::evalDimsToExclude(input->rankOf(), {axis});
+
+                const Nd4jLong numOfSubArrs = indices->lengthOf();
+
+                auto inTadPack  = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimsIn);
+                auto outTadPack = ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimsOut);
+
+                Nd4jLong* inTadShapeInfo  = inTadPack.primaryShapeInfo();
+                Nd4jLong* outTadShapeInfo = outTadPack.primaryShapeInfo();
+
+                if (shape::order(inTadShapeInfo) == shape::order(outTadShapeInfo) && shape::order(inTadShapeInfo) == 'c' && input->dataType() == output->dataType() && shape::elementWiseStride(inTadShapeInfo) == 1 && shape::elementWiseStride(outTadShapeInfo) == 1) {
+
+                    auto func = PRAGMA_THREADS_FOR {
+
+                        for (auto i = start; i < stop; i++) {
+
+                            void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
+                            void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
+
+                            memcpy(outBuff, inBuff, shape::length(inTadShapeInfo) * input->sizeOfT());
+                        }
+                    };
+                    samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
                 }
-            };
+                else {
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto i = start; i < stop; i++) {
 
-            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
+                            void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
+                            void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
+
+                            NativeOpExecutioner::execTransformAny(input->getContext(), transform::Assign,
+                                                                 inBuff,  inTadShapeInfo,  nullptr/*input specialBuffer*/, nullptr/*input specialShapeInfo*/,
+                                                                 outBuff, outTadShapeInfo, nullptr/*output specialBuffer*/, nullptr/*output specialShapeInfo*/,
+                                                                 nullptr, nullptr, nullptr, false/*allowParallelism*/);
+                        }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
+                }
+            }
         }
-    } 
+    }
     else {
-                
+
         // we only allow scalar/vector case here
         if (numOfIntArgs == 2) { // scalar case
+
             output->assign((*input)(intArgs[1], {axis}));
         }
         else { // vector case
+
             const Nd4jLong numOfSubArrs = intArgs.size() - 1;
 
-            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
-                    NDArray subArrOut = (*output)(i, {axis});
-                    NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
-                    subArrOut.assign(subArrIn);
-                }
-            };
+            std::vector<int> dims  = ShapeUtils::evalDimsToExclude(input->rankOf(), {axis});
+
+            auto inTadPack  = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dims);
+            auto outTadPack = ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dims);
+
+            Nd4jLong* inTadShapeInfo  = inTadPack.primaryShapeInfo();
+            Nd4jLong* outTadShapeInfo = outTadPack.primaryShapeInfo();
+
+            if (shape::order(inTadShapeInfo) == shape::order(outTadShapeInfo) && shape::order(inTadShapeInfo) == 'c' && input->dataType() == output->dataType() && shape::elementWiseStride(inTadShapeInfo) == 1 && shape::elementWiseStride(outTadShapeInfo) == 1) {
+
+                auto func = PRAGMA_THREADS_FOR {
+
+                    for (auto i = start; i < stop; i++) {
+
+                        void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
+                        void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
+
+                        std::memcpy(outBuff, inBuff, shape::length(inTadShapeInfo) * input->sizeOfT());
+                    }
+                };
+                samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
+
+            }
+            else {
+
+                auto func = PRAGMA_THREADS_FOR {
+
+                    for (auto i = start; i < stop; i++) {
+
+                        void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
+                        void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
+
+                        NativeOpExecutioner::execTransformAny(input->getContext(), transform::Assign,
+                                                             inBuff,  inTadShapeInfo,  nullptr/*input specialBuffer*/, nullptr/*input specialShapeInfo*/,
+                                                             outBuff, outTadShapeInfo, nullptr/*output specialBuffer*/, nullptr/*output specialShapeInfo*/,
+                                                             nullptr, nullptr, nullptr, false/*allowParallelism*/);
+
+                    }
+                };
+                samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
+            }
 
-            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
-    }    
+    }
 }
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
index 9e3bdf885..fc6fc768b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
@@ -56,7 +56,7 @@ namespace nd4j {
 
                 if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto _x = static_cast<unsigned long long>(xBuffer[e]);
                             auto _y = static_cast<unsigned long long>(yBuffer[e]);
 
@@ -67,7 +67,7 @@ namespace nd4j {
                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
                             auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
 
@@ -78,7 +78,7 @@ namespace nd4j {
                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
                             auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
index 04df86c36..beb48e382 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@@ -42,7 +42,7 @@ namespace nd4j {
 
                 // we divide array into 32 element chunks, and store intermediate results once
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto b = 0; b < stop; b += increment) {
+                    for (auto b = 0; b < stop; b++) {
                         auto blockBuffer = buffer + b * numBlocks;
 
                         Nd4jLong r = 1;
@@ -64,7 +64,7 @@ namespace nd4j {
 
 
                     auto func2 = PRAGMA_THREADS_FOR {
-                        for (auto b = start; b < stop; b += increment) {
+                        for (auto b = start; b < stop; b++) {
                             auto blockBuffer = tempBuffer + b * numBlocks;
 
                             Nd4jLong r = 1;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
index d4089359f..23acab375 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@@ -280,7 +280,7 @@ namespace helpers {
         int xsSize = xs.size();
         // Scale x interpolation weights to avoid a multiplication during iteration.
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 xs[i]._bottomIndex *= channels;
                 xs[i]._topIndex *= channels;
             }
@@ -366,102 +366,6 @@ namespace helpers {
         BUILD_SINGLE_SELECTOR(images->dataType(), return resizeNeighborFunctor_, (images, width, height, alignCorners, halfPixelCenter, output), LIBND4J_TYPES);
     }
 
-
-    template<typename T, typename F, typename I>
-    static void cropAndResizeFunctor_(NDArray const *images, NDArray const *boxes, NDArray const *indices,
-                                      NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
-        const int batchSize = images->sizeAt(0);
-        const int imageHeight = images->sizeAt(1);
-        const int imageWidth = images->sizeAt(2);
-
-        const int numBoxes = crops->sizeAt(0);
-        const int cropHeight = crops->sizeAt(1);
-        const int cropWidth = crops->sizeAt(2);
-        const int depth = crops->sizeAt(3);
-
-        for (auto b = 0; b < numBoxes; ++b) {
-            T y1 = boxes->t<F>(b, 0);
-            T x1 = boxes->t<F>(b, 1);
-            T y2 = boxes->t<F>(b, 2);
-            T x2 = boxes->t<F>(b, 3);
-
-            int bIn = indices->e<int>(b);
-            if (bIn >= batchSize) {
-                continue;
-            }
-
-            T heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / (cropHeight - 1) : T(0);
-            T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
-
-            auto func = PRAGMA_THREADS_FOR {
-                for (auto y = start; y < stop; y += increment) {
-                    const float inY = (cropHeight > 1)
-                                      ? y1 * (imageHeight - 1) + y * heightScale
-                                      : 0.5 * (y1 + y2) * (imageHeight - 1);
-
-                    if (inY < 0 || inY > imageHeight - 1) {
-                        for (auto x = 0; x < cropWidth; ++x) {
-                            for (auto d = 0; d < depth; ++d) {
-                                crops->p(b, y, x, d, extrapolationVal);
-                            }
-                        }
-                        continue;
-                    }
-                    if (method == 0 /* bilinear */) {
-                        const int topYIndex = nd4j::math::p_floor(inY);
-                        const int bottomYIndex = nd4j::math::p_ceil(inY);
-                        const float y_lerp = inY - topYIndex;
-
-                        for (auto x = 0; x < cropWidth; ++x) {
-                            const float in_x = (cropWidth > 1)
-                                               ? x1 * (imageWidth - 1) + x * widthScale
-                                               : 0.5 * (x1 + x2) * (imageWidth - 1);
-
-                            if (in_x < 0 || in_x > imageWidth - 1) {
-                                for (auto d = 0; d < depth; ++d) {
-                                    crops->p(b, y, x, d, extrapolationVal);
-                                }
-                                continue;
-                            }
-                            int left_x_index = math::p_floor(in_x);
-                            int right_x_index = math::p_ceil(in_x);
-                            T x_lerp = in_x - left_x_index;
-
-                            for (auto d = 0; d < depth; ++d) {
-                                const float topLeft(images->e<float>(bIn, topYIndex, left_x_index, d));
-                                const float topRight(images->e<float>(bIn, topYIndex, right_x_index, d));
-                                const float bottomLeft(images->e<float>(bIn, bottomYIndex, left_x_index, d));
-                                const float bottomRight(images->e<float>(bIn, bottomYIndex, right_x_index, d));
-                                const float top = topLeft + (topRight - topLeft) * x_lerp;
-                                const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp;
-                                crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
-                            }
-                        }
-                    } else {  // method is "nearest neighbor"
-                        for (auto x = 0; x < cropWidth; ++x) {
-                            const float inX = (cropWidth > 1)
-                                              ? x1 * (imageWidth - 1) + x * widthScale
-                                              : 0.5 * (x1 + x2) * (imageWidth - 1);
-
-                            if (inX < 0 || inX > imageWidth - 1) {
-                                for (auto d = 0; d < depth; ++d) {
-                                    crops->p(b, y, x, d, extrapolationVal);
-                                }
-                                continue;
-                            }
-                            const int closestXIndex = roundf(inX);
-                            const int closestYIndex = roundf(inY);
-                            for (auto d = 0; d < depth; ++d) {
-                                crops->p(b, y, x, d, images->e<T>(bIn, closestYIndex, closestXIndex, d));
-                            }
-                        }
-                    }
-                }
-            };
-
-            samediff::Threads::parallel_for(func, 0, cropHeight);
-        }
-    }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // ------------------------------------------------------------------------------------------------------------------ //
 // Bicubic interpolation
@@ -1002,7 +906,7 @@ namespace helpers {
         auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with  template <typename X, typename Z> declaration
 
         auto batchProcess = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 for (auto y = 0; y < st.outHeight; ++y) {
                     const float inY = y * st.heightScale;
                     const float inY1 = (y + 1) * st.heightScale;
@@ -1057,7 +961,7 @@ namespace helpers {
             if (Status::OK() == res) {
                 std::vector<CachedInterpolation> xCached(st.outWidth);
                 auto cachingProcedure = PRAGMA_THREADS_FOR {
-                    for (auto x = start; x < stop; x += increment) {
+                    for (auto x = start; x < stop; x++) {
                         auto &xCache = xCached[x];
                         const float inX = x * st.widthScale;
                         const float inX1 = (x + 1) * st.widthScale;
@@ -1105,25 +1009,7 @@ namespace helpers {
         return ND4J_STATUS_OK;
     }
 
-// ------------------------------------------------------------------------------------------------------------------ //
-// ------------------------------------------------------------------------------------------------------------------ //
-// crop and resize helper functor:
-// \@param context - launch context for operation
-// \@param images - batch of images (4D tensor) with shape {batch, width, height, channels} with given type
-// \@param boxes - float boxes for crop
-// \@param indices - integer boxes indices for crop
-// \@param cropSize - integer size (newWidth, newHeight)
-// \@param method - one of bilinear (0) or nearest neighbour (1) interpolation algorithm
-// \@param extrapolationVal - radix to increase/decrease image
-// \@param crops - output image batch (4D with given type)
-//
-    void
-    cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes,
-            NDArray const *indices, NDArray const *cropSize,
-                         int method, double extrapolationVal, NDArray *crops) {
-        BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_,
-                              (images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES);
-    }
+
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
index e065174d5..b98e7f026 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
@@ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
         'c' == output.ordering() && 1 == output.ews()){
 
         auto func = PRAGMA_THREADS_FOR{
-             for (auto i = start; i < stop; i += increment) {
+             for (auto i = start; i < stop; i++) {
                  const auto xStep = i*3;
                  z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2];
              }
@@ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
     auto func = PRAGMA_THREADS_FOR{
 
          Nd4jLong coords[MAX_RANK];
-         for (auto i = start; i < stop; i += increment) {
+         for (auto i = start; i < stop; i++) {
              shape::index2coords(i, output.getShapeInfo(), coords);
              const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
              const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords);
@@ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
     const Nd4jLong zDimCstride = output.stridesOf()[dimC];
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             const T* xTad = x + packX.platformOffsets()[i];
             T* zTad = z + packZ.platformOffsets()[i];
             op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@@ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR{
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 const T* xTad = x + packX.platformOffsets()[i];
                 T* zTad = z + packZ.platformOffsets()[i];
                 op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@@ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR{
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 const T* xTad = x + packX.platformOffsets()[i];
                 T* zTad = z + packZ.platformOffsets()[i];
                 //simple M*v //tr.T*v
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
index 4bc9d3304..1fea8e4fe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@@ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
         int span = (tads / num_threads) + 8;
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
+            for (auto r = start; r < stop; r++) {
                     auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
                     auto rZ = output->bufferAsT<Z>() + zOfsets[r];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
index c9b833cf5..aeb9e38b0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
     if(inTadEws == 1 && outTadEws == 1) {
         
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                 const T *x = inBuff + inTadOffsets[i];
                 T *y = outBuff + outTadOffsets[i];
 
@@ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
     if(inTadEws == 1 && gradITadEws == 1) {
         
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                 const X *x = inBuff + inTadOffsets[i];
                       Y *y = gradIBuff + gradITadOffsets[i];
 
@@ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
     else {
 
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                 const X *x = inBuff + inTadOffsets[i];
                       Y *y = gradIBuff + gradITadOffsets[i];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index 683a82392..634d875d2 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
     auto h_ = h->bufferAsT<T>();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (uint e = start; e < stop; e += increment) {
+        for (uint e = start; e < stop; e++) {
             c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
             h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
index 2856e73b9..7d2eb5051 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
@@ -45,7 +45,7 @@ namespace helpers {
             auto n = shape::sizeAt(matrixShape, -1);
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     Nd4jLong theFirstPos[] = {theFirst, i};
                     Nd4jLong theSecondPos[] = {theSecond, i};
                     auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0);
@@ -203,7 +203,7 @@ namespace helpers {
         auto result = -1;
         //auto loop = PRAGMA_THREADS_FOR {
             auto start = column, stop = rowNum, increment = 1;
-            for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) {
+            for (auto rowCounter = start; rowCounter < stop; rowCounter++) {
                 Nd4jLong xPos[] = {rowCounter, column};
                 auto xIndex = shape::getOffset(compoundShape, xPos, 0);
                 if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
@@ -221,7 +221,7 @@ namespace helpers {
         Nd4jLong xDiag[] = {currentRow, currentRow};
         auto diagIndex = shape::getOffset(compoundShape, xDiag, 0);
         auto loop = PRAGMA_THREADS_FOR {
-            for (int j = start; j < stop; j += increment) {
+            for (auto j = start; j < stop; j++) {
                 Nd4jLong xRow[] = {j, currentRow};
                 auto rowIndex = shape::getOffset(compoundShape, xRow, 0);
                 compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
@@ -310,7 +310,7 @@ namespace helpers {
             permutations = permutationVectors->allTensorsAlongDimension({-1});
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n);
             }
         };
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
index cc43c1866..8a2048263 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
@@ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
     int lO = listOut.size();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             for (int j = 0; j < lastDimension; ++j)
                 listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j));
     };
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
index dcca5075e..20d8bd34f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
@@ -55,7 +55,7 @@ namespace helpers {
             Nd4jLong oL = output->lengthOf();
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto row = rows.at(e);
                     output->p(e, row->e<T>(n));
                 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
index 3e18d6d14..71beed7f9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
@@ -49,7 +49,7 @@ namespace nd4j {
 
                 if (tadEws >= 1) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = 0; e < stop; e += increment) {
+                        for (auto e = 0; e < stop; e++) {
                             auto cO = output + tadPack.primaryOffsets()[e];
 
                             auto idx = static_cast<int>(indices[e]);
@@ -70,7 +70,7 @@ namespace nd4j {
                     samediff::Threads::parallel_tad(func, 0, numTads);
                 } else {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto cO = output + tadPack.primaryOffsets()[e];
 
                             auto idx = static_cast<int>(indices[e]);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
index fc572677e..df80636ee 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
@@ -70,7 +70,7 @@ template <typename T>
 static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
 
 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
         	const T order = n.e<T>(i);
         	if(order != static_cast<int>(order))						// if order has fractional part then do not perform calculations and return NAN
         		output.p(i, std::numeric_limits<T>::quiet_NaN());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
index 90b69ca6f..9e1980e54 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
@@ -113,7 +113,7 @@ namespace helpers {
         ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
         ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
         auto batching = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 //qr here
                 qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies);
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
index bb0e7e24e..a14fb89f9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
@@ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
     auto d = delta.e<T>(0);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             buff[i] = s + i * d;
     };
     samediff::Threads::parallel_for(func, 0, len);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
index 9ee906bd5..4c80e3bf2 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@@ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
             if (inArr == outArr) {
                 if (inEWS == 1) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto idx = sLength - e;
                             swap(inArr, e, idx);
                         }
@@ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 }
                 else if (inEWS > 1) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto idx1 = (sLength - e) * inEWS;
                             Nd4jLong idx2 = e * inEWS;
                             swap(inArr, idx1, idx2);
@@ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 else {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                             auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
                             swap(outArr, inOffset, outOffset);
@@ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (Nd4jLong e = start; e < stop; e += increment)
+                        for (Nd4jLong e = start; e < stop; e++)
                             outArr[sLength - e] = inArr[e];
                     };
                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
                         auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment)
+                            for (auto e = start; e < stop; e++)
                                 outArr[e] = inArr[e];
                         };
                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@@ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment)
+                        for (auto e = start; e < stop; e++)
                             outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
                     };
                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
                         auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment)
+                            for (auto e = start; e < stop; e++)
                                 outArr[e * outEWS] = inArr[e * inEWS];
                         };
                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@@ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 else {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                             auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
                             outArr[outOffset] = inArr[inOffset];
@@ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                     if(inLength != numOfElemsToReverse) {
 
                         auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment) {
+                            for (auto e = start; e < stop; e++) {
                                 auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                                 auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
                                 outArr[outOffset] = inArr[inOffset];
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index 5422d04c1..09a628b84 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
     // loop through input array
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
 
             shape::index2coords(i, output.getShapeInfo(), coords);
 
@@ -244,14 +244,14 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
 
     // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC]
 
-    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), output.sizeAt(3)});
+    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), output.sizeAt(3)}, false);
     outputRearranged0.permutei({2, 3,0, 4,1, 5});
 
     if(input.lengthOf() == output.lengthOf()) {
         outputRearranged0.assign(input);
     }
     else {
-        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, output.sizeAt(3)});
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, output.sizeAt(3)}, false);
         BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatch_, (input, outputRearranged1, padBottom, padTop, padLeft, padRight), LIBND4J_TYPES);
 
         if(output.getBuffer() != outputRearranged1.getBuffer())
@@ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
     // loop through output array
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             shape::index2coords(i, output.getShapeInfo(), coords);
 
             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
@@ -352,7 +352,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
     for(int j = 1; j < rank; ++i, ++j)
         temp[i] = output.sizeAt(j);
 
-    NDArray outputRearranged0 = output.reshape(output.ordering(), temp);
+    NDArray outputRearranged0 = output.reshape(output.ordering(), temp, false);
 
     //*** construct permuting std::vector for permutation of output array ***//
 
@@ -382,7 +382,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
         for(i = 1; i < rank; ++i)
             temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : output.sizeAt(i);
 
-        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp);
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp, false);
 
         BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatchND_, (input, padding, outputRearranged1, numOfSpatialDims), LIBND4J_TYPES);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
index fd285ed9c..557d63fd3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
@@ -48,7 +48,7 @@ namespace helpers {
             const int total_count = batch_size * input_height * input_width * input_depth;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
                     // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
                     const int d = inp_idx % input_depth;
                     const int inp_idx2 = inp_idx / input_depth;
@@ -74,7 +74,7 @@ namespace helpers {
             const int total_count = batch_size * output_depth_by_output_area;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
                     const int n_iC_oY_bY_oX = inp_idx / block_size;
                     const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index a3f0c01be..2de2b2d22 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
 
         Nd4jLong xCoords[MAX_RANK];
 
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
 
             shape::index2coords(i, xShapeInfo, xCoords);
 
@@ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
 
     if(outRank == 1) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 Nd4jLong idx = indices.e<Nd4jLong>(i);
                 NDArray out = output({idx, idx + 1});
 
@@ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
                 NDArray updSubArr = updates(i, dimsToExcludeUpd);
 
@@ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
 
     if(outRank == 1) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 Nd4jLong idx = indices.e<Nd4jLong>(i);
                 NDArray out = output({idx, idx + 1});
 
@@ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
         auto func = PRAGMA_THREADS_FOR {
             std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
 
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 NDArray indSubArr = indices(i, dimsToExcludeInd);
 
                 for (Nd4jLong j = 0; j < indLastDim; ++j) {
@@ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr
 
     if(!calcGrad) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto subArr = updates(i, dimsToExclude);
                 output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
             }
@@ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr
         samediff::Threads::parallel_for(func, 0, indicesLen);
     } else {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto subArr = updates(i, dimsToExclude);
                 auto ind = indices.e<Nd4jLong>(i);
                 subArr.p(ind, subArr.e(ind) - 1.);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
index e20145735..08aafc98c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@@ -169,7 +169,7 @@ namespace helpers {
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e));
                         }
                     };
@@ -223,7 +223,7 @@ namespace helpers {
             for (int i = 0; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e));
                         }
                     };
@@ -272,7 +272,7 @@ namespace helpers {
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i)  == idx) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e));
                         }
                     };
@@ -625,7 +625,7 @@ namespace helpers {
             Nd4jLong loop_size = input->lengthOf();
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
                         output->p(e, gradOut->e<T>(classNum));
@@ -645,7 +645,7 @@ namespace helpers {
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
@@ -675,7 +675,7 @@ namespace helpers {
         segmentMinFunctor(context, input, indices, &tempRes);
         if (input->isVector()) {
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
                         output->p(e, gradOut->e<double>(classNum));
@@ -697,7 +697,7 @@ namespace helpers {
             int pos = 0;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
@@ -887,7 +887,7 @@ namespace helpers {
         if (input->isVector()) {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
                         output->t<T>(e) = gradOut->t<T>(classNum);
@@ -1004,7 +1004,7 @@ namespace helpers {
         unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes);
         if (input->isVector()) {
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e));
                 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
index 59c257c28..05353bf5e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@@ -364,7 +364,7 @@ namespace nd4j {
                     auto func = PRAGMA_THREADS_FOR {
                         T sneu1e[600];
 
-                        for (auto t = start; t < stop; t += increment) {
+                        for (auto t = start; t < stop; t++) {
                             T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
                             memset(neu1e, 0, vectorLength * sizeof(T));
 
@@ -457,7 +457,7 @@ namespace nd4j {
                     T sneu1[600];
                     T sneu1e[600];
 
-                    for (int e = start; e < stop; e += increment) {
+                    for (int e = start; e < stop; e++) {
                         T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
                         T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
index 48f7f0d9a..c8774f028 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
@@ -40,7 +40,7 @@ namespace helpers {
         output->assign(input);
 
         auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 for (auto r = 0; r < rows; r++) {
                     for (auto c = 0; c < r; c++) {
                         math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
index 642dd37da..d2dd3bf30 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
@@ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
     T* pCt   = ct->bufferAsT<T>();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto col = start; col < stop; col += increment) {
+        for (auto col = start; col < stop; col++) {
             const auto colNum = col % d2;
             bool flip = colNum >= K;
             T maskVal = mask ? *(pMask + col) : T(1);
@@ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
     T* pGradInit  = gradC0->bufferAsT<T>();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto col = start; col < stop; col += increment) {
+        for (auto col = start; col < stop; col++) {
             T gbF = 0.f;
             T gbR = 0.f;
             const auto colNum = col % d2;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
index db9b6afff..a3d27702d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
@@ -37,7 +37,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 	    int inSize = inArrs.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment)
+            for (auto i = start; i < stop; i++)
                 outArr->p<T>(i, inArrs[i]->t<T>(0));
         };
 
@@ -50,7 +50,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
         int listSize = list.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment)
+            for (auto i = start; i < stop; i++)
                 list.at(i)->assign(inArrs[i]);
         };
         samediff::Threads::parallel_tad(func, 0, listSize);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
index e38232928..c4b45b398 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@@ -150,7 +150,7 @@ namespace helpers {
             result->assign(0);
             if (status == ND4J_STATUS_OK) {
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto e = start; e < stop; e += increment) {
+                    for (auto e = start; e < stop; e++) {
                         bool found = false;
                         for (int j = 0; j < k; j++) {
                             if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index ea5e90cd8..1f630e8e0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
     int dLen = dOdI.lengthOf();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             if (dOdI.t<T>(i) != static_cast<T>(0.f))
                 dOdI.t<T>(i) = static_cast<T>(1.f);
         }
@@ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) {
     auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             output.p(i, setOfSubArrs.at(i)->getTrace());
     };
     samediff::Threads::parallel_for(func, 0, setOfSubArrs.size());
@@ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 
         auto func = PRAGMA_THREADS_FOR {
             Nd4jLong coords[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 shape::index2coords(i, output.getShapeInfo(), coords);
                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
@@ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 
         auto func = PRAGMA_THREADS_FOR {
             Nd4jLong coords[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 shape::index2coords(i, output.getShapeInfo(), coords);
                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
@@ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
 
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK * 3];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             Nd4jLong *zCoordStart, *xCoordStart;
 
             if (yLastDim == xRank) {
@@ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
         else if (input->rankOf() == 1 && indices->isVector()) {
             // special case
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment)
+                for (auto e = start; e < stop; e++)
                     output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
             };
 
@@ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     NDArray subArrOut = (*output)(i, dimsOut);
                     NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
                     subArrOut.assign(subArrIn);
@@ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     NDArray subArrOut = (*output)(i, {axis});
                     NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
                     subArrOut.assign(subArrIn);
@@ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
     auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             arrs.at(i)->setIdentity();
     };
 
@@ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
         indices.push_back((*intArgs)[e]);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             auto inSubArr = input(indices[i], dimsToExclude, true);
             auto updSubArr = updates(i, dimsToExclude, true);
 
@@ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input
 
         case 6: {   // copy
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto inSubArr = input(i, dimensions);
                     inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
                 }
@@ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T max = -DataTypeUtils::max<T>();
             Nd4jLong idx = 0;
 
@@ -839,7 +839,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T max = -DataTypeUtils::max<T>();
             for (int i = 0; i < numArgs; i++) {
                 T v = inArrs[i]->e<T>(e);
@@ -865,7 +865,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T sum = 0.;
             for (int i = 0; i < numArgs; i++) {
                 T v = inArrs[i]->e<T>(e);
@@ -891,7 +891,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T sum = (T) 0.f;
             for (int i = 0; i < numArgs; i++)
                 sum += inArrs[i]->e<T>(e);
@@ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
             auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     const T iNormActual = norm2.e<T>(i);
                     if (iNormActual > normClip)
                         *listOfInSubArrs.at(i) *= normClip / iNormActual;
@@ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
             auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto inputSubArr = listOfInSubArrs.at(i);
                     auto outputSubArr = listOfOutSubArrs.at(i);
                     outputSubArr->assign(inputSubArr);
@@ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
         auto cn = clipNorm.e<T>(0);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 T N = norm2.e<T>(i);
 
                 auto gradOSubArr = gradOSubArrs.at(i);
@@ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
         auto func = PRAGMA_THREADS_FOR {
             Nd4jLong inIdx[MAX_RANK];
             Nd4jLong outIdx[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 shape::index2coords(i, output.getShapeInfo(), outIdx);
 
                 for (int j = 0; j < rank; ++j) {
@@ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
 
     BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES);
 
-//////////////////////////////////////////////////////////////////////////
-template<typename T>
-static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-    nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
-}
-
-    void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-        BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
-    }
-
-    BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
index ceb228439..c825a8fee 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
@@ -90,7 +90,7 @@ namespace helpers {
         auto outputPart = output->allTensorsAlongDimension({-2, -1});
 
         auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 if (lower) {
                     lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]);
                 } else {
@@ -112,7 +112,7 @@ namespace helpers {
         auto rows = input->sizeAt(-2);
 
         auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 if (!lower) {
                     for (auto r = 0; r < rows; r++) {
                         for (auto c = 0; c <= r; c++) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
index 5d4ed9f2e..90ef634c1 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
@@ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
 	int xLen = x.lengthOf();
 
 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
     };
 
diff --git a/libnd4j/include/ops/declarable/helpers/crop_and_resize.h b/libnd4j/include/ops/declarable/helpers/crop_and_resize.h
new file mode 100644
index 000000000..3926dbfb0
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/crop_and_resize.h
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+
+//
+//  @author sgazeos@gmail.com
+//
+
+#ifndef SD_CROP_AND_RESIZE_H
+#define SD_CROP_AND_RESIZE_H
+
+#include <op_boilerplate.h>
+#include <NDArray.h>
+
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            template<typename T, typename F, typename I>
+            void cropAndResizeFunctor_(NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops);
+
+            void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const* images, NDArray const* boxes, NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops);
+        }
+    }
+}
+
+#endif //SD_CROP_AND_RESIZE_H
diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h
index 31b386e7e..3ea80966b 100644
--- a/libnd4j/include/ops/declarable/helpers/cross.h
+++ b/libnd4j/include/ops/declarable/helpers/cross.h
@@ -59,7 +59,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
     void FORCEINLINE _crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
         auto a_ = a->reshape(a->ordering(), {-1, 3});
         auto b_ = b->reshape(b->ordering(), {-1, 3});
-        auto o_ = o->reshape(o->ordering(), {-1, 3});
+        auto o_ = o->reshape(o->ordering(), {-1, 3}, false);
 
         auto tadsA = a_.allTensorsAlongDimension({1});
         auto tadsB = b_.allTensorsAlongDimension({1});
@@ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
         int tads = tadsA.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 auto a_ = tadsA.at(e);
                 auto b_ = tadsB.at(e);
                 auto o_ = tadsO.at(e);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
index 43c0e4af9..b455ff659 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@@ -85,38 +85,106 @@ BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid
 //////////////////////////////////////////////////////////////////////////
 void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
 
-    const int threadsPerBlock = 256;
-    const int blocksPerGrid = 512;
-    const int sharedMem = 512;
+    const int numOfInArrs = inArrs.size();
+    const auto sizeofT    = output.sizeOfT();
 
-    const int numOfArrs = inArrs.size();
-
-    for(int i = 0; i < numOfArrs; ++i)
+    for(int i = 0; i < numOfInArrs; ++i)
         inArrs[i]->syncToDevice();
-
     output.syncToDevice();
 
-    // prepare arrays of pointers on buffers and shapes
-    std::vector<void*> hInBuffers(numOfArrs);
-    std::vector<Nd4jLong*> hInShapeInfo(numOfArrs);
+    bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
 
-    for(int i = 0; i < numOfArrs; ++i) {
-        hInBuffers[i]   = inArrs[i]->getSpecialBuffer();
-        hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
+    if(luckCase1) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
+            if(!luckCase1)
+                break;
+        }
     }
 
-    PointersManager manager(context, "helpers::concat");
+    if(luckCase1) {     // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
 
-    void* dInBuffers   = manager.replicatePointer(hInBuffers.data(),    hInBuffers.size() * sizeof(void*));
-    void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(),  hInShapeInfo.size() * sizeof(Nd4jLong*));
+        void* z = static_cast<int8_t*>(output.getSpecialBuffer());
 
-    BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            const auto memAmountToCopy = inArrs[i]->lengthOf() * sizeofT;
+            cudaMemcpyAsync(z, static_cast<int8_t*>(inArrs[i]->getSpecialBuffer()), memAmountToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
+            z = static_cast<int8_t*>(z) + memAmountToCopy;
+        }
 
-    manager.synchronize();
+        if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
+            throw std::runtime_error("concat cuda: luckCase1 failed!");
 
-    for(int i = 0; i < numOfArrs; ++i)
+        for(int i = 0; i < numOfInArrs; ++i)
+            inArrs[i]->tickReadDevice();
+        output.tickWriteDevice();
+
+        return;
+    }
+
+    const bool isZcontin = output.strideAt(axis) == 1;
+    bool areInputsContin = true;
+    bool allSameOrder    = true;
+
+    if(isZcontin) {
+        for (uint i = 0; i < inArrs.size(); ++i) {
+            areInputsContin &= inArrs[i]->strideAt(axis) == 1;
+            allSameOrder    &= output.ordering() == inArrs[i]->ordering();
+            if(!areInputsContin || !allSameOrder)
+                break;
+        }
+    }
+
+    const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
+
+    if(luckCase2) {     // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
+
+        const uint zDim = output.sizeAt(axis);
+
+        for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
+
+            const auto iShift = i * sizeofT;
+            void* z = static_cast<int8_t*>(output.getSpecialBuffer()) + zDim * iShift;
+
+            for (uint j = 0; j < numOfInArrs; ++j) {
+                const auto xDim = inArrs[j]->sizeAt(axis);
+                void* x = static_cast<int8_t*>(inArrs[j]->getSpecialBuffer()) + xDim * iShift;
+                const auto memSizeToCopy = xDim * sizeofT;
+                cudaMemcpyAsync(z, x, memSizeToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
+                z = static_cast<int8_t*>(z) + memSizeToCopy;
+            }
+        }
+
+        if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
+            throw std::runtime_error("concat cuda: luckCase2 failed!");
+    }
+    else {      // general (slower) case
+
+        const int threadsPerBlock = 256;
+        const int blocksPerGrid = 512;
+        const int sharedMem = 512;
+
+        // prepare arrays of pointers on buffers and shapes
+        std::vector<void*> hInBuffers(numOfInArrs);
+        std::vector<Nd4jLong*> hInShapeInfo(numOfInArrs);
+
+        for(int i = 0; i < numOfInArrs; ++i) {
+            hInBuffers[i]   = inArrs[i]->getSpecialBuffer();
+            hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
+        }
+
+        PointersManager manager(context, "helpers::concat");
+
+        void* dInBuffers   = manager.replicatePointer(hInBuffers.data(),    hInBuffers.size() * sizeof(void*));
+        void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(),  hInShapeInfo.size() * sizeof(Nd4jLong*));
+
+        BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
+
+        manager.synchronize();
+    }
+
+    for(int i = 0; i < numOfInArrs; ++i)
         inArrs[i]->tickReadDevice();
-
     output.tickWriteDevice();
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
index 4f77b2e7c..39732b024 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@@ -322,7 +322,7 @@ static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input,
         ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
 
     NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-    NDArray outputReshaped = output->reshape(output->ordering(), outReShape);
+    NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false);
 
     helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
     MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
@@ -1228,7 +1228,7 @@ static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const N
         NDArray* gradBR = gradB;
         if(gradB->rankOf() == 2)
             gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
-        gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot);                          // sum over bS, oH, oW
+        gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot, false);                          // sum over bS, oH, oW
         if(gradBR != gradB)
             delete gradBR;
     }
@@ -1310,7 +1310,7 @@ static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, con
         NDArray* gradBR = gradB;
         if(gradB->rankOf() == 2)
             gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
-        gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1});                      // sum over bS, oH, oW
+        gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1}, false);                      // sum over bS, oH, oW
         if(gradBR != gradB)
             delete gradBR;
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
index c028daff3..6096f3a85 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
@@ -1326,7 +1326,7 @@ namespace helpers {
 //      crops - output (4D tensor - [batch, outWidth, outHeight, pixels])
 //
     template <typename T, typename Z, typename I>
-    static void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices,
+    void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices,
                                       NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
         const int batchSize = images->sizeAt(0);
         const int imageHeight = images->sizeAt(1);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
index 82f421fdd..f3bee349b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@@ -313,7 +313,7 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
 
     // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC]
 
-    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), input.sizeAt(3)});
+    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), input.sizeAt(3)}, false);
     outputRearranged0.permutei({2, 3,0, 4,1, 5});
 
     if(input.lengthOf() == output.lengthOf()) {
@@ -322,7 +322,7 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
     }
     else {
 
-        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, input.sizeAt(3)});
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, input.sizeAt(3)}, false);
 
         const int threadsPerBlock = MAX_NUM_THREADS / 2;
         const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
@@ -439,7 +439,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
     for(int j = 1; j < rank; ++i, ++j)
         temp[i] = output.sizeAt(j);
 
-    NDArray outputRearranged0 = output.reshape(output.ordering(), temp);
+    NDArray outputRearranged0 = output.reshape(output.ordering(), temp, false);
 
     //*** construct permuting std::vector for permutation of output array ***//
 
@@ -469,7 +469,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
         for(i = 1; i < rank; ++i)
             temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : output.sizeAt(i);
 
-        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp);
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp, false);
 
         const int threadsPerBlock = MAX_NUM_THREADS / 4;
         const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/libnd4j/include/ops/declarable/helpers/image_resize.h b/libnd4j/include/ops/declarable/helpers/image_resize.h
index 047b2cf70..decac3db9 100644
--- a/libnd4j/include/ops/declarable/helpers/image_resize.h
+++ b/libnd4j/include/ops/declarable/helpers/image_resize.h
@@ -50,9 +50,6 @@ namespace helpers {
 
     int resizeFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
             ImageResizeMethods method, bool preserveAspectRatio, bool antialias, NDArray* output);
-
-    void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const* images, NDArray const* boxes,
-            NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
index 8ef63101e..3bcdea865 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
@@ -69,7 +69,7 @@ namespace helpers {
         }
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 values->p(e, static_cast<T>(valuesVector[e]));
                 if (counts != nullptr)
                     counts->p(e, countsMap[valuesVector[e]]);
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
index 46d10b51c..9724b6ba5 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
@@ -150,6 +150,22 @@ namespace nd4j {
             }
 
             if (ctx.isInplace()) {
+                if (Environment::getInstance()->isProfiling() && node != nullptr) {
+                    if (ctx.isFastPath()) {
+                        //
+                    } else {
+                        for (auto p: *ctx.inputs()) {
+                            auto var = ctx.variable(p);
+                            if (var->variableType() == VariableType::NDARRAY) {
+                                NDArray *array = var->getNDArray();
+
+                                node->addInputShape(array->shapeInfo());
+                                node->addOutputShape(array->shapeInfo());
+                            }
+                        }
+                    }
+                }
+
                 // do nothing, getZ result will do the trick
                 return static_cast<int>(ctx.width());
             } else {
@@ -192,6 +208,10 @@ namespace nd4j {
                     auto inputTime = std::chrono::duration_cast<std::chrono::nanoseconds>(inputEnd - inputStart).count();
                     node->setInputTime(inputTime);
 
+                    // saving output shapes in profile
+                    for (int e = 0; e < inSha.size(); e++)
+                        node->addInputShape(inSha.at(e));
+
                     shapeStart = std::chrono::system_clock::now();
                 }
 
@@ -204,6 +224,10 @@ namespace nd4j {
                     auto prepTime = std::chrono::duration_cast<std::chrono::nanoseconds>(shapeEnd - shapeStart).count();
                     node->setShapeFunctionTime(prepTime);
 
+                    // saving output shapes in profile
+                    for (int e = 0; e < outSha->size(); e++)
+                        node->addOutputShape(outSha->at(e));
+
                     arrayStart = std::chrono::system_clock::now();
                 }
 
@@ -562,7 +586,7 @@ namespace nd4j {
                 block->setInnerTime(outerTime);
             }
 
-            if (Environment::getInstance()->isProfiling()) {
+            if (Environment::getInstance()->isProfiling() && !block->isFastPath()) {
                 auto fp = block->getVariableSpace()->flowPath();
                 if (fp != nullptr) {
                     auto p = fp->profile();
diff --git a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp
index f93df63f1..e9920c409 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp
@@ -23,11 +23,11 @@
 
 namespace nd4j {
     namespace ops {
-        LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) {
+        LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) {
             _numInputs = numInputs;
         }
 
-        LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) {
+        LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) {
             _opNum = opNum;
             _numInputs = numInputs;
         }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp
index 07c7234f5..49f896be1 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp
@@ -25,11 +25,11 @@
 namespace nd4j {
     namespace ops {
         LegacyPairwiseTransformOp::LegacyPairwiseTransformOp() : LegacyOp::LegacyOp(2) {
-            // just a no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyPairwiseTransformOp::LegacyPairwiseTransformOp(int opNum) : LegacyOp::LegacyOp(2, opNum) {
-            // just a no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyOp* LegacyPairwiseTransformOp::clone() {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
index b1261b37c..856bfdeaf 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
@@ -26,11 +26,11 @@
 namespace nd4j {
     namespace ops {
         LegacyScalarOp::LegacyScalarOp() : LegacyOp::LegacyOp(1) {
-            // no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyScalarOp::LegacyScalarOp(int opNum)  : LegacyOp::LegacyOp(1, opNum){
-            // no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyOp* LegacyScalarOp::clone() {
@@ -66,6 +66,7 @@ namespace nd4j {
 
                 NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y->buffer(), y->shapeInfo(), y->specialBuffer(), y->specialShapeInfo(), extras.argumentsAsT(z->dataType()));
 
+                NDArray::registerSpecialUse({z}, {x, y});
             } else if (block.getTArguments()->size() > 0) {
                 auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());
 
@@ -78,10 +79,9 @@ namespace nd4j {
                 NDArray::prepareSpecialUse({z}, {x, _scalar});
 
                 NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), _scalar->buffer(), _scalar->shapeInfo(), _scalar->specialBuffer(), _scalar->specialShapeInfo(), extras.argumentsAsT(z->dataType()));
-            }
 
-            manager.synchronize();
-            STORE_RESULT(*z);
+                NDArray::registerSpecialUse({z}, {x, _scalar});
+            }
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp
index 49fef3af0..6b097c3af 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp
@@ -26,11 +26,11 @@
 namespace nd4j {
     namespace ops {
         LegacyTransformSameOp::LegacyTransformSameOp() : LegacyOp::LegacyOp(1) {
-            // just a no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyTransformSameOp::LegacyTransformSameOp(int opNum) : LegacyOp::LegacyOp(1, opNum) {
-            // just a no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyOp* LegacyTransformSameOp::clone() {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp
index 19a51191a..a390a458c 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp
@@ -26,11 +26,11 @@
 namespace nd4j {
     namespace ops {
         LegacyTransformStrictOp::LegacyTransformStrictOp() : LegacyOp::LegacyOp(1) {
-            // just a no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyTransformStrictOp::LegacyTransformStrictOp(int opNum) : LegacyOp::LegacyOp(1, opNum) {
-            // just a no-op
+            this->getOpDescriptor()->allowInplace(true);
         }
 
         LegacyOp* LegacyTransformStrictOp::clone() {
diff --git a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp
index 5139a95cc..417fc0605 100644
--- a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp
+++ b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp
@@ -50,6 +50,9 @@ namespace nd4j {
             _scalar = isScalar;
         }
 
+        void OpDescriptor::allowInplace(bool reallyAllow){
+            _allowsInplace = reallyAllow;
+        }
 
         bool OpDescriptor::operator==(const OpDescriptor& other) const {
             if (_hash == -1 && other._hash == -1)
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
index 3371b16ad..26aeacaa3 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
@@ -471,9 +471,9 @@ PLATFORM_IMPL(lstmLayer, ENGINE_CPU) {
     if(cI)
         cIR = new NDArray(cI->reshape(cI->ordering(), {1,dirDim,bS,nOut}));
     if(hL)
-        hLR = new NDArray(hL->reshape(hL->ordering(), {1,dirDim,bS,nOut}));
+        hLR = new NDArray(hL->reshape(hL->ordering(), {1,dirDim,bS,nOut}, false));
     if(cL)
-        cLR = new NDArray(cL->reshape(cL->ordering(), {1,dirDim,bS,nOut}));
+        cLR = new NDArray(cL->reshape(cL->ordering(), {1,dirDim,bS,nOut}, false));
 
     lstmLayerMKLDNN(xP, WxR, WrR, bR, hIR, cIR, params, hP, hLR, cLR);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp
new file mode 100644
index 000000000..53d18e3cd
--- /dev/null
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp
@@ -0,0 +1,294 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/PlatformHelper.h>
+#include <ops/declarable/OpRegistrator.h>
+#include <platform_boilerplate.h>
+
+#include <helpers/MKLDNNStream.h>
+#include "mkldnnUtils.h"
+#include <numeric>
+
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+//////////////////////////////////////////////////////////////////////////
+static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const bool transX, const bool transY) {
+
+    // mkl works with following
+    // [M,K]     x [K,N]     = [M,N]
+    // [bS, M,K] x [bS, K,N] = [bS, M,N]
+
+    // possible input cases not supported by mkl, however we'll perform permut/reshape procedures in order to fit requirements
+    // [4]          x [4]          = [1]          --> [1,4]     x [4,1]     = [1,1]
+    // [4]          x [4,5]        = [5]          --> [1,4]     x [4,5]     = [1,5]
+    // [4,5]        x [5]          = [4]          --> [4,5]     x [5,1]     = [4,1]
+    // [2,3, 4,5]   x [2,3, 5,4]   = [2,3, 4,4]   --> [6, 4,5]  x [6, 5,4]  = [6, 4,4]
+    // [2,2,3, 4,5] x [2,2,3, 5,4] = [2,2,3, 4,4] --> [12, 4,5] x [12, 5,4] = [12, 4,4]
+
+    const auto xRank = x->rankOf();
+    const auto yRank = y->rankOf();
+    const auto zRank = z->rankOf();
+
+    std::vector<int> permut;
+
+    // fill permutation vector appropriately if transposition is required
+    if((transX && xRank > 1) || (transY && yRank > 1)) {
+
+        const int rank = xRank >= yRank ? xRank : yRank;
+        permut.resize(rank);
+        std::iota(std::begin(permut), std::end(permut), 0);
+        permut[rank-2] = rank - 1;
+        permut[rank-1] = rank - 2;
+    }
+
+    const NDArray* xT = (transX && xRank > 1) ? new NDArray(x->permute(permut)) : x;
+    const NDArray* yT = (transY && yRank > 1) ? new NDArray(y->permute(permut)) : y;
+
+    const NDArray* xTR = xRank <= 3 ? xT : new NDArray(xT->reshape(xT->ordering(), {xT->lengthOf() / (xT->sizeAt(-2) * xT->sizeAt(-1)), xT->sizeAt(-2),  xT->sizeAt(-1)}));
+    const NDArray* yTR = xRank <= 3 ? yT : new NDArray(yT->reshape(yT->ordering(), {yT->lengthOf() / (yT->sizeAt(-2) * yT->sizeAt(-1)), yT->sizeAt(-2),  yT->sizeAt(-1)}));
+          NDArray* zR =  xRank <= 3 ? z  : new NDArray(z->reshape(z->ordering(), {z->lengthOf() / (z->sizeAt(-2) * z->sizeAt(-1)), z->sizeAt(-2),  z->sizeAt(-1)})/*, false*/);
+
+    // [M,K] x [K,N] = [M,N]
+    const int M  = (xRank > 1) ? xTR->sizeAt(-2) : 1;
+    const int K  = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf();
+    const int N  = (yRank > 1) ? yTR->sizeAt(-1) : 1;
+    const int bS = (xRank > 2) ? xTR->sizeAt(0)  : 1;                   // [bS, M,K] x [bS, K,N] = [bS, M,N]
+
+    dnnl::memory::dims xShape = xRank < 3 ? dnnl::memory::dims({M, K}) : dnnl::memory::dims({bS, M, K});
+    dnnl::memory::dims yShape = xRank < 3 ? dnnl::memory::dims({K, N}) : dnnl::memory::dims({bS, K, N});
+    dnnl::memory::dims zShape = xRank < 3 ? dnnl::memory::dims({M, N}) : dnnl::memory::dims({bS, M, N});
+
+    dnnl::memory::format_tag format = xRank < 3 ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::abc;
+
+    // x type
+    dnnl::memory::data_type xType;
+    if(x->dataType() == DataType::FLOAT32)
+        xType = dnnl::memory::data_type::f32;
+    else if(x->dataType() == DataType::HALF)
+        xType = dnnl::memory::data_type::f16;
+    else if(x->dataType() == DataType::BFLOAT16)
+        xType = dnnl::memory::data_type::bf16;
+    else if(x->dataType() == DataType::UINT8)
+        xType = dnnl::memory::data_type::u8;
+    else
+        xType = dnnl::memory::data_type::s8;
+
+    // y type
+    dnnl::memory::data_type yType = xType;
+    if(y->dataType() == DataType::UINT8)
+        yType = dnnl::memory::data_type::u8;
+    else if(y->dataType() == DataType::INT8)
+        yType = dnnl::memory::data_type::s8;
+
+    // z type
+    dnnl::memory::data_type zType = xType;
+    if(z->dataType() == DataType::FLOAT32)
+        zType = dnnl::memory::data_type::f32;
+    else if(z->dataType() == DataType::INT32)
+        zType = dnnl::memory::data_type::s32;
+    else if(z->dataType() == DataType::UINT8)
+        zType = dnnl::memory::data_type::u8;
+    else if(z->dataType() == DataType::INT8)
+        zType = dnnl::memory::data_type::s8;
+
+    // memory descriptors for arrays
+
+    // x
+    dnnl::memory::desc x_mkl_md  = dnnl::memory::desc(xShape, xType, dnnl::memory::format_tag::any);
+    dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, format);
+    if(xTR->ews() != 1 || xTR->ordering() != 'c') {
+        x_user_md.data.format_kind = dnnl_blocked;    // overrides format
+        x_user_md.data.format_desc.blocking.strides[0] = xRank == 1 ? 1 : xTR->strideAt(0);
+        x_user_md.data.format_desc.blocking.strides[1] = xRank == 1 ? xTR->strideAt(0) : xTR->strideAt(1);
+        if(xRank > 2)
+            x_user_md.data.format_desc.blocking.strides[2] = xTR->strideAt(2);
+    }
+
+    // y
+    dnnl::memory::desc y_mkl_md  = dnnl::memory::desc(yShape, yType, dnnl::memory::format_tag::any);
+    dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, format);
+    if(yTR->ews() != 1 || yTR->ordering() != 'c') {
+        y_user_md.data.format_kind = dnnl_blocked;    // overrides format
+        y_user_md.data.format_desc.blocking.strides[0] = yRank == 1 ? 1 : yTR->strideAt(0);
+        y_user_md.data.format_desc.blocking.strides[1] = yRank == 1 ? yTR->strideAt(0) : yTR->strideAt(1);
+        if(yRank > 2)
+            y_user_md.data.format_desc.blocking.strides[2] = yTR->strideAt(2);
+    }
+
+    // z
+    dnnl::memory::desc z_mkl_md  = dnnl::memory::desc(zShape, zType, dnnl::memory::format_tag::any);
+    dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, format);
+    if(zR->ews() != 1 || zR->ordering() != 'c') {
+        z_user_md.data.format_kind = dnnl_blocked;    // overrides format
+        z_user_md.data.format_desc.blocking.strides[0] = zRank == 1 ? 1 : zR->strideAt(0);
+        z_user_md.data.format_desc.blocking.strides[1] = zRank == 1 ? zR->strideAt(0) : zR->strideAt(1);
+        if(zRank > 2)
+            z_user_md.data.format_desc.blocking.strides[2] = zR->strideAt(2);
+    }
+
+    auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine());
+
+    // Create attributes (to handle alpha and beta if necessary)
+    dnnl::primitive_attr attr; // it is empty since we have usual values for alpha (=1) and beta (=0)
+
+    // operation primitive description
+    dnnl::matmul::desc op_desc(x_mkl_md, y_mkl_md, z_mkl_md);
+    dnnl::matmul::primitive_desc op_prim_desc(op_desc, attr, engine);
+
+    // arguments (memory buffers) necessary for calculations
+    std::unordered_map<int, dnnl::memory> args;
+
+    dnnl::stream stream(engine);
+
+    // provide memory buffers and check whether reorder is required
+
+    // input
+    auto x_user_mem = dnnl::memory(x_user_md, engine, xTR->getBuffer());
+    const bool xReorder = op_prim_desc.src_desc() != x_user_mem.get_desc();
+    auto x_mkl_mem = xReorder ? dnnl::memory(op_prim_desc.src_desc(), engine) : x_user_mem;
+    if (xReorder)
+        dnnl::reorder(x_user_mem, x_mkl_mem).execute(stream, x_user_mem, x_mkl_mem);
+    args[DNNL_ARG_SRC] = x_mkl_mem;
+
+    // y
+    auto y_user_mem = dnnl::memory(y_user_md, engine, yTR->getBuffer());
+    const bool yReorder = op_prim_desc.weights_desc() != y_user_mem.get_desc();
+    auto y_mkl_mem = yReorder ? dnnl::memory(op_prim_desc.weights_desc(), engine) : y_user_mem;
+    if (yReorder)
+        dnnl::reorder(y_user_mem, y_mkl_mem).execute(stream, y_user_mem, y_mkl_mem);
+    args[DNNL_ARG_WEIGHTS] = y_mkl_mem;
+
+    // z
+    auto z_user_mem = dnnl::memory(z_user_md, engine, zR->getBuffer());
+    const bool zReorder = op_prim_desc.dst_desc() != z_user_mem.get_desc();
+    auto z_mkl_mem = zReorder ? dnnl::memory(op_prim_desc.dst_desc(), engine) : z_user_mem;
+    args[DNNL_ARG_DST] = z_mkl_mem;
+
+    // run calculations
+    dnnl::matmul(op_prim_desc).execute(stream, args);
+
+    // reorder outputs if necessary
+    if (zReorder)
+        dnnl::reorder(z_mkl_mem, z_user_mem).execute(stream, z_mkl_mem, z_user_mem);
+
+    stream.wait();
+
+    if(zR->getBuffer() != z->getBuffer())
+        z->assign(zR);
+
+    if(zR != z)
+        delete zR;
+    if(xTR != xT)
+        delete xTR;
+    if(xT != x)
+        delete xT;
+    if(yTR != yT)
+        delete yTR;
+    if(yT != y)
+        delete yT;
+
+    // shape::printArray(z_mkl_mem.map_data<float>(),8);
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(matmul, ENGINE_CPU) {
+
+    auto x = INPUT_VARIABLE(0);
+    auto y = INPUT_VARIABLE(1);
+    auto z = OUTPUT_VARIABLE(0);
+
+    if(x->isEmpty() || y->isEmpty())
+        return Status::OK();
+
+    const int iSize = (int) block.getIArguments()->size();
+    int transX = iSize > 0 ? INT_ARG(0) : 0;
+    int transY = iSize > 1 ? INT_ARG(1) : 0;
+    const int transZ = iSize > 2 ? INT_ARG(2) : 0;
+
+    const int xRank = x->rankOf();
+    const int yRank = y->rankOf();
+    const int zRank = z->rankOf();
+
+    if (transZ) {
+        x = INPUT_VARIABLE(1);
+        y = INPUT_VARIABLE(0);
+        bool temp = transX;
+        transX = !transY;
+        transY = !temp;
+    }
+
+    const int xLastDim = transX ? -2 : -1;
+    const int yLastDim = transY ? -2 : -1;
+    const int xLastButOneDim = transX ? -1 : -2;
+    const int yLastButOneDim = transY ? -1 : -2;
+
+    // ******* input validation ******* //
+    REQUIRE_TRUE(xRank > 0 && yRank > 0, 0, "MATMUL MKLDNN OP: input arrays must have rank bigger than 0 (should not be scalars), but got instead: x rank = %i, y rank = %i !", xRank, yRank);
+
+    if (xRank == 1 && yRank == 1) {  // dot case, output is scalar (or vector with length = 1)
+        REQUIRE_TRUE(x->lengthOf() == y->lengthOf(), 0,"MATMUL MKLDNN OP: since input arrays are vectors they must have the same length, but got x length = %i, y length = %i !",x->lengthOf(), y->lengthOf());
+    } else if (xRank == 1 && yRank == 2) {  // vector x matrix, i.e. [4] x [4,5] = [5], output is vector
+        REQUIRE_TRUE(x->lengthOf() == y->sizeAt(yLastButOneDim), 0, "MATMUL MKLDNN OP: input arrays have inconsistent shapes for vector-matrix product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
+    } else if (xRank == 2 && yRank == 1) {   // matrix x vector , i.e. [4,5] x [5] = [4], output is vector
+        REQUIRE_TRUE(x->sizeAt(xLastDim) == y->lengthOf(), 0, "MATMUL MKLDNN OP: input arrays have inconsistent shapes for matrix-vector product: x %s, y %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
+    } else {
+        REQUIRE_TRUE(xRank == yRank && yRank == zRank, 0, "MATMUL MKLDNN OP: input and output arrays must have the same rank, but got instead: x rank = %i, y rank = %i, z rank = %i !", xRank, yRank, zRank);
+        REQUIRE_TRUE(x->sizeAt(xLastDim) == y->sizeAt(yLastButOneDim) && x->sizeAt(xLastButOneDim) == z->sizeAt(-2) && y->sizeAt(yLastDim) == z->sizeAt(-1), 0, "MATMUL MKLDNN OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str());
+
+        if (xRank > 2)   // outer dims must be the same
+            for (int i = 0; i < xRank - 2; ++i)
+                REQUIRE_TRUE(x->sizeAt(i) == y->sizeAt(i) && y->sizeAt(i) == z->sizeAt(i), 0, "MATMUL MKLDNN OP: input/output arrays have inconsistent shapes for matrix product: x %s, y %s, z %s !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str(), ShapeUtils::shapeAsString(z).c_str());
+    }
+    // ******* end of input validation ******* //
+
+    matmulMKLDNN(x, y, z, transX, transY);
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(matmul, ENGINE_CPU) {
+
+    auto x = INPUT_VARIABLE(0);
+    auto y = INPUT_VARIABLE(1);
+
+    auto z = INPUT_VARIABLE(0);
+
+    const DataType xType = x->dataType();
+    const DataType yType = y->dataType();
+    const DataType zType = z->dataType();
+
+
+    return block.isUseMKLDNN() && x->rankOf() < 3 &&
+          (
+            (xType==DataType::FLOAT32  && yType==DataType::FLOAT32  && zType==DataType::FLOAT32)  ||
+            (xType==DataType::HALF     && yType==DataType::HALF     && zType==DataType::FLOAT32)  ||
+            (xType==DataType::BFLOAT16 && yType==DataType::BFLOAT16 && zType==DataType::BFLOAT16) ||
+            ((xType==DataType::UINT8 || xType==DataType::INT8) && (yType==DataType::UINT8 || yType==DataType::INT8) && (zType==DataType::UINT8 || zType==DataType::INT8 || zType==DataType::INT32 || zType==DataType::FLOAT32))
+          );
+}
+
+
+}
+}
+}
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
index c8b34a6c0..10adf533d 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
@@ -84,6 +84,8 @@ namespace nd4j{
             DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CPU);
 
             DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CPU);
+
+            DECLARE_PLATFORM(matmul, ENGINE_CPU);
         }
     }
 
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
index 1a35ecd47..8ef8032bb 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
@@ -19,8 +19,10 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0);
+
+    BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
index be8edad04..5bb518d76 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
index 915983bb0..27b68e732 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
index d2f59137d..80e2258c7 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
index 29caeae84..e34b0c528 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
index 489d1fc6a..96797cc98 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
index 6f50c4682..70c7f3990 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
index 03a31221f..e2d1df0e9 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
index 074f09238..25e14d39f 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
index 8de7c663b..f3b4cbcb6 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
index 3e841dfae..4d1575123 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
index 59a215c20..b50c487b7 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
index 77617173d..972b936dd 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
index 2c19c3bc6..9eb99b238 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
index cd6babb61..6558d7284 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
index b54028b42..d89652899 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
index 4ca54e7b1..40c9598ee 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
index 3d843ca4c..e49ace221 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
index d8dc34f1c..973b25edc 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
index 2c12f2803..b3bf0beeb 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9);
diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp
index 2779bdadf..efd57a7c5 100644
--- a/libnd4j/include/ops/impl/gemm.cpp
+++ b/libnd4j/include/ops/impl/gemm.cpp
@@ -34,7 +34,7 @@ namespace nd4j {
 
             // handle transpose in parallel
             auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
                     for (int c = 0; c < cols; c++) {
                         int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
                         int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
@@ -73,7 +73,7 @@ namespace nd4j {
                         C[r] = z;
                 } else {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto r = start; r < stop; r += increment)
+                        for (auto r = start; r < stop; r++)
                             C[r] = z;
                     };
                     samediff::Threads::parallel_for(func, 0, length);
@@ -130,7 +130,7 @@ namespace nd4j {
             auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
                     int aIdx = linearIndexC(M, N, r, 0);
                     auto aX = aT + aIdx;
 
diff --git a/libnd4j/include/ops/impl/specials.hpp b/libnd4j/include/ops/impl/specials.hpp
deleted file mode 100644
index 207ca5964..000000000
--- a/libnd4j/include/ops/impl/specials.hpp
+++ /dev/null
@@ -1,667 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com, created on 07.10.2017.
-// @author Yurii Shyrma (iuriish@yahoo.com)
-//
-
-
-#include <pointercast.h>
-#include <helpers/shape.h>
-#include <helpers/TAD.h>
-#include <specials.h>
-#include <dll.h>
-#include <NDArray.h>
-#include <ops/declarable/CustomOperations.h>
-#include <types/types.h>
-#include <helpers/Loops.h>
-
-namespace nd4j {
-
-/**
-* Concatneate multi array of the same shape together
-* along a particular dimension
-*/
-template <typename T>
-void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-        const uint numOfArrs = inArrs.size();
-
-        int outDim;
-        const bool isOutputVector = output.isCommonVector(outDim);
-
-        if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
-
-            bool allVectorsOrScalars = true;
-            const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
-
-            std::vector<int> nonUnityDim(numOfArrs);
-            std::vector<Nd4jLong> zOffset(numOfArrs);
-
-            for(int i = 0; i < numOfArrs; i++) {
-                allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
-                if(!allVectorsOrScalars)
-                    break;
-                if(i == 0)  zOffset[0] = 0;
-                else        zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
-            }
-
-            if(allVectorsOrScalars) {
-
-                T* outBuff = output.bufferAsT<T>();
-
-                auto func = PRAGMA_THREADS_FOR {
-                    for (auto r = start; r < stop; r += increment) {
-                        const Nd4jLong arrLen = inArrs[r]->lengthOf();
-                        const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
-
-                        T *z = outBuff + zOffset[r];
-                        T *x = inArrs[r]->bufferAsT<T>();
-
-                        if (outEws == 1 && xEws == 1)
-                            for (Nd4jLong e = 0; e < arrLen; e++)
-                                z[e] = x[e];
-                        else
-                            for (Nd4jLong e = 0; e < arrLen; e++)
-                                z[e * outEws] = x[e * xEws];
-                    }
-                };
-
-                samediff::Threads::parallel_tad(func, 0, numOfArrs);
-                return;
-            }
-        }
-
-        const int rank  = inArrs[0]->rankOf();
-        const int rank2 = 2*rank;
-        std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
-
-        // take into account indices for first array
-        indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
-
-        // loop through the rest of input arrays
-        for(int i = 1; i < numOfArrs; ++i) {
-            indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
-            indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
-        }
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
-                auto temp = output(indices[i], true);
-                nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
-            }
-        };
-
-        samediff::Threads::parallel_tad(func, 0, numOfArrs);
-}
-
-/**
-* Concatneate multi array of the same shape together
-* along a particular dimension
-*/
-template <typename T>
-void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPointer *data, Nd4jPointer *inputShapeInfo, void *vresult, Nd4jLong *resultShapeInfo) {
-    auto result = reinterpret_cast<T *>(vresult);
-    std::vector<NDArray*> inputs(numArrays);
-
-    NDArray output(static_cast<void*>(result), static_cast<Nd4jLong*>(resultShapeInfo));
-
-    for(int i = 0; i < numArrays; ++i)
-        inputs[i] = new NDArray(static_cast<void *>(data[i]), static_cast<Nd4jLong*>(inputShapeInfo[i]));
-
-    nd4j::SpecialMethods<T>::concatCpuGeneric(inputs, output, dimension);
-
-    for(int i = 0; i < numArrays; ++i)
-        delete inputs[i];
-}
-
-/**
- * This kernel accumulates X arrays, and stores result into Z
- *
- * @tparam T
- * @param x
- * @param z
- * @param n
- * @param length
- */
-    template<typename T>
-    void SpecialMethods<T>::accumulateGeneric(void **vx, void *vz, Nd4jLong *zShapeInfo, int n, const Nd4jLong length) {
-        auto z = reinterpret_cast<T *>(vz);
-        auto x = reinterpret_cast<T **>(vx);
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
-                for (auto ar = 0L; ar < n; ar++) {
-                    z[i] += x[ar][i];
-                }
-            }
-        };
-
-        samediff::Threads::parallel_for(func, 0, length);
-    }
-
-
-/**
- * This kernel averages X input arrays, and stores result to Z
- *
- * @tparam T
- * @param x
- * @param z
- * @param n
- * @param length
- * @param propagate
- */
-    template<typename T>
-    void SpecialMethods<T>::averageGeneric(void **vx, void *vz, Nd4jLong *zShapeInfo, int n, const Nd4jLong length, bool propagate) {
-        auto z = reinterpret_cast<T *>(vz);
-        auto x = reinterpret_cast<T **>(vx);
-
-        if (z == nullptr) {
-            //code branch for absent Z
-            z = x[0];
-
-            PRAGMA_OMP_SIMD
-            for (uint64_t i = 0; i < length; i++) {
-                z[i] /= static_cast<T>(n);
-            }
-
-            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
-                    for (Nd4jLong ar = 1; ar < n; ar++) {
-                        z[i] += x[ar][i] / static_cast<T>(n);
-                    }
-                }
-            };
-            samediff::Threads::parallel_for(func, 0, length);
-
-            // instead of doing element-wise propagation, we just issue memcpy to propagate data
-            for (Nd4jLong ar = 1; ar < n; ar++) {
-                memcpy(x[ar], z, length * sizeof(T));
-            }
-        } else {
-            // code branch for existing Z
-
-            // memset before propagation
-            memset(z, 0, length * sizeof(T));
-
-            // aggregation step
-            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
-                    for (Nd4jLong ar = 0; ar < n; ar++) {
-                        z[i] += x[ar][i] / static_cast<T>(n);
-                    }
-                }
-            };
-            samediff::Threads::parallel_for(func, 0, length);
-
-            // instead of doing element-wise propagation, we just issue memcpy to propagate data
-            for (Nd4jLong ar = 0; ar < n; ar++) {
-                memcpy(x[ar], z, length * sizeof(T));
-            }
-        }
-    }
-
-    template <typename T>
-    Nd4jLong SpecialMethods<T>::getPosition(Nd4jLong *xShapeInfo, Nd4jLong index) {
-        auto xEWS = shape::elementWiseStride(xShapeInfo);
-
-        if (xEWS == 1)
-            return index;
-        else if (xEWS > 1)
-            return index * xEWS;
-        else
-            return shape::getIndexOffset(index, xShapeInfo);
-    }
-
-    template<typename T>
-    void SpecialMethods<T>::quickSort_parallel_internal(T* array, Nd4jLong *xShapeInfo, int left, int right, int cutoff, bool descending) {
-
-        int i = left, j = right;
-        T tmp;
-        T pivot = array[getPosition(xShapeInfo, (left + right) / 2)];
-
-
-        {
-            /* PARTITION PART */
-            while (i <= j) {
-                if (descending) {
-                    while (array[getPosition(xShapeInfo, i)] > pivot)
-                        i++;
-                    while (array[getPosition(xShapeInfo, j)] < pivot)
-                        j--;
-                    if (i <= j) {
-                        tmp = array[getPosition(xShapeInfo, i)];
-                        array[getPosition(xShapeInfo, i)] = array[getPosition(xShapeInfo, j)];
-                        array[getPosition(xShapeInfo, j)] = tmp;
-                        i++;
-                        j--;
-                    }
-                } else {
-                    while (array[getPosition(xShapeInfo, i)] < pivot)
-                        i++;
-                    while (array[getPosition(xShapeInfo, j)] > pivot)
-                        j--;
-                    if (i <= j) {
-                        tmp = array[getPosition(xShapeInfo, i)];
-                        array[getPosition(xShapeInfo, i)] = array[getPosition(xShapeInfo, j)];
-                        array[getPosition(xShapeInfo, j)] = tmp;
-                        i++;
-                        j--;
-                    }
-                }
-            }
-
-        }
-
-        //
-
-        if ( ((right-left)<cutoff) ){
-            if (left < j){ quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); }
-            if (i < right){ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
-
-        }else{
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); }
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
-        }
-    }
-
-    template<typename T>
-    void SpecialMethods<T>::quickSort_parallel(void *varray, Nd4jLong *xShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
-        auto array = reinterpret_cast<T *>(varray);
-        int cutoff = 1000;
-
-        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-        {
-PRAGMA_OMP_SINGLE_ARGS(nowait)
-            {
-                quickSort_parallel_internal(array, xShapeInfo, 0, lenArray-1, cutoff, descending);
-            }
-        }
-
-    }
-
-
-
-    template <typename T>
-    int SpecialMethods<T>::nextPowerOf2(int number) {
-        int pos = 0;
-
-        while (number > 0) {
-            pos++;
-            number = number >> 1;
-        }
-        return (int) pow(2, pos);
-    }
-
-    template <typename T>
-    int SpecialMethods<T>::lastPowerOf2(int number) {
-        int p = 1;
-        while (p <= number)
-            p <<= 1;
-
-        p >>= 1;
-        return p;
-    }
-
-
-    template<typename T>
-    void SpecialMethods<T>::sortGeneric(void *vx, Nd4jLong *xShapeInfo, bool descending) {
-        auto x = reinterpret_cast<T *>(vx);
-
-        quickSort_parallel(x, xShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-    }
-
-    template<typename T>
-    void SpecialMethods<T>::sortTadGeneric(void *vx, Nd4jLong *xShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool descending) {
-        auto x = reinterpret_cast<T *>(vx);
-
-        //quickSort_parallel(x, xShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-        Nd4jLong xLength = shape::length(xShapeInfo);
-        Nd4jLong xTadLength = shape::tadLength(xShapeInfo, dimension, dimensionLength);
-        int numTads = xLength / xTadLength;
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
-                T *dx = x + tadOffsets[r];
-
-                quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
-            }
-        };
-        samediff::Threads::parallel_tad(func, 0, numTads);
-    }
-
-
-    template<typename T>
-    void SpecialMethods<T>::decodeBitmapGeneric(void *dx, Nd4jLong N, void *vz, Nd4jLong *zShapeInfo) {
-        auto dz = reinterpret_cast<T *>(vz);
-        auto x = reinterpret_cast<int *>(dx);
-        Nd4jLong lim = N / 16 + 5;
-
-        FloatBits2 fb;
-        fb.i_ = x[2];
-        float threshold = fb.f_;
-
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
-                for (int bitId = 0; bitId < 16; bitId++) {
-                    bool hasBit = (x[e] & 1 << (bitId)) != 0;
-                    bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
-
-                    if (hasBit) {
-                        if (hasSign)
-                            dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold);
-                        else
-                            dz[(e - 4) * 16 + bitId] += static_cast<T>(threshold);
-                    } else if (hasSign) {
-                        dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold / 2);
-                    }
-                }
-            }
-        };
-
-        samediff::Threads::parallel_for(func, 4, lim);
-    }
-
-    template<typename S, typename T>
-    void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
-        auto x = reinterpret_cast<S *>(dx);
-        auto z = reinterpret_cast<T *>(dz);
-
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
-                z[i] = static_cast<T>(x[i]);
-            }
-        };
-
-        samediff::Threads::parallel_for(func, 0, N);
-    };
-    BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
-
-    template<typename T>
-    Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
-        auto dx = reinterpret_cast<T *>(vx);
-
-//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
-        auto func = PRAGMA_REDUCE_LONG {
-            Nd4jLong retVal = 0L;
-
-            for (auto x = start; x < stop; x += increment) {
-                int byte = 0;
-                int byteId = x / 16 + 4;
-
-                for (int f = 0; f < 16; f++) {
-                    Nd4jLong e = x + f;
-
-                    if (e >= N)
-                        continue;
-
-                    T val = dx[e];
-                    T abs = nd4j::math::nd4j_abs<T>(val);
-
-                    int bitId = e % 16;
-
-                    if (abs >= (T) threshold) {
-                        byte |= 1 << (bitId);
-                        retVal++;
-
-                        if (val < (T) 0.0f) {
-                            byte |= 1 << (bitId + 16);
-                            dx[e] += static_cast<T>(threshold);
-                        } else {
-                            dx[e] -= static_cast<T>(threshold);
-                        }
-                    } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
-                        byte |= 1 << (bitId + 16);
-                        dx[e] += static_cast<T>(threshold / 2);
-
-                        retVal++;
-                    }
-                }
-
-                dz[byteId] = byte;
-            }
-
-            return retVal;
-        };
-        return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
-    }
-
-    template <typename X, typename Y>
-    void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        int i = left, j = right;
-        X ktmp;
-        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
-
-        Y vtmp;
-
-        {
-            /* PARTITION PART */
-            while (i <= j) {
-                if (descending) {
-                    while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
-                        i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
-                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
-                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                } else {
-                    while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
-                        i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
-                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
-                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                }
-            }
-
-        }
-
-        //
-
-        if ( ((right-left)<cutoff) ){
-            if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
-            if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
-
-        }else{
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
-        }
-    }
-
-
-    template <typename X, typename Y>
-    void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        int i = left, j = right;
-        X ktmp;
-        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
-
-        Y vtmp;
-
-        {
-            /* PARTITION PART */
-            while (i <= j) {
-                if (descending) {
-                    while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
-                        i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
-                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
-                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                } else {
-                    while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
-                        i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
-                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
-                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                }
-            }
-
-        }
-
-        //
-
-        if ( ((right-left)<cutoff) ){
-            if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
-            if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
-
-        }else{
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
-        }
-    }
-
-
-    template <typename X, typename Y>
-    static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
-        auto array = reinterpret_cast<X *>(varray);
-        auto values = reinterpret_cast<Y *>(yarray);
-        int cutoff = 1000;
-
-        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-        {
-PRAGMA_OMP_SINGLE_ARGS(nowait)
-            {
-                quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
-            }
-        }
-    }
-
-    template <typename X, typename Y>
-    static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
-        auto array = reinterpret_cast<X *>(varray);
-        auto values = reinterpret_cast<Y *>(yarray);
-        int cutoff = 1000;
-
-        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-        {
-PRAGMA_OMP_SINGLE_ARGS(nowait)
-            {
-                quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
-            }
-        }
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
-        quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
-        quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
-        auto x = reinterpret_cast<X*>(vx);
-        auto y = reinterpret_cast<Y*>(vy);
-
-        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
-
-        auto xLength = shape::length(xShapeInfo);
-        auto xTadLength = shape::length(packX.primaryShapeInfo());
-        auto numTads = packX.numberOfTads();
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
-                auto dx = x + packX.primaryOffsets()[r];
-                auto dy = y + packY.primaryOffsets()[r];
-
-                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-            }
-        };
-
-        samediff::Threads::parallel_tad(func, 0, numTads);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
-        auto x = reinterpret_cast<X*>(vx);
-        auto y = reinterpret_cast<Y*>(vy);
-
-        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
-
-        auto xLength = shape::length(xShapeInfo);
-        auto xTadLength = shape::length(packX.primaryShapeInfo());
-        auto numTads = packX.numberOfTads();
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
-                auto dx = x + packX.primaryOffsets()[r];
-                auto dy = y + packY.primaryOffsets()[r];
-
-                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-            }
-        };
-
-        samediff::Threads::parallel_tad(func, 0, numTads);
-    }
-
-    //BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
-    //BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES);
-}
-
diff --git a/libnd4j/include/ops/impl/specials_double.hpp b/libnd4j/include/ops/impl/specials_double.hpp
new file mode 100644
index 000000000..73f50c772
--- /dev/null
+++ b/libnd4j/include/ops/impl/specials_double.hpp
@@ -0,0 +1,270 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com, created on 07.10.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include <pointercast.h>
+#include <helpers/shape.h>
+#include <helpers/TAD.h>
+#include <specials.h>
+#include <dll.h>
+#include <NDArray.h>
+#include <ops/declarable/CustomOperations.h>
+#include <types/types.h>
+#include <helpers/Loops.h>
+
+namespace nd4j {
+
+
+    template<typename S, typename T>
+    void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
+        auto x = reinterpret_cast<S *>(dx);
+        auto z = reinterpret_cast<T *>(dz);
+
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i++) {
+                z[i] = static_cast<T>(x[i]);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, N);
+    };
+
+
+    template <typename X, typename Y>
+    void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
+        int i = left, j = right;
+        X ktmp;
+        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
+
+        Y vtmp;
+
+        {
+            /* PARTITION PART */
+            while (i <= j) {
+                if (descending) {
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
+                        i++;
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                } else {
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
+                        i++;
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                }
+            }
+
+        }
+
+        //
+
+        if ( ((right-left)<cutoff) ){
+            if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
+            if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
+
+        }else{
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
+        }
+    }
+
+
+    template <typename X, typename Y>
+    void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
+        int i = left, j = right;
+        X ktmp;
+        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
+
+        Y vtmp;
+
+        {
+            /* PARTITION PART */
+            while (i <= j) {
+                if (descending) {
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
+                        i++;
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                } else {
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
+                        i++;
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                }
+            }
+
+        }
+
+        //
+
+        if ( ((right-left)<cutoff) ){
+            if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
+            if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
+
+        }else{
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
+        }
+    }
+
+
+    template <typename X, typename Y>
+    static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
+        auto array = reinterpret_cast<X *>(varray);
+        auto values = reinterpret_cast<Y *>(yarray);
+        int cutoff = 1000;
+
+        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
+        {
+PRAGMA_OMP_SINGLE_ARGS(nowait)
+            {
+                quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
+            }
+        }
+    }
+
+    template <typename X, typename Y>
+    static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
+        auto array = reinterpret_cast<X *>(varray);
+        auto values = reinterpret_cast<Y *>(yarray);
+        int cutoff = 1000;
+
+        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
+        {
+PRAGMA_OMP_SINGLE_ARGS(nowait)
+            {
+                quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
+            }
+        }
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
+        quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
+        quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
+        auto x = reinterpret_cast<X*>(vx);
+        auto y = reinterpret_cast<Y*>(vy);
+
+        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+
+        auto xLength = shape::length(xShapeInfo);
+        auto xTadLength = shape::length(packX.primaryShapeInfo());
+        auto numTads = packX.numberOfTads();
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r++) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
+
+                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
+        auto x = reinterpret_cast<X*>(vx);
+        auto y = reinterpret_cast<Y*>(vy);
+
+        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+
+        auto xLength = shape::length(xShapeInfo);
+        auto xTadLength = shape::length(packX.primaryShapeInfo());
+        auto numTads = packX.numberOfTads();
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r++) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
+
+                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
+    }
+}
+
diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp
new file mode 100644
index 000000000..ad63ee490
--- /dev/null
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@@ -0,0 +1,520 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com, created on 07.10.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include <pointercast.h>
+#include <helpers/shape.h>
+#include <helpers/TAD.h>
+#include <specials.h>
+#include <dll.h>
+#include <NDArray.h>
+#include <ops/declarable/CustomOperations.h>
+#include <types/types.h>
+#include <helpers/Loops.h>
+
+namespace nd4j {
+/**
+* Concatneate multi array of the same shape together
+* along a particular dimension
+*/
+// template <typename T>
+// void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+//         const uint numOfArrs = inArrs.size();
+
+//         int outDim;
+//         const bool isOutputVector = output.isCommonVector(outDim);
+
+//         if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
+
+//             bool allVectorsOrScalars = true;
+//             const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
+
+//             std::vector<int> nonUnityDim(numOfArrs);
+//             std::vector<Nd4jLong> zOffset(numOfArrs);
+
+//             for(int i = 0; i < numOfArrs; i++) {
+//                 allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
+//                 if(!allVectorsOrScalars)
+//                     break;
+//                 if(i == 0)  zOffset[0] = 0;
+//                 else        zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
+//             }
+
+//             if(allVectorsOrScalars) {
+
+//                 T* outBuff = output.bufferAsT<T>();
+
+//                 auto func = PRAGMA_THREADS_FOR {
+//                     for (auto r = start; r < stop; r += increment) {
+//                         const Nd4jLong arrLen = inArrs[r]->lengthOf();
+//                         const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
+
+//                         T *z = outBuff + zOffset[r];
+//                         T *x = inArrs[r]->bufferAsT<T>();
+
+//                         if (outEws == 1 && xEws == 1)
+//                             for (Nd4jLong e = 0; e < arrLen; e++)
+//                                 z[e] = x[e];
+//                         else
+//                             for (Nd4jLong e = 0; e < arrLen; e++)
+//                                 z[e * outEws] = x[e * xEws];
+//                     }
+//                 };
+
+//                 samediff::Threads::parallel_tad(func, 0, numOfArrs);
+//                 return;
+//             }
+//         }
+
+//         const int rank  = inArrs[0]->rankOf();
+//         const int rank2 = 2*rank;
+//         std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
+
+//         // take into account indices for first array
+//         indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
+
+//         // loop through the rest of input arrays
+//         for(int i = 1; i < numOfArrs; ++i) {
+//             indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
+//             indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
+//         }
+
+//         auto func = PRAGMA_THREADS_FOR {
+//             for (auto i = start; i < stop; i += increment) {
+//                 auto temp = output(indices[i], true);
+//                 nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+//             }
+//         };
+
+//         samediff::Threads::parallel_tad(func, 0, numOfArrs);
+// }
+
+template <typename T>
+void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+
+    const int numOfInArrs = inArrs.size();
+    const auto sizeofT    = output.sizeOfT();
+
+    T* zBuff = output.bufferAsT<T>();
+
+    bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
+
+    if(luckCase1) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
+            if(!luckCase1)
+                break;
+        }
+    }
+
+    if(luckCase1) {     // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
+
+        T* z = zBuff;
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            const auto memAmountToCopy = inArrs[i]->lengthOf();
+            memcpy(z, inArrs[i]->bufferAsT<T>(), memAmountToCopy * sizeofT);
+            z += memAmountToCopy;
+        }
+        return;
+    }
+
+    const bool isZcontin = output.strideAt(axis) == 1 && output.ordering() == 'c';
+    bool areInputsContin = true;
+    bool allSameOrder    = true;
+
+    if(isZcontin) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            areInputsContin &= inArrs[i]->strideAt(axis) == 1;
+            allSameOrder    &= inArrs[i]->ordering() == output.ordering();
+            if(!areInputsContin || !allSameOrder)
+                break;
+        }
+    }
+
+    const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
+
+    if(luckCase2) {     // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
+
+        const uint zDim       = output.sizeAt(axis);
+
+        for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
+            T* z = zBuff + zDim * i;
+
+            for (uint j = 0; j < inArrs.size(); ++j) {
+                const auto xDim = inArrs[j]->sizeAt(axis);
+                const T* x = inArrs[j]->bufferAsT<T>() + xDim * i;
+                memcpy(z, x, xDim * sizeofT);
+                z += xDim;
+            }
+        }
+
+        return;
+    }
+
+    // general case
+    auto func = PRAGMA_THREADS_FOR {
+
+        Nd4jLong coords[MAX_RANK];
+
+        for (auto i = start; i < stop; i += increment) {
+
+            shape::index2coords(i, output.getShapeInfo(), coords);
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+
+            uint inArrIdx = 0;
+            uint xDim = inArrs[inArrIdx]->sizeAt(axis);
+
+            while (coords[axis] >= xDim) {
+                coords[axis] -= xDim;
+                xDim = inArrs[++inArrIdx]->sizeAt(axis);
+            }
+
+            const T* x = inArrs[inArrIdx]->bufferAsT<T>();
+            const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
+
+            zBuff[zOffset] = x[xOffset];
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, output.lengthOf());
+}
+
+/**
+* Concatneate multi array of the same shape together
+* along a particular dimension
+*/
+template <typename T>
+void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPointer *data, Nd4jPointer *inputShapeInfo, void *vresult, Nd4jLong *resultShapeInfo) {
+    auto result = reinterpret_cast<T *>(vresult);
+    std::vector<NDArray*> inputs(numArrays);
+
+    NDArray output(static_cast<void*>(result), static_cast<Nd4jLong*>(resultShapeInfo));
+
+    for(int i = 0; i < numArrays; ++i)
+        inputs[i] = new NDArray(static_cast<void *>(data[i]), static_cast<Nd4jLong*>(inputShapeInfo[i]));
+
+    nd4j::SpecialMethods<T>::concatCpuGeneric(inputs, output, dimension);
+
+    for(int i = 0; i < numArrays; ++i)
+        delete inputs[i];
+}
+
+
+/**
+ * This kernel accumulates X arrays, and stores result into Z
+ *
+ * @tparam T
+ * @param x
+ * @param z
+ * @param n
+ * @param length
+ */
+    template<typename T>
+    void SpecialMethods<T>::accumulateGeneric(void **vx, void *vz, Nd4jLong *zShapeInfo, int n, const Nd4jLong length) {
+        auto z = reinterpret_cast<T *>(vz);
+        auto x = reinterpret_cast<T **>(vx);
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i++) {
+                for (auto ar = 0L; ar < n; ar++) {
+                    z[i] += x[ar][i];
+                }
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, length);
+    }
+
+
+/**
+ * This kernel averages X input arrays, and stores result to Z
+ *
+ * @tparam T
+ * @param x
+ * @param z
+ * @param n
+ * @param length
+ * @param propagate
+ */
+    template<typename T>
+    void SpecialMethods<T>::averageGeneric(void **vx, void *vz, Nd4jLong *zShapeInfo, int n, const Nd4jLong length, bool propagate) {
+        auto z = reinterpret_cast<T *>(vz);
+        auto x = reinterpret_cast<T **>(vx);
+
+        if (z == nullptr) {
+            //code branch for absent Z
+            z = x[0];
+
+            PRAGMA_OMP_SIMD
+            for (uint64_t i = 0; i < length; i++) {
+                z[i] /= static_cast<T>(n);
+            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i++) {
+                    for (Nd4jLong ar = 1; ar < n; ar++) {
+                        z[i] += x[ar][i] / static_cast<T>(n);
+                    }
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, length);
+
+            // instead of doing element-wise propagation, we just issue memcpy to propagate data
+            for (Nd4jLong ar = 1; ar < n; ar++) {
+                memcpy(x[ar], z, length * sizeof(T));
+            }
+        } else {
+            // code branch for existing Z
+
+            // memset before propagation
+            memset(z, 0, length * sizeof(T));
+
+            // aggregation step
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i++) {
+                    for (Nd4jLong ar = 0; ar < n; ar++) {
+                        z[i] += x[ar][i] / static_cast<T>(n);
+                    }
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, length);
+
+            // instead of doing element-wise propagation, we just issue memcpy to propagate data
+            for (Nd4jLong ar = 0; ar < n; ar++) {
+                memcpy(x[ar], z, length * sizeof(T));
+            }
+        }
+    }
+
+    template <typename T>
+    Nd4jLong SpecialMethods<T>::getPosition(Nd4jLong *xShapeInfo, Nd4jLong index) {
+        auto xEWS = shape::elementWiseStride(xShapeInfo);
+
+        if (xEWS == 1)
+            return index;
+        else if (xEWS > 1)
+            return index * xEWS;
+        else
+            return shape::getIndexOffset(index, xShapeInfo);
+    }
+
+    template<typename T>
+    void SpecialMethods<T>::quickSort_parallel_internal(T* array, Nd4jLong *xShapeInfo, int left, int right, int cutoff, bool descending) {
+
+        int i = left, j = right;
+        T tmp;
+        T pivot = array[getPosition(xShapeInfo, (left + right) / 2)];
+
+
+        {
+            /* PARTITION PART */
+            while (i <= j) {
+                if (descending) {
+                    while (array[getPosition(xShapeInfo, i)] > pivot)
+                        i++;
+                    while (array[getPosition(xShapeInfo, j)] < pivot)
+                        j--;
+                    if (i <= j) {
+                        tmp = array[getPosition(xShapeInfo, i)];
+                        array[getPosition(xShapeInfo, i)] = array[getPosition(xShapeInfo, j)];
+                        array[getPosition(xShapeInfo, j)] = tmp;
+                        i++;
+                        j--;
+                    }
+                } else {
+                    while (array[getPosition(xShapeInfo, i)] < pivot)
+                        i++;
+                    while (array[getPosition(xShapeInfo, j)] > pivot)
+                        j--;
+                    if (i <= j) {
+                        tmp = array[getPosition(xShapeInfo, i)];
+                        array[getPosition(xShapeInfo, i)] = array[getPosition(xShapeInfo, j)];
+                        array[getPosition(xShapeInfo, j)] = tmp;
+                        i++;
+                        j--;
+                    }
+                }
+            }
+
+        }
+
+        //
+
+        if ( ((right-left)<cutoff) ){
+            if (left < j){ quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); }
+            if (i < right){ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
+
+        }else{
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); }
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
+        }
+    }
+
+    template<typename T>
+    void SpecialMethods<T>::quickSort_parallel(void *varray, Nd4jLong *xShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
+        auto array = reinterpret_cast<T *>(varray);
+        int cutoff = 1000;
+
+        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
+        {
+PRAGMA_OMP_SINGLE_ARGS(nowait)
+            {
+                quickSort_parallel_internal(array, xShapeInfo, 0, lenArray-1, cutoff, descending);
+            }
+        }
+
+    }
+
+
+
+    template <typename T>
+    int SpecialMethods<T>::nextPowerOf2(int number) {
+        int pos = 0;
+
+        while (number > 0) {
+            pos++;
+            number = number >> 1;
+        }
+        return (int) pow(2, pos);
+    }
+
+    template <typename T>
+    int SpecialMethods<T>::lastPowerOf2(int number) {
+        int p = 1;
+        while (p <= number)
+            p <<= 1;
+
+        p >>= 1;
+        return p;
+    }
+
+
+    template<typename T>
+    void SpecialMethods<T>::sortGeneric(void *vx, Nd4jLong *xShapeInfo, bool descending) {
+        auto x = reinterpret_cast<T *>(vx);
+
+        quickSort_parallel(x, xShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+    }
+
+    template<typename T>
+    void SpecialMethods<T>::sortTadGeneric(void *vx, Nd4jLong *xShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool descending) {
+        auto x = reinterpret_cast<T *>(vx);
+
+        //quickSort_parallel(x, xShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+        Nd4jLong xLength = shape::length(xShapeInfo);
+        Nd4jLong xTadLength = shape::tadLength(xShapeInfo, dimension, dimensionLength);
+        int numTads = xLength / xTadLength;
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r++) {
+                T *dx = x + tadOffsets[r];
+
+                quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
+            }
+        };
+        samediff::Threads::parallel_tad(func, 0, numTads);
+    }
+
+
+    template<typename T>
+    void SpecialMethods<T>::decodeBitmapGeneric(void *dx, Nd4jLong N, void *vz, Nd4jLong *zShapeInfo) {
+        auto dz = reinterpret_cast<T *>(vz);
+        auto x = reinterpret_cast<int *>(dx);
+        Nd4jLong lim = N / 16 + 5;
+
+        FloatBits2 fb;
+        fb.i_ = x[2];
+        float threshold = fb.f_;
+
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e++) {
+                for (int bitId = 0; bitId < 16; bitId++) {
+                    bool hasBit = (x[e] & 1 << (bitId)) != 0;
+                    bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
+
+                    if (hasBit) {
+                        if (hasSign)
+                            dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold);
+                        else
+                            dz[(e - 4) * 16 + bitId] += static_cast<T>(threshold);
+                    } else if (hasSign) {
+                        dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold / 2);
+                    }
+                }
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 4, lim);
+    }
+
+    template<typename T>
+    Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
+        auto dx = reinterpret_cast<T *>(vx);
+
+//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
+        auto func = PRAGMA_REDUCE_LONG {
+            Nd4jLong retVal = 0L;
+
+            for (auto x = start; x < stop; x += increment) {
+                int byte = 0;
+                int byteId = x / 16 + 4;
+
+                for (int f = 0; f < 16; f++) {
+                    Nd4jLong e = x + f;
+
+                    if (e >= N)
+                        continue;
+
+                    T val = dx[e];
+                    T abs = nd4j::math::nd4j_abs<T>(val);
+
+                    int bitId = e % 16;
+
+                    if (abs >= (T) threshold) {
+                        byte |= 1 << (bitId);
+                        retVal++;
+
+                        if (val < (T) 0.0f) {
+                            byte |= 1 << (bitId + 16);
+                            dx[e] += static_cast<T>(threshold);
+                        } else {
+                            dx[e] -= static_cast<T>(threshold);
+                        }
+                    } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
+                        byte |= 1 << (bitId + 16);
+                        dx[e] += static_cast<T>(threshold / 2);
+
+                        retVal++;
+                    }
+                }
+
+                dz[byteId] = byte;
+            }
+
+            return retVal;
+        };
+        return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
+    }
+}
+
diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h
index a25aa36ec..354f8e328 100644
--- a/libnd4j/include/ops/special_random_ops.h
+++ b/libnd4j/include/ops/special_random_ops.h
@@ -167,7 +167,7 @@ namespace randomOps {
 
             if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
                 auto func = PRAGMA_THREADS_FOR {
-                    for (uint64_t e = start; e < stop; e += increment) {
+                    for (auto e = start; e < stop; e++) {
                         T prob = rng->relativeT<T>(e);
                         T cumProb = (T) 0.0f;
                         for (Nd4jLong f = 0; f < yLength; f++) {
@@ -330,7 +330,7 @@ namespace randomOps {
             const T epsilon = static_cast<T>(1e-5);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto epm = e + middle;
 
                     // we need to get random values
@@ -440,7 +440,7 @@ namespace randomOps {
 
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             auto func = PRAGMA_THREADS_FOR {
-                for (Nd4jLong e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -549,7 +549,7 @@ namespace randomOps {
             //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -690,7 +690,7 @@ namespace randomOps {
             const T epsilon = static_cast<T>(1e-5);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     if (z[e] > mean + ds || z[e] < mean - ds) {
                         z[e] = step(rng, mean, stddev, e, middle, z[e]);
 
@@ -818,7 +818,7 @@ namespace randomOps {
 
             auto func = PRAGMA_THREADS_FOR {
                 PRAGMA_OMP_SIMD
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto epm = e + middle;
 
                     // we need to get random values
diff --git a/libnd4j/include/platformmath.h b/libnd4j/include/platformmath.h
index b7cbe3745..b58e8f7f6 100644
--- a/libnd4j/include/platformmath.h
+++ b/libnd4j/include/platformmath.h
@@ -326,6 +326,11 @@ namespace nd4j {
 #endif
         }
 
+        template <>
+        math_def FORCEINLINE bfloat16 p_floor(bfloat16 value) {
+            return static_cast<bfloat16>(floorf((float)value));
+        }
+
         template <>
         math_def FORCEINLINE double p_floor(double value) {
             return floor(value);
@@ -352,6 +357,11 @@ namespace nd4j {
 #endif
         }
 
+        template <>
+        math_def FORCEINLINE bfloat16 p_ceil(bfloat16 value) {
+            return static_cast<bfloat16>(ceilf((float)value));
+        }
+
         template <>
         math_def FORCEINLINE double p_ceil(double value) {
             return ceil(value);
@@ -374,6 +384,12 @@ namespace nd4j {
             return static_cast<float16>(roundf((float) val));
         }
 
+        template <>
+        math_def FORCEINLINE bfloat16 p_round(bfloat16 value) {
+            return static_cast<bfloat16>(roundf((float)value));
+        }
+
+
         template <>
         math_def FORCEINLINE double p_round(double value) {
             return round(value);
diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h
index b412befd8..48021d734 100644
--- a/libnd4j/include/templatemath.h
+++ b/libnd4j/include/templatemath.h
@@ -127,6 +127,32 @@ namespace nd4j {
 		template<typename T, typename Z>
         math_def inline Z nd4j_erfc(T num);
 
+        math_def inline int32_t floatToRawIntBits(float d) {
+            union {
+                float f;
+                int32_t i;
+            } tmp;
+            tmp.f = d;
+            return tmp.i;
+        }
+
+        math_def inline float intBitsToFloat(int32_t i) {
+            union {
+                float f;
+                int32_t i;
+            } tmp;
+            tmp.i = i;
+            return tmp.f;
+        }
+
+        math_def inline float mulsignf(float x, float y) {
+            return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
+        }
+
+        math_def inline float copysignfk(float x, float y) {
+            return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31)));
+        }
+
 		template<typename T, typename Z>
         math_def inline Z nd4j_sigmoid(T val) {
 			return (Z) 1.0f / ((Z) 1.0f + nd4j_exp<T, Z>(-val));
@@ -660,6 +686,11 @@ namespace nd4j {
 		 * @param val2
 		 * @return
 		 */
+        template <>
+        math_def inline float nd4j_pow(float val, float val2) {
+            return p_pow<float>(val, val2);
+        }
+
 		template <typename X, typename Y, typename Z>
         math_def inline Z nd4j_pow(X val, Y val2) {
             return p_pow<Z>(static_cast<Z>(val), static_cast<Z>(val2));
@@ -767,10 +798,23 @@ namespace nd4j {
         }
 
 
+        math_def inline float neu_tanh(float val, float sign) {
+            float e(M_E);
+            float av = sign * val;
+            auto p = nd4j::math::nd4j_pow<float, float, float>(e, -av * 2.f);
+            return (1 - p) / (1 + p);
+        }
+
+        template <>
+        math_def inline float nd4j_tanh(float val) {
+            float sign = copysignfk(1.0f, val);
+            return sign * neu_tanh(val, sign);
+        }
+
+
 		template <typename X, typename Z>
 		math_def inline Z nd4j_tanh(X val) {
             return val <= 0 ? neg_tanh(val) : pos_tanh(val);
-            //return p_tanh<Z>(static_cast<Z>(val));
 		}
 
         template <typename X, typename Z>
diff --git a/libnd4j/include/types/types.h b/libnd4j/include/types/types.h
index 92fada8d3..7322c6bd5 100644
--- a/libnd4j/include/types/types.h
+++ b/libnd4j/include/types/types.h
@@ -159,6 +159,38 @@
         (nd4j::DataType::INT64, Nd4jLong), \
         (nd4j::DataType::BFLOAT16, bfloat16)
 
+#define NUMERIC_TYPES_0 \
+        (nd4j::DataType::HALF, float16)
+
+#define NUMERIC_TYPES_1 \
+        (nd4j::DataType::FLOAT32, float)
+
+#define NUMERIC_TYPES_2 \
+        (nd4j::DataType::DOUBLE, double)
+
+#define NUMERIC_TYPES_3 \
+        (nd4j::DataType::INT8, int8_t), \
+        (nd4j::DataType::BFLOAT16, bfloat16)
+
+#define NUMERIC_TYPES_4 \
+        (nd4j::DataType::UINT8, uint8_t)
+
+#define NUMERIC_TYPES_5 \
+        (nd4j::DataType::UINT16, uint16_t)
+
+#define NUMERIC_TYPES_6 \
+        (nd4j::DataType::UINT32, uint32_t)
+
+#define NUMERIC_TYPES_7 \
+        (nd4j::DataType::UINT64, uint64_t)
+
+#define NUMERIC_TYPES_8 \
+        (nd4j::DataType::INT16, int16_t)
+
+#define NUMERIC_TYPES_9 \
+        (nd4j::DataType::INT32, int32_t), \
+        (nd4j::DataType::INT64, Nd4jLong)
+
 
 #define GENERIC_NUMERIC_TYPES \
         (nd4j::DataType::HALF, float16), \
diff --git a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
index 1f6000f06..9b6d06ec6 100644
--- a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
@@ -179,6 +179,7 @@ TEST_F(BroadcastableOpsTests, Test_Minimum_1) {
     auto z = result->at(0);
 
     ASSERT_TRUE(exp.isSameShape(z));
+
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
diff --git a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
index 9a8f09b87..bc2ae2152 100644
--- a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
@@ -54,7 +54,7 @@ TEST_F(BroadcastMultiDimTest,MultimDimTest) {
             tad->tadOnlyShapeInfo, //tadShapeInfo
             tad->tadOffsets, //tadOffset
             tad->tadOnlyShapeInfo, //tadShapeInfoZ
-            tad->tadOffsets, 0, tad->numTads); //tadOffsetZ
+            tad->tadOffsets, nd4j::LoopKind::COMMON, 0, tad->numTads); //tadOffsetZ
     for(int i = 0; i < 30; i++) {
         ASSERT_EQ(dataAssertion[i],result[i]);
     }
diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
index f538eb9cd..17ae714cd 100644
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@@ -52,7 +52,7 @@ elseif(WIN32)
 		set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2")
 	endif()
 else()
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}  -O3")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
     set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2")
     if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
         set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
index 9df949267..507a507af 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
@@ -321,6 +321,280 @@ TEST_F(DeclarableOpsTests1, TestTensorDot4) {
     delete results;
 }
 
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot5) {
+
+    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {44,110,160, 66,132, 38, 88,154, 68,170,224,102,204, 82,136,238, 92,230,288,138,276,126,184,322, 116,290,352,174,348,170,232,406, 76,190,160,114,228,182,152,266, 100,250,224,150,300,226,200,350, 124,310,288,186,372,270,248,434, 148,370,352,222,444,314,296,518});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot6) {
+
+    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {22, 66,110,154, 44, 88,132,176, 34,102,170,238, 68,136,204,272, 46,138,230,322, 92,184,276,368, 58,174,290,406,116,232,348,464, 38,114,190,266, 76,152,228,304, 50,150,250,350,100,200,300,400, 62,186,310,434,124,248,372,496, 74,222,370,518,148,296,444,592});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot7) {
+
+    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {76,166,112,106,196, 62,136,226, 60,174,208, 98,212,230,136,250, 76,214,336,122,260,174,168,306, 124,286,240,178,340,150,232,394, 100,226,176,142,268,106,184,310, 84,234,272,134,284,274,184,334, 100,274,400,158,332,218,216,390, 148,346,304,214,412,194,280,478});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot8) {
+
+    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {30, 90,150,210, 60,120,180,240, 38,114,190,266, 76,152,228,304, 46,138,230,322, 92,184,276,368, 54,162,270,378,108,216,324,432, 42,126,210,294, 84,168,252,336, 50,150,250,350,100,200,300,400, 58,174,290,406,116,232,348,464, 66,198,330,462,132,264,396,528});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot9) {
+
+    // NDArray z('f',{2,2,3}, nd4j::DataType::DOUBLE);
+    // z.linspace(1);
+    // z.printShapeInfo();
+    // z.printIndexedBuffer();
+    // z.reshapei('c', {4,3});
+    // z.printShapeInfo();
+    // z.printIndexedBuffer();
+
+    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {3,4,4,3}, {14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1,0,1,0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+}
+
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot10) {
+
+    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {4,4}, {114,258,402,546, 138,314,490,666, 162,370,578,786, 186,426,666,906});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot11) {
+
+    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {4,4}, {98,218,338,458, 134,302,470,638, 170,386,602,818, 206,470,734,998});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot12) {
+
+    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {4,4}, {272,292,312,332, 368,396,424,452, 464,500,536,572, 560,604,648,692});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot13) {
+
+    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {3,3}, {640,560,640, 576,624,576, 640,560,640});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot14) {
+
+    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {3,3}, {648,600,520, 648,536,648, 520,600,648});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot15) {
+
+    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
+    auto y = NDArrayFactory::create<double>('f', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
+    auto expected = NDArrayFactory::create<double>('c', {3,3}, {624,624,624, 656,656,656, 624,624,624});
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(result));
+    ASSERT_TRUE(expected.equalsTo(result));
+
+    delete results;
+
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot16) {
+
+    NDArray x('c', {1}, std::vector<double>{2}, nd4j::DataType::FLOAT32);
+    NDArray y('c', {2,1,2}, {1,2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2,2}, {2,4,6,8}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1,0, 1,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto *result = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests1, TestTensorDot17) {
+
+    NDArray x('f', {16,16}, nd4j::DataType::FLOAT32);
+    NDArray y('f', {1000,16}, nd4j::DataType::FLOAT32);
+    NDArray z('c', {16,1000}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul op;
+    auto status = op.execute({&x, &y}, {&z}, {}, {1,1, 1,1}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, status);
+}
+
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, DivergentCheck1) {
     auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation("switch");
@@ -1067,40 +1341,6 @@ TEST_F(DeclarableOpsTests1, MultiplyScalarScalar1) {
     delete exp;
 }
 
-TEST_F(DeclarableOpsTests1, TestMatMul1) {
-    auto x = NDArrayFactory::create_<float>('c', {3, 5});
-    x->linspace(1);
-
-    auto y = NDArrayFactory::create_<float>('c', {5, 3});
-    y->linspace(1);
-
-    float _expB[]{135.0f, 310.0f, 485.0f, 150.0f, 350.0f, 550.0f, 165.0f, 390.0f, 615.0f};
-    Nd4jLong _expS[] {2, 3, 3, 1, 3, 0, 1, 102}; // expected shape
-    ArrayOptions::setDataType(_expS, nd4j::DataType::FLOAT32);
-    NDArray exp(_expB, _expS);
-
-    auto variableSpace = new VariableSpace();
-    variableSpace->putVariable(-1, x);
-    variableSpace->putVariable(-2, y);
-    variableSpace->putVariable(1, new Variable());
-
-    auto block = new Context(1, variableSpace, false);
-    block->fillInputs({-1, -2});
-
-    nd4j::ops::matmul op;
-
-    Nd4jStatus status = op.execute(block);
-    ASSERT_EQ(ND4J_STATUS_OK, status);
-    ASSERT_TRUE(variableSpace->hasVariable(1));
-
-    auto result = variableSpace->getVariable(1)->getNDArray();
-
-    ASSERT_TRUE(result->equalsTo(&exp));
-
-    delete block;
-    delete variableSpace;
-}
-
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, TestSoftMax_bp_1) {
 
@@ -1608,36 +1848,6 @@ TEST_F(DeclarableOpsTests1, TestGemv1) {
 
 #endif
 
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests1, Reshape1) {
-    const std::vector<Nd4jLong> xShape = {5,4,3};
-    const std::vector<Nd4jLong> yShape = {3,5,4};
-
-    auto x = NDArrayFactory::create_<float>('f', xShape);
-    auto y = NDArrayFactory::create_<float>('f', yShape);
-
-    auto variableSpace = new VariableSpace();
-    variableSpace->putVariable(-1, x);
-
-    auto block = new Context(1, variableSpace, true);
-    block->fillInputs({-1});
-    std::vector<int>* arguments = block->getIArguments();
-    arguments->push_back(-y->ordering());
-    arguments->push_back(3);
-    arguments->push_back(5);
-    arguments->push_back(4);
-
-    nd4j::ops::reshape reshape;
-
-    reshape.execute(block);
-
-    ASSERT_TRUE(x->isSameShape(y));
-
-    delete y;
-    delete block;
-    delete variableSpace;
-}
-
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, Reshape2) {
     const std::vector<Nd4jLong> xShape = {5,4,3};
@@ -1748,37 +1958,8 @@ TEST_F(DeclarableOpsTests1, Reshape7){
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, Transpose1) {
-
     auto x = NDArrayFactory::create_<float>('c', {3,5,2});
-    auto exp = NDArrayFactory::create_<float>('f', {2,5,3});
-
-    auto variableSpace = new VariableSpace();
-    variableSpace->putVariable(-1, x);
-
-    auto block = new Context(1, variableSpace, true);  // in-place
-    block->fillInputs({-1});
-    nd4j::ops::transpose transpose;
-
-    Nd4jStatus status = transpose.execute(block);
-    ASSERT_EQ(ND4J_STATUS_OK, status);
-    // ASSERT_TRUE(x.isSameShapeStrict(exp));
-
-    for (int e = 0; e < x->rankOf() * 2 + 2; e++) {
-        ASSERT_EQ(x->getShapeInfo()[e], exp->getShapeInfo()[e]);
-    }
-//  ASSERT_EQ(x.getShapeInfo()[x.rankOf() * 2 + 2],-exp.getShapeInfo()[x.rankOf() * 2 + 2]);
-    ASSERT_EQ(x->getShapeInfo()[x->rankOf() * 2 + 3], exp->getShapeInfo()[x->rankOf() * 2 + 3]);
-
-    delete exp;
-    delete block;
-    delete variableSpace;
-
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests1, Transpose2) {
-    auto x = NDArrayFactory::create_<float>('c', {3,5,2});
-    auto exp = NDArrayFactory::create_<float>('f', {2,5,3});
+    auto exp = NDArrayFactory::create_<float>('c', {2,5,3});
 
     auto variableSpace = new VariableSpace();
     variableSpace->putVariable(-1, x);
@@ -1792,12 +1973,10 @@ TEST_F(DeclarableOpsTests1, Transpose2) {
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
     auto result = variableSpace->getVariable(block->getNodeId())->getNDArray();
-    // ASSERT_TRUE(result->isSameShapeStrict(exp));
-    for (int e = 0; e < result->rankOf() * 2 + 2; e++) {
-        ASSERT_EQ(result->getShapeInfo()[e], exp->getShapeInfo()[e]);
-    }
-    //ASSERT_EQ(result->getShapeInfo()[x.rankOf() * 2 + 2],-exp.getShapeInfo()[x.rankOf() * 2 + 2]);
-    ASSERT_EQ(result->getShapeInfo()[x->rankOf() * 2 + 3], exp->getShapeInfo()[x->rankOf() * 2 + 3]);
+
+    ASSERT_TRUE(exp->isSameShape(result));
+    ASSERT_TRUE(exp->dataType() == result->dataType());
+    ASSERT_TRUE(exp->ordering() == result->ordering());
 
     delete exp;
     delete block;
@@ -1805,44 +1984,12 @@ TEST_F(DeclarableOpsTests1, Transpose2) {
 }
 
 
-//////////////////////////////////////////////////////////////////////
-// in-place
-TEST_F(DeclarableOpsTests1, Permute1) {
-
-    Nd4jLong shapeX[]   = {3, 5, 10, 15, 150, 15, 1, 0, 1, 99};
-    Nd4jLong shapeExp[] = {3, 15, 5, 10, 1, 150, 15, 0, 0, 99};
-    const std::vector<int> perm = {2, 0, 1};
-    ArrayOptions::setDataType(shapeX, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shapeExp, nd4j::DataType::FLOAT32);
-
-    auto x = new NDArray(shapeX,true);
-    auto exp = new NDArray(shapeExp,true);
-
-    auto variableSpace = new VariableSpace();
-    variableSpace->putVariable(-1, x);
-
-    auto block = new Context(1, variableSpace, true);  // in-place
-    block->fillInputs({-1});
-    std::vector<int>* arguments = block->getIArguments();
-    *arguments = perm;      // set dimensions to be permuted
-
-    nd4j::ops::permute permute;
-    Nd4jStatus status = permute.execute(block);
-    ASSERT_EQ(ND4J_STATUS_OK, status);
-
-    ASSERT_TRUE(x->isSameShapeStrict(*exp));
-
-    delete exp;
-    delete block;
-    delete variableSpace;
-}
-
 //////////////////////////////////////////////////////////////////////
 // not-in-place
-TEST_F(DeclarableOpsTests1, Permute2) {
+TEST_F(DeclarableOpsTests1, Permute1) {
 
-    Nd4jLong shapeX[]   = {3, 5, 10, 15, 150, 15, 1, 0, 1, 99};
-    Nd4jLong shapeExp[] = {3, 15, 5, 10, 1, 150, 15, 0, 0, 99};
+    Nd4jLong shapeX[]   = {3, 5,10,15,  150,15,1,  0,1,99};
+    Nd4jLong shapeExp[] = {3, 15,5,10,  50,10,1,  0,1,99};
     const std::vector<int> perm = {2, 0, 1};
 
     ArrayOptions::setDataType(shapeX, nd4j::DataType::FLOAT32);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index a0722f9d0..484719a45 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -3087,6 +3087,10 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03_3) {
 
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
+#ifdef FFAST_MATH
+    if (1 > 0)
+        return;
+#endif
 
     NDArray x = NDArrayFactory::create<float>('c', {2,4,5,3});
     NDArray exp = NDArrayFactory::create<float>('c', {2,4,5,3},{
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
index 6025216f9..e5eaa9a6a 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
@@ -708,30 +708,6 @@ TEST_F(DeclarableOpsTests12, multiUnique_2) {
     ASSERT_TRUE(nd4j::ops::helpers::multiUnique(arrayList));
 }
 
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests12, tensormmul_6) {
-
-    NDArray x('c', {1}, std::vector<double>{2}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2,1,2}, {1,2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,2}, {2,4,6,8}, nd4j::DataType::FLOAT32);
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1,0, 1,1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-    // exp.printShapeInfo();
-    // result->printShapeInfo();
-    // result->printIndexedBuffer();
-
-    ASSERT_TRUE(exp.isSameShape(result));
-    ASSERT_TRUE(exp.equalsTo(result));
-
-    delete results;
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, reduceMeanBp_4) {
 
@@ -2824,16 +2800,9 @@ TEST_F(DeclarableOpsTests12, QR_Test_1_1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, QR_Test_2) {
 
-    auto in = NDArrayFactory::create<double>('c', {5,3}, {
-        12.,  -51.,    4.,  6.,   167.,  -68.,  -4.,    24.,  -41.,  -1.,     1.,    0.,   2.,     0.,    3.
-    });
-    auto expQ = NDArrayFactory::create<double>('c', {5, 3}, {
-            0.8464148,   0.3912908,    -0.3431241,            -0.42320737,  -0.9040873,  0.02927014,            0.28213826, -0.17042054,  -0.93285596,            0.07053456, -0.01404065,   0.00109937,            -0.14106913,  0.0166551,   0.10577161
-    });
-
-    auto expR = NDArrayFactory::create<double>('c', {3,3}, {
-            -14.177447,         -20.666622,       13.401566,                    0.,         -175.04254,       70.080315,                    0.,                 0.,       35.201546
-    });
+    auto in = NDArrayFactory::create<double>('c', {5,3}, {12.,  -51.,    4.,  6.,   167.,  -68.,  -4.,    24.,  -41.,  -1.,     1.,    0.,   2.,     0.,    3.});
+    auto expQ = NDArrayFactory::create<double>('c', {5, 3}, {0.8464148,0.3912908,-0.3431241,-0.42320737, -0.9040873,0.02927014,0.28213826, -0.17042054, -0.93285596,0.07053456, -0.01404065,0.00109937,-0.14106913,0.0166551,0.10577161});
+    auto expR = NDArrayFactory::create<double>('c', {3,3}, {-14.177447,-20.666622,13.401566,0.,-175.04254,70.080315,0.,0.,35.201546});
 
     nd4j::ops::qr op;
     auto res = op.evaluate({&in}, {}, {}, {false});
@@ -2843,8 +2812,6 @@ TEST_F(DeclarableOpsTests12, QR_Test_2) {
     auto r = res->at(1);
     ASSERT_TRUE(q->isSameShape(expQ));
     ASSERT_TRUE(r->isSameShape(expR));
-//    q->printIndexedBuffer("Orthogonal 5x5");
-//    r->printIndexedBuffer("Upper triangular 5x3");
 
     nd4j::ops::matmul opMul;
     auto res2 = opMul.evaluate({q, r}); //MmulHelper::matmul(q, r, &in, false, false);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
index 600004ec2..3672a4c20 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
@@ -78,6 +78,11 @@ TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_1) {
 }
 
 TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_2) {
+#ifdef FFAST_MATH
+    if (1 > 0)
+        return;
+#endif
+
     auto x = NDArrayFactory::create<double>('c', {5}, {1, 2, 3, std::numeric_limits<double>::infinity(), 5});
     auto y = NDArrayFactory::create<double>('c', {5}, {1, 2, 3, -std::numeric_limits<double>::infinity(), 5});
 
@@ -332,6 +337,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_max_1) {
 }
 
 TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) {
+#ifdef FFAST_MATH
+    if (1 > 0)
+        return;
+#endif
 
     auto e = NDArrayFactory::create<float>('c', {1, 0});
     nd4j::ops::reduce_sum sumOp;
@@ -343,6 +352,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) {
 }
 
 TEST_F(DeclarableOpsTests14, test_empty_reduce_mean_1) {
+#ifdef FFAST_MATH
+    if (1 > 0)
+        return;
+#endif
 
     auto e = NDArrayFactory::create<float>('c', {1, 0});
     nd4j::ops::reduce_mean sumOp;
@@ -533,13 +546,13 @@ TEST_F(DeclarableOpsTests14, repeat_5) {
     delete result;
 }
 /////////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests14, Test_scalar_broadcast_SpecialCaseTest) {
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest) {
 
     auto y = NDArray('c', { 3 }, nd4j::DataType::FLOAT32);
     auto x = NDArray('c', { 5, 2, 1 }, nd4j::DataType::FLOAT32);
 
     auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, nd4j::DataType::FLOAT32);
-    
+
     y.assign(1.0);
     x.linspace(1.0);
 
@@ -553,3 +566,1119 @@ TEST_F(DeclarableOpsTests14, Test_scalar_broadcast_SpecialCaseTest) {
 
     delete result;
 }
+/////////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest2) {
+
+    auto y = NDArray('c', { 1, 3 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 5, 2, 1 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, nd4j::DataType::FLOAT32);
+
+    y.assign(1.0);
+    x.linspace(1.0);
+
+    nd4j::ops::add op;
+    auto result = op.evaluate({ &x, &y });
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto res = *result->at(0);
+
+    ASSERT_EQ(e, res);
+
+    delete result;
+}
+
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest3) {
+
+    auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 3, 5, 4 }, { 10., 11., 12., 13., 20., 22., 24., 26., 30., 33., 36., 39., 40., 44., 48., 52., 50., 55., 60., 65., 84., 90., 96., 102., 98., 105., 112., 119., 112., 120., 128., 136., 126., 135., 144., 153., 140., 150., 160., 170., 198., 209., 220., 231., 216., 228., 240., 252., 234., 247., 260., 273., 252., 266., 280., 294., 270., 285., 300., 315. }, nd4j::DataType::FLOAT32);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), y, z);
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest4) {
+
+    auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { 10., 11., 12., 13.,20., 22., 24., 26.,30., 33., 36., 39.,40., 44., 48., 52.,50., 55., 60., 65.,84., 90., 96., 102.,98., 105., 112., 119.,112., 120., 128., 136.,126., 135., 144., 153.,140., 150., 160., 170.,198., 209., 220., 231.,216., 228., 240., 252.,234., 247., 260., 273.,252., 266., 280., 294.,270., 285., 300., 315.,352., 368., 384., 400.,374., 391., 408., 425.,396., 414., 432., 450.,418., 437., 456., 475.,440., 460., 480., 500.,546., 567., 588., 609.,572., 594., 616., 638.,598., 621., 644., 667.,624., 648., 672., 696.,650., 675., 700., 725.,780., 806., 832., 858.,810., 837., 864., 891.,840., 868., 896., 924.,870., 899., 928., 957.,900., 930., 960., 990. }, nd4j::DataType::FLOAT32);
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), y, z);
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest5) {
+
+    auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615, 0.428571, 0.400000, 0.375000, 0.352941,  0.500000, 0.466667, 0.437500, 0.411765,  0.571429, 0.533333, 0.500000, 0.470588,  0.642857, 0.600000, 0.562500, 0.529412,  0.714286, 0.666667, 0.625000, 0.588235,  0.611111, 0.578947, 0.550000, 0.523810,  0.666667, 0.631579, 0.600000, 0.571429,  0.722222, 0.684211, 0.650000, 0.619048,   0.777778, 0.736842, 0.700000, 0.666667,  0.833333, 0.789474, 0.750000, 0.714286 }, nd4j::DataType::FLOAT32);
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z);
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest6) {
+
+    auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615,  0.428571, 0.400000, 0.375000, 0.352941,  0.500000, 0.466667, 0.437500, 0.411765,  0.571429, 0.533333, 0.500000, 0.470588,  0.642857, 0.600000, 0.562500, 0.529412,  0.714286, 0.666667, 0.625000, 0.588235,0.611111, 0.578947, 0.550000, 0.523810,0.666667, 0.631579, 0.600000, 0.571429,0.722222, 0.684211, 0.650000, 0.619048,0.777778, 0.736842, 0.700000, 0.666667,0.833333, 0.789474, 0.750000, 0.714286, 0.727273, 0.695652, 0.666667, 0.64, 0.772727, 0.739130, 0.708333, 0.68, 0.818182, 0.782609, 0.750000, 0.72, 0.863636, 0.826087, 0.791667, 0.76, 0.909091, 0.869565, 0.833333, 0.80,  0.807692, 0.777778, 0.750000, 0.724138,  0.846154, 0.814815, 0.785714, 0.758621,  0.884615, 0.851852, 0.821429, 0.793103,  0.923077, 0.888889, 0.857143, 0.827586,  0.961538, 0.925926, 0.892857, 0.862069,  0.866667, 0.838710, 0.812500, 0.787879,  0.900000, 0.870968, 0.843750, 0.818182,  0.933333, 0.903226, 0.875000, 0.848485, 0.966667, 0.935484, 0.906250, 0.878788,  1.000000, 0.967742, 0.937500, 0.909091 }, nd4j::DataType::FLOAT32);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z);
+    ASSERT_EQ(e, z);
+}
+
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest7) {
+
+    auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 3, 5, 4 }, { -9., -10., -11., -12.,-8., -9., -10., -11., -7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8.000000, -9.000000, -10.00,-6.000000, -7.000000, -8.000000, -9.000,-5.000000, -6.000000, -7.000000, -8.000,-4.000000, -5.000000, -6.000000, -7.000,-3.000000, -4.000000, -5.000000, -6.000 }, nd4j::DataType::FLOAT32);
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Subtract(), y, z);
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest8) {
+
+    auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { -9.0, -10., -11., -12.,-8., -9., -10., -11.0,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4., 0., -1., -2., -3. }, nd4j::DataType::FLOAT32);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Subtract(), y, z);
+    ASSERT_EQ(e, z);
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test1) {
+
+    auto x  = NDArrayFactory::create<double>('c', {3, 4});
+    auto y  = NDArrayFactory::create<double>('c', {4, 3});
+    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35.,  79., 123., 40.,  92., 144., 45., 105., 165.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test2) {
+
+    auto x  = NDArrayFactory::create<double>('c', {3, 4});
+    auto y  = NDArrayFactory::create<double>('f', {4, 3});
+    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test3) {
+
+    auto x  = NDArrayFactory::create<double>('f', {3, 4});
+    auto y  = NDArrayFactory::create<double>('c', {4, 3});
+    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test4) {
+
+    auto x = NDArrayFactory::create<double> ('f', {3, 4});
+    auto y  = NDArrayFactory::create<double>('f', {4, 3});
+    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test5) {
+
+    auto x  = NDArrayFactory::create<double>('c', {4, 3});
+    auto y  = NDArrayFactory::create<double>('c', {4, 3});
+    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {83.,  94., 105., 94., 107., 120., 105., 120., 135.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test6) {
+
+    auto x  = NDArrayFactory::create<double>('c', {4, 3});
+    auto y  = NDArrayFactory::create<double>('f', {3, 4});
+    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35.,  40.,  45., 79.,  92., 105., 123., 144., 165.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test7) {
+
+    auto x  = NDArrayFactory::create<double>('c', {5,  3,4});
+    auto y  = NDArrayFactory::create<double>('f', {5,  3,4});
+    auto exp = NDArrayFactory::create<double>('f',{5,  3,3}, {3. ,  84.6, 281.4, 593.4, 1020.6, 7. , 107.8, 323.8, 655. , 1101.4,11. , 131. , 366.2, 716.6, 1182.2,
+                                        7. , 107.8, 323.8, 655. , 1101.4,17.4, 137.4, 372.6, 723. , 1188.6,27.8, 167. , 421.4, 791. , 1275.8,
+                                       11. , 131. , 366.2, 716.6, 1182.2,27.8, 167. , 421.4, 791. , 1275.8,44.6, 203. , 476.6, 865.4, 1369.4,});
+
+    x.linspace(1.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {0, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test8) {
+
+    auto x  = NDArrayFactory::create<double>('c', {2,5,  3,4});
+    auto y  = NDArrayFactory::create<double>('f', {2,5,  3,4});
+    auto exp = NDArrayFactory::create<double>('f',{2,5,  3,3}, {3. , 1563. ,  84.6, 2220.6, 281.4, 2993.4, 593.4, 3881.4,1020.6, 4884.6,   7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4,
+                                          11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2,   7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4,
+                                          17.4, 1769.4, 137.4, 2465.4, 372.6, 3276.6, 723. , 4203. ,1188.6, 5244.6,  27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8,
+                                          11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2,  27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8,
+                                          44.6, 1988.6, 203. , 2723. , 476.6, 3572.6, 865.4, 4537.4,1369.4, 5617.4});
+
+    x.linspace(1.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {0, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test9) {
+
+    auto x  = NDArrayFactory::create<double>('c', {2,5,  4,3});
+    auto y  = NDArrayFactory::create<double>('f', {2,5,  3,4});
+    auto exp = NDArrayFactory::create<double>('f',{2,5,  3,3}, {7. , 1639. , 103. , 2311. , 314.2, 3098.2, 640.6, 4000.6,1082.2, 5018.2,   8. , 1664. , 108.8, 2340.8, 324.8, 3132.8, 656. , 4040. ,1102.4, 5062.4,
+                                          9. , 1689. , 114.6, 2370.6, 335.4, 3167.4, 671.4, 4079.4,1122.6, 5106.6,  15.8, 1743.8, 131. , 2435. , 361.4, 3241.4, 707. , 4163. ,1167.8, 5199.8,
+                                          18.4, 1770.4, 138.4, 2466.4, 373.6, 3277.6, 724. , 4204. ,1189.6, 5245.6,  21. , 1797. , 145.8, 2497.8, 385.8, 3313.8, 741. , 4245. ,1211.4, 5291.4,
+                                          24.6, 1848.6, 159. , 2559. , 408.6, 3384.6, 773.4, 4325.4,1253.4, 5381.4,  28.8, 1876.8, 168. , 2592. , 422.4, 3422.4, 792. , 4368. ,1276.8, 5428.8,
+                                          33. , 1905. , 177. , 2625. , 436.2, 3460.2, 810.6, 4410.6,1300.2, 5476.2});
+
+    x.linspace(1.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+TEST_F(DeclarableOpsTests14, matmul_test10) {
+
+    auto x = NDArrayFactory::create_<float>('c', {3, 5});
+    x->linspace(1);
+
+    auto y = NDArrayFactory::create_<float>('c', {5, 3});
+    y->linspace(1);
+
+    float _expB[]{135.0f, 310.0f, 485.0f, 150.0f, 350.0f, 550.0f, 165.0f, 390.0f, 615.0f};
+    Nd4jLong _expS[] {2, 3, 3, 1, 3, 0, 1, 102}; // expected shape
+    ArrayOptions::setDataType(_expS, nd4j::DataType::FLOAT32);
+    NDArray exp(_expB, _expS);
+
+    auto variableSpace = new VariableSpace();
+    variableSpace->putVariable(-1, x);
+    variableSpace->putVariable(-2, y);
+    variableSpace->putVariable(1, new Variable());
+
+    auto block = new Context(1, variableSpace, false);
+    block->fillInputs({-1, -2});
+
+    nd4j::ops::matmul op;
+
+    Nd4jStatus status = op.execute(block);
+    ASSERT_EQ(ND4J_STATUS_OK, status);
+    ASSERT_TRUE(variableSpace->hasVariable(1));
+
+    auto result = variableSpace->getVariable(1)->getNDArray();
+
+    ASSERT_TRUE(result->equalsTo(&exp));
+
+    delete block;
+    delete variableSpace;
+}
+
+TEST_F(DeclarableOpsTests14, matmul_test11) {
+    auto A = NDArrayFactory::create<float>('c', {3, 3});
+    auto B = NDArrayFactory::create<float>('c', {3, 1});
+    auto exp = NDArrayFactory::create<float>('c', {3, 1}, {14.00f,  32.00f,  50.00f});
+
+    A.linspace(1);
+    B.linspace(1);
+
+    nd4j::ops::matmul op;
+
+    auto result = op.evaluate({&A, &B}, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+TEST_F(DeclarableOpsTests14, matmul_test12) {
+    auto x= NDArrayFactory::create<double>('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12});
+    auto y= NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12});
+    auto exp= NDArrayFactory::create<double>('f', {4, 4}, {38.0, 44.0, 50.0, 56.0, 83.0, 98.0, 113.0, 128.0, 128.0, 152.0, 176.0, 200.0, 173.0, 206.0, 239.0, 272.0});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&x, &y}, {}, {1, 1});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+
+    delete result;
+}
+
+
+TEST_F(DeclarableOpsTests14, matmul_test13) {
+    auto x= NDArrayFactory::create<double>('c', {1, 3}, {1, 2, 3});
+    auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
+    auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&x, &y}, {}, {1, 0});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    //z->printIndexedBuffer("z");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+TEST_F(DeclarableOpsTests14, matmul_test14) {
+    auto x= NDArrayFactory::create<double>('c', {3, 1}, {1, 2, 3});
+    auto y= NDArrayFactory::create<double>('c', {4, 1}, {1, 2, 3, 4});
+    auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&x, &y}, {}, {0, 1});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    //z->printIndexedBuffer("z");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+TEST_F(DeclarableOpsTests14, matmul_test15) {
+    auto x= NDArrayFactory::create<double>('c', {3, 1}, {1, 2, 3});
+    auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
+    auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&x, &y}, {}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    //z->printIndexedBuffer("z");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+TEST_F(DeclarableOpsTests14, matmul_test16) {
+    auto x= NDArrayFactory::create<double>('c', {4, 1}, {1, 2, 3, 4});
+    auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
+    auto exp= NDArrayFactory::create<double>('f', {4, 4}, {1,2, 3, 4,2,4, 6, 8,3,6, 9,12,4,8,12,16});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&x, &y});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    //z->printIndexedBuffer("z");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+TEST_F(DeclarableOpsTests14, matmul_test17) {
+    auto x = NDArrayFactory::create<double>('c', {1, 2}, {2.0f, 2.0f});
+    auto y = NDArrayFactory::create<double>('c', {2, 1}, {2.0f, 2.0f});
+    auto exp = NDArrayFactory::create<double>('c', {1, 1}, {8.0f});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&x, &y}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    ASSERT_EQ(exp, *result->at(0));
+
+    delete result;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test18) {
+
+    auto x  = NDArrayFactory::create<double>('c', {1, 4, 3});
+    auto y  = NDArrayFactory::create<double>('f', {1, 3, 4});
+    auto exp = NDArrayFactory::create<double>('f', {1, 3, 3}, {35.,  40.,  45., 79.,  92., 105., 123., 144., 165.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test19) {
+
+    auto x  = NDArrayFactory::create<double>('c', {4, 1});
+    auto y  = NDArrayFactory::create<double>('f', {1, 4});
+    auto exp = NDArrayFactory::create<double>('f', {1, 1}, {15});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1});
+    ASSERT_EQ(Status::OK(), results->status());
+
+    auto z = results->at(0);
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test20) {
+
+    auto x  = NDArrayFactory::create<double>('c', {1, 4, 1});
+    auto y  = NDArrayFactory::create<double>('f', {1, 1, 4});
+    auto exp = NDArrayFactory::create<double>('f', {1, 1, 1}, {15});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1});
+
+    ASSERT_EQ(Status::OK(), results->status());
+    auto z = results->at(0);
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test21) {
+
+    auto x  = NDArrayFactory::create<double>('c', {2, 3});
+    auto y  = NDArrayFactory::create<double>('c', {3, 5});
+    auto exp = NDArrayFactory::create<double>('f', {5, 2}, {23. , 26. , 29. , 32. , 35., 50. , 57.5, 65. , 72.5, 80.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {0, 0, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test22) {
+
+    auto x  = NDArrayFactory::create<double>('c', {3, 2});
+    auto y  = NDArrayFactory::create<double>('c', {3, 5});
+    auto exp = NDArrayFactory::create<double>('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 0, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test23) {
+
+    auto x  = NDArrayFactory::create<double>('c', {3, 2});
+    auto y  = NDArrayFactory::create<double>('c', {3, 5});
+    auto exp = NDArrayFactory::create<double>('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.});
+
+    x.linspace(1.);
+    y.linspace(0.5, 0.5);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 0, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test24) {
+
+    auto x  = NDArrayFactory::create<double>('c', {2,2,  3,5});
+    auto y  = NDArrayFactory::create<double>('c', {2,2,  4,3});
+    auto exp = NDArrayFactory::create<double>('f',{2,2,  4,5}, {4.6, 281.8, 89.2, 582.4, 10. , 314.2,108.1, 628.3, 15.4, 346.6,127. , 674.2, 20.8, 379. ,145.9, 720.1,  5.2, 289.6, 93.4, 593.8,
+                                          11.5, 322.9,113.2, 640.6, 17.8, 356.2,133. , 687.4, 24.1, 389.5,152.8, 734.2,  5.8, 297.4, 97.6, 605.2, 13. , 331.6,118.3, 652.9,
+                                          20.2, 365.8,139. , 700.6, 27.4, 400. ,159.7, 748.3,  6.4, 305.2,101.8, 616.6, 14.5, 340.3,123.4, 665.2, 22.6, 375.4,145. , 713.8,
+                                          30.7, 410.5,166.6, 762.4,  7. , 313. ,106. , 628. , 16. , 349. ,128.5, 677.5, 25. , 385. ,151. , 727. , 34. , 421. ,173.5, 776.5});
+
+    x.linspace(1.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test25) {
+
+    auto x  = NDArrayFactory::create<double>('f', {4, 3});
+    auto y  = NDArrayFactory::create<double>('c', {4});
+    auto exp = NDArrayFactory::create<double>('f',{3}, {7., 8., 9.});
+
+    x.linspace(1.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 0});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test26) {
+
+    auto x  = NDArrayFactory::create<double>('f', {3});
+    auto y  = NDArrayFactory::create<double>('c', {4, 3});
+    auto exp = NDArrayFactory::create<double>('f',{4}, {1.4, 3.2, 5., 6.8});
+
+    x.linspace(1.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {0, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test27) {
+
+    auto x  = NDArrayFactory::create<double>('f', {1, 1});
+    auto y  = NDArrayFactory::create<double>('c', {1, 1});
+    auto exp = NDArrayFactory::create<double>('f',{1, 1}, {0.2});
+
+    x.linspace(2.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test28) {
+
+    auto x  = NDArrayFactory::create<double>('f', {1, 1});
+    auto y  = NDArrayFactory::create<double>('c', {1, 1});
+    auto exp = NDArrayFactory::create<double>('f',{1, 1}, {0.2});
+
+    x.linspace(2.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1,1,1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test29) {
+
+    auto x  = NDArrayFactory::create<double>('f', {1});
+    auto y  = NDArrayFactory::create<double>('c', {1, 1});
+    auto exp = NDArrayFactory::create<double>('f',{1}, {0.2});
+
+    x.linspace(2.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test30) {
+
+    auto x  = NDArrayFactory::create<double>('f', {1,1});
+    auto y  = NDArrayFactory::create<double>('c', {1});
+    auto exp = NDArrayFactory::create<double>('f',{1}, {0.2});
+
+    x.linspace(2.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test31) {
+
+    auto x  = NDArrayFactory::create<double>('f', {4});
+    auto y  = NDArrayFactory::create<double>('c', {4});
+    auto exp = NDArrayFactory::create<double>(3.);
+
+    x.linspace(1.);
+    y.linspace(0.1, 0.1);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test32) {
+
+    auto x  = NDArrayFactory::create<double>('f', {1}, {2.});
+    auto y  = NDArrayFactory::create<double>('c', {1}, {3.});
+    auto exp = NDArrayFactory::create<double>(6.);
+
+    nd4j::ops::matmul op;
+    auto results = op.evaluate({&x, &y}, {}, {1, 1});
+    auto z = results->at(0);
+
+    ASSERT_EQ(Status::OK(), results->status());
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete results;
+}
+/////////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test33) {
+    auto x = NDArrayFactory::create<double>('c', {4, 3});
+    auto y = NDArrayFactory::create<double>('c', {4, 1});
+    auto exp = NDArrayFactory::create<double>('c',{ 3, 1}, {70, 80, 90});
+
+    x.linspace(1);
+    y.linspace(1);
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&x, &y}, {}, {1, 0});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+//////////////////////////////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test34) {
+    auto a = NDArrayFactory::create<double>('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    auto b = NDArrayFactory::create<double>('c', {4}, {1, 2, 3, 4});
+    auto exp = NDArrayFactory::create<double>('c', {3}, {30, 70, 110});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&a, &b});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test35) {
+    auto a = NDArrayFactory::create<double>('c', {4}, {1, 2, 3, 4});
+    auto b = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    auto exp = NDArrayFactory::create<double>('c', {3}, {70, 80, 90});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&a, &b});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test36) {
+    auto a = NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
+    auto b = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    auto exp = NDArrayFactory::create<double>('c', {1, 3}, {70, 80, 90});
+
+    nd4j::ops::matmul op;
+    auto result = op.evaluate({&a, &b});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, matmul_test37) {
+
+    NDArray a('c', {32, 12, 128, 64},  nd4j::DataType::FLOAT32);
+    NDArray b('c', {32, 12, 128, 64}, nd4j::DataType::FLOAT32);
+    NDArray c('c', {32,12,128,128}, nd4j::DataType::FLOAT32);
+    NDArray cExp('c', {32,12,128,128}, nd4j::DataType::FLOAT32);
+
+    a = 1;
+    b = 1;
+    cExp = 64;      //Each entry in output c is sum of 64 (1.0 x 1.0) multiplications
+
+    nd4j::ops::matmul op;
+    auto status = op.execute({&a, &b}, {&c}, {}, {0,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, status);
+
+    ASSERT_TRUE(cExp.isSameShape(c));
+    ASSERT_TRUE(cExp.equalsTo(c));
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_3D_1) {
+
+    // x[4, 12, 128] * y[4, 128] = z[4, 12, 128]
+
+    auto x = NDArray('c', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 5 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 2, 3, 5 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 60.000000, 77.000000, 96.000000, 117.000000, 140.000000, 110.000000, 132.000000, 156.000000, 182.000000, 210.000000, 240.000000, 272.000000, 306.000000, 342.000000, 380.000000, 315.000000, 352.000000, 391.000000, 432.000000, 475.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000 }, nd4j::DataType::FLOAT32);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyBroadcast(nd4j::broadcast::Multiply, { 0,2 }, y, z);
+    //z.printBuffer();
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_3D_2) {
+
+    auto x = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto eC = NDArray('c', { 2, 3, 5 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.600000, 0.636364, 0.666667, 0.692308, 0.714286, 1.100000, 1.090909, 1.083333, 1.076923, 1.071429, 1.066667, 1.062500, 1.058824, 1.055556, 1.052632, 1.400000, 1.375000, 1.352941, 1.333333, 1.315789, 1.733333, 1.687500, 1.647059, 1.611111, 1.578947 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+
+    e.assign(eC);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z);
+
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_4D_1) {
+
+    auto x = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 210.000000, 242.000000, 276.000000, 312.000000, 350.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000, 620.000000, 672.000000, 726.000000, 782.000000, 840.000000, 900.000000, 962.000000, 1026.000000, 1092.000000, 1160.000000, 410.000000, 462.000000, 516.000000, 572.000000, 630.000000, 690.000000, 752.000000, 816.000000, 882.000000, 950.000000, 1020.000000, 1092.000000, 1166.000000, 1242.000000, 1320.000000, 1400.000000, 1482.000000, 1566.000000, 1652.000000, 1740.000000, 1830.000000, 1922.000000, 2016.000000, 2112.000000, 2210.000000, 2310.000000, 2412.000000, 2516.000000, 2622.000000, 2730.000000, 2840.000000, 2952.000000, 3066.000000, 3182.000000, 3300.000000, 3420.000000, 3542.000000, 3666.000000, 3792.000000, 3920.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 3030.000000, 3162.000000, 3296.000000, 3432.000000, 3570.000000, 3710.000000, 3852.000000, 3996.000000, 4142.000000, 4290.000000, 4440.000000, 4592.000000, 4746.000000, 4902.000000, 5060.000000, 5220.000000, 5382.000000, 5546.000000, 5712.000000, 5880.000000 }, nd4j::DataType::FLOAT32);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyBroadcast(nd4j::broadcast::Multiply, { 0,2,3 }, y, z);
+
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_4D_2) {
+
+    auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000,0.181818,0.250000,0.307692,0.357143,0.400000,0.437500,0.470588,0.500000,0.526316,0.550000,0.571429, 0.590909,0.608696,0.625000,0.640000, 0.653846,0.666667,0.678571,0.689655, 2.100000,2.000000,1.916667, 1.846154, 1.785714, 1.733333,1.687500, 1.647059,1.611111, 1.578947,1.550000, 1.523810,1.500000, 1.478261,1.458333, 1.440000,1.423077, 1.407407,1.392857, 1.379310,4.100000, 3.818182,3.583333, 3.384615, 3.214286, 3.066667,2.937500, 2.823529,2.722222, 2.631579,2.550000, 2.476191,2.409091, 2.347826,2.291667, 2.240000,2.192308, 2.148148,2.107143, 2.068965,2.033333, 2.000000,1.968750, 1.939394,1.911765, 1.885714,1.861111, 1.837838,1.815789, 1.794872,1.775000, 1.756098,1.738095, 1.720930,1.704545, 1.688889,1.673913, 1.659575,1.645833,1.632653,2.700000,2.645161,2.593750,2.545455,2.500000,2.457143,2.416667,2.378378,2.342105,2.307692,2.275000,2.243902,2.214286,2.186047,2.159091,2.133333,2.108696,2.085106,2.062500,2.040816,3.366667,3.290323,3.218750,3.151515,3.088235,3.028571,2.972222,2.918919,2.868421,2.820513,2.775000,2.731707,2.690476,2.651163,2.613636,2.577778,2.543478,2.510638,2.479167,2.448980 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+
+    e.assign(eC);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2,3 }, y, z);
+
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_4D_3) {
+
+    auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+
+    e.assign(eC);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z);
+
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_4D_4) {
+
+    // x[4, 12, 128, 128] * y[4, 1, 128, 1] = z[4, 12, 128, 128]
+
+    auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 1, 5, 1 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    e.assign(eC);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z);
+
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_5D_1) {
+    // x[4, 12, 128, 128, 128] * y[4, 1, 128, 128, 128] = z[4, 12, 128, 128, 128]
+    auto x = NDArray('c', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 1, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto e = NDArray('c', { 2, 3, 5, 4, 3 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 630.000000, 682.000000, 736.000000, 792.000000, 850.000000, 910.000000, 972.000000, 1036.000000, 1102.000000, 1170.000000, 1240.000000, 1312.000000, 1386.000000, 1462.000000, 1540.000000, 1620.000000, 1702.000000, 1786.000000, 1872.000000, 1960.000000, 2050.000000, 2142.000000, 2236.000000, 2332.000000, 2430.000000, 2530.000000, 2632.000000, 2736.000000, 2842.000000, 2950.000000, 3060.000000, 3172.000000, 3286.000000, 3402.000000, 3520.000000, 3640.000000, 3762.000000, 3886.000000, 4012.000000, 4140.000000, 610.000000, 682.000000, 756.000000, 832.000000, 910.000000, 990.000000, 1072.000000, 1156.000000, 1242.000000, 1330.000000, 1420.000000, 1512.000000, 1606.000000, 1702.000000, 1800.000000, 1900.000000, 2002.000000, 2106.000000, 2212.000000, 2320.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 5050.000000, 5202.000000, 5356.000000, 5512.000000, 5670.000000, 5830.000000, 5992.000000, 6156.000000, 6322.000000, 6490.000000, 6660.000000, 6832.000000, 7006.000000, 7182.000000, 7360.000000, 7540.000000, 7722.000000, 7906.000000, 8092.000000, 8280.000000, 1210.000000, 1342.000000, 1476.000000, 1612.000000, 1750.000000, 1890.000000, 2032.000000, 2176.000000, 2322.000000, 2470.000000, 2620.000000, 2772.000000, 2926.000000, 3082.000000, 3240.000000, 3400.000000, 3562.000000, 3726.000000, 3892.000000, 4060.000000, 4230.000000, 4402.000000, 4576.000000, 4752.000000, 4930.000000, 5110.000000, 5292.000000, 5476.000000, 5662.000000, 5850.000000, 6040.000000, 6232.000000, 6426.000000, 6622.000000, 6820.000000, 7020.000000, 7222.000000, 7426.000000, 7632.000000, 7840.000000, 8050.000000, 8262.000000, 8476.000000, 8692.000000, 8910.000000, 9130.000000, 9352.000000, 9576.000000, 9802.000000, 10030.000000, 10260.000000, 10492.000000, 10726.000000, 10962.000000, 11200.000000, 11440.000000, 11682.000000, 11926.000000, 12172.000000, 12420.000000, 12670.000000, 12922.000000, 13176.000000, 13432.000000, 13690.000000, 13950.000000, 14212.000000, 14476.000000, 14742.000000, 15010.000000, 15280.000000, 15552.000000, 15826.000000, 16102.000000, 16380.000000, 16660.000000, 16942.000000, 17226.000000, 17512.000000, 17800.000000, 18090.000000, 18382.000000, 18676.000000, 18972.000000, 19270.000000, 19570.000000, 19872.000000, 20176.000000, 20482.000000, 20790.000000, 21100.000000, 21412.000000, 21726.000000, 22042.000000, 22360.000000, 22680.000000, 23002.000000, 23326.000000, 23652.000000, 23980.000000, 24310.000000, 24642.000000, 24976.000000, 25312.000000, 25650.000000, 25990.000000, 26332.000000, 26676.000000, 27022.000000, 27370.000000, 27720.000000, 28072.000000, 28426.000000, 28782.000000, 29140.000000, 29500.000000, 29862.000000, 30226.000000, 30592.000000, 30960.000000, 16870.000000, 17182.000000, 17496.000000, 17812.000000, 18130.000000, 18450.000000, 18772.000000, 19096.000000, 19422.000000, 19750.000000, 20080.000000, 20412.000000, 20746.000000, 21082.000000, 21420.000000, 21760.000000, 22102.000000, 22446.000000, 22792.000000, 23140.000000, 23490.000000, 23842.000000, 24196.000000, 24552.000000, 24910.000000, 25270.000000, 25632.000000, 25996.000000, 26362.000000, 26730.000000, 27100.000000, 27472.000000, 27846.000000, 28222.000000, 28600.000000, 28980.000000, 29362.000000, 29746.000000, 30132.000000, 30520.000000, 30910.000000, 31302.000000, 31696.000000, 32092.000000, 32490.000000, 32890.000000, 33292.000000, 33696.000000, 34102.000000, 34510.000000, 34920.000000, 35332.000000, 35746.000000, 36162.000000, 36580.000000, 37000.000000, 37422.000000, 37846.000000, 38272.000000, 38700.000000, 21070.000000, 21442.000000, 21816.000000, 22192.000000, 22570.000000, 22950.000000, 23332.000000, 23716.000000, 24102.000000, 24490.000000, 24880.000000, 25272.000000, 25666.000000, 26062.000000, 26460.000000, 26860.000000, 27262.000000, 27666.000000, 28072.000000, 28480.000000, 28890.000000, 29302.000000, 29716.000000, 30132.000000, 30550.000000, 30970.000000, 31392.000000, 31816.000000, 32242.000000, 32670.000000, 33100.000000, 33532.000000, 33966.000000, 34402.000000, 34840.000000, 35280.000000, 35722.000000, 36166.000000, 36612.000000, 37060.000000, 37510.000000, 37962.000000, 38416.000000, 38872.000000, 39330.000000, 39790.000000, 40252.000000, 40716.000000, 41182.000000, 41650.000000, 42120.000000, 42592.000000, 43066.000000, 43542.000000, 44020.000000, 44500.000000, 44982.000000, 45466.000000, 45952.000000, 46440.000000 }, nd4j::DataType::FLOAT32);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), y, z);
+    // z.printBuffer();
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_5D_2) {
+
+    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.400000, 0.437500, 0.470588, 0.500000, 0.526316, 0.550000, 0.571429, 0.590909, 0.608696, 0.625000, 0.640000, 0.653846, 0.666667, 0.678571, 0.689655, 0.700000, 0.709677, 0.718750, 0.727273, 0.735294, 0.742857, 0.750000, 0.756757, 0.763158, 0.769231, 0.775000, 0.780488, 0.785714, 0.790698, 0.795455, 0.800000, 0.804348, 0.808511, 0.812500, 0.816327, 0.820000, 0.823529, 0.826923, 0.830189, 0.833333, 0.836364, 0.839286, 0.842105, 0.844828, 0.847458, 0.850000, 0.852459, 0.854839, 0.857143, 0.859375, 0.861538, 0.863636, 0.865672, 0.867647, 0.869565, 6.100000, 5.636364, 5.250000, 4.923077, 4.642857, 4.400000, 4.187500, 4.000000, 3.833333, 3.684211, 3.550000, 3.428571, 3.318182, 3.217391, 3.125000, 3.040000, 2.961539, 2.888889, 2.821429, 2.758621, 2.700000, 2.645161, 2.593750, 2.545455, 2.500000, 2.457143, 2.416667, 2.378378, 2.342105, 2.307692, 2.275000, 2.243902, 2.214286, 2.186047, 2.159091, 2.133333, 2.108696, 2.085106, 2.062500, 2.040816, 2.020000, 2.000000, 1.980769, 1.962264, 1.944444, 1.927273, 1.910714, 1.894737, 1.879310, 1.864407, 1.850000, 1.836066, 1.822581, 1.809524, 1.796875, 1.784615, 1.772727, 1.761194, 1.750000, 1.739130, 12.100000, 11.090909, 10.250000, 9.538462, 8.928572, 8.400000, 7.937500, 7.529412, 7.166667, 6.842105, 6.550000, 6.285714, 6.045455, 5.826087, 5.625000, 5.440000, 5.269231, 5.111111, 4.964286, 4.827586, 4.700000, 4.580645, 4.468750, 4.363636, 4.264706, 4.171429, 4.083333, 4.000000, 3.921053, 3.846154, 3.775000, 3.707317, 3.642857, 3.581395, 3.522727, 3.466667, 3.413043, 3.361702, 3.312500, 3.265306, 3.220000, 3.176471, 3.134615, 3.094340, 3.055556, 3.018182, 2.982143, 2.947368, 2.913793, 2.881356, 2.850000, 2.819672, 2.790323, 2.761905, 2.734375, 2.707692, 2.681818, 2.656716, 2.632353, 2.608696, 2.585714, 2.563380, 2.541667, 2.520548, 2.500000, 2.480000, 2.460526, 2.441558, 2.423077, 2.405063, 2.387500, 2.370370, 2.353658, 2.337349, 2.321429, 2.305882, 2.290698, 2.275862, 2.261364, 2.247191, 2.233333, 2.219780, 2.206522, 2.193548, 2.180851, 2.168421, 2.156250, 2.144330, 2.132653, 2.121212, 2.110000, 2.099010, 2.088235, 2.077670, 2.067308, 2.057143, 2.047170, 2.037383, 2.027778, 2.018349, 2.009091, 2.000000, 1.991071, 1.982301, 1.973684, 1.965217, 1.956897, 1.948718, 1.940678, 1.932773, 1.925000, 1.917355, 1.909836, 1.902439, 1.895161, 1.888000, 1.880952, 1.874016, 1.867188, 1.860465, 3.442857, 3.408451, 3.375000, 3.342466, 3.310811, 3.280000, 3.250000, 3.220779, 3.192308, 3.164557, 3.137500, 3.111111, 3.085366, 3.060241, 3.035714, 3.011765, 2.988372, 2.965517, 2.943182, 2.921348, 2.900000, 2.879121, 2.858696, 2.838710, 2.819149, 2.800000, 2.781250, 2.762887, 2.744898, 2.727273, 2.710000, 2.693069, 2.676471, 2.660194, 2.644231, 2.628572, 2.613208, 2.598131, 2.583333, 2.568807, 2.554545, 2.540540, 2.526786, 2.513274, 2.500000, 2.486957, 2.474138, 2.461539, 2.449152, 2.436975, 2.425000, 2.413223, 2.401639, 2.390244, 2.379032, 2.368000, 2.357143, 2.346457, 2.335938, 2.325581, 4.300000, 4.253521, 4.208333, 4.164383, 4.121622, 4.080000, 4.039474, 4.000000, 3.961539, 3.924051, 3.887500, 3.851852, 3.817073, 3.783133, 3.750000, 3.717647, 3.686047, 3.655172, 3.625000, 3.595506, 3.566667, 3.538461, 3.510870, 3.483871, 3.457447, 3.431579, 3.406250, 3.381443, 3.357143, 3.333333, 3.310000, 3.287129, 3.264706, 3.242718, 3.221154, 3.200000, 3.179245, 3.158879, 3.138889, 3.119266, 3.100000, 3.081081, 3.062500, 3.044248, 3.026316, 3.008696, 2.991379, 2.974359, 2.957627, 2.941176, 2.925000, 2.909091, 2.893443, 2.878049, 2.862903, 2.848000, 2.833333, 2.818898, 2.804688, 2.790698 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+
+    e.assign(eC);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2,3,4 }, y, z);
+
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_5D_3) {
+
+    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+
+    e.assign(eC);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z);
+
+    ASSERT_EQ(e, z);
+}
+///////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests14, Test_broadcast_5D_4) {
+
+    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 1, 5, 1, 1 }, nd4j::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    // recieved by main algorithm
+    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, nd4j::DataType::FLOAT32);
+
+    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    e.assign(eC);
+
+    x.linspace(1.f);
+    y.linspace(10.f);
+    z.assign(0.f);
+
+    x.applyTrueBroadcast(BroadcastOpsTuple::Divide(), y, z);
+
+    ASSERT_EQ(e, z);
+}
+
+// @Test
+//     public void testMmulRank4_simple(){
+
+//         INDArray arr1 = Nd4j.ones(DataType.FLOAT, 32, 12, 128, 64);
+//         INDArray arr2 = Nd4j.ones(DataType.FLOAT, 32, 12, 128, 64);
+
+//         DynamicCustomOp op = DynamicCustomOp.builder("matmul")
+//                 .addInputs(arr1, arr2)
+//                 .addIntegerArguments(0, 1)      //Transpose arr2 only
+//                 .build();
+
+//         List<LongShapeDescriptor> shapes = op.calculateOutputShape();
+//         assertEquals(1, shapes.size());
+//         long[] shape = new long[]{32,12,128,128};
+//         assertArrayEquals(shape, shapes.get(0).getShape());
+
+//         INDArray out = Nd4j.create(DataType.FLOAT, shape);
+
+//         op.setOutputArgument(0, out);
+//         Nd4j.exec(op);
+// //        System.out.println(out);
+
+//         INDArray exp = Nd4j.valueArrayOf(shape, 64.0, DataType.FLOAT);      //Each entry in output is sum of 64 (1.0 x 1.0) multiplications
+//         assertEquals(exp, out);
+//     }
+
+
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
index d154039f3..199630d4e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
@@ -584,6 +584,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_1) {
 }
 
 TEST_F(DeclarableOpsTests15, test_check_numeric_2) {
+#ifdef FFAST_MATH
+    if (1 > 0)
+        return;
+#endif
+
     auto x = NDArrayFactory::create<float>('c', {3},{1.f, 2.f, std::numeric_limits<float>::infinity()});
     auto y = NDArrayFactory::string("should trigger");
     auto z = NDArrayFactory::create<float>('c', {3} );
@@ -598,6 +603,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_2) {
 }
 
 TEST_F(DeclarableOpsTests15, test_check_numeric_3) {
+#ifdef FFAST_MATH
+    if (1 > 0)
+        return;
+#endif
+
     auto x = NDArrayFactory::create<float>('c', {3},{1.f, 2.f, std::numeric_limits<float>::quiet_NaN()});
     auto y = NDArrayFactory::string("should trigger");
     auto z = NDArrayFactory::create<float>('c', {3} );
@@ -1530,6 +1540,10 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test10) {
 }
 
 TEST_F(DeclarableOpsTests15, Pow_BP_Test11) {
+#ifdef FFAST_MATH
+    if (1 > 0)
+        return;
+#endif
 
     NDArray xB('c', { 3,2,1 }, { .4, 3, 5, .8, -9, -12 }, nd4j::DataType::FLOAT32);
     NDArray yB('c', { 1,2,3 }, { 3, -2, .4, -4, 10, .8 }, nd4j::DataType::FLOAT32);
@@ -1560,3 +1574,447 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test11) {
 
     delete resultsB;
 }
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP1) {
+
+    NDArray A('c', { 1, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 1, 2, 4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.1 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdA('c', { 1, 2, 3 }, { 3.3,  8.5,  13.36, 3.7, 9.54, 15. }, nd4j::DataType::FLOAT32);
+    NDArray dLdB('c', { 1, 2, 4 }, { 3.38, 4.04, 4.7, 5.13, 3.83, 4.58, 5.33, 5.82 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,0,1, 2,0,1 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+    
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+    
+    ASSERT_TRUE(dLdA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dLdA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dLdB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dLdB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP2) {
+
+    NDArray A('c', { 1, 2, 3 }, { 2,2,2, 2,2,2 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 1, 2, 3 }, { 3,3,3,3, 3,3 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 1 }, { 1 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+    
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+    
+    ASSERT_TRUE(B.isSameShape(*dLdAbp));
+    ASSERT_TRUE(B.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(A.isSameShape(*dLdBbp));
+    ASSERT_TRUE(A.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP3) {
+
+    NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 4, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+
+    NDArray dA('c', { 3, 2, 2 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, nd4j::DataType::FLOAT32);
+    NDArray dB('c', { 4, 2, 2 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8,  7.04 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dB.equalsTo(*dLdBbp));
+
+     delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP4) {
+
+    NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 2, 4, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 2 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32);
+    NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9, 26.1, 32.84 , 3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+    
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dLdA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dLdA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dLdB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dLdB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP5) {
+
+    NDArray A('c', { 3, 4, 1, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 2, 4, 1, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdA('c', { 3, 4, 1, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32);
+    NDArray dLdB('c', { 2, 4, 1, 1 }, { 30.49, 3.456, 201.9,  26.1, 32.84,  3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+    
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dLdA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dLdA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dLdB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dLdB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP6) {
+
+    NDArray A('c', { 2, 2, 2 }, { 2,2, 2,2, 2,2, 2,2 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 2, 2, 2 }, { 3,3, 3,3, 3,3, 3,3  }, nd4j::DataType::FLOAT32);
+
+    auto dLdC = NDArrayFactory::create<float>(1.f);
+
+    nd4j::ops::tensormmul_bp op_bp;
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 3,0,1,2, 3,0,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+    
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(B.isSameShape(*dLdAbp));
+    ASSERT_TRUE(B.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(A.isSameShape(*dLdBbp));
+    ASSERT_TRUE(A.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP7) {
+
+    NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 2, 4, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32);
+    NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9,  26.1, 32.84,  3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dLdA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dLdA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dLdB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dLdB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP8) {
+
+    NDArray A('c', { 1, 1, 4, 3 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 1, 1, 4, 2 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 2 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdA('c', { 1, 1, 4, 3 }, { 20., 23.4,  26.8, 23.35, 27.25, 31.15, 3.97,  4.67,  5.37, 20.88, 24.66, 28.44 }, nd4j::DataType::FLOAT32);
+    NDArray dLdB('c', { 1, 1, 4, 2 }, { 11.84,   12.68,  39.98,  43.192, 20.65, 22.36, 165.7,   178.4 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 3,0,1,2, 3,0,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+    
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dLdA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dLdA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dLdB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dLdB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP9) {
+
+    NDArray A('c', { 3, 2, 2, 1 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 4, 2, 2 ,1 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 1, 4, 1 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+
+    NDArray dA('c', { 3, 2, 2, 1 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, nd4j::DataType::FLOAT32);
+    NDArray dB('c', { 4, 2, 2, 1 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8,  7.04 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP10) {
+
+    NDArray A('c', { 1, 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 1, 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 1, 3, 1, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+
+
+    NDArray dA('c', { 1, 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5,  11.62, 18.74 }, nd4j::DataType::FLOAT32);
+    NDArray dB('c', { 1, 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP11) {
+
+    NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+
+
+    NDArray dA('c', { 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5,  11.62, 18.74 }, nd4j::DataType::FLOAT32);
+    NDArray dB('c', { 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,0,1, 2,0,1 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP12) {
+
+    NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
+    NDArray B('c', { 2, 2 ,3 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('c', { 2, 3, 2, 3 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
+                      1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2., 2.1, 2.2, 2.3, 2.4,
+                      2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::FLOAT32);
+
+    NDArray dA('c', { 2, 2, 3 }, { 7.66, 20.26, 32.86, 8.29, 21.97, 35.65, 45.46, 58.06, 70.66, 49.33, 63.01, 76.69 }, nd4j::DataType::FLOAT32);
+    NDArray dB('c', { 2, 2, 3 }, { 25.86, 27.36, 28.86, 28.74, 30.42, 32.1, 30.36, 31.86, 33.36, 33.78, 35.46, 37.14 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP13) {
+
+    NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::DOUBLE);
+    NDArray B('c', { 3, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, nd4j::DataType::DOUBLE);
+    NDArray dLdC('c', { 3, 2, 3, 2 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
+                      1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2., 2.1, 2.2, 2.3, 2.4,
+                      2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::DOUBLE);
+
+    NDArray dA('c', { 3, 2, 2 }, { 7.79, 20.57, 8.21, 21.71, 33.35, 46.13, 35.21, 48.71, 58.91, 71.69, 62.21, 75.71 }, nd4j::DataType::DOUBLE);
+    NDArray dB('c', { 3, 2, 2 }, { 26.49, 28.02, 28.41, 30.06, 29.55, 31.08, 31.71, 33.36, 32.61, 34.14, 35.01, 36.66 }, nd4j::DataType::DOUBLE);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP14) {
+
+    NDArray A('c', { 2, 2, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::DOUBLE);
+
+    NDArray B('c', { 2, 2, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::DOUBLE);
+
+    NDArray dLdC('c', { 2, 2, 2, 2, 2, 2 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
+                      1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
+                      1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
+                      1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
+                      1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::DOUBLE);
+
+    NDArray dA('c', { 2, 2, 2, 2 }, { 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24, 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24 }, nd4j::DataType::DOUBLE);
+    NDArray dB('c', { 2, 2, 2, 2 }, { 10.76, 12.88, 15., 17.12, 12.36, 14.8, 17.24, 19.68, 19.24, 21.36, 23.48, 25.6, 22.12, 24.56, 27., 29.44 }, nd4j::DataType::DOUBLE);
+
+    nd4j::ops::tensormmul_bp op_bp;
+
+    auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
+
+    auto* dLdAbp = resultsBP->at(0);
+    auto* dLdBbp = resultsBP->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdAbp));
+    ASSERT_TRUE(dA.equalsTo(*dLdAbp));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdBbp));
+    ASSERT_TRUE(dB.equalsTo(*dLdBbp));
+
+    delete resultsBP;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP15) {
+
+    NDArray A('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::FLOAT32);
+    NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdC('f', { 2, 2 }, { 23.0, 24.44, 2.0, 26. }, nd4j::DataType::FLOAT32);
+
+    NDArray dA('c', { 2, 2, 3 }, { 27., 127., 227., 77., 177., 277., 76.44, 278.20001, 479.96002, 177.32, 379.08001, 580.839966 }, nd4j::DataType::FLOAT32);
+    NDArray dB('f', { 2, 2, 3 }, { 194.08, 184., 336.4, 268., 241.52, 212., 383.839996, 296., 288.96002, 240., 431.27999, 324. }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::tensormmul_bp op;
+    auto results = op.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2,2,1,2 });
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto* dLdA = results->at(0);
+    auto* dLdB = results->at(1);
+
+    ASSERT_TRUE(dA.isSameShape(*dLdA));
+    ASSERT_TRUE(dA.equalsTo(*dLdA));
+
+    ASSERT_TRUE(dB.isSameShape(*dLdB));
+    ASSERT_TRUE(dB.equalsTo(*dLdB));
+
+    delete results;
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP16) {
+
+    NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
+    NDArray B('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
+
+    NDArray dLdC('c', { 2, 2 }, nd4j::DataType::DOUBLE);
+
+    const OpArgsHolder argsHolderFF({ &A, &B }, {}, { 2,1,2, 2,1,2 });
+    const OpArgsHolder argsHolderBP({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 });
+
+    nd4j::ops::tensormmul op;
+    nd4j::ops::tensormmul_bp op_bp;
+
+    const bool isGradCorrect = GradCheck::checkGrad(op, op_bp, argsHolderFF, argsHolderBP, {1,0});
+    ASSERT_TRUE(isGradCorrect);
+}
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests15, TestTensorMmul_BP17) {
+
+    NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
+    NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
+
+    NDArray dLdC('c', { 2, 2 }, nd4j::DataType::DOUBLE);
+
+    const OpArgsHolder argsHolderFF({ &A, &B }, {}, { 2,1,2, 2,1,2 });
+    const OpArgsHolder argsHolderBP({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 });
+
+    nd4j::ops::tensormmul op;
+    nd4j::ops::tensormmul_bp op_bp;
+
+    const bool isGradCorrect = GradCheck::checkGrad(op, op_bp, argsHolderFF, argsHolderBP, { 1,0 });
+    ASSERT_TRUE(isGradCorrect);
+}
+
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp
index 93864af8c..2c7737a31 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp
@@ -48,5 +48,16 @@ TEST_F(DeclarableOpsTests18, test_bitcast_1) {
     auto status = op.execute({&x}, {&z}, {}, {(Nd4jLong) nd4j::DataType::INT64}, {});
     ASSERT_EQ(Status::OK(), status);
 
+    ASSERT_EQ(e, z);
+}
+
+TEST_F(DeclarableOpsTests18, test_tanh_1) {
+    auto x = NDArrayFactory::create<float>('c', {8}, {0.23f, -0.23f, 0.35f, -0.35f, 0.64f, -0.64f, 100000.f, -100000.f});
+    auto z = x.ulike();
+    auto e = NDArrayFactory::create<float>('c', {8}, {0.226028f, -0.226028f, 0.336376f, -0.336376f, 0.564900f, -0.564900f, 1.f, -1.f});
+
+    nd4j::ops::tanh op;
+    op.execute({&x}, {&z});
+
     ASSERT_EQ(e, z);
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
index 9883a9d79..b0a547a7d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
@@ -66,4 +66,14 @@ TEST_F(DeclarableOpsTests19, test_conv1d_bp_1) {
 
 
     delete result;
+}
+
+TEST_F(DeclarableOpsTests19, test_squeeze_1) {
+    auto x = NDArrayFactory::create<double>('c', {3, 4, 1});
+    auto e = NDArrayFactory::create<double>('c', {3, 4});
+    int axis = 2;
+
+    nd4j::ops::squeeze op;
+    auto status = op.execute({&x}, {&e}, {axis});
+    ASSERT_EQ(Status::OK(), status);
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
index fa129b1af..029a392f7 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
@@ -397,27 +397,6 @@ TEST_F(DeclarableOpsTests2, NLP_Cbow_Test_1) {
     delete result;
 }
 
-TEST_F(DeclarableOpsTests2, YetAnotherMatmulTest_1) {
-    auto A = NDArrayFactory::create<float>('c', {3, 3});
-    auto B = NDArrayFactory::create<float>('c', {3, 1});
-    auto exp = NDArrayFactory::create<float>('c', {3, 1}, {14.00f,  32.00f,  50.00f});
-
-    A.linspace(1);
-    B.linspace(1);
-
-    nd4j::ops::matmul op;
-
-    auto result = op.evaluate({&A, &B}, {}, {});
-
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
 TEST_F(DeclarableOpsTests2, Test_Squeeze_1) {
     auto x = NDArrayFactory::create<float>('c', {2, 1, 3, 1, 1, 1, 4});
     x.linspace(1);
@@ -578,246 +557,6 @@ TEST_F(DeclarableOpsTests2, Test_Concat_BP_1) {
 }
 
 
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot5) {
-
-    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {44,110,160, 66,132, 38, 88,154, 68,170,224,102,204, 82,136,238, 92,230,288,138,276,126,184,322, 116,290,352,174,348,170,232,406, 76,190,160,114,228,182,152,266, 100,250,224,150,300,226,200,350, 124,310,288,186,372,270,248,434, 148,370,352,222,444,314,296,518});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot6) {
-
-    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {22, 66,110,154, 44, 88,132,176, 34,102,170,238, 68,136,204,272, 46,138,230,322, 92,184,276,368, 58,174,290,406,116,232,348,464, 38,114,190,266, 76,152,228,304, 50,150,250,350,100,200,300,400, 62,186,310,434,124,248,372,496, 74,222,370,518,148,296,444,592});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot7) {
-
-    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {76,166,112,106,196, 62,136,226, 60,174,208, 98,212,230,136,250, 76,214,336,122,260,174,168,306, 124,286,240,178,340,150,232,394, 100,226,176,142,268,106,184,310, 84,234,272,134,284,274,184,334, 100,274,400,158,332,218,216,390, 148,346,304,214,412,194,280,478});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot8) {
-
-    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {30, 90,150,210, 60,120,180,240, 38,114,190,266, 76,152,228,304, 46,138,230,322, 92,184,276,368, 54,162,270,378,108,216,324,432, 42,126,210,294, 84,168,252,336, 50,150,250,350,100,200,300,400, 58,174,290,406,116,232,348,464, 66,198,330,462,132,264,396,528});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot9) {
-
-    // NDArray z('f',{2,2,3}, nd4j::DataType::DOUBLE);
-    // z.linspace(1);
-    // z.printShapeInfo();
-    // z.printIndexedBuffer();
-    // z.reshapei('c', {4,3});
-    // z.printShapeInfo();
-    // z.printIndexedBuffer();
-
-    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {3,4,4,3}, {14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1,0,1,0});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-}
-
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot10) {
-
-    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {4,4}, {114,258,402,546, 138,314,490,666, 162,370,578,786, 186,426,666,906});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot11) {
-
-    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {4,4}, {98,218,338,458, 134,302,470,638, 170,386,602,818, 206,470,734,998});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot12) {
-
-    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {4,4}, {272,292,312,332, 368,396,424,452, 464,500,536,572, 560,604,648,692});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot13) {
-
-    auto x = NDArrayFactory::create<double>('c', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {3,3}, {640,560,640, 576,624,576, 640,560,640});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot14) {
-
-    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {3,3}, {648,600,520, 648,536,648, 520,600,648});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests2, TestTensorDot15) {
-
-    auto x = NDArrayFactory::create<double>('f', {2,3,4}, {1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15, 1,3,5,7,9,11,13,15});
-    auto y = NDArrayFactory::create<double>('f', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
-    auto expected = NDArrayFactory::create<double>('c', {3,3}, {624,624,624, 656,656,656, 624,624,624});
-
-    nd4j::ops::tensormmul op;
-    auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto *result = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShape(result));
-    ASSERT_TRUE(expected.equalsTo(result));
-
-    delete results;
-
-}
 
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_1) {
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
index 04816b2b2..e7e95afcb 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
@@ -789,120 +789,6 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_Validation_2) {
     }
 }
 
-TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_1) {
-    auto x= NDArrayFactory::create<double>('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12});
-    auto y= NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12});
-    auto exp= NDArrayFactory::create<double>('f', {4, 4}, {38.0, 44.0, 50.0, 56.0, 83.0, 98.0, 113.0, 128.0, 128.0, 152.0, 176.0, 200.0, 173.0, 206.0, 239.0, 272.0});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y}, {}, {1, 1});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-
-    delete result;
-}
-
-
-TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_2) {
-    auto x= NDArrayFactory::create<double>('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12});
-    auto y= NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12});
-    auto exp= NDArrayFactory::create<double>('f', {3, 3}, {70.0, 158.0, 246.0, 80.0, 184.0, 288.0, 90.0, 210.0, 330.0});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y}, {}, {0, 0});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-
-    delete result;
-}
-
-
-TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_3) {
-    auto x= NDArrayFactory::create<double>('c', {1, 3}, {1, 2, 3});
-    auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
-    auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y}, {}, {1, 0});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    //z->printIndexedBuffer("z");
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
-TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_4) {
-    auto x= NDArrayFactory::create<double>('c', {3, 1}, {1, 2, 3});
-    auto y= NDArrayFactory::create<double>('c', {4, 1}, {1, 2, 3, 4});
-    auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y}, {}, {0, 1});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    //z->printIndexedBuffer("z");
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
-TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_5) {
-    auto x= NDArrayFactory::create<double>('c', {3, 1}, {1, 2, 3});
-    auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
-    auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y}, {}, {});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    //z->printIndexedBuffer("z");
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
-TEST_F(DeclarableOpsTests3, Test_Manual_Gemm_6) {
-    auto x= NDArrayFactory::create<double>('c', {4, 1}, {1, 2, 3, 4});
-    auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
-    auto exp= NDArrayFactory::create<double>('f', {4, 4}, {1,2, 3, 4,2,4, 6, 8,3,6, 9,12,4,8,12,16});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    //z->printIndexedBuffer("z");
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
 TEST_F(DeclarableOpsTests3, Test_ReverseDivide_1) {
     auto x= NDArrayFactory::create<double>('c', {1, 3}, {2, 2, 2});
     auto y= NDArrayFactory::create<double>('c', {1, 3}, {4, 6, 8});
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
index f04d24395..1fb700779 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
@@ -809,26 +809,6 @@ TEST_F(DeclarableOpsTests4, Test_Reshape_Again) {
     delete result;
 }
 
-TEST_F(DeclarableOpsTests4, Test_Gemv_Transpose_1) {
-    auto x = NDArrayFactory::create<double>('c', {4, 3});
-    auto y = NDArrayFactory::create<double>('c', {4, 1});
-    auto exp = NDArrayFactory::create<double>('c',{ 3, 1}, {70, 80, 90});
-
-    x.linspace(1);
-    y.linspace(1);
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y}, {}, {1, 0});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
 TEST_F(DeclarableOpsTests4, Test_Split_1) {
     auto x = NDArrayFactory::create<double>('c', {5, 30});
     auto sizes = NDArrayFactory::create<int>('c', {1, 3}, {4, 15, 11});
@@ -1166,57 +1146,6 @@ TEST_F(DeclarableOpsTests4, Test_Cross_3) {
     delete result;
 }
 
-TEST_F(DeclarableOpsTests4, Test_Matmul_YATS_1) {
-    auto a = NDArrayFactory::create<double>('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    auto b = NDArrayFactory::create<double>('c', {4}, {1, 2, 3, 4});
-    auto exp = NDArrayFactory::create<double>('c', {3}, {30, 70, 110});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&a, &b});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
-TEST_F(DeclarableOpsTests4, Test_Matmul_YATS_2) {
-    auto a = NDArrayFactory::create<double>('c', {4}, {1, 2, 3, 4});
-    auto b = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    auto exp = NDArrayFactory::create<double>('c', {3}, {70, 80, 90});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&a, &b});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
-TEST_F(DeclarableOpsTests4, Test_Matmul_YATS_3) {
-    auto a = NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
-    auto b = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    auto exp = NDArrayFactory::create<double>('c', {1, 3}, {70, 80, 90});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&a, &b});
-    ASSERT_EQ(ND4J_STATUS_OK, result->status());
-
-    auto z = result->at(0);
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
 TEST_F(DeclarableOpsTests4, Test_Add_119) {
     auto a = NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
     auto b = NDArrayFactory::create<double>('c', {4}, {1, 2, 3, 4});
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
index 0a6f8e5e8..7a9bc1648 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
@@ -5019,20 +5019,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_7) {
     delete result;
 }
 
-TEST_F(DeclarableOpsTests7, Test_Matmul_Once_Again) {
-    auto x = NDArrayFactory::create<double>('c', {1, 2}, {2.0f, 2.0f});
-    auto y = NDArrayFactory::create<double>('c', {2, 1}, {2.0f, 2.0f});
-    auto exp = NDArrayFactory::create<double>('c', {1, 1}, {8.0f});
-
-    nd4j::ops::matmul op;
-    auto result = op.evaluate({&x, &y}, {}, {});
-    ASSERT_EQ(Status::OK(), result->status());
-
-    ASSERT_EQ(exp, *result->at(0));
-
-    delete result;
-}
-
 TYPED_TEST(TypedDeclarableOpsTests7, Test_Pnorm_Once_Again) {
     auto input = NDArrayFactory::create<TypeParam>('c', {1, 1, 5, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f});
     auto exp = NDArrayFactory::create<TypeParam>('c', {1, 1, 5, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f});
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
index 2c4655b31..773e1dc18 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@@ -300,6 +300,8 @@ TEST_F(DeclarableOpsTests9, concat_test3) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
 
+    output->printBuffer();
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -620,12 +622,12 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
 
     // we crate bunch of arrays, filled with specific values
     for (int e = 0; e < 2000; e++) {
-        auto array = NDArrayFactory::create_<float>('c', {1, 300});
+        auto array = NDArrayFactory::create_<int>('c', {1, 300});
         array->assign(e);
         context.setInputArray(e, array, true);
     }
 
-    auto z = NDArrayFactory::create<float>('c', {2000, 300});
+    auto z = NDArrayFactory::create<int>('c', {2000, 300});
     context.setOutputArray(0, &z, false);
     context.setIArguments(&axis, 1);
 
@@ -633,8 +635,10 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
     op.execute(&context);
 
     for (int e = 0; e < 2000; e++) {
+        auto exp = NDArrayFactory::create<int>('c', {300});
+        exp.assign(e);
         auto row = z.tensorAlongDimension(e, {1});
-        ASSERT_NEAR((float) e, row.e<float>(0), 1e-5f);
+        ASSERT_EQ(exp, row);
     }
 }
 
@@ -932,208 +936,6 @@ TEST_F(DeclarableOpsTests9, tile_test1) {
     delete results;
 }
 
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test1) {
-
-    auto x  = NDArrayFactory::create<double>('c', {3, 4});
-    auto y  = NDArrayFactory::create<double>('c', {4, 3});
-    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35.,  79., 123., 40.,  92., 144., 45., 105., 165.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test2) {
-
-    auto x  = NDArrayFactory::create<double>('c', {3, 4});
-    auto y  = NDArrayFactory::create<double>('f', {4, 3});
-    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test3) {
-
-    auto x  = NDArrayFactory::create<double>('f', {3, 4});
-    auto y  = NDArrayFactory::create<double>('c', {4, 3});
-    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test4) {
-
-    auto x = NDArrayFactory::create<double> ('f', {3, 4});
-    auto y  = NDArrayFactory::create<double>('f', {4, 3});
-    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35., 79., 123.,40., 92., 144.,45.,105., 165.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test5) {
-
-    auto x  = NDArrayFactory::create<double>('c', {4, 3});
-    auto y  = NDArrayFactory::create<double>('c', {4, 3});
-    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {83.,  94., 105., 94., 107., 120., 105., 120., 135.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test6) {
-
-    auto x  = NDArrayFactory::create<double>('c', {4, 3});
-    auto y  = NDArrayFactory::create<double>('f', {3, 4});
-    auto exp = NDArrayFactory::create<double>('f', {3, 3}, {35.,  40.,  45., 79.,  92., 105., 123., 144., 165.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test7) {
-
-    auto x  = NDArrayFactory::create<double>('c', {5,  3,4});
-    auto y  = NDArrayFactory::create<double>('f', {5,  3,4});
-    auto exp = NDArrayFactory::create<double>('f',{5,  3,3}, {3. ,  84.6, 281.4, 593.4, 1020.6, 7. , 107.8, 323.8, 655. , 1101.4,11. , 131. , 366.2, 716.6, 1182.2,
-                                        7. , 107.8, 323.8, 655. , 1101.4,17.4, 137.4, 372.6, 723. , 1188.6,27.8, 167. , 421.4, 791. , 1275.8,
-                                       11. , 131. , 366.2, 716.6, 1182.2,27.8, 167. , 421.4, 791. , 1275.8,44.6, 203. , 476.6, 865.4, 1369.4,});
-
-    x.linspace(1.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {0, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test8) {
-
-    auto x  = NDArrayFactory::create<double>('c', {2,5,  3,4});
-    auto y  = NDArrayFactory::create<double>('f', {2,5,  3,4});
-    auto exp = NDArrayFactory::create<double>('f',{2,5,  3,3}, {3. , 1563. ,  84.6, 2220.6, 281.4, 2993.4, 593.4, 3881.4,1020.6, 4884.6,   7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4,
-                                          11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2,   7. , 1663. , 107.8, 2339.8, 323.8, 3131.8, 655. , 4039. ,1101.4, 5061.4,
-                                          17.4, 1769.4, 137.4, 2465.4, 372.6, 3276.6, 723. , 4203. ,1188.6, 5244.6,  27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8,
-                                          11. , 1763. , 131. , 2459. , 366.2, 3270.2, 716.6, 4196.6,1182.2, 5238.2,  27.8, 1875.8, 167. , 2591. , 421.4, 3421.4, 791. , 4367. ,1275.8, 5427.8,
-                                          44.6, 1988.6, 203. , 2723. , 476.6, 3572.6, 865.4, 4537.4,1369.4, 5617.4});
-
-    x.linspace(1.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {0, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test9) {
-
-    auto x  = NDArrayFactory::create<double>('c', {2,5,  4,3});
-    auto y  = NDArrayFactory::create<double>('f', {2,5,  3,4});
-    auto exp = NDArrayFactory::create<double>('f',{2,5,  3,3}, {7. , 1639. , 103. , 2311. , 314.2, 3098.2, 640.6, 4000.6,1082.2, 5018.2,   8. , 1664. , 108.8, 2340.8, 324.8, 3132.8, 656. , 4040. ,1102.4, 5062.4,
-                                          9. , 1689. , 114.6, 2370.6, 335.4, 3167.4, 671.4, 4079.4,1122.6, 5106.6,  15.8, 1743.8, 131. , 2435. , 361.4, 3241.4, 707. , 4163. ,1167.8, 5199.8,
-                                          18.4, 1770.4, 138.4, 2466.4, 373.6, 3277.6, 724. , 4204. ,1189.6, 5245.6,  21. , 1797. , 145.8, 2497.8, 385.8, 3313.8, 741. , 4245. ,1211.4, 5291.4,
-                                          24.6, 1848.6, 159. , 2559. , 408.6, 3384.6, 773.4, 4325.4,1253.4, 5381.4,  28.8, 1876.8, 168. , 2592. , 422.4, 3422.4, 792. , 4368. ,1276.8, 5428.8,
-                                          33. , 1905. , 177. , 2625. , 436.2, 3460.2, 810.6, 4410.6,1300.2, 5476.2});
-
-    x.linspace(1.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, TestDropout_BP_1) {
 
@@ -1325,325 +1127,6 @@ TEST_F(DeclarableOpsTests9, Test_AlphaDropout_BP_1) {
     delete ress2;
 }
 
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test10) {
-
-    auto x  = NDArrayFactory::create<double>('c', {1, 4, 3});
-    auto y  = NDArrayFactory::create<double>('f', {1, 3, 4});
-    auto exp = NDArrayFactory::create<double>('f', {1, 3, 3}, {35.,  40.,  45., 79.,  92., 105., 123., 144., 165.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test11) {
-
-    auto x  = NDArrayFactory::create<double>('c', {4, 1});
-    auto y  = NDArrayFactory::create<double>('f', {1, 4});
-    auto exp = NDArrayFactory::create<double>('f', {1, 1}, {15});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1});
-    ASSERT_EQ(Status::OK(), results->status());
-
-    auto z = results->at(0);
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test12) {
-
-    auto x  = NDArrayFactory::create<double>('c', {1, 4, 1});
-    auto y  = NDArrayFactory::create<double>('f', {1, 1, 4});
-    auto exp = NDArrayFactory::create<double>('f', {1, 1, 1}, {15});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1});
-
-    ASSERT_EQ(Status::OK(), results->status());
-    auto z = results->at(0);
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test13) {
-
-    auto x  = NDArrayFactory::create<double>('c', {2, 3});
-    auto y  = NDArrayFactory::create<double>('c', {3, 5});
-    auto exp = NDArrayFactory::create<double>('f', {5, 2}, {23. , 26. , 29. , 32. , 35., 50. , 57.5, 65. , 72.5, 80.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {0, 0, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test14) {
-
-    auto x  = NDArrayFactory::create<double>('c', {3, 2});
-    auto y  = NDArrayFactory::create<double>('c', {3, 5});
-    auto exp = NDArrayFactory::create<double>('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 0, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test15) {
-
-    auto x  = NDArrayFactory::create<double>('c', {3, 2});
-    auto y  = NDArrayFactory::create<double>('c', {3, 5});
-    auto exp = NDArrayFactory::create<double>('f', {5, 2}, {37. , 41.5, 46. , 50.5, 55., 46. , 52. , 58. , 64. , 70.});
-
-    x.linspace(1.);
-    y.linspace(0.5, 0.5);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 0, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test16) {
-
-    auto x  = NDArrayFactory::create<double>('c', {2,2,  3,5});
-    auto y  = NDArrayFactory::create<double>('c', {2,2,  4,3});
-    auto exp = NDArrayFactory::create<double>('f',{2,2,  4,5}, {4.6, 281.8, 89.2, 582.4, 10. , 314.2,108.1, 628.3, 15.4, 346.6,127. , 674.2, 20.8, 379. ,145.9, 720.1,  5.2, 289.6, 93.4, 593.8,
-                                          11.5, 322.9,113.2, 640.6, 17.8, 356.2,133. , 687.4, 24.1, 389.5,152.8, 734.2,  5.8, 297.4, 97.6, 605.2, 13. , 331.6,118.3, 652.9,
-                                          20.2, 365.8,139. , 700.6, 27.4, 400. ,159.7, 748.3,  6.4, 305.2,101.8, 616.6, 14.5, 340.3,123.4, 665.2, 22.6, 375.4,145. , 713.8,
-                                          30.7, 410.5,166.6, 762.4,  7. , 313. ,106. , 628. , 16. , 349. ,128.5, 677.5, 25. , 385. ,151. , 727. , 34. , 421. ,173.5, 776.5});
-
-    x.linspace(1.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test17) {
-
-    auto x  = NDArrayFactory::create<double>('f', {4, 3});
-    auto y  = NDArrayFactory::create<double>('c', {4});
-    auto exp = NDArrayFactory::create<double>('f',{3}, {7., 8., 9.});
-
-    x.linspace(1.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 0});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test18) {
-
-    auto x  = NDArrayFactory::create<double>('f', {3});
-    auto y  = NDArrayFactory::create<double>('c', {4, 3});
-    auto exp = NDArrayFactory::create<double>('f',{4}, {1.4, 3.2, 5., 6.8});
-
-    x.linspace(1.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {0, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test19) {
-
-    auto x  = NDArrayFactory::create<double>('f', {1, 1});
-    auto y  = NDArrayFactory::create<double>('c', {1, 1});
-    auto exp = NDArrayFactory::create<double>('f',{1, 1}, {0.2});
-
-    x.linspace(2.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test20) {
-
-    auto x  = NDArrayFactory::create<double>('f', {1, 1});
-    auto y  = NDArrayFactory::create<double>('c', {1, 1});
-    auto exp = NDArrayFactory::create<double>('f',{1, 1}, {0.2});
-
-    x.linspace(2.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1,1,1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test21) {
-
-    auto x  = NDArrayFactory::create<double>('f', {1});
-    auto y  = NDArrayFactory::create<double>('c', {1, 1});
-    auto exp = NDArrayFactory::create<double>('f',{1}, {0.2});
-
-    x.linspace(2.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test22) {
-
-    auto x  = NDArrayFactory::create<double>('f', {1,1});
-    auto y  = NDArrayFactory::create<double>('c', {1});
-    auto exp = NDArrayFactory::create<double>('f',{1}, {0.2});
-
-    x.linspace(2.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test23) {
-
-    auto x  = NDArrayFactory::create<double>('f', {4});
-    auto y  = NDArrayFactory::create<double>('c', {4});
-    auto exp = NDArrayFactory::create<double>(3.);
-
-    x.linspace(1.);
-    y.linspace(0.1, 0.1);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, matmul_test24) {
-
-    auto x  = NDArrayFactory::create<double>('f', {1}, {2.});
-    auto y  = NDArrayFactory::create<double>('c', {1}, {3.});
-    auto exp = NDArrayFactory::create<double>(6.);
-
-    nd4j::ops::matmul op;
-    auto results = op.evaluate({&x, &y}, {}, {1, 1});
-    auto z = results->at(0);
-
-    ASSERT_EQ(Status::OK(), results->status());
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete results;
-}
-
 TEST_F(DeclarableOpsTests9, test_range_int_1) {
     auto x0 = NDArrayFactory::create<int>(0);
     auto x1 = NDArrayFactory::create<int>(2);
@@ -2043,34 +1526,6 @@ TEST_F(DeclarableOpsTests9, cumprod_test1) {
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN);
 
     ASSERT_TRUE(isGradCorrect);
-
-    //************************************//
-/*    exclusive = 1; reverse = 0;
-
-    result = op.execute({&inputC, &axis}, {}, {exclusive, reverse});
-    ASSERT_EQ(Status::OK(), result->status());
-    z = result->at(0);
-    ASSERT_TRUE(expTF.equalsTo(z));
-    delete result;
-*/
-    //************************************//
-/*    exclusive = 0; reverse = 1;
-
-    result = op.execute({&inputC, &axis}, {}, {exclusive, reverse});
-    ASSERT_EQ(Status::OK(), result->status());
-    z = result->at(0);
-    ASSERT_TRUE(expFT.equalsTo(z));
-    delete result;
-*/
-    //************************************//
-/*    exclusive = 1; reverse = 1;
-
-    result = op.execute({&inputC, &axis}, {}, {exclusive, reverse});
-    ASSERT_EQ(Status::OK(), result->status());
-    z = result->at(0);
-    ASSERT_TRUE(expTT.equalsTo(z));
-    delete result;
-*/
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -2079,11 +1534,6 @@ TEST_F(DeclarableOpsTests9, cumprod_test2) {
     auto inputC = NDArrayFactory::create<double>('c', {2, 2});
     auto axis = NDArrayFactory::create<double>(1.);
 
-//    auto expFF = NDArrayFactory::create<double>('c', {3, 5}, {1.,   2.,   6.,    24.,   120., 6.,  42., 336.,  3024., 30240.,11., 132.,1716., 24024.,360360.});
-//    auto expTF = NDArrayFactory::create<double>('c', {3, 5}, {1, 1, 2, 6, 24,1, 6, 42, 336, 3024,1, 11, 132, 1716, 24024});
-
-//    auto expFT = NDArrayFactory::create<double>('c', {3, 5}, {120, 120, 60, 20, 5,30240, 5040, 720, 90, 10,360360, 32760, 2730, 210, 15});    //+++
-//    auto expTT = NDArrayFactory::create<double>('c', {3, 5}, {120, 60, 20, 5, 1,5040, 720, 90, 10, 1,32760, 2730, 210, 15, 1});
     auto gradO = NDArrayFactory::create<double>('c', {2, 2});
 
     int exclusive, reverse;
diff --git a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
index 3717c488b..a234e6d50 100644
--- a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
@@ -161,23 +161,6 @@ TEST_F(EmptyTests, Test_Reshape_1) {
     delete result;
 }
 
-TEST_F(EmptyTests, Test_Reshape_2) {
-    auto vector = NDArrayFactory::create<float>('c', {1}, {119.0f});
-    auto exp = NDArrayFactory::create<float>(119.0f);
-    auto empty = NDArrayFactory::empty_<Nd4jLong>();
-
-    nd4j::ops::reshape op;
-    auto result = op.evaluate({&vector, empty}, {}, {}, {}, {}, true);
-
-    ASSERT_EQ(Status::OK(), result->status());
-
-    ASSERT_EQ(exp, *result->at(0));
-    ASSERT_EQ(exp, vector);
-
-    delete empty;
-    delete result;
-}
-
 TEST_F(EmptyTests, Test_Reshape_3) {
     auto x = NDArrayFactory::create<float>('c', {1, 0, 0, 2});
     auto y = NDArrayFactory::create<int>('c', {2}, {10, 0});
diff --git a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
index d83e85f67..b01c9f98a 100644
--- a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
@@ -64,8 +64,11 @@ TEST_F(MklDnnTests, helpers_includer) {
     nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CPU maxpool3d_bp;
 
     nd4j::ops::platforms::PLATFORM_lrn_ENGINE_CPU lrn;
+
     nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CPU batchnorm;
 
-    printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm});
+    nd4j::ops::platforms::PLATFORM_matmul_ENGINE_CPU matmul;
+
+    printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm, &matmul});
 #endif
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
index e3dc1aefc..6d5366396 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
@@ -956,7 +956,7 @@ TEST_F(NDArrayTest2, subarray_1) {
     float    buffExpX3[]  = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
     Nd4jLong shapeExpX4[] = {3, 2, 1, 4, 12, 4, 1, 8192, 0, 99};
     float    buffExpX4[]  = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
-    Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 0, 99};
+    Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 4, 99};
     float    buffExpX5[]  = {4.000000, 8.000000, 12.000000, 16.000000, 20.000000, 24.000000};
 
     Nd4jLong shapeExpY0[] = {1, 2, 1, 8192, 1, 102};
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index 9f75beca1..93fb5d6b3 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -65,6 +65,246 @@ TEST_F(PlaygroundTests, test_avx) {
     nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel());
 }
 
+
+TEST_F(PlaygroundTests, test_biasAdd_1) {
+    auto x = NDArrayFactory::create<float>('c', {512, 3072});
+    auto y = NDArrayFactory::create<float>('c', {3072});
+
+    std::vector<Nd4jLong> values;
+
+    nd4j::ops::biasadd op;
+
+    for (int e = 0; e < 100; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        op.execute({&x, &y}, {&x});
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
+}
+
+
+
+TEST_F(PlaygroundTests, test_bert_1) {
+    // this test will run ONLY if this model exists
+    if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0)
+        return;
+
+    auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb");
+
+    auto t = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext.numpy");
+    auto u = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_1.numpy");
+    auto v = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_4.numpy");
+    auto z = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model_output.numpy");
+
+    //graph->printOut();
+
+    graph->tagInplaceNodes();
+
+    graph->getVariableSpace()->putVariable(85,0, t);
+    graph->getVariableSpace()->putVariable(86,0, u);
+    graph->getVariableSpace()->putVariable(87,0, v);
+
+/*
+    // validating graph now
+    auto status = GraphExecutioner::execute(graph);
+    ASSERT_EQ(Status::OK(), status);
+    ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198));
+
+    auto array = graph->getVariableSpace()->getVariable(198)->getNDArray();
+    ASSERT_EQ(z, *array);
+*/
+
+    nd4j::Environment::getInstance()->setProfiling(true);
+    auto profile = GraphProfilingHelper::profile(graph, 1);
+
+    profile->printOut();
+
+    nd4j::Environment::getInstance()->setProfiling(false);
+    delete profile;
+
+/*
+    std::vector<Nd4jLong> values;
+
+    for (int e = 0; e < 1; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        GraphExecutioner::execute(graph);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
+*/
+    delete graph;
+}
+
+TEST_F(PlaygroundTests, test_bert_2) {
+    // this test will run ONLY if this model exists
+    if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb") < 0)
+        return;
+
+    auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb");
+
+    //graph->printOut();
+
+    graph->tagInplaceNodes();
+
+
+/*
+    // validating graph now
+    auto status = GraphExecutioner::execute(graph);
+    ASSERT_EQ(Status::OK(), status);
+    ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198));
+
+    auto array = graph->getVariableSpace()->getVariable(198)->getNDArray();
+    ASSERT_EQ(z, *array);
+*/
+
+    nd4j::Environment::getInstance()->setProfiling(true);
+    auto profile = GraphProfilingHelper::profile(graph, 1);
+
+    profile->printOut();
+
+    nd4j::Environment::getInstance()->setProfiling(false);
+    delete profile;
+
+/*
+    std::vector<Nd4jLong> values;
+
+    for (int e = 0; e < 1; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        GraphExecutioner::execute(graph);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
+*/
+    delete graph;
+}
+
+TEST_F(PlaygroundTests, test_one_off_ops_1) {
+    auto x = NDArrayFactory::create<float>('c', {4, 128, 768});
+    auto y = NDArrayFactory::create<float>('c', {4, 128, 1});
+    auto z = x.ulike();
+
+    nd4j::ops::squaredsubtract op;
+    op.execute({&x, &y}, {&z});
+}
+
+
+/*
+
+TEST_F(PlaygroundTests, test_broadcast_1) {
+    int pool = 1000;
+    std::vector<NDArray*> aX(pool);
+    std::vector<NDArray*> aY(pool);
+    std::vector<NDArray*> aZ(pool);
+
+    for (int e = 0; e < pool; e++) {
+        aX[e] = NDArrayFactory::create_<float>('c', {512, 3072});
+        aY[e] = NDArrayFactory::create_<float>('c', {3072});
+        aZ[e] = NDArrayFactory::create_<float>('c', {512, 3072});
+
+        aX[e]->assign(119 * (e+1));
+        aY[e]->assign(119 * (e+3));
+    }
+
+    std::vector<Nd4jLong> values;
+    Context ctx(1);
+
+    nd4j::ops::biasadd op;
+
+    for (int e = 0; e < 1000; e++) {
+        auto x = aX[e < pool ? e : e % pool];
+        auto y = aY[e < pool ? e : e % pool];
+        auto z = aZ[e < pool ? e : e % pool];
+
+        auto timeStart = std::chrono::system_clock::now();
+
+        //op.execute({x, y}, {z});
+        nd4j::ops::helpers::addBias(ctx, *x, *y, *z, false);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
+
+    for (int e = 0; e < pool; e++) {
+        delete aX[e];
+        delete aY[e];
+        delete aZ[e];
+    }
+}
+
+
+/*
+TEST_F(PlaygroundTests, test_broadcast_1) {
+    int pool = 500;
+    std::vector<NDArray*> aX(pool);
+    std::vector<NDArray*> aY(pool);
+    std::vector<NDArray*> aZ(pool);
+
+    for (int e = 0; e < pool; e++) {
+        aX[e] = NDArrayFactory::create_<float>('c', {512, 3072});
+        aY[e] = NDArrayFactory::create_<float>('c', {768});
+        aZ[e] = NDArrayFactory::create_<float>('c', {512, 3072});
+
+        aX[e]->assign( (e+1) / 119);
+        aY[e]->assign( (e+3) / 119);
+    }
+
+
+
+    std::vector<Nd4jLong> values;
+
+    for (int e = 0; e < 1000; e++) {
+        auto x = aX[e < pool ? e : e % pool];
+        auto y = aY[e < pool ? e : e % pool];
+        auto z = aZ[e < pool ? e : e % pool];
+
+        auto timeStart = std::chrono::system_clock::now();
+
+        //x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z);
+        x->applyTransform(transform::Tanh, *z, nullptr);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
+
+    for (int e = 0; e < pool; e++) {
+        delete aX[e];
+        delete aY[e];
+        delete aZ[e];
+    }
+}
+
+*/
 /*
 
 TEST_F(PlaygroundTests, test_s_0) {
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
index a8f430fe3..fb0d7991a 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
@@ -43,7 +43,7 @@ public:
     Nd4jLong shape[3] = {3,4,5};
     Nd4jLong *shapeBuffer;
     ThreeDTest() {
-        shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);        
+        shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
     }
     ~ThreeDTest() {
         delete[] shapeBuffer;
@@ -196,11 +196,11 @@ public:
     int dimensionLength = 2;
     int dimension[2] = {2,3};
     Nd4jLong tadAssertionC[10] = {3,4,4,1,4,1,16,16384,1,99};
-    Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,1,16384,1,102};
+    Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,16,16384,1,102};
 };
 
 
-TEST_F(LeadingOnes,OnesTest) {        
+TEST_F(LeadingOnes,OnesTest) {
 
     shape::TAD *cTad = new shape::TAD;
     cTad->init(shapeBufferC,dimension,dimensionLength);
@@ -222,7 +222,7 @@ TEST_F(LeadingOnes,OnesTest) {
 
 class NormalThreeFourFive : public testing::Test {
 public:
-    Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 0, 102};
+    Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 5, 99};
     Nd4jLong inputShapeBuffer[10] = {3,3,4,5,20,5,1,16384,1,99};
     int dimensionLength = 2;
     int dimension[2] = {0,1};
@@ -243,7 +243,7 @@ class DimensionWarning : public testing::Test {
 public:
     int dimensionLength = 2;
     int dimensions[2] = {0,1};
-    Nd4jLong shape[3] = {1,5,1};    
+    Nd4jLong shape[3] = {1,5,1};
     Nd4jLong *shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
 
     ~DimensionWarning() {
@@ -324,7 +324,7 @@ public:
     int dimensionFour = 0;
     int dimensionLength = 1;
     FourDTest() {
-        threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape);        
+        threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape);
         fourDShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 4, fourDShape);
     }
     ~FourDTest() {
@@ -491,7 +491,7 @@ TEST_F(LabelTest,LabelTad) {
     delete tad;
 }
 
-TEST_F(ExpectedValuesTest,TadTest) {    
+TEST_F(ExpectedValuesTest,TadTest) {
     auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, mainShape);
     shape::TAD *tad = new shape::TAD;
     tad->init(shapeBuffer,testDimensions,3);
@@ -528,7 +528,7 @@ TEST_F(ThreeDTest,TensorAlongDimensionTest) {
 }
 
 
-TEST_F(NumTadTests,TadTest) {    
+TEST_F(NumTadTests,TadTest) {
     auto shape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, this->shape);
     shape::TAD *tad = new shape::TAD;
     tad->init(shape,&dimension,1);
@@ -539,7 +539,7 @@ TEST_F(NumTadTests,TadTest) {
 }
 
 TEST_F(TADStall,TestStall) {
-    auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);    
+    auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);
     shape::TAD *tad = new shape::TAD;
     tad->init(0,shapeInfo,this->dimensions,3);
     tad->createTadOnlyShapeInfo();
@@ -564,7 +564,7 @@ TEST_F(PermuteTest,PermuteShapeBufferTest) {
     Nd4jLong shapeToPermute[4] = {5,3,2,6};
     Nd4jLong permutedOrder[4] = {6,2,3,5};
     auto shapeBufferOriginal  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);
-    auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);    
+    auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);
     shape::permuteShapeBufferInPlace(shapeBufferOriginal,normalOrder,shapeBufferOriginal);
     EXPECT_TRUE(arrsEquals(4,assertionShapeBuffer,shapeBufferOriginal));
 
@@ -585,9 +585,9 @@ TEST_F(ElementWiseStrideTest,ElementWiseStrideTest) {
 
 TEST_F(SliceVectorTest,RowColumnVectorTest) {
     Nd4jLong rowVectorShape[2] = {1,5};
-    auto rowVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);    
+    auto rowVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);
     Nd4jLong colVectorShape[2] = {5,1};
-    auto colVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape);    
+    auto colVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape);
     Nd4jLong *sliceRow = shape::sliceOfShapeBuffer(0,rowVectorShapeInfo);
     EXPECT_TRUE(arrsEquals(2,rowVectorShapeInfo,sliceRow));
     Nd4jLong *scalarSliceInfo = shape::createScalarShapeInfo();
@@ -608,7 +608,7 @@ TEST_F(SliceTensorTest,TestSlice) {
     Nd4jLong shape[3] = {3,3,2};
     auto shapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
     Nd4jLong sliceShape[2] = {3,2};
-    auto sliceShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);    
+    auto sliceShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);
     Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer);
     EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice));
     delete[] testSlice;
@@ -619,9 +619,9 @@ TEST_F(SliceTensorTest,TestSlice) {
 
 TEST_F(SliceMatrixTest,TestSlice) {
     Nd4jLong shape[2] = {3,2};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);
     Nd4jLong sliceShape[2] = {1,2};
-    auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);    
+    auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);
     Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer);
     EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice));
     delete[] testSlice;
@@ -664,13 +664,13 @@ TEST_F(TensorTwoFromFourDDimTest,TadTwoFromFourDimTest) {
     //Along dimension 1,2: expect matrix with shape [cols,dim2]
     //Along dimension 1,3: expect matrix with shape [cols,dim3]
     //Along dimension 2,3: expect matrix with shape [dim2,dim3]
-    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);    
+    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);
     for(int i = 0; i <  3; i++) {
         int *dimArr = dims[i];
         Nd4jLong *expectedShape = expectedShapes[i];
         shape::TAD *tad = new shape::TAD;
         tad->init(baseShapeBuffer,dimArr,dimensionLength);
-        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);        
+        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
         tad->createTadOnlyShapeInfo();
         Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo;
         EXPECT_TRUE(arrsEquals(shape::rank(expectedShapeBuffer),expectedShape,shape::shapeOf(testShapeBuffer)));
@@ -687,14 +687,14 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) {
     //Along dimension 0,1: expect matrix with shape [rows,cols]
     //Along dimension 0,2: expect matrix with shape [rows,dim2]
     //Along dimension 1,2: expect matrix with shape [cols,dim2]
-    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);    
+    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
 
     for(int i = 0; i <  3; i++) {
         int *dimArr = dims[i];
         Nd4jLong *expectedShape = expectedShapes[i];
         shape::TAD *tad = new shape::TAD;
         tad->init(baseShapeBuffer,dimArr,dimensionLength);
-        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);        
+        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
         tad->createTadOnlyShapeInfo();
         Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo;
         Nd4jLong *expectedStride = expectedStrides[i];
@@ -715,7 +715,7 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) {
 
 TEST_F(TensorOneDimTest,TadDimensionsForTensor) {
     Nd4jLong shape[3] = {rows,cols,dim2};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);
 
     for(int i = 0; i < rank; i++) {
         //Along dimension 0: expect row vector with length 'dims[i]'
@@ -737,14 +737,14 @@ TEST_F(TensorOneDimTest,TadDimensionsForTensor) {
 
 TEST_F(MatrixTest,TadDimensionsForMatrix) {
     Nd4jLong shape[2] = {rows,cols};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);
 
     shape::TAD *dimZero = new shape::TAD;
     dimZero->init(shapeBuffer,&dims[0],1);
     shape::TAD *dimOne = new shape::TAD;
     dimOne->init(shapeBuffer,&dims[1],1);
     //Along dimension 0: expect row vector with length 'rows'
-    Nd4jLong rowVectorShape[2] = {1,rows};    
+    Nd4jLong rowVectorShape[2] = {1,rows};
     auto expectedDimZeroShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);
     dimZero->createTadOnlyShapeInfo();
     Nd4jLong *testDimZero = dimZero->tadOnlyShapeInfo;
@@ -753,7 +753,7 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) {
 
     delete[] expectedDimZeroShape;
     //Along dimension 1: expect row vector with length 'cols'
-    Nd4jLong rowVectorColShape[2] {1,cols};    
+    Nd4jLong rowVectorColShape[2] {1,cols};
     auto expectedDimOneShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorColShape);
     dimOne->createTadOnlyShapeInfo();
     Nd4jLong *testDimOneShape = dimOne->tadOnlyShapeInfo;
@@ -767,12 +767,12 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) {
 }
 
 TEST_F(VectorTest,VectorTadShape) {
-    Nd4jLong rowVector[2] = {2,2};    
+    Nd4jLong rowVector[2] = {2,2};
     auto rowBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVector);
     int rowDimension = 1;
 
     Nd4jLong columnVector[2] = {2,2};
-    auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector);   
+    auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector);
     int colDimension = 0;
 
 
@@ -811,7 +811,7 @@ TEST_F(VectorTest,LinspaceCombinationTest) {
     int len = rows * cols;
     double *linspaced = linspace<double>(1,rows * cols,len);
     Nd4jLong shape[2] = {rows,cols};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);
 
     delete[] shapeBuffer;
     delete[] linspaced;
diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
index a852a0c4c..fbba329e3 100644
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@@ -273,7 +273,7 @@ add_executable(runtests ${LOOPS_SOURCES} ../../blas/cpu/NativeOps.cpp ../../blas
     ../../include/cnpy/cnpy.cpp  ../../include/nd4jmemset.h ../../include/nd4jmalloc.h
     ../../blas/Environment.cpp ../../blas/Environment.h ${EXEC_SOURCES} ${HELPERS_SOURCES}  ${ARRAY_SOURCES} ${TYPES_SOURCES}
     ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
-    ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES} ../../include/loops/cpu/compilation_units/TrueBroadcastHelper_1.cpp)
+    ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES})
 
 target_link_libraries(runtests gtest ${MKLDNN} gtest_main ${BLAS_LIBRARIES})
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index e7ddcda11..db2c941e9 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -4250,14 +4250,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  set new order and shape in case of suitable array length (in-place operation)
         *  order - order to set
         *  shape - shape to set
-        *
+        *  copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping
         *  if there was permute applied before or there are weird strides, then new buffer is allocated for array
         */
+		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape);
+		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape);
+		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape);
 
         /**
@@ -4267,8 +4273,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *
         * if permute have been applied before or there are weird strides, then new buffer is allocated for new array
         */
+        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
         public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
         public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
         public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
 
         /**
@@ -6203,6 +6212,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <pointercast.h>
 // #include <dll.h>
 // #include <string>
+// #include <vector>
         @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
@@ -6235,11 +6245,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native void setObjectsSize(@Cast("Nd4jLong") long bytes);
             public native void setTotalSize(@Cast("Nd4jLong") long bytes);
 
+            public native void addInputShape(@Cast("Nd4jLong*") LongPointer shapeInfo);
+            public native void addInputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+            public native void addInputShape(@Cast("Nd4jLong*") long[] shapeInfo);
+            public native void addOutputShape(@Cast("Nd4jLong*") LongPointer shapeInfo);
+            public native void addOutputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+            public native void addOutputShape(@Cast("Nd4jLong*") long[] shapeInfo);
+
             public native @Cast("Nd4jLong") long getActivationsSize();
             public native @Cast("Nd4jLong") long getTemporarySize();
             public native @Cast("Nd4jLong") long getObjectsSize();
             public native @Cast("Nd4jLong") long getTotalSize();
 
+            public native @Cast("Nd4jLong") long getExecutionTime();
+
             public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name();
 
             public native void merge(NodeProfile other);
@@ -6835,9 +6854,15 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") LongBuffer oldShape, int newRank, @Cast("Nd4jLong*") LongBuffer newShape, @Cast("bool") boolean isFOrder);
     @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") long[] oldShape, int newRank, @Cast("Nd4jLong*") long[] newShape, @Cast("bool") boolean isFOrder);
 
-    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongPointer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo);
-    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongBuffer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo);
-    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") long[] oldShapeInfo, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo);
+    /**
+    * newShapeInfo contains rank, shape and order only, no strides/ews/type
+    */
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, @Cast("Nd4jLong*") LongPointer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, @Cast("Nd4jLong*") LongBuffer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, @Cast("Nd4jLong*") long[] newShapeInfo);
 
     /**
     * Get the shape info buffer
@@ -7145,6 +7170,15 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongPointer shapeInfo);
     @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongBuffer shapeInfo);
     @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") long[] shapeInfo);
+
+    /**
+    * shape - input inShape is shape only, not shapeInfo
+    * returns number of non-unity dimensions in inShape
+    */
+    @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongPointer inShape);
+    @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongBuffer inShape);
+    @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") long[] inShape);
+
     /**
  * Returns whether the
  * given shape is a vector or not
@@ -7163,9 +7197,9 @@ public static final int PREALLOC_SIZE = 33554432;
  * Returns the shape portion of an information
  * buffer
  */
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer shapeInfo);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] shapeInfo);
 
 /**
  * Return a copy of a buffer.
@@ -7708,18 +7742,18 @@ public static final int PREALLOC_SIZE = 33554432;
 * @return the double at the specified index
 */
 
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords);
 
     @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank);
     @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank);
@@ -7903,40 +7937,22 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong*") LongBuffer offsets);
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets, byte order/*='c'*/);
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets);
+    // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c');
+    // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c');
     @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order);
     @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order);
     @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order);
 
-    // deduce element-wise stride
-    // if array is scalar or unit length vector then ews = 1
-    // if array is common vector then ews = stride of non-unity dimension
-    // if strides are normal set ews = 1, otherwise ews = 0
-    @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len);
-    @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len);
-    @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len);
-
     // deduce order and element-wise stride
     // if array is scalar or unit length vector then ews = 1 and order is preserved
     // if array is common vector then ews = stride of non-unity dimension and order is preserved
     // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len/*=-1*/);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo);
 
     /**
     * processes whole set of sub-arrays
@@ -7946,7 +7962,7 @@ public static final int PREALLOC_SIZE = 33554432;
     * numOfSubArrs - number of sub-arrays, size of subArrOffsets is equal to numOfSubArrs
     * dimsSize - size of dimsToExclude, if dimsSize = array rank or dimsSize = 0 it means sub-array is whole array, copy of wholeShapeInfo and one zero offset will be returned
     * dimsToExclude - MUST BE SORTED, dimensions to evaluate sub-array along, i.e. when shape is [2,3,4,5] and dimsToExclude={0,2}, then there will be 8 sub-arrays with shape [3,5]
-    * subArrShapeInfo    - output argument, contains shapeInfo common for all sub-arrays
+    * subArrShapeInfo    - output argument, contains shapeInfo (same for all sub-arrays)
     * subArrOffsets      - output argument, contains successive sub-arrays offsets from original this-buffer
     * keepUnitiesInShape - if false then eliminate unities from sub-array shapeInfo, for example {1,a,1,b} -> {a,b}
     */
@@ -7957,6 +7973,24 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets, @Cast("bool") boolean keepUnitiesInShape/*=false*/);
     @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets);
 
+    /**
+    * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99}
+    * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order
+    * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1}
+    * returns number of non-unity dimensions in inShapeInfo
+    * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo
+    */
+    @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongPointer stridesNoUnities);
+    @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer stridesNoUnities);
+    @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities);
+
+    /**
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
+    * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
+    */
+    @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo);
+    @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, int dimsSize, @Const IntBuffer dimsToExclude, @Cast("Nd4jLong*") LongBuffer outShapeInfo);
+    @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] outShapeInfo);
 
 
 
@@ -8186,6 +8220,8 @@ public static final int PREALLOC_SIZE = 33554432;
  * @param rank the rank of the shape
  */
 
+//////////////////////////////////////////////////////////////////////
+
 /**
 * Returns whether the
 * given shape is a vector or not
@@ -8735,69 +8771,60 @@ public static final int PREALLOC_SIZE = 33554432;
 //         return true;
 //     }
 
-// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) {
+//////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) {
 
 //         // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements
 //         // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo
 
-//         const int newOrder = isFOrder ? 102 : 99;
-//         const int oldOrder = oldShapeInfo[2 * oldRank + 3];
-
 //         newShapeInfo[0] = newRank;
 //         memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong));
 
-//         Nd4jLong* newStrides = shape::stride(newShapeInfo);
-//         const Nd4jLong* oldShape = shape::shapeOf(const_cast<Nd4jLong*>(oldShapeInfo));
+//         Nd4jLong* newStrides       = shape::stride(newShapeInfo);
+//         const Nd4jLong* oldShape   = shape::shapeOf(const_cast<Nd4jLong*>(oldShapeInfo));
 //         const Nd4jLong* oldStrides = shape::stride(const_cast<Nd4jLong*>(oldShapeInfo));
-//         int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
-
+//         Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
 
 //         while (newStart < newRank && oldStart < oldRank) {
 
 //             newDim = newShape[newStart];
 //             oldDim = oldShape[oldStart];
 
-//             while (newDim != oldDim)
+//             while (newDim != oldDim && newDim > 0 && oldDim > 0)
 //                 if (newDim < oldDim) newDim *= newShape[newStop++];
 //                 else                 oldDim *= oldShape[oldStop++];
 
 //             // ------ Check whether the original axes can be combined ------ //
-//             for (int i = oldStart; i < oldStop - 1; i++) {
-
-//                 if(oldShape[i] == 1) {                         // ignore strides like {...,1,1,...}
-//                     if(oldOrder == 102) ++oldStart;
+//             for (int step = 1, i = oldStart; i < oldStop - 1; ++i) {
+//                 if(oldShape[i] == 1)                // skip unity-dimension and its stride
 //                     continue;
-//                 }
-
-//                 if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i])
-//                     return false;       // not contiguous enough
-//                 if(oldOrder == 99  && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1])
-//                     return false;       // not contiguous enough
+//                 while((i + step) < oldRank && oldShape[i + step] == 1)
+//                     ++step;                         // skip following unity-dimensions and its strides if such are present
+//                 if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step])
+//                     return false;                   // not contiguous enough
 //             }
 
-//             // ------ Calculate new strides for all axes currently worked with ------ //
-//             if(isFOrder) {
-//                 newStrides[newStart] = oldStrides[oldStart];
-//                 for (int i = newStart + 1; i < newStop; ++i)
-//                     newStrides[i] = newStrides[i - 1] * newShape[i - 1];
-//             }
-//             else {
-//                 newStrides[newStop - 1] = oldStrides[oldStop - 1];
-//                 for (int i = newStop - 1; i > newStart; --i)
-//                     newStrides[i - 1] = newStrides[i] * newShape[i];
-//             }
+//             newStrides[newStop - 1] = oldStrides[oldStop - 1];
+//             for (int i = newStop - 1; i > newStart; --i)
+//                 newStrides[i - 1] = newStrides[i] * newShape[i];
 
 //             newStart = newStop++;
 //             oldStart = oldStop++;
 //         }
 
-//         newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo);    // order
-//         newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo);    // ews
-//         newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);    // type
+//         // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank)
+//         for (int i = newStart; i < newRank; ++i)
+//             newStrides[i] = 1;
+
+//         newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo);                 // order
+//         newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo);     // ews
+//         newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);                  // type
 
 //         return true;
 //     }
 
+//////////////////////////////////////////////////////////////////////
+
 //////////////////////////////////////////////////////////////////////
 
     // this function checks the consistence of dimensions with array rank (negative dimensions, too large dimensions, too big number of dimensions)
@@ -8838,9 +8865,198 @@ public static final int PREALLOC_SIZE = 33554432;
 //////////////////////////////////////////////////////////////////////
 
 //////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
+
+//     // we assume all array have same length
+//     const Nd4jLong len = shape::length(xShapeInfo);
+
+//     const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
+//     const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
+//     const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo);
+
+//     const char xOrder = shape::order(xShapeInfo);
+//     const char yOrder = shape::order(yShapeInfo);
+//     const char zOrder = shape::order(zShapeInfo);
+
+//     const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo);
+
+//     if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) {
+//         xOffsets = yOffsets = zOffsets = nullptr;
+//     }
+//     else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) {
+//         xOffsets = yOffsets = nullptr;
+//         zOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
+//     }
+//     else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) {
+//         xOffsets = zOffsets = nullptr;
+//         yOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//     }
+//     else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) {
+//         yOffsets = zOffsets = nullptr;
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//     }
+//     else if(xEws == 1) {
+//         xOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
+//             }
+//         }
+//     }
+//     else if(yEws == 1) {
+//         yOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets, yOrder);
+//             }
+//         }
+//     }
+//     else if(zEws == 1) {
+//         zOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets, zOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets, zOrder);
+//             }
+//         }
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) {
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets);
+//         yOffsets = zOffsets = xOffsets;
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets);
+//             }
+//         }
+//         yOffsets = xOffsets;
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//         }
+//         zOffsets = xOffsets;
+//     }
+//     else {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets);
+//             }
+//         }
+//     }
+// }
+
+//////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) {
+
+//     // we assume all array have same length
+//     const Nd4jLong len = shape::length(xShapeInfo);
+
+//     const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
+//     const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
+
+//     const char xOrder = shape::order(xShapeInfo);
+//     const char yOrder = shape::order(yShapeInfo);
+
+//     const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo);
+
+//     if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) {
+//         xOffsets = yOffsets = nullptr;
+//     }
+//     else if(xEws == 1) {
+//         xOffsets = nullptr;
+//         yOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//     }
+//     else if(yEws == 1) {
+//         yOffsets = nullptr;
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets);
+//         yOffsets = xOffsets;
+//     }
+//     else {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//         }
+//     }
+// }
 
 //////////////////////////////////////////////////////////////////////
 
+//////////////////////////////////////////////////////////////////////
 
 
 
@@ -9065,6 +9281,9 @@ public static final int PREALLOC_SIZE = 33554432;
             // returns TRUE if this op allows in-place execution
             public native @Cast("bool") boolean allowsInplace();
 
+            // this method allows you to enable/disable inplace call for a given op
+            public native void allowInplace(@Cast("bool") boolean reallyAllow);
+
             // this method returns opNum (applicable for legacy XYZ ops only)
             public native int getOpNum();
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index 49d088f27..71614c20f 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -4253,14 +4253,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  set new order and shape in case of suitable array length (in-place operation)
         *  order - order to set
         *  shape - shape to set
-        *
+        *  copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping
         *  if there was permute applied before or there are weird strides, then new buffer is allocated for array
         */
+		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape);
+		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape);
+		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
 		public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape);
 
         /**
@@ -4270,8 +4276,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *
         * if permute have been applied before or there are weird strides, then new buffer is allocated for new array
         */
+        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
         public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
         public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
         public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
 
         /**
@@ -6206,6 +6215,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <pointercast.h>
 // #include <dll.h>
 // #include <string>
+// #include <vector>
         @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
@@ -6238,11 +6248,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native void setObjectsSize(@Cast("Nd4jLong") long bytes);
             public native void setTotalSize(@Cast("Nd4jLong") long bytes);
 
+            public native void addInputShape(@Cast("Nd4jLong*") LongPointer shapeInfo);
+            public native void addInputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+            public native void addInputShape(@Cast("Nd4jLong*") long[] shapeInfo);
+            public native void addOutputShape(@Cast("Nd4jLong*") LongPointer shapeInfo);
+            public native void addOutputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+            public native void addOutputShape(@Cast("Nd4jLong*") long[] shapeInfo);
+
             public native @Cast("Nd4jLong") long getActivationsSize();
             public native @Cast("Nd4jLong") long getTemporarySize();
             public native @Cast("Nd4jLong") long getObjectsSize();
             public native @Cast("Nd4jLong") long getTotalSize();
 
+            public native @Cast("Nd4jLong") long getExecutionTime();
+
             public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name();
 
             public native void merge(NodeProfile other);
@@ -6838,9 +6857,15 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") LongBuffer oldShape, int newRank, @Cast("Nd4jLong*") LongBuffer newShape, @Cast("bool") boolean isFOrder);
     @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") long[] oldShape, int newRank, @Cast("Nd4jLong*") long[] newShape, @Cast("bool") boolean isFOrder);
 
-    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongPointer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo);
-    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongBuffer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo);
-    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") long[] oldShapeInfo, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo);
+    /**
+    * newShapeInfo contains rank, shape and order only, no strides/ews/type
+    */
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, @Cast("Nd4jLong*") LongPointer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, @Cast("Nd4jLong*") LongBuffer newShapeInfo);
+    @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, @Cast("Nd4jLong*") long[] newShapeInfo);
 
     /**
     * Get the shape info buffer
@@ -7148,6 +7173,15 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongPointer shapeInfo);
     @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongBuffer shapeInfo);
     @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") long[] shapeInfo);
+
+    /**
+    * shape - input inShape is shape only, not shapeInfo
+    * returns number of non-unity dimensions in inShape
+    */
+    @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongPointer inShape);
+    @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongBuffer inShape);
+    @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") long[] inShape);
+
     /**
  * Returns whether the
  * given shape is a vector or not
@@ -7166,9 +7200,9 @@ public static final int PREALLOC_SIZE = 33554432;
  * Returns the shape portion of an information
  * buffer
  */
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer shapeInfo);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] shapeInfo);
 
 /**
  * Return a copy of a buffer.
@@ -7906,40 +7940,22 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong*") LongBuffer offsets);
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets, byte order/*='c'*/);
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets, byte order/*='c'*/);
-    @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets);
+    // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c');
+    // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c');
     @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order);
     @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order);
     @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order);
 
-    // deduce element-wise stride
-    // if array is scalar or unit length vector then ews = 1
-    // if array is common vector then ews = stride of non-unity dimension
-    // if strides are normal set ews = 1, otherwise ews = 0
-    @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len);
-    @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len);
-    @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len);
-
     // deduce order and element-wise stride
     // if array is scalar or unit length vector then ews = 1 and order is preserved
     // if array is common vector then ews = stride of non-unity dimension and order is preserved
     // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len/*=-1*/);
-    @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo);
 
     /**
     * processes whole set of sub-arrays
@@ -7949,7 +7965,7 @@ public static final int PREALLOC_SIZE = 33554432;
     * numOfSubArrs - number of sub-arrays, size of subArrOffsets is equal to numOfSubArrs
     * dimsSize - size of dimsToExclude, if dimsSize = array rank or dimsSize = 0 it means sub-array is whole array, copy of wholeShapeInfo and one zero offset will be returned
     * dimsToExclude - MUST BE SORTED, dimensions to evaluate sub-array along, i.e. when shape is [2,3,4,5] and dimsToExclude={0,2}, then there will be 8 sub-arrays with shape [3,5]
-    * subArrShapeInfo    - output argument, contains shapeInfo common for all sub-arrays
+    * subArrShapeInfo    - output argument, contains shapeInfo (same for all sub-arrays)
     * subArrOffsets      - output argument, contains successive sub-arrays offsets from original this-buffer
     * keepUnitiesInShape - if false then eliminate unities from sub-array shapeInfo, for example {1,a,1,b} -> {a,b}
     */
@@ -7960,6 +7976,24 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets, @Cast("bool") boolean keepUnitiesInShape/*=false*/);
     @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets);
 
+    /**
+    * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99}
+    * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order
+    * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1}
+    * returns number of non-unity dimensions in inShapeInfo
+    * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo
+    */
+    @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongPointer stridesNoUnities);
+    @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer stridesNoUnities);
+    @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities);
+
+    /**
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
+    * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
+    */
+    @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo);
+    @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, int dimsSize, @Const IntBuffer dimsToExclude, @Cast("Nd4jLong*") LongBuffer outShapeInfo);
+    @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] outShapeInfo);
 
 
 
@@ -8189,6 +8223,8 @@ public static final int PREALLOC_SIZE = 33554432;
  * @param rank the rank of the shape
  */
 
+//////////////////////////////////////////////////////////////////////
+
 /**
 * Returns whether the
 * given shape is a vector or not
@@ -8738,69 +8774,60 @@ public static final int PREALLOC_SIZE = 33554432;
 //         return true;
 //     }
 
-// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) {
+//////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) {
 
 //         // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements
 //         // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo
 
-//         const int newOrder = isFOrder ? 102 : 99;
-//         const int oldOrder = oldShapeInfo[2 * oldRank + 3];
-
 //         newShapeInfo[0] = newRank;
 //         memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong));
 
-//         Nd4jLong* newStrides = shape::stride(newShapeInfo);
-//         const Nd4jLong* oldShape = shape::shapeOf(const_cast<Nd4jLong*>(oldShapeInfo));
+//         Nd4jLong* newStrides       = shape::stride(newShapeInfo);
+//         const Nd4jLong* oldShape   = shape::shapeOf(const_cast<Nd4jLong*>(oldShapeInfo));
 //         const Nd4jLong* oldStrides = shape::stride(const_cast<Nd4jLong*>(oldShapeInfo));
-//         int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
-
+//         Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim;
 
 //         while (newStart < newRank && oldStart < oldRank) {
 
 //             newDim = newShape[newStart];
 //             oldDim = oldShape[oldStart];
 
-//             while (newDim != oldDim)
+//             while (newDim != oldDim && newDim > 0 && oldDim > 0)
 //                 if (newDim < oldDim) newDim *= newShape[newStop++];
 //                 else                 oldDim *= oldShape[oldStop++];
 
 //             // ------ Check whether the original axes can be combined ------ //
-//             for (int i = oldStart; i < oldStop - 1; i++) {
-
-//                 if(oldShape[i] == 1) {                         // ignore strides like {...,1,1,...}
-//                     if(oldOrder == 102) ++oldStart;
+//             for (int step = 1, i = oldStart; i < oldStop - 1; ++i) {
+//                 if(oldShape[i] == 1)                // skip unity-dimension and its stride
 //                     continue;
-//                 }
-
-//                 if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i])
-//                     return false;       // not contiguous enough
-//                 if(oldOrder == 99  && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1])
-//                     return false;       // not contiguous enough
+//                 while((i + step) < oldRank && oldShape[i + step] == 1)
+//                     ++step;                         // skip following unity-dimensions and its strides if such are present
+//                 if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step])
+//                     return false;                   // not contiguous enough
 //             }
 
-//             // ------ Calculate new strides for all axes currently worked with ------ //
-//             if(isFOrder) {
-//                 newStrides[newStart] = oldStrides[oldStart];
-//                 for (int i = newStart + 1; i < newStop; ++i)
-//                     newStrides[i] = newStrides[i - 1] * newShape[i - 1];
-//             }
-//             else {
-//                 newStrides[newStop - 1] = oldStrides[oldStop - 1];
-//                 for (int i = newStop - 1; i > newStart; --i)
-//                     newStrides[i - 1] = newStrides[i] * newShape[i];
-//             }
+//             newStrides[newStop - 1] = oldStrides[oldStop - 1];
+//             for (int i = newStop - 1; i > newStart; --i)
+//                 newStrides[i - 1] = newStrides[i] * newShape[i];
 
 //             newStart = newStop++;
 //             oldStart = oldStop++;
 //         }
 
-//         newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo);    // order
-//         newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo);    // ews
-//         newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);    // type
+//         // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank)
+//         for (int i = newStart; i < newRank; ++i)
+//             newStrides[i] = 1;
+
+//         newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo);                 // order
+//         newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo);     // ews
+//         newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo);                  // type
 
 //         return true;
 //     }
 
+//////////////////////////////////////////////////////////////////////
+
 //////////////////////////////////////////////////////////////////////
 
     // this function checks the consistence of dimensions with array rank (negative dimensions, too large dimensions, too big number of dimensions)
@@ -8841,9 +8868,198 @@ public static final int PREALLOC_SIZE = 33554432;
 //////////////////////////////////////////////////////////////////////
 
 //////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
+
+//     // we assume all array have same length
+//     const Nd4jLong len = shape::length(xShapeInfo);
+
+//     const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
+//     const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
+//     const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo);
+
+//     const char xOrder = shape::order(xShapeInfo);
+//     const char yOrder = shape::order(yShapeInfo);
+//     const char zOrder = shape::order(zShapeInfo);
+
+//     const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo);
+
+//     if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) {
+//         xOffsets = yOffsets = zOffsets = nullptr;
+//     }
+//     else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) {
+//         xOffsets = yOffsets = nullptr;
+//         zOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
+//     }
+//     else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) {
+//         xOffsets = zOffsets = nullptr;
+//         yOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//     }
+//     else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) {
+//         yOffsets = zOffsets = nullptr;
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//     }
+//     else if(xEws == 1) {
+//         xOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
+//             }
+//         }
+//     }
+//     else if(yEws == 1) {
+//         yOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets, yOrder);
+//             }
+//         }
+//     }
+//     else if(zEws == 1) {
+//         zOffsets = nullptr;
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets, zOrder);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets, zOrder);
+//             }
+//         }
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) {
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets);
+//         yOffsets = zOffsets = xOffsets;
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets);
+//             }
+//         }
+//         yOffsets = xOffsets;
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//         }
+//         zOffsets = xOffsets;
+//     }
+//     else {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 zOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(zShapeInfo, zOffsets);
+//             }
+//         }
+//     }
+// }
+
+//////////////////////////////////////////////////////////////////////
+// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) {
+
+//     // we assume all array have same length
+//     const Nd4jLong len = shape::length(xShapeInfo);
+
+//     const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo);
+//     const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo);
+
+//     const char xOrder = shape::order(xShapeInfo);
+//     const char yOrder = shape::order(yShapeInfo);
+
+//     const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo);
+
+//     if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) {
+//         xOffsets = yOffsets = nullptr;
+//     }
+//     else if(xEws == 1) {
+//         xOffsets = nullptr;
+//         yOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
+//     }
+//     else if(yEws == 1) {
+//         yOffsets = nullptr;
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
+//     }
+//     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
+//         xOffsets = new Nd4jLong[len];
+//         shape::calcOffsets(xShapeInfo, xOffsets);
+//         yOffsets = xOffsets;
+//     }
+//     else {
+//         PRAGMA_OMP_PARALLEL_SECTIONS
+//         {
+//             PRAGMA_OMP_SECTION
+//             {
+//                 xOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(xShapeInfo, xOffsets);
+//             }
+//             PRAGMA_OMP_SECTION
+//             {
+//                 yOffsets = new Nd4jLong[len];
+//                 shape::calcOffsets(yShapeInfo, yOffsets);
+//             }
+//         }
+//     }
+// }
 
 //////////////////////////////////////////////////////////////////////
 
+//////////////////////////////////////////////////////////////////////
 
 
 
@@ -11382,6 +11598,9 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
             // returns TRUE if this op allows in-place execution
             public native @Cast("bool") boolean allowsInplace();
 
+            // this method allows you to enable/disable inplace call for a given op
+            public native void allowInplace(@Cast("bool") boolean reallyAllow);
+
             // this method returns opNum (applicable for legacy XYZ ops only)
             public native int getOpNum();
 
@@ -21093,7 +21312,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public permute() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
-                                                                                }   
+                                                                                }
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reshapeas)
@@ -21111,7 +21330,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public reshapeas() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
-                                                                                }      
+                                                                                }
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_transpose)
@@ -22222,7 +22441,22 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public tensormmul() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
-                                                                                }   
+                                                                                }
+        @Namespace("nd4j::ops") public static class tensormmul_bp extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public tensormmul_bp(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public tensormmul_bp(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public tensormmul_bp position(long position) {
+                return (tensormmul_bp)super.position(position);
+            }
+        
+                                                                                    public tensormmul_bp() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
 //         #endif
 
         /**
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java
index 210c4b703..3788c434e 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/imports/TFGraphs/TFGraphTestAllSameDiff.java
@@ -123,7 +123,12 @@ public class TFGraphTestAllSameDiff {   //Note: Can't extend BaseNd4jTest here a
             //AB 2020/01/07 - Known issues
             "bitcast/from_float64_to_int64",
             "bitcast/from_rank2_float64_to_int64",
-            "bitcast/from_float64_to_uint64"
+            "bitcast/from_float64_to_uint64",
+
+            // 2020/02/14 - new ops which are not passing yet
+            "linear_solve/.*",
+            "triangular_solve/.*",
+            "lstsq/.*"
     };
 
     /* As per TFGraphTestList.printArraysDebugging - this field defines a set of regexes for test cases that should have
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java
index 981495eac..4cfe9f1be 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/dimensionalityreduction/TestPCA.java
@@ -61,7 +61,7 @@ public class TestPCA extends BaseNd4jTest {
             assertEquals("Reconstructed matrix is very different from the original.", 0.0, Diff.getDouble(i), 1.0);
         }
     }
-    
+
     @Test
     public void testFactorSVDTransposed() {
         int m = 4;
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
index 90e9015b1..bad97296f 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
@@ -16,6 +16,7 @@
 
 package org.nd4j.linalg.shape.concat;
 
+import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -43,6 +44,7 @@ import static org.junit.Assert.assertTrue;
 /**
  * @author Adam Gibson
  */
+@Slf4j
 @RunWith(Parameterized.class)
 public class ConcatTestsC extends BaseNd4jTest {
 
@@ -309,7 +311,11 @@ public class ConcatTestsC extends BaseNd4jTest {
         for (int e = 0; e < 20000; e++)
             list.add(Nd4j.create(DataType.INT, 1, 300).assign(e));
 
+        val timeStart = System.nanoTime();
         val result = Nd4j.concat(0, list.toArray(new INDArray[list.size()]));
+        val timeEnd = System.nanoTime();
+
+        log.info("Time: {} us", (timeEnd - timeStart) / 1000);
 
         for (int e = 0; e < 20000; e++)
             assertEquals((float) e, result.getRow(e).meanNumber().floatValue(), 1e-5f);
diff --git a/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala b/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala
index 02474f771..65a2bddf2 100644
--- a/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/NDArrayExtractionTest.scala
@@ -123,7 +123,7 @@ trait NDArrayExtractionTestBase extends FlatSpec { self: OrderingForTest =>
     val expectedSlice = expectedArray.slice(0)
     val actualSlice = expectedArray(0, ->)
 
-    Console.println(expectedSlice)
+//    Console.println(expectedSlice)
 
     assert(actualSlice == expectedSlice)
   }
diff --git a/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala
index d51707ee1..553e59df2 100644
--- a/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/samediff/TrainingTest.scala
@@ -28,7 +28,7 @@ class TrainingTest extends FlatSpec with Matchers {
       val unused3 = unused1.div(unused2)
       val loss1 = add.std("l1", true)
       val loss2 = mmul.mean("l2")
-      Console.println(sd.summary)
+//      Console.println(sd.summary)
       if (i == 0) {
         sd.setLossVariables("l1", "l2")
         sd.createGradFunction()
diff --git a/pom.xml b/pom.xml
index 9b32f25ae..f02800d9e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -226,6 +226,7 @@
         <nd4j.version>1.0.0-SNAPSHOT</nd4j.version>
         <datavec.version>1.0.0-SNAPSHOT</datavec.version>
         <dl4j-test-resources.version>1.0.0-SNAPSHOT</dl4j-test-resources.version>
+        <dl4j-test-resources.classifier></dl4j-test-resources.classifier>
 
         <jackson-asl.version>1.9.13</jackson-asl.version>
         <asm.version>5.1</asm.version>
@@ -561,6 +562,7 @@
                     <groupId>org.deeplearning4j</groupId>
                     <artifactId>dl4j-test-resources</artifactId>
                     <version>${dl4j-test-resources.version}</version>
+                    <classifier>${dl4j-test-resources.classifier}</classifier>
                     <scope>test</scope>
                 </dependency>
             </dependencies>
@@ -854,7 +856,7 @@
                 <os.arch>arm</os.arch>
             </properties>
         </profile>
-        
+
         <!-- Integration Tests Profile. By default (when this profile is not enabled) unit tests are run only.
              Note also that some tests may run fewer/quicker cases for unit tests vs. integration tests
         -->
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java
index a1c28ce60..26ec0708f 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/learning/HistoryProcessorTest.java
@@ -43,8 +43,8 @@ public class HistoryProcessorTest {
         hp.add(a);
         INDArray[] h = hp.getHistory();
         assertEquals(4, h.length);
-        System.out.println(Arrays.toString(a.shape()));
-        System.out.println(Arrays.toString(h[0].shape()));
+//        System.out.println(Arrays.toString(a.shape()));
+//        System.out.println(Arrays.toString(h[0].shape()));
         assertEquals(           1, h[0].shape()[0]);
         assertEquals(a.shape()[0], h[0].shape()[1]);
         assertEquals(a.shape()[1], h[0].shape()[2]);
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java
index dc4814220..c43c26d50 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/network/ac/ActorCriticTest.java
@@ -100,8 +100,8 @@ public class ActorCriticTest {
                 double error2 = gradient2 - gradient.getDouble(1);
                 double relError1 = error1 / gradient.getDouble(0);
                 double relError2 = error2 / gradient.getDouble(1);
-                System.out.println(gradient.getDouble(0) + "  " + gradient1 + " " + relError1);
-                System.out.println(gradient.getDouble(1) + "  " + gradient2 + " " + relError2);
+//                System.out.println(gradient.getDouble(0) + "  " + gradient1 + " " + relError1);
+//                System.out.println(gradient.getDouble(1) + "  " + gradient2 + " " + relError2);
                 assertTrue(gradient.getDouble(0) < maxRelError || Math.abs(relError1) < maxRelError);
                 assertTrue(gradient.getDouble(1) < maxRelError || Math.abs(relError2) < maxRelError);
             }
diff --git a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java
index f97457a52..2262f1789 100644
--- a/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java
+++ b/rl4j/rl4j-core/src/test/java/org/deeplearning4j/rl4j/policy/PolicyTest.java
@@ -158,7 +158,7 @@ public class PolicyTest {
         for (int i = 0; i < 100; i++) {
             count[policy.nextAction(input)]++;
         }
-        System.out.println(count[0] + " " + count[1] + " " + count[2] + " " + count[3]);
+//        System.out.println(count[0] + " " + count[1] + " " + count[2] + " " + count[3]);
         assertTrue(count[0] < 20);
         assertTrue(count[1] < 30);
         assertTrue(count[2] < 40);