From 6de00bf75fbe7d183779a82789839029889b3824 Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Wed, 13 Nov 2019 17:15:18 +0300
Subject: [PATCH] [WIP] Weekly update of repo (#8390)

* [WIP] Fix compilation after nd4j changes (#37)

* Fix compilation.

* Some tests fixed

* Disable tests temporarily.

* Restored test

* Tests restored.

* Test restored.

* [WIP] perf tests (#40)

* special maxpool test

Signed-off-by: raver119 <raver119@gmail.com>

* special maxpool test

Signed-off-by: raver119 <raver119@gmail.com>

* Shyrma bnorm bp (#41)

Batchnorm backprop mkldnn

* Add SameDiff memory reuse memory manager (array cache) (#39)

* Attention op comments

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* ArrayCacheMemoryMgr - first pass

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Tweak array cache for use with SameDiff identity arrays

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* ArrayCacheMemoryMgr javadoc and properly get max memory

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* LRU cache policy + add tests

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Fixes

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Resize arrays internally if required for ArrayCacheMemoryMgr

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Test improvement

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Small polish

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* SameDiff op runtime benchmarking listener (#42)

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* INLINE_LOOPS for windows

Signed-off-by: raver119 <raver119@gmail.com>

* [WIP] ThreadPool (#8)

This PR removes OpenMP use in 95% of cases
---
 .../CompareTrainingImplementations.java       |    1 +
 .../deeplearning4j/util/ConvolutionUtils.java |   14 +-
 libnd4j/CMakeLists.txt                        |   22 +-
 libnd4j/CMakeLists.txt.mkldnn.in              |    2 +-
 libnd4j/blas/CMakeLists.txt                   |   27 +-
 libnd4j/blas/Environment.cpp                  |    7 +-
 libnd4j/blas/NDArray.h                        |    2 +-
 libnd4j/blas/NDArray.hpp                      |    5 +-
 libnd4j/blas/NativeOpExecutioner.h            |    6 +-
 libnd4j/blas/NativeOps.h                      |    5 +-
 libnd4j/blas/cpu/NDArray.cpp                  |  122 +-
 libnd4j/blas/cpu/NDArrayLambda.hpp            |  191 +-
 libnd4j/blas/cpu/NativeOpExecutioner.cpp      |  441 ++--
 libnd4j/blas/cpu/NativeOps.cpp                |  349 +--
 libnd4j/blas/cuda/NativeOps.cu                |   43 +-
 libnd4j/buildnativeoperations.sh              |   38 +-
 libnd4j/include/array/DataTypeConversions.h   |   37 +-
 libnd4j/include/buffer.h                      |    1 +
 libnd4j/include/cnpy/cnpy.h                   |   30 +-
 libnd4j/include/dll.h                         |    3 +
 libnd4j/include/execution/BlockingQueue.h     |   52 +
 libnd4j/include/execution/CallableInterface.h |   94 +
 .../include/execution/CallableWithArguments.h |   92 +
 libnd4j/include/execution/ThreadPool.h        |   71 +
 libnd4j/include/execution/Threads.h           |  160 ++
 libnd4j/include/execution/Ticket.h            |   67 +
 .../include/execution/impl/BlockingQueue.cpp  |   73 +
 .../execution/impl/CallableInterface.cpp      |  213 ++
 .../execution/impl/CallableWithArguments.cpp  |  103 +
 libnd4j/include/execution/impl/ThreadPool.cpp |  194 ++
 libnd4j/include/execution/impl/Threads.cpp    |  641 +++++
 libnd4j/include/execution/impl/Ticket.cpp     |   94 +
 libnd4j/include/graph/Node.h                  |    1 +
 libnd4j/include/graph/impl/Graph.cpp          |    3 +-
 libnd4j/include/graph/impl/Node.cpp           |   69 +-
 libnd4j/include/helpers/Loops.h               |  924 +++----
 libnd4j/include/helpers/TAD.h                 |    2 +-
 .../helpers/benchmark/MatrixBenchmark.h       |    1 -
 libnd4j/include/helpers/cpu/MmulHelper.cpp    |   76 +-
 .../helpers/cpu/TrueBroadcastHelper.cpp       |    1 +
 .../helpers/cpu/loops/IndexReductionLoops.cpp |  266 +-
 .../helpers/cpu/loops/Reduction3Loops_0.cpp   |   24 +-
 .../helpers/cpu/loops/Reduction3Loops_1.cpp   |   24 +-
 .../helpers/cpu/loops/Reduction3Loops_2.cpp   |   24 +-
 .../helpers/cpu/loops/Reduction3Loops_3.cpp   |   24 +-
 .../helpers/cpu/loops/ReductionLoops.hpp      |    1 +
 .../helpers/cpu/loops/ReductionLoops_bool.cpp |   12 +-
 .../cpu/loops/ReductionLoops_float_0.cpp      |   13 +-
 .../cpu/loops/ReductionLoops_float_1.cpp      |   13 +-
 .../cpu/loops/ReductionLoops_float_2.cpp      |   13 +-
 .../cpu/loops/ReductionLoops_float_3.cpp      |   13 +-
 .../helpers/cpu/loops/ReductionLoops_long.cpp |   13 +-
 .../helpers/cpu/loops/ReductionLoops_same.cpp |   12 +-
 .../helpers/cuda/TrueBroadcastHelper.cu       |    1 +
 .../include/helpers/impl/AttentionHelper.cpp  |   10 +-
 libnd4j/include/helpers/impl/BlasHelper.cpp   |   24 +-
 libnd4j/include/helpers/impl/DebugHelper.cpp  |   18 +-
 libnd4j/include/helpers/impl/GradCheck.cpp    |    2 -
 .../include/helpers/impl/OmpLaunchHelper.cpp  |    6 +-
 libnd4j/include/loops/aggregates.h            |   66 -
 libnd4j/include/loops/broadcasting.h          |   19 +-
 libnd4j/include/loops/broadcasting_bool.h     |   19 +-
 libnd4j/include/loops/broadcasting_int.h      |   19 +-
 libnd4j/include/loops/cpu/broadcasting.hpp    |  130 +-
 .../include/loops/cpu/broadcasting_bool.cpp   |  117 +-
 .../include/loops/cpu/broadcasting_int.cpp    |  137 +-
 libnd4j/include/loops/cpu/indexreduce.cpp     |   62 +-
 libnd4j/include/loops/cpu/pairwise.hpp        |  200 +-
 libnd4j/include/loops/cpu/pairwise2.hpp       |  106 -
 libnd4j/include/loops/cpu/pairwise_bool.cpp   |  201 +-
 libnd4j/include/loops/cpu/pairwise_int.cpp    |  201 +-
 libnd4j/include/loops/cpu/random.cpp          |  139 +-
 .../include/loops/cpu/reduce/reduce_bool.cpp  |  104 +-
 .../include/loops/cpu/reduce/reduce_float.cpp |  121 +-
 .../include/loops/cpu/reduce/reduce_long.cpp  |  117 +-
 .../include/loops/cpu/reduce/reduce_same.cpp  |  123 +-
 libnd4j/include/loops/cpu/reduce3.cpp         |  101 +-
 libnd4j/include/loops/cpu/scalar.hpp          |  112 +-
 libnd4j/include/loops/cpu/scalar_bool.cpp     |  116 +-
 libnd4j/include/loops/cpu/scalar_int.cpp      |  118 +-
 .../include/loops/cpu/summarystatsreduce.cpp  |   54 +-
 .../loops/cpu/transform/transform_any.cpp     |   18 +-
 .../loops/cpu/transform/transform_bool.cpp    |   18 +-
 .../loops/cpu/transform/transform_float.cpp   |   16 +-
 .../loops/cpu/transform/transform_same.cpp    |   14 +-
 .../loops/cpu/transform/transform_strict.cpp  |   17 +-
 libnd4j/include/loops/cuda/aggregates.cu      |  145 --
 libnd4j/include/loops/cuda/broadcasting.cu    |   78 -
 .../include/loops/cuda/broadcasting_bool.cu   |   70 -
 .../include/loops/cuda/broadcasting_int.cu    |   69 -
 libnd4j/include/loops/cuda/indexreduce.cu     |   26 -
 libnd4j/include/loops/cuda/pairwise.cu        |   52 -
 libnd4j/include/loops/cuda/pairwise_bool.cu   |   57 -
 libnd4j/include/loops/cuda/pairwise_int.cu    |   57 -
 libnd4j/include/loops/cuda/random.cu          |   33 -
 libnd4j/include/loops/cuda/reduce3.chpp       |    2 +-
 libnd4j/include/loops/cuda/reduce3.cu         |   49 -
 libnd4j/include/loops/cuda/scalar_bool.cu     |   35 -
 libnd4j/include/loops/cuda/scalar_int.cu      |   34 -
 .../include/loops/cuda/summarystatsreduce.cu  |   67 -
 .../loops/cuda/transform/transform_any.cu     |   11 -
 .../loops/cuda/transform/transform_bool.cu    |   11 -
 .../loops/cuda/transform/transform_float.cu   |   12 -
 .../loops/cuda/transform/transform_same.cu    |   11 -
 .../loops/cuda/transform/transform_strict.cu  |   11 -
 .../include/loops/impl/type_conversions.cpp   |   42 +-
 libnd4j/include/loops/indexreduce.h           |    7 +-
 libnd4j/include/loops/legacy_ops.h            |    3 -
 libnd4j/include/loops/pairwise_bool.h         |   25 +-
 libnd4j/include/loops/pairwise_int.h          |   24 +-
 libnd4j/include/loops/pairwise_transform.h    |   22 +-
 libnd4j/include/loops/random.h                |    3 +-
 libnd4j/include/loops/reduce3.h               |   20 +-
 libnd4j/include/loops/reduce_bool.h           |   13 +-
 libnd4j/include/loops/reduce_float.h          |   13 +-
 libnd4j/include/loops/reduce_long.h           |   13 +-
 libnd4j/include/loops/reduce_same.h           |   14 +-
 libnd4j/include/loops/scalar.h                |   15 +-
 libnd4j/include/loops/scalar_bool.h           |   15 +-
 libnd4j/include/loops/scalar_int.h            |   18 +-
 libnd4j/include/loops/summarystatsreduce.h    |    4 +-
 libnd4j/include/loops/transform_any.h         |   15 +-
 libnd4j/include/loops/transform_bool.h        |   15 +-
 libnd4j/include/loops/transform_float.h       |   14 +-
 libnd4j/include/loops/transform_same.h        |   14 +-
 libnd4j/include/loops/transform_strict.h      |   17 +-
 libnd4j/include/msvc.h                        |   39 +
 libnd4j/include/op_boilerplate.h              |    3 +-
 libnd4j/include/openmp_pragmas.h              |   40 +-
 libnd4j/include/ops/aggregate_ops.h           |  996 -------
 libnd4j/include/ops/declarable/BooleanOp.h    |    1 -
 .../include/ops/declarable/BroadcastableOp.h  |    1 -
 .../ops/declarable/DeclarableCustomOp.h       |    1 -
 .../include/ops/declarable/DeclarableListOp.h |    3 +-
 libnd4j/include/ops/declarable/DeclarableOp.h |    2 +-
 .../ops/declarable/DeclarableReductionOp.h    |    1 -
 libnd4j/include/ops/declarable/LegacyOp.h     |    1 +
 libnd4j/include/ops/declarable/LogicOp.h      |    1 -
 libnd4j/include/ops/declarable/OpTuple.h      |    2 +-
 .../ops/declarable/generic/blas/axpy.cpp      |   20 +-
 .../ops/declarable/generic/datatypes/cast.cpp |    8 -
 .../ops/declarable/generic/nn/batchnorm.cpp   |  164 +-
 .../nn/multi_head_dot_product_attention.cpp   |   21 +-
 .../declarable/generic/parity_ops/argmax.cpp  |    2 +-
 .../declarable/generic/parity_ops/argmin.cpp  |    2 +-
 .../recurrent/dynamicBidirectionalRNN.cpp     |    6 +-
 .../generic/transforms/reverseSequence.cpp    |   16 +-
 .../declarable/helpers/cpu/BarnesHutTsne.cpp  |   39 +-
 .../declarable/helpers/cpu/activations.cpp    |  103 +-
 .../ops/declarable/helpers/cpu/addBias.cpp    |   65 +-
 .../ops/declarable/helpers/cpu/adjust_hue.cpp |   58 +-
 .../helpers/cpu/adjust_saturation.cpp         |   62 +-
 .../declarable/helpers/cpu/batched_gemm.cpp   |   34 +-
 .../ops/declarable/helpers/cpu/batchnorm.cpp  |   20 +-
 .../ops/declarable/helpers/cpu/betaInc.cpp    |   12 +-
 .../ops/declarable/helpers/cpu/col2im.cpp     |   90 +-
 .../declarable/helpers/cpu/compare_elem.cpp   |   43 +-
 .../ops/declarable/helpers/cpu/confusion.cpp  |   18 +-
 .../declarable/helpers/cpu/convolutions.cpp   | 1419 +++++-----
 .../ops/declarable/helpers/cpu/cross.cpp      |   17 +-
 .../ops/declarable/helpers/cpu/d_t_s.cpp      |   67 +-
 .../ops/declarable/helpers/cpu/diag.cpp       |    1 -
 .../ops/declarable/helpers/cpu/dilation2d.cpp |   40 +-
 .../ops/declarable/helpers/cpu/dropout.cpp    |   34 +-
 .../ops/declarable/helpers/cpu/dynamic.cpp    |   39 +-
 .../helpers/cpu/extract_patches.cpp           |   65 +-
 .../ops/declarable/helpers/cpu/gather.cpp     |   33 +-
 .../ops/declarable/helpers/cpu/hamming.cpp    |   47 +-
 .../ops/declarable/helpers/cpu/hashcode.cpp   |   45 +-
 .../helpers/cpu/histogramFixedWidth.cpp       |   20 +-
 .../ops/declarable/helpers/cpu/im2col.cpp     |   76 +-
 .../declarable/helpers/cpu/image_resize.cpp   |  149 +-
 .../helpers/cpu/image_suppression.cpp         |    3 +-
 .../ops/declarable/helpers/cpu/ismax.cpp      |   15 +-
 .../declarable/helpers/cpu/legacy_helper.cpp  |    1 +
 .../ops/declarable/helpers/cpu/lrn.cpp        |  378 +--
 .../ops/declarable/helpers/cpu/lstm.cpp       |   14 +-
 .../declarable/helpers/cpu/matrixSetDiag.cpp  |   29 +-
 .../helpers/cpu/matrix_diag_part.cpp          |   13 +-
 .../declarable/helpers/cpu/nth_element.cpp    |   14 +-
 .../ops/declarable/helpers/cpu/one_hot.cpp    |   63 +-
 .../ops/declarable/helpers/cpu/percentile.cpp |    2 +-
 .../ops/declarable/helpers/cpu/polyGamma.cpp  |   11 +-
 .../ops/declarable/helpers/cpu/range.cpp      |   10 +-
 .../ops/declarable/helpers/cpu/reverse.cpp    |  117 +-
 .../ops/declarable/helpers/cpu/s_t_b.cpp      |  112 +-
 .../ops/declarable/helpers/cpu/s_t_d.cpp      |   73 +-
 .../ops/declarable/helpers/cpu/scatter.cpp    |  105 +-
 .../ops/declarable/helpers/cpu/segment.cpp    |  323 ++-
 .../declarable/helpers/cpu/sequence_mask.cpp  |   14 +-
 .../ops/declarable/helpers/cpu/sg_cb.cpp      |  310 +--
 .../ops/declarable/helpers/cpu/sru.cpp        |  206 +-
 .../ops/declarable/helpers/cpu/stack.cpp      |   18 +-
 .../ops/declarable/helpers/cpu/top_k.cpp      |   25 +-
 .../ops/declarable/helpers/cpu/transforms.cpp |  519 ++--
 .../ops/declarable/helpers/cpu/zeta.cpp       |   10 +-
 .../include/ops/declarable/helpers/cross.h    |   18 +-
 .../ops/declarable/helpers/cuda/col2im.cppc   |  138 -
 .../ops/declarable/helpers/cuda/im2col.cppc   |  129 -
 .../declarable/helpers/cuda/legacy/relu.cu    |    1 +
 .../declarable/helpers/cuda/legacy/tanh.cu    |    1 +
 .../declarable/helpers/cuda/legacy_helper.cu  |    1 +
 .../ops/declarable/helpers/cuda/transforms.cu |    3 +-
 .../include/ops/declarable/helpers/helpers.h  |    1 +
 .../ops/declarable/helpers/impl/choose.cpp    |    1 +
 .../ops/declarable/helpers/impl/unique.cpp    |   15 +-
 .../include/ops/declarable/helpers/matmul.h   |    1 -
 .../include/ops/declarable/impl/BooleanOp.cpp |    4 -
 .../ops/declarable/impl/BroadcastableOp.cpp   |    4 -
 .../declarable/impl/DeclarableCustomOp.cpp    |    4 -
 .../ops/declarable/impl/DeclarableListOp.cpp  |    4 -
 .../declarable/impl/DeclarableReductionOp.cpp |    8 +-
 .../ops/declarable/impl/LegacyReduce3Op.cpp   |    5 +-
 .../declarable/impl/LegacyReduceBoolOp.cpp    |    5 +-
 .../declarable/impl/LegacyReduceFloatOp.cpp   |    5 +-
 .../declarable/impl/LegacyReduceLongOp.cpp    |    5 +-
 .../declarable/impl/LegacyReduceSameOp.cpp    |    3 +-
 .../ops/declarable/impl/LegacyStatsOp.cpp     |    5 +-
 .../declarable/platform/mkldnn/batchnorm.cpp  |  130 +-
 .../ops/declarable/platform/mkldnn/conv3d.cpp |    3 +
 libnd4j/include/ops/impl/gemm.cpp             |   87 +-
 libnd4j/include/ops/impl/specials.cpp         |  252 +-
 libnd4j/include/ops/ops.h                     |   36 -
 .../include/ops/special_accumulation_ops.h    |  213 --
 libnd4j/include/ops/special_ops.h             | 2293 -----------------
 libnd4j/include/ops/special_random_ops.h      |  176 +-
 libnd4j/include/ops/specials.h                |    7 +-
 .../benchmarking/impl/FullBenchmarkSuit.cpp   |    3 +-
 .../benchmarking/impl/LightBenchmarkSuit.cpp  |   19 +-
 libnd4j/include/pointercast.h                 |    1 +
 libnd4j/include/templatemath.h                |   43 +-
 libnd4j/pom.xml                               |    2 +
 .../layers_tests/BooleanOpsTests.cpp          |    2 +-
 .../layers_tests/BroadcastableOpsTests.cpp    |    8 +-
 .../tests_cpu/layers_tests/BrodcastTests.cpp  |    2 +-
 libnd4j/tests_cpu/layers_tests/CMakeLists.txt |   21 +-
 .../layers_tests/ConditionalTests.cpp         |    1 -
 .../layers_tests/ConstantShapeHelperTests.cpp |    4 +-
 .../layers_tests/ConvolutionTests1.cpp        |  180 +-
 .../layers_tests/DataTypesValidationTests.cpp |    4 +-
 .../layers_tests/DeclarableOpsTests1.cpp      |   53 +-
 .../layers_tests/DeclarableOpsTests10.cpp     |   42 +-
 .../layers_tests/DeclarableOpsTests11.cpp     |   11 -
 .../layers_tests/DeclarableOpsTests12.cpp     |   18 -
 .../layers_tests/DeclarableOpsTests13.cpp     |    1 -
 .../layers_tests/DeclarableOpsTests14.cpp     |   12 +-
 .../layers_tests/DeclarableOpsTests15.cpp     |    1 -
 .../layers_tests/DeclarableOpsTests16.cpp     |   11 +
 .../layers_tests/DeclarableOpsTests2.cpp      |    2 -
 .../layers_tests/DeclarableOpsTests4.cpp      |    8 -
 .../layers_tests/DeclarableOpsTests5.cpp      |   72 +-
 .../layers_tests/DeclarableOpsTests6.cpp      |   49 +-
 .../layers_tests/DeclarableOpsTests7.cpp      |   83 +-
 .../layers_tests/DeclarableOpsTests8.cpp      | 1008 ++++----
 .../layers_tests/DeclarableOpsTests9.cpp      |  115 +-
 libnd4j/tests_cpu/layers_tests/EmptyTests.cpp |    3 -
 .../tests_cpu/layers_tests/HelpersTests1.cpp  |    5 +-
 .../tests_cpu/layers_tests/IndexingTests.cpp  |    5 -
 .../layers_tests/JavaInteropCudaTests.cu      |    2 -
 .../layers_tests/JavaInteropTests.cpp         |   25 +-
 libnd4j/tests_cpu/layers_tests/LambdaTests.cu |    9 -
 .../tests_cpu/layers_tests/LegacyOpsTests.cpp |   24 +-
 .../layers_tests/NDArrayCudaBasicsTests.cu    |   42 +-
 .../tests_cpu/layers_tests/NDArrayTests.cpp   |    5 -
 .../tests_cpu/layers_tests/NDArrayTests2.cpp  |   19 -
 .../tests_cpu/layers_tests/NativeOpsTests.cpp |    7 +-
 .../layers_tests/OmpLaunchHelperTests.cpp     |   28 -
 libnd4j/tests_cpu/layers_tests/OpsArena.cpp   |  200 --
 .../tests_cpu/layers_tests/ParityOpsTests.cpp |    6 +-
 .../layers_tests/PerformanceTests.cpp         |   95 +
 .../layers_tests/PlaygroundTests.cpp          |  191 +-
 libnd4j/tests_cpu/layers_tests/RNGTests.cpp   |    1 -
 .../tests_cpu/layers_tests/ReduceTests.cpp    |    6 +-
 .../tests_cpu/layers_tests/ShapeTests2.cpp    |    1 -
 libnd4j/tests_cpu/layers_tests/TadTests.cpp   |    7 -
 .../tests_cpu/layers_tests/ThreadsTests.cpp   |  233 ++
 .../tests_cpu/layers_tests/WorkspaceTests.cpp |    2 -
 .../tests_cpu/libnd4j_tests/CMakeLists.txt    |    6 +-
 libnd4j/tests_cpu/run_tests.sh                |   25 +-
 .../functions/DifferentialFunction.java       |    2 +-
 .../debugging/OpBenchmarkListener.java        |  189 ++
 .../samediff/internal/InferenceSession.java   |   42 +-
 .../internal/memory/ArrayCacheMemoryMgr.java  |  292 +++
 .../api/ops/impl/reduce3/EqualsWithEps.java   |    2 +-
 .../java/org/nd4j/nativeblas/Nd4jCuda.java    |    2 +-
 .../cpu/nativecpu/CpuMemoryManager.java       |    2 +-
 .../java/org/nd4j/nativeblas/Nd4jCpu.java     |   66 +-
 .../nd4j/autodiff/samediff/MemoryMgrTest.java |  119 +
 nd4s/build.sbt                                |    2 +-
 nd4s/src/main/scala/org/nd4s/Implicits.scala  |    2 +-
 .../org/nd4s/samediff/ConstructionTest.scala  |    6 +-
 .../scala/org/nd4s/samediff/MathTest.scala    |   14 +-
 .../org/nd4s/samediff/SameDiffTest.scala      |   21 +-
 293 files changed, 9700 insertions(+), 12064 deletions(-)
 create mode 100644 libnd4j/include/execution/BlockingQueue.h
 create mode 100644 libnd4j/include/execution/CallableInterface.h
 create mode 100644 libnd4j/include/execution/CallableWithArguments.h
 create mode 100644 libnd4j/include/execution/ThreadPool.h
 create mode 100644 libnd4j/include/execution/Threads.h
 create mode 100644 libnd4j/include/execution/Ticket.h
 create mode 100644 libnd4j/include/execution/impl/BlockingQueue.cpp
 create mode 100644 libnd4j/include/execution/impl/CallableInterface.cpp
 create mode 100644 libnd4j/include/execution/impl/CallableWithArguments.cpp
 create mode 100644 libnd4j/include/execution/impl/ThreadPool.cpp
 create mode 100644 libnd4j/include/execution/impl/Threads.cpp
 create mode 100644 libnd4j/include/execution/impl/Ticket.cpp
 delete mode 100644 libnd4j/include/loops/aggregates.h
 delete mode 100644 libnd4j/include/loops/cpu/pairwise2.hpp
 delete mode 100644 libnd4j/include/loops/cuda/aggregates.cu
 create mode 100644 libnd4j/include/msvc.h
 delete mode 100644 libnd4j/include/ops/aggregate_ops.h
 delete mode 100644 libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
 delete mode 100644 libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
 delete mode 100644 libnd4j/include/ops/special_accumulation_ops.h
 delete mode 100644 libnd4j/include/ops/special_ops.h
 delete mode 100644 libnd4j/tests_cpu/layers_tests/OpsArena.cpp
 create mode 100644 libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
 create mode 100644 libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
 create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java
 create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java
 create mode 100644 nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java

diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java
index 12564f01a..fa0fc335f 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java
@@ -98,6 +98,7 @@ public class CompareTrainingImplementations extends BaseDL4JTest {
 
                 SDVariable diff = sd.f().squaredDifference(a1, label);
                 SDVariable lossMse = diff.mean();
+                lossMse.markAsLoss();
 
                 IUpdater updater;
                 double lr;
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java
index d5c8ee1f6..56421bc00 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java
@@ -35,6 +35,7 @@ import org.nd4j.linalg.api.ops.Op;
 import org.nd4j.linalg.api.ops.impl.broadcast.BroadcastCopyOp;
 import org.nd4j.linalg.api.ops.impl.layers.convolution.MaxPooling2D;
 import org.nd4j.linalg.api.ops.impl.layers.convolution.config.Pooling2DConfig;
+import org.nd4j.linalg.api.ops.impl.transforms.custom.Assign;
 import org.nd4j.linalg.api.shape.Shape;
 import org.nd4j.linalg.exception.ND4JArraySizeException;
 import org.nd4j.linalg.factory.NDArrayFactory;
@@ -482,23 +483,12 @@ public class ConvolutionUtils {
             return reshape5dTo2d(format, mask, workspaceMgr, type);
         } else {
             //Need to broadcast first
-            IntArrayList broadcastDims = new IntArrayList();
-            for(int i=0; i<mask.rank(); i++ ){
-                if(mask.size(i) == label.size(i)){
-                    if((format == Convolution3D.DataFormat.NCDHW && i == 1) || (format == Convolution3D.DataFormat.NDHWC && i == 4)){
-                        //Skip channels dimension
-                        continue;
-                    }
-                    broadcastDims.add(i);
-                }
-            }
             long[] lShape = label.shape().clone();
             int channelIdx = format == Convolution3D.DataFormat.NCDHW ? 1 : 4;
             lShape[channelIdx] = mask.size(channelIdx);     //Keep existing channel size
 
             INDArray bMask = workspaceMgr.createUninitialized(type, mask.dataType(), lShape, 'c');
-            int[] bcDims = broadcastDims.toIntArray();
-            Nd4j.getExecutioner().exec(new BroadcastCopyOp(bMask, mask, bMask, bcDims));
+            Nd4j.exec(new Assign(new INDArray[]{bMask, mask}, new INDArray[]{bMask}));
             return reshape5dTo2d(format, bMask, workspaceMgr, type);
         }
     }
diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt
index 949dbd542..c563eda27 100755
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@@ -16,17 +16,20 @@ endif()
 
 # -fsanitize=address
 # -fsanitize=leak
-if (APPLE)
-    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
-    set(CMAKE_CXX_FLAGS_DEBUG  " -O0 -g -fPIC -std=c++11 -fmax-errors=2 -D__APPLE_OS__=true")
+if (ANDROID_BUILD)
+    set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
+elseif (APPLE)
+    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
+    set(CMAKE_CXX_FLAGS_DEBUG  " -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
 elseif(WIN32)
     set(X86_BUILD true)
-    if (NOT CUDA_BLAS)
-        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
-        set(CMAKE_CXX_FLAGS_DEBUG  " -g -fPIC -std=c++11 -fmax-errors=2")
-    else()
-        set(CMAKE_CXX_FLAGS_RELEASE  "-D_RELEASE=true /wd4804")
+    if (CUDA_BLAS)
+        set(CMAKE_CXX_FLAGS_RELEASE  " /O2 -D_RELEASE=true /wd4804")
         set(CMAKE_CXX_FLAGS_DEBUG  "  /FS /EHsc /wd4661 /wd4804 /wd4267 /wd4244 /wd4251 /wd4305")
+    else()
+        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
+        set(CMAKE_CXX_FLAGS_DEBUG  " -g -O2 -fPIC -std=c++11 -fmax-errors=2")
     endif()
 else()
     set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
@@ -75,6 +78,9 @@ if(NOT CUDA_BLAS)
 
             message("Found external BLAS implementation: ${BLAS_LIBRARIES} ")
             add_definitions(-D__EXTERNAL_BLAS__=true)
+        elseif(WIN32)
+            message("BLAS not found, using downloaded OpenBLAS instead")
+            add_definitions(-D__EXTERNAL_BLAS__=true)
         endif()
     else()
         # if we have externally provided OPENBLAS_PATH - let's use it
diff --git a/libnd4j/CMakeLists.txt.mkldnn.in b/libnd4j/CMakeLists.txt.mkldnn.in
index 26d82034f..ac0f9accf 100644
--- a/libnd4j/CMakeLists.txt.mkldnn.in
+++ b/libnd4j/CMakeLists.txt.mkldnn.in
@@ -5,7 +5,7 @@ project(mkldnn-download NONE)
 include(ExternalProject)
 ExternalProject_Add(mkldnn
   GIT_REPOSITORY     https://github.com/intel/mkl-dnn.git
-  GIT_TAG           v1.0.2
+  GIT_TAG           v1.0.4
   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src"
   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build"
   CONFIGURE_COMMAND ""
diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt
index e3d6cedb8..c804ce5ec 100755
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@@ -30,8 +30,8 @@ if(APPLE)
 endif()
 
 if (APPLE_BUILD)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAPPLE_BUILD=true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAPPLE_BUILD=true")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAPPLE_BUILD=true -mmacosx-version-min=10.10")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAPPLE_BUILD=true -mmacosx-version-min=10.10")
 endif()
 
 if (ANDROID_BUILD)
@@ -92,11 +92,13 @@ ELSE()
     IF(${EXTENSION} MATCHES "avx512")
         message("Building AVX512 binary...")
         # we need to set flag here, that we can use hardware f16 conversion + tell that cpu features should be tracked
-        message("Current CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmmx -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -mavx512f -mavx512vl -mavx512bw -mavx512dq  -mavx512cd -mbmi -mbmi2 -mprefetchwt1 -mclflushopt -mxsavec -mxsaves -DSD_F16C=true -DF_AVX512=true")
     endif()
 
-    set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH_TYPE}")
+    if (NOT WIN32)
+        # we don't want this definition for msvc
+        set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH_TYPE}")
+    endif()
 ENDIF()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
@@ -109,7 +111,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
     # using Visual Studio C++
 
-    set( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} /EHsc /w")
+    set( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} /EHsc ${ARCH_TUNE}")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     # using GCC
     SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
@@ -283,8 +285,8 @@ if(CUDA_BLAS)
 
         if(WIN32)
             message("CUDA on Windows: enabling /EHsc")
-            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj")
-            SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj")
+            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
+            SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
         endif()
 
 
@@ -322,7 +324,7 @@ elseif(CPU_BLAS)
     endif()
 
     if (X86_BUILD)
-        #we disable platform optimizations for certains files
+        # we disable platform optimizations for certains files for linux/macos
         set_source_files_properties(cpu/NativeOps.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
         set_source_files_properties(../include/helpers/impl/OpTracker.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
     endif()
@@ -342,7 +344,16 @@ elseif(CPU_BLAS)
         add_library(${LIBND4J_NAME}       SHARED $<TARGET_OBJECTS:nd4jobj>)
     endif()
 
+    #if(WIN32)
+    #    message("CPU on Windows: enabling /EHsc")
+    #    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
+    #    SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
+    #endif()
+
     # we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS
+    if (NOT BLAS_LIBRARIES)
+        set(BLAS_LIBRARIES "")
+    endif()
     target_link_libraries(${LIBND4J_NAME} ${MKLDNN} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES})
 
     if ("${LIBND4J_ALL_OPS}" AND "${LIBND4J_BUILD_MINIFIER}")
diff --git a/libnd4j/blas/Environment.cpp b/libnd4j/blas/Environment.cpp
index 0c23f61be..90c391cf1 100644
--- a/libnd4j/blas/Environment.cpp
+++ b/libnd4j/blas/Environment.cpp
@@ -24,6 +24,8 @@
 #include <string>
 #include "Environment.h"
 #include <helpers/StringUtils.h>
+#include <thread>
+#include <helpers/logger.h>
 
 #ifdef _OPENMP
 
@@ -49,6 +51,7 @@ namespace nd4j {
         _precBoost.store(false);
         _leaks.store(false);
         _dataType.store(nd4j::DataType::FLOAT32);
+        _maxThreads = std::thread::hardware_concurrency();
 
 #ifndef ANDROID
         const char* omp_threads = std::getenv("OMP_NUM_THREADS");
@@ -86,9 +89,7 @@ namespace nd4j {
 	    cudaSetDevice(0);
 	    delete[] devProperties;
 #else
-#ifdef _OPENMP
-        omp_set_nested(1);
-#endif
+
 #endif
     }
 
diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h
index 10847f882..de2488f9d 100644
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@@ -26,6 +26,7 @@
 #include <indexing/IndicesList.h>
 #include <graph/Intervals.h>
 #include <array/DataType.h>
+#include <array/DataTypeUtils.h>
 #include <stdint.h>
 #include <array/ArrayOptions.h>
 #include <array/ArrayType.h>
@@ -1678,7 +1679,6 @@ namespace nd4j {
     //////////////////////////////////////////////////////////////////////////
 
     size_t NDArray::sizeOfT() const {
-
         return DataTypeUtils::sizeOfElement(_dataType);
     }
 
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index 2a601033a..c4a631cf5 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -2478,7 +2478,6 @@ double NDArray::getTrace() const {
 
     double sum = 0.;
 
-PRAGMA_OMP_PARALLEL_FOR_ARGS(reduction(OMP_SUMT:sum) OMP_IF(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
     for(int i = 0; i < minDim; ++i)
         sum += e<double>(i * offset);
 
@@ -3275,7 +3274,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
         // regular numeric types
         NDArray tmp(nd4j::DataType::FLOAT32, getContext()); // scalar = 0
 
-        ExtraArguments extras({eps});
+        ExtraArguments extras({0.0, 0.0, eps});
 
         NDArray::prepareSpecialUse({&tmp}, {this, other});
         NativeOpExecutioner::execReduce3Scalar(getContext(), reduce3::EqualsWithEps, getBuffer(), getShapeInfo(),
@@ -3288,7 +3287,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
 
         synchronize("NDArray::equalsTo");
 
-        if (tmp.e<int>(0) > 0)
+        if (tmp.e<Nd4jLong>(0) != 0)
             return false;
 
         return true;
diff --git a/libnd4j/blas/NativeOpExecutioner.h b/libnd4j/blas/NativeOpExecutioner.h
index cae7a4e56..fb2ca58f0 100644
--- a/libnd4j/blas/NativeOpExecutioner.h
+++ b/libnd4j/blas/NativeOpExecutioner.h
@@ -24,10 +24,10 @@
 
 #include <types/types.h>
 #include <dll.h>
-#include <loops/aggregates.h>
 #include <ops/specials.h>
 #include <ops/specials_sparse.h>
 #include <execution/LaunchContext.h>
+#include <array/ArrayOptions.h>
 
 /**
  * Native op executioner:
@@ -624,10 +624,6 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                               void *vrealArguments,
                               int numRealArguments) {
 
-        auto arguments = reinterpret_cast<X **>(varguments);
-        auto realArguments = reinterpret_cast<X *>(vrealArguments);
-
-        functions::aggregate::AggregatedFunction<X>::exec(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
     }
     
 
diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h
index b2679f537..b10b3807a 100755
--- a/libnd4j/blas/NativeOps.h
+++ b/libnd4j/blas/NativeOps.h
@@ -55,7 +55,6 @@
 #define ND4J_EXPORT
 #endif
 #include <dll.h>
-#include <helpers/BlasHelper.h>
 
 /*
 int tad_threshold = 1;
@@ -1430,7 +1429,11 @@ static const char* getNpyArrayNameFromMap(void *map, int index){
     for(; it != end; ++it, ++cnt){
         if (cnt == index){
             // FIXME: @fariz, this is a leak!
+#ifdef _MSC_VER
+            return const_cast<const char *>(_strdup(it->first.c_str()));
+#else
             return const_cast<const char *>(strdup(it->first.c_str()));
+#endif
         }
     }
     throw std::runtime_error("No array at index.");
diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp
index 03c7c53e1..dc9d09231 100644
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@@ -98,24 +98,27 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char
 
     const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target->getShapeInfo());
 
-    std::vector<Nd4jLong> coords(zRank);
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
+            shape::index2coords(i, target->getShapeInfo(), coords);
+            const auto zOffset = shape::getOffset(target->getShapeInfo(), coords);
 
-        shape::index2coords(i, target->getShapeInfo(), coords.data());
-        const auto zOffset = shape::getOffset(target->getShapeInfo(), coords.data());
+            // if( (row + upper < col) || (row + lower > col) )
+            if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
+                z[zOffset] = value;
+            else if (this != target) {      // when this and target are different arrays
+                if (xRank != zRank)
+                    coords[0] = coords[1];
 
-        // if( (row + upper < col) || (row + lower > col) )
-        if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
-            z[zOffset] = value;
-        else if(this != target) {      // when this and target are different arrays
-            if(xRank != zRank)
-                coords[0] = coords[1];
-            const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords.data());
-            z[zOffset] = x[xOffset];
+                const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords);
+                z[zOffset] = x[xOffset];
+            }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, zLen);
 }
 BUILD_SINGLE_TEMPLATE(template void NDArray::fillAsTriangular, (const float val, int lower, int upper, const char direction, NDArray* target), LIBND4J_TYPES);
 
@@ -140,7 +143,7 @@ void NDArray::setIdentity() {
             minDim = shape[i];
 
     float v = 1.0f;
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
+
     for(int i = 0; i < minDim; ++i)
         templatedSet<float>(buffer(), i*offset, this->dataType(), &v);
 }
@@ -151,12 +154,15 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) {
     auto x = reinterpret_cast<T *>(xBuffer);
     auto y = reinterpret_cast<T *>(yBuffer);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static))
-    for (Nd4jLong i = 0; i < length; ++i) {
-        auto temp = x[i];
-        x[i] = y[i];
-        y[i] = temp;
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            auto temp = x[i];
+            x[i] = y[i];
+            y[i] = temp;
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, length);
 }
 BUILD_SINGLE_TEMPLATE(template void templatedSwap, (void *xBuffer, void *yBuffer, Nd4jLong length), LIBND4J_TYPES);
 
@@ -262,21 +268,26 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     auto xType = this->dataType();
     if(result.ordering() == 'c') {           //  ews == 1 always here
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i = 0;  i < resultLen; ++i) {
-            auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
-            BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign, (result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
+                BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
+            }
+        };
 
-        }
+        samediff::Threads::parallel_for(func, 0, resultLen);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i=0;  i<resultLen; ++i) {
-            auto xOffset = result.getOffset(i);
-            auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
-            BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign, (result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto xOffset = result.getOffset(i);
+                auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
+                BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, resultLen);
     }
     result.tickWriteHost();
     return result;
@@ -337,14 +348,7 @@ void NDArray::tile(NDArray& target) const {
     // looping through _buffer goes automatically by means of getSubArrayIndex applying
     const auto ews = target.ews();
     const auto targetLen = target.lengthOf();
-    if(target.ordering() == 'c' && ews == 1) {           //  ews == 1 always here
-
-        for (Nd4jLong i = 0; i < targetLen; ++i) {
-            auto yOffset = shape::subArrayOffset(i, target.getShapeInfo(), getShapeInfo());
-            BUILD_DOUBLE_SELECTOR(target.dataType(), dataType(), templatedDoubleAssign, (target.getBuffer(), i, getBuffer(), yOffset), LIBND4J_TYPES, LIBND4J_TYPES);
-        }
-    }
-    else if(target.ordering() == 'c' && ews > 1) {
+    if(target.ordering() == 'c' && ews >= 1) {
 
         for(Nd4jLong i=0;  i<targetLen; ++i) {
             auto yOffset = shape::subArrayOffset(i, target.getShapeInfo(), getShapeInfo());
@@ -373,30 +377,30 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
     const int zLen    = output.lengthOf(); // xLen <= zLen
     const int repSize = repeats.size();
 
-    std::vector<Nd4jLong> coords(rank);
-
     // loop through input array
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
+            shape::index2coords(i, output.getShapeInfo(), coords);
 
-        shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
-
-        if(repSize > 1) {
-            for (uint j = 0; j < repSize; ++j) {
-                coords[axis] -= repeats[j];
-                if (coords[axis] < 0) {
-                    coords[axis] = j;
-                    break;
+            if (repSize > 1) {
+                for (uint j = 0; j < repSize; ++j) {
+                    coords[axis] -= repeats[j];
+                    if (coords[axis] < 0) {
+                        coords[axis] = j;
+                        break;
+                    }
                 }
-            }
-        }
-        else
-            coords[axis] /= repeats[0];
+            } else
+                coords[axis] /= repeats[0];
 
-        z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-    }
+            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, zLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/blas/cpu/NDArrayLambda.hpp b/libnd4j/blas/cpu/NDArrayLambda.hpp
index ecf2aa9ed..6ce8e6823 100644
--- a/libnd4j/blas/cpu/NDArrayLambda.hpp
+++ b/libnd4j/blas/cpu/NDArrayLambda.hpp
@@ -32,33 +32,40 @@ void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::
 
     if (this->ordering() == second->ordering() && this->ordering() == third->ordering()  && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == second->ews() && this->ews() == third->ews()) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func(f[e], s[e], t[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(f[e], s[e], t[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto tOffset = this->getOffset(e);
+                    auto uOffset = second->getOffset(e);
+                    auto vOffset = third->getOffset(e);
 
-                auto tOffset = this->getOffset(e);
-                auto uOffset = second->getOffset(e);
-                auto vOffset = third->getOffset(e);
+                    f[tOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
+                }
+            };
 
-                f[tOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto tOffset = this->getOffset(e);
+                    auto uOffset = second->getOffset(e);
+                    auto vOffset = third->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto tOffset = this->getOffset(e);
-                auto uOffset = second->getOffset(e);
-                auto vOffset = third->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
+                }
+            };
 
-                z[zOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -103,31 +110,38 @@ void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<T(T,
 
     if (this->ordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func(f[e], s[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(f[e], s[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
+                    f[xOffset] = func(f[xOffset], s[yOffset]);
+                }
+            };
 
-                f[xOffset] = func(f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(f[xOffset], s[yOffset]);
+                }
+            };
 
-                z[zOffset] = func(f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -161,29 +175,36 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray* target) {
 
     if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int e = 0; e < _length; e++)
-            z[e] = func(f[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(f[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
+                    f[xOffset] = func(f[xOffset]);
+                }
+            };
 
-                f[xOffset] = func(f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(f[xOffset]);
+                }
+            };
 
-                z[zOffset] = func(f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -217,29 +238,36 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
 
     if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func(e, f[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(e, f[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
+                    f[xOffset] = func(e, f[xOffset]);
+                }
+            };
 
-                f[xOffset] = func(e, f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(e, f[xOffset]);
+                }
+            };
 
-                z[zOffset] = func(e, f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -282,31 +310,38 @@ void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<T(N
 
     if (this->ordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func((Nd4jLong) e, f[e], s[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func((Nd4jLong) e, f[e], s[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
+                    f[xOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
+                }
+            };
 
-                f[xOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
+                }
+            };
 
-                z[zOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
diff --git a/libnd4j/blas/cpu/NativeOpExecutioner.cpp b/libnd4j/blas/cpu/NativeOpExecutioner.cpp
index 22fd9eca4..dc27c1cce 100644
--- a/libnd4j/blas/cpu/NativeOpExecutioner.cpp
+++ b/libnd4j/blas/cpu/NativeOpExecutioner.cpp
@@ -20,6 +20,8 @@
 #include "NativeOpExecutioner.h"
 #include <types/types.h>
 
+#include <LoopKind.h>
+
 #include <pairwise_bool.h>
 #include <broadcasting_bool.h>
 #include <scalar_bool.h>
@@ -50,11 +52,14 @@
 #include <loops/random.h>
 #include <pointercast.h>
 #include <exceptions/datatype_exception.h>
+#include <array/TadPack.h>
+#include <helpers/ConstantTadHelper.h>
 
 
 #ifdef _OPENMP
 
 #include <omp.h>
+#include <helpers/ConstantTadHelper.h>
 
 #endif
 
@@ -78,9 +83,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc, int op
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -111,9 +114,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -149,9 +150,7 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -160,7 +159,16 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
 #ifdef __ND4J_EXPERIMENTAL__
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = xLen / yLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 #endif
 }
 
@@ -179,9 +187,7 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     if (!nd4j::Environment::getInstance()->isExperimentalBuild())
         if ((yType != xType && yType != nd4j::DataType::BOOL) || xType != zType)
@@ -190,7 +196,15 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
 #ifdef __ND4J_EXPERIMENTAL__
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = yLen / xLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 #endif
 
 }
@@ -208,15 +222,21 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = xLen / yLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
@@ -231,9 +251,7 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
                                                   Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                                   Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -243,7 +261,15 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
         if (yType != xType || nd4j::DataType::BOOL != zType)
             throw nd4j::datatype_exception::build("NativeOps::execInverseBroadcastBool both operands must have same data type", xType, yType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = yLen / xLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 
@@ -260,9 +286,7 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
                                             int *dimension, int dimensionLength,
                                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -274,7 +298,15 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = xLen / yLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
@@ -289,21 +321,27 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
                                                    Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                                    Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform", zType, xType, yType);
+        throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt", zType, xType, yType);
 
     if (!nd4j::DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType);
+        throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt,::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = yLen / xLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -328,9 +366,7 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     void *extraParams) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -339,7 +375,15 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
 #ifdef __ND4J_EXPERIMENTAL__
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::pairwise_transforms::PairWiseTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::pairwise_transforms::PairWiseTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::pairwise_transforms::PairWiseTransform,
+                                     ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop),
+                                     LIBND4J_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 #endif
 }
 
@@ -353,9 +397,7 @@ void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     void *extraParams) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -367,7 +409,13 @@ void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext  *lc,
     if (zType != nd4j::DataType::BOOL)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform", nd4j::DataType::BOOL, zType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -380,9 +428,7 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext  *lc,
                                                     void *hZ, Nd4jLong *hZShapeInfo,
                                                     void *dZ, Nd4jLong *dZShapeInfo,
                                                     void *extraParams) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -394,7 +440,13 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execSPairwiseInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), INTEGER_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -417,14 +469,22 @@ void NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -437,14 +497,22 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -457,14 +525,22 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -477,14 +553,22 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, LONG_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -503,9 +587,7 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
                                     void *extraParams,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -521,9 +603,7 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
                                         void *extraParams,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
 
@@ -539,9 +619,7 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -557,9 +635,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
                                         void *extraParams,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -591,10 +667,6 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
                             void *dY, Nd4jLong *dYShapeInfo,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -623,15 +695,13 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
                             void *dY, Nd4jLong *dYShapeInfo,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, nullptr, 1), LIBND4J_TYPES, FLOAT_TYPES);
-
+    //BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, nullptr, 0), LIBND4J_TYPES, FLOAT_TYPES);
+    NativeOpExecutioner::execReduce3Scalar(lc, opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -647,14 +717,31 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadOnlyShapeInfo, Nd4jLong *xTadOffsets,
                             Nd4jLong *yTadOnlyShapeInfo, Nd4jLong *yTadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength), LIBND4J_TYPES, FLOAT_TYPES);
+    const auto xLen = shape::length(hXShapeInfo);
+    const auto yLen = shape::length(hYShapeInfo);
+
+    nd4j::TadPack tadPack;
+
+    if(xLen == yLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+    else if(yLen > xLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
+    }
+    else {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads());
 }
 
 
@@ -671,15 +758,19 @@ void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets,
                             Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES);
-//    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+
+    // TODO: make it 2d
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -696,15 +787,31 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *yTadShapeInfo, Nd4jLong *yTadOffsets) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
-//    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    const auto xLen = shape::length(hXShapeInfo);
+    const auto yLen = shape::length(hYShapeInfo);
+
+    nd4j::TadPack tadPack;
+
+    if(xLen == yLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+    else if(yLen > xLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
+    }
+    else {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads());
 }
 
 
@@ -729,9 +836,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
                             void *hScalar, Nd4jLong *hScalarShapeInfo,
                             void *dScalar, Nd4jLong *dScalarShapeInfo,
                             void *extraParams, bool allowParallelism) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -743,7 +848,13 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
     if (xType != yType || xType != zType)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
 
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, allowParallelism), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform,::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 #endif
 }
 
@@ -760,9 +871,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -774,7 +883,13 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
     if (xType != yType || xType != zType)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
 
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
+    };
+
+    auto yLen = shape::length(hScalarShapeInfo);
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxThreads()));
+
 #endif
 }
 
@@ -789,9 +904,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
                             void *dScalar, Nd4jLong *dSscalarShapeInfo,
                             void *extraParams, bool allowParallelism) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo);
@@ -803,7 +916,13 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
     if (zType != nd4j::DataType::BOOL)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1,  !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -819,9 +938,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -833,7 +950,12 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
     if (zType != nd4j::DataType::BOOL)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto yLen = shape::length(hScalarShapeInfo);
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxThreads()));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -847,9 +969,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
                                          void *dScalar, Nd4jLong *dSscalarShapeInfo,
                                          void *extraParams, bool allowParallelism) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo);
@@ -861,7 +981,13 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt", nd4j::DataType::INT32, zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), INTEGER_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -877,9 +1003,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
                                          int *dimension, int dimensionLength,
                                          Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                          Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -891,7 +1015,12 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
+    };
+
+    auto yLen = shape::length(hScalarShapeInfo);
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxThreads()));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -912,9 +1041,7 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
                                 void *hZ, Nd4jLong *hZShapeInfo,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 bool biasCorrected) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -940,9 +1067,7 @@ void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     bool biasCorrected) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -972,10 +1097,6 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                 bool biasCorrected) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -1002,14 +1123,14 @@ void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1021,14 +1142,14 @@ void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1040,14 +1161,14 @@ void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets, allowParallelism), LIBND4J_TYPES, LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, LIBND4J_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1059,14 +1180,14 @@ void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1078,14 +1199,14 @@ void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), FLOAT_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1095,9 +1216,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo,
                             void *extraArguments) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -1116,9 +1235,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo,
                             void *extraArguments) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -1139,9 +1256,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                           void *hZ, Nd4jLong *hZShapeInfo,
                           void *dZ, Nd4jLong *dZShapeInfo,
                           void *extraArguments) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp
index 7449bb022..151f5c883 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@@ -28,7 +28,6 @@
 #include <templatemath.h>
 #include <types/float8.h>
 #include <loops/type_conversions.h>
-#include <loops/aggregates.h>
 #include <helpers/helper_ptrmap.h>
 #include <helpers/logger.h>
 #include <pointercast.h>
@@ -36,6 +35,7 @@
 #include <types/types.h>
 #include <ops/declarable/helpers/transforms.h>
 #include <exceptions/allocation_exception.h>
+#include <helpers/BlasHelper.h>
 
 
 #include <fcntl.h>
@@ -75,6 +75,7 @@ bool experimentalSupport = false;
 #include <performance/benchmarking/BenchmarkSuit.h>
 #include <performance/benchmarking/FullBenchmarkSuit.h>
 #include <performance/benchmarking/LightBenchmarkSuit.h>
+#include <execution/Threads.h>
 
 #ifdef CPU_FEATURES
 #include <cpuinfo_x86.h>
@@ -1152,10 +1153,7 @@ void initializeFunctions(Nd4jPointer *functions) {
        * @param flags optional parameter
        */
 Nd4jPointer mallocHost(Nd4jLong memorySize, int flags) {
-    Nd4jPointer pointer = (Nd4jPointer) malloc(memorySize);
-    if (pointer == 0)
-        return 0L;
-    return pointer;
+    return reinterpret_cast<Nd4jPointer>(new int8_t[memorySize]);
 }
 
 /**
@@ -1179,7 +1177,7 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) {
  * @param pointer pointer that'll be freed
  */
 int freeHost(Nd4jPointer pointer) {
-    free(reinterpret_cast<void *>(pointer));
+    delete[] reinterpret_cast<int8_t *>(pointer);
     return 1L;
 }
 
@@ -1364,37 +1362,37 @@ void pullRowsGeneric(void *vx,
 
     int elementsPerThread = n / TAD_THRESHOLD;
     int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-    _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+    _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
-    PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-    for (int idx = 0; idx < n; idx++) {
-        auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
-        auto zTadOffsetForBlock = zTadOffsets[idx];
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto idx = start; idx < stop; idx += increment) {
+            auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
+            auto zTadOffsetForBlock = zTadOffsets[idx];
 
-        auto rX = hX + xTadOffsetForBlock;
-        auto rZ = hZ + zTadOffsetForBlock;
+            auto rX = hX + xTadOffsetForBlock;
+            auto rZ = hZ + zTadOffsetForBlock;
 
-        if (xEWS == 1 && zEWS == 1) {
-
-            PRAGMA_OMP_SIMD
-            for (int i = 0; i < tadLength; i++ ) {
-                rZ[i] = rX[i];
-            }
-        } else if (xEWS >= 1 && zEWS >= 1) {
-
-            PRAGMA_OMP_SIMD
-            for (int i = 0; i < tadLength; i++ ) {
-                rZ[i * zEWS] = rX[i * xEWS];
+            if (xEWS == 1 && zEWS == 1) {
+                PRAGMA_OMP_SIMD
+                for (int i = 0; i < tadLength; i++) {
+                    rZ[i] = rX[i];
+                }
+            } else if (xEWS >= 1 && zEWS >= 1) {
+                PRAGMA_OMP_SIMD
+                for (int i = 0; i < tadLength; i++) {
+                    rZ[i * zEWS] = rX[i * xEWS];
+                }
+            } else {
+                for (int i = 0; i < tadLength; i++) {
+                    auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
+                    auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
+                    hZ[zOffset] = hX[xOffset];
+                }
             }
         }
-        else {
-            for (int i = 0; i < tadLength; i++) {
-                auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
-                auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
-                hZ[zOffset] = hX[xOffset];
-            }
-        }
-    }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, n, 1, _threads);
 }
 
 void pullRows(Nd4jPointer *extraPointers,
@@ -1433,30 +1431,29 @@ void tearGeneric(void *vx,
     auto zEWS = shape::elementWiseStride(hZShapeInfo);
     auto numTads = shape::length(hXShapeInfo) / tadLength;
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong i = 0; i < numTads; i++) {
-        auto hZ = reinterpret_cast<T *>(targets[i]);
-        auto s = hX + tadOffsets[i];
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            auto hZ = reinterpret_cast<T *>(targets[i]);
+            auto s = hX + tadOffsets[i];
 
-        if (zEWS == 1 && tadEWS == 1) {
-
-            PRAGMA_OMP_SIMD
-            for (Nd4jLong j = 0; j < tadLength; j++) {
-                hZ[j] = s[j];
-            }
-        } else if (zEWS > 0 && tadEWS > 0) {
-
-            PRAGMA_OMP_SIMD
-            for (Nd4jLong j = 0; j < tadLength; j++) {
-                hZ[j * zEWS] = s[j * tadEWS];
+            if (zEWS == 1 && tadEWS == 1) {
+                PRAGMA_OMP_SIMD
+                for (Nd4jLong j = 0; j < tadLength; j++) {
+                    hZ[j] = s[j];
+                }
+            } else if (zEWS > 0 && tadEWS > 0) {
+                PRAGMA_OMP_SIMD
+                for (Nd4jLong j = 0; j < tadLength; j++) {
+                    hZ[j * zEWS] = s[j * tadEWS];
+                }
+            } else {
+                for (Nd4jLong j = 0; j < tadLength; j++)
+                    hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)];
             }
         }
-        else {
+    };
 
-            for (Nd4jLong j = 0; j < tadLength; j++)
-                hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)];
-        }
-    }
+    samediff::Threads::parallel_tad(func,0, numTads);
 }
 
 void tear(Nd4jPointer *extraPointers,
@@ -1557,57 +1554,60 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
     auto dX = reinterpret_cast<T **>(hX);
     auto dZ = reinterpret_cast<T **>(dz);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(N)
-    for (int f = 0; f < N; f++) {
-        auto hX = reinterpret_cast<T *>(dX[f]);
-        //auto hZ = reinterpret_cast<T *>(dZ[f]);
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto f = start; f < stop; f += increment) {
+            auto hX = reinterpret_cast<T *>(dX[f]);
+            //auto hZ = reinterpret_cast<T *>(dZ[f]);
 
-        auto xShapeInfo = hXShapeInfo[f];
-        auto tadOffset = reinterpret_cast<Nd4jLong *>(tadOffsets[f]);
+            auto xShapeInfo = hXShapeInfo[f];
+            auto tadOffset = reinterpret_cast<Nd4jLong *>(tadOffsets[f]);
 
 
-        const auto tadLength = shape::length(tadOnlyShapeInfo[f]);
-        auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo[f]);
-        auto tadRank = shape::rank(tadOnlyShapeInfo[f]);
-        auto numTads = shape::length(hXShapeInfo[f]) / tadLength;
+            const auto tadLength = shape::length(tadOnlyShapeInfo[f]);
+            auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo[f]);
+            auto tadRank = shape::rank(tadOnlyShapeInfo[f]);
+            auto numTads = shape::length(hXShapeInfo[f]) / tadLength;
 
-        auto tadShape = shape::shapeOf(tadOnlyShapeInfo[f]);
-        auto tadStride = shape::stride(tadOnlyShapeInfo[f]);
+            auto tadShape = shape::shapeOf(tadOnlyShapeInfo[f]);
+            auto tadStride = shape::stride(tadOnlyShapeInfo[f]);
 
-        if (shape::rank(xShapeInfo) == 1) {
-            auto xLength = shape::length(xShapeInfo);
-            auto ews = shape::elementWiseStride(xShapeInfo);
-            for (Nd4jLong r = 0; r < xLength; r++) {
-                auto swapIdx = shuffleMap[r];
-                if (swapIdx < 0)
-                    continue;
+            if (shape::rank(xShapeInfo) == 1) {
+                auto xLength = shape::length(xShapeInfo);
+                auto ews = shape::elementWiseStride(xShapeInfo);
+                for (Nd4jLong r = 0; r < xLength; r++) {
+                    auto swapIdx = shuffleMap[r];
+                    if (swapIdx < 0)
+                        continue;
 
-                nd4j::math::nd4j_swap<T>(hX[r*ews], hX[swapIdx*ews]);
-            }
-        } else {
-            for (Nd4jLong r = 0; r < numTads; r++) {
-                if (shuffleMap[r] < 0)
-                    continue;
+                    nd4j::math::nd4j_swap<T>(hX[r * ews], hX[swapIdx * ews]);
+                }
+            } else {
+                for (Nd4jLong r = 0; r < numTads; r++) {
+                    if (shuffleMap[r] < 0)
+                        continue;
 
-                auto oldOffset = tadOffset[r];
-                auto newOffset = tadOffset[shuffleMap[r]];
+                    auto oldOffset = tadOffset[r];
+                    auto newOffset = tadOffset[shuffleMap[r]];
 
-                auto rX = hX + oldOffset;
-                auto rY = hX + newOffset;
+                    auto rX = hX + oldOffset;
+                    auto rY = hX + newOffset;
 
-                if (tadEWS == 1) {
-                    for (Nd4jLong i = 0; i < tadLength; i++) {
-                        nd4j::math::nd4j_swap<T>(rX[i], rY[i]);
-                    }
-                } else {
-                    for (Nd4jLong i = 0; i < tadLength; i++) {
-                        auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
-                        nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
+                    if (tadEWS == 1) {
+                        for (Nd4jLong i = 0; i < tadLength; i++) {
+                            nd4j::math::nd4j_swap<T>(rX[i], rY[i]);
+                        }
+                    } else {
+                        for (Nd4jLong i = 0; i < tadLength; i++) {
+                            auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
+                            nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
+                        }
                     }
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, N);
 }
 
 void shuffle(Nd4jPointer *extras,
@@ -1772,72 +1772,9 @@ void execAggregate(Nd4jPointer *extraPointers,int opNum,
                                     void *realArguments,
                                     int numRealArguments,
                                     nd4j::DataType dtype) {
-    try {
-        BUILD_SINGLE_SELECTOR(dtype, NativeOpExecutioner::execAggregate, (nullptr, opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), FLOAT_TYPES);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
 
 }
 
-template <typename T>
-void _batchExecutor(Nd4jPointer *extraPointers,
-                           int numAggregates,
-                           int opNum,
-                           int maxArgs,
-                           int maxShapes,
-                           int maxIntArrays,
-                           int maxIntArraySize,
-                           int maxIdx,
-                           int maxReals,
-                           void *ptrToArguments,
-                           nd4j::DataType dtype) {
-    // probably, we don't want too much threads as usually
-    int _threads = nd4j::math::nd4j_min<int>(numAggregates, omp_get_max_threads());
-
-    nd4j::PointersHelper<T> helper(ptrToArguments,
-                                        numAggregates,
-                                        maxArgs,
-                                        maxShapes,
-                                        maxIntArrays,
-                                        maxIntArraySize,
-                                        maxIdx,
-                                        maxReals);
-
-    // special case here, we prefer spread arrangement here, all threads are detached from each other
-    PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-    for (int i = 0; i < numAggregates; i++) {
-        auto intArrays = new int *[maxIntArrays];
-
-        auto arguments = helper.getArguments(i);
-        auto shapes = helper.getShapeArguments(i);
-        auto idxArg = helper.getIndexArguments(i);
-        auto realArg = helper.getRealArguments(i);
-
-        for (int e = 0; e < maxIntArrays; e++) {
-            intArrays[e] = helper.getIntArrayArguments(i, e);
-        }
-
-        execAggregate(extraPointers,
-                      opNum,
-                      reinterpret_cast<void **>(arguments),
-                      helper.getNumArguments(i),
-                      shapes,
-                      helper.getNumShapeArguments(i),
-                      idxArg,
-                      helper.getNumIndexArguments(i),
-                      intArrays,
-                      helper.getNumIntArrayArguments(i),
-                      realArg,
-                      helper.getNumRealArguments(i),
-                      dtype);
-
-        delete [] intArrays;
-    }
-}
-BUILD_SINGLE_TEMPLATE(template void _batchExecutor, (Nd4jPointer *extraPointers, int numAggregates, int opNum, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments, nd4j::DataType dtype), FLOAT_TYPES);
-
 void batchExecutor(Nd4jPointer *extraPointers,
                                int numAggregates,
                                int opNum,
@@ -1849,12 +1786,7 @@ void batchExecutor(Nd4jPointer *extraPointers,
                                int maxReals,
                                void *ptrToArguments,
                                nd4j::DataType dtype) {
-    try {
-        BUILD_SINGLE_SELECTOR(dtype, _batchExecutor, (extraPointers, numAggregates, opNum, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments, dtype), FLOAT_TYPES);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
+
 }
 
 void execAggregateBatch(Nd4jPointer *extraPointers,
@@ -1868,12 +1800,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers,
                                          int maxReals,
                                          void *ptrToArguments,
                                          nd4j::DataType dtype) {
-    try {
-        BUILD_SINGLE_SELECTOR(dtype, _batchExecutor, (extraPointers, numAggregates, opNum, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments, dtype), FLOAT_TYPES);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
+
 }
 
 
@@ -2094,27 +2021,21 @@ const char* getAllCustomOps() {
 template <typename T>
 FORCEINLINE int estimateThresholdGeneric(Nd4jPointer *extraPointers, Nd4jPointer hX, int N, T threshold) {
     auto buffer = reinterpret_cast<T *>(hX);
-
     int span = (N / 6) + 8;
-    int cnt = 0;
-
-    PRAGMA_OMP_PARALLEL_REDUCTION(+:cnt)
-    {
-        int tid = omp_get_thread_num();
-        int start = span * tid;
-        int stop = span * (tid + 1);
-        if (stop > N)
-            stop = N;
 
+    auto func = PRAGMA_REDUCE_LONG {
+        int64_t cnt = 0;
         PRAGMA_OMP_SIMD
-        for (int e = start; e < stop; e++) {
+        for (auto e = start; e < stop; e++) {
             auto v = nd4j::math::nd4j_abs<T>(buffer[e]);
             if (v >= threshold)
                 cnt++;
         }
-    }
 
-    return cnt;
+        return cnt;
+    };
+
+    return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N);
 }
 
 
@@ -2776,58 +2697,51 @@ static void  _scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSub
                             void* vIindexes, Nd4jLong* hIndicesShapeInfo, void* dIindexes, Nd4jLong* dIndicesShapeInfo) {
 
     auto hIindexes = reinterpret_cast<I*>(vIindexes);
-
-    int numThreads = omp_get_max_threads();
-
-    PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-    {
-        for (int i = 0; i < numOfSubArrs; ++i) {
-
-            int threadIndex = omp_get_thread_num();
+        auto func = PRAGMA_THREADS_DO {
+            for (int i = 0; i < numOfSubArrs; ++i) {
+                int threadIndex = thread_id;
             const auto xIndex = hIindexes[i];
             const bool isOwner = xIndex < numThreads ? threadIndex == xIndex : threadIndex == xIndex % numThreads;
 
             if (!isOwner)
                 continue;
 
-            NDArray inSubArr(
-                    reinterpret_cast<int8_t *>(hX) + (hXOffsets[hIindexes[i]] * DataTypeUtils::sizeOf(hXShapeInfo)),
-                    hXShapeInfo);
-            NDArray updSubArr(reinterpret_cast<int8_t *>(hY) + (hYOffsets[i] * DataTypeUtils::sizeOf(hXShapeInfo)),
-                              hYShapeInfo);
+                NDArray inSubArr(reinterpret_cast<int8_t *>(hX) + (hXOffsets[hIindexes[i]] * DataTypeUtils::sizeOf(hXShapeInfo)), hXShapeInfo);
+                NDArray updSubArr(reinterpret_cast<int8_t *>(hY) + (hYOffsets[i] * DataTypeUtils::sizeOf(hXShapeInfo)), hYShapeInfo);
 
             if (inSubArr.lengthOf() != updSubArr.lengthOf()) {
                 continue;
             }
 
-            switch (opCode) {
-                case 0:
-                    inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 1:
-                    inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 2:
-                    inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 3:
-                    inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 4:
-                    inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 5:
-                    inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 6:
-                    inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
-                    break;
-                default:
-                    continue;
+                switch (opCode) {
+                    case 0:
+                        inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 1:
+                        inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 2:
+                        inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 3:
+                        inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 4:
+                        inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 5:
+                        inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 6:
+                        inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    default:
+                        continue;
+                }
             }
-        }
-    }
+        };
 
+        samediff::Threads::parallel_do(func);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -2847,6 +2761,7 @@ void scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSubArrs,
     }
 }
 
+
 void inspectArray(Nd4jPointer *extraPointers, Nd4jPointer buffer, Nd4jLong *shapeInfo, Nd4jPointer specialBuffer, Nd4jLong *specialShapeInfo, Nd4jPointer debugInfo) {
     try {
         auto p = reinterpret_cast<nd4j::DebugInfo *>(debugInfo);
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index 2db1aa128..2af0e3783 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -25,6 +25,7 @@
 #include <loops/transform_any.h>
 #include <loops/reduce_bool.h>
 #include <loops/reduce_long.h>
+#include <loops/scalar.h>
 #include <helpers/threshold.h>
 #include <ops/specials_cuda.h>
 #include <helpers/DebugHelper.h>
@@ -33,8 +34,8 @@
 #include <exceptions/datatype_exception.h>
 #include <exceptions/cuda_exception.h>
 #include <helpers/CudaLaunchHelper.h>
-// FIXME: we need cuda-specific implementations
 #include <GraphExecutioner.h>
+#include <helpers/BlasHelper.h>
 #include <graph/GraphHolder.h>
 #include <ops/declarable/CustomOperations.h>
 #include <PointersManager.h>
@@ -1723,11 +1724,7 @@ void execScalarTad(Nd4jPointer *extraPointers,
 #ifdef __ND4J_EXPERIMENTAL__
         BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform,
-                                     ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ,
-                                                                 dZShapeInfo, dScalars, extraParams, dimension,
-                                                                 dimensionLength, tadShapeInfo, tadOffsets,
-                                                                 tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
 #endif
 
         DEBUG_KERNEL(stream, opNum);
@@ -1750,23 +1747,7 @@ void execAggregate(Nd4jPointer *extraPointers,
                                    void *realArguments,
                                    int numRealArguments,
                                    nd4j::DataType dtype) {
-    try {
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-        int numBlocks = getDeviceId(extraPointers[2]);
-        int numThreads = getDeviceId(extraPointers[3]);
-        int shmem = getDeviceId(extraPointers[4]);
 
-        dim3 launchDims = dim3(numBlocks, numThreads, shmem);
-
-        BUILD_SINGLE_SELECTOR(dtype, functions::aggregate::AggregatedFunction,
-                              ::aggregateKernelGeneric(launchDims, stream, opNum, arguments, numArguments, shapes,
-                                                       numShapes, indexArguments, numIndexArguments, intArrays,
-                                                       numIntArrays, realArguments, numRealArguments), FLOAT_TYPES);
-        nd4j::DebugHelper::checkErrorCode(stream, "execAggregateFloat(...) failed");
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
 }
 
 void batchExecutor(Nd4jPointer *extraPointers,
@@ -1788,25 +1769,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers,
 									int maxIntArrays, int maxIntArraySize,
 									int maxIdx, int maxReals,
 									void *ptrToArguments, nd4j::DataType dtype) {
-    try {
-        // not implemented yet
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-        int numBlocks = getDeviceId(extraPointers[2]);
-        int numThreads = getDeviceId(extraPointers[3]);
-        int shmem = getDeviceId(extraPointers[4]);
 
-        dim3 launchDims = dim3(numAggregates, numThreads, shmem);
-
-        BUILD_SINGLE_SELECTOR(dtype, functions::aggregate::AggregatedFunction,
-                              ::aggregateBatchKernelGeneric(launchDims, stream, opNum, numAggregates, maxArgs,
-                                                            maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals,
-                                                            ptrToArguments), FLOAT_TYPES);
-
-        DEBUG_KERNEL(stream, opNum);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/buildnativeoperations.sh b/libnd4j/buildnativeoperations.sh
index 599c4f250..56e225a5d 100755
--- a/libnd4j/buildnativeoperations.sh
+++ b/libnd4j/buildnativeoperations.sh
@@ -53,6 +53,7 @@ CLEAN="false"
 MINIFIER="false"
 TESTS="false"
 VERBOSE="false"
+VERBOSE_ARG="VERBOSE=1"
 HELPER=
 NAME=
 while [[ $# > 0 ]]
@@ -291,38 +292,37 @@ case "$OS" in
 
     macosx*)
     # Do something under Mac OS X platform
-    if [ "$CHIP" == "cuda" ]; then
+    #if [ "$CHIP" == "cuda" ]; then
         export CC=clang
         export CXX=clang++
-        PARALLEL="false"
-    else
-        export CC="$(ls -1 /usr/local/bin/gcc-? | head -n 1)"
-        export CXX="$(ls -1 /usr/local/bin/g++-? | head -n 1)"
         PARALLEL="true"
-    fi
+    #else
+    #    export CC="$(ls -1 /usr/local/bin/gcc-? | head -n 1)"
+    #    export CXX="$(ls -1 /usr/local/bin/g++-? | head -n 1)"
+    #    PARALLEL="true"
+    #fi
     export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_MACOSX_RPATH=ON -DAPPLE_BUILD=true"
     ;;
 
     windows*)
-    # Do something under Windows NT platform
-    if [ "$CHIP" == "cuda" ]; then
+      # Do something under Windows NT platform
+      if [ "$CHIP" == "cuda" ]; then
         export CMAKE_COMMAND="cmake -G \"Ninja\""
         export MAKE_COMMAND="ninja"
         export CC="cl.exe"
         export CXX="cl.exe"
         PARALLEL="true"
-    else
+        VERBOSE_ARG="-v"
+      else
         export CMAKE_COMMAND="cmake -G \"MSYS Makefiles\""
         export MAKE_COMMAND="make"
-
-        # Sam, do we really need this?
         export CC=/mingw64/bin/gcc
         export CXX=/mingw64/bin/g++
         PARALLEL="true"
+      fi
 
-    fi
-    # Try some defaults for Visual Studio 2013 if user has not run vcvarsall.bat or something
-    if [ -z "${VCINSTALLDIR:-}" ]; then
+      # Try some defaults for Visual Studio 2013 if user has not run vcvarsall.bat or something
+      if [ -z "${VCINSTALLDIR:-}" ]; then
         export VisualStudioVersion=12.0
         export VSINSTALLDIR="C:\\Program Files (x86)\\Microsoft Visual Studio $VisualStudioVersion"
         export VCINSTALLDIR="$VSINSTALLDIR\\VC"
@@ -332,10 +332,10 @@ case "$OS" in
         export LIB="$VCINSTALLDIR\\LIB\\amd64;$WindowsSdkDir\\lib\\winv6.3\\um\\x64"
         export LIBPATH="$VCINSTALLDIR\\LIB\\amd64;$WindowsSdkDir\\References\\CommonConfiguration\\Neutral"
         export PATH="$PATH:$VCINSTALLDIR\\BIN\\amd64:$WindowsSdkDir\\bin\\x64:$WindowsSdkDir\\bin\\x86"
-    fi
-    # Make sure we are using 64-bit MinGW-w64
-    export PATH=/mingw64/bin/:$PATH
-    # export GENERATOR="MSYS Makefiles"
+      fi
+      # Make sure we are using 64-bit MinGW-w64
+      export PATH=/mingw64/bin/:/mingw64/lib:$PATH
+      # export GENERATOR="MSYS Makefiles"
     ;;
 esac
 
@@ -534,6 +534,6 @@ if [ "$PARALLEL" == "true" ]; then
     MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
 fi
 if [ "$VERBOSE" == "true" ]; then
-    MAKE_ARGUMENTS="$MAKE_ARGUMENTS VERBOSE=1"
+    MAKE_ARGUMENTS="$MAKE_ARGUMENTS $VERBOSE_ARG"
 fi
 eval $MAKE_COMMAND $MAKE_ARGUMENTS && cd ../../..
diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h
index 677401954..3af77ca39 100644
--- a/libnd4j/include/array/DataTypeConversions.h
+++ b/libnd4j/include/array/DataTypeConversions.h
@@ -29,6 +29,7 @@
 #include <helpers/BitwiseUtils.h>
 #include <loops/type_conversions.h>
 #include <dll.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     template <typename T>
@@ -50,9 +51,12 @@ namespace nd4j {
                             else
                                 TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
 #else
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (Nd4jLong e = 0; e < length; e++)
-                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto e = start; e < stop; e += increment)
+                        buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                };
+
+                samediff::Threads::parallel_for(func, 0, length);
 #endif
 
                 delete[] tmp;
@@ -105,9 +109,12 @@ namespace nd4j {
                             else
                                 TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
 #else
-                            PRAGMA_OMP_PARALLEL_FOR_SIMD
-                            for (Nd4jLong e = 0; e < length; e++)
-                                buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            auto func = PRAGMA_THREADS_FOR {
+                                for (auto e = start; e < stop; e += increment)
+                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            };
+
+                            samediff::Threads::parallel_for(func, 0, length);
 #endif
 
                             delete[] tmp;
@@ -130,9 +137,12 @@ namespace nd4j {
 
 
 #else
-                            PRAGMA_OMP_PARALLEL_FOR
-                            for (Nd4jLong e = 0; e < length; e++)
-                                buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            auto func = PRAGMA_THREADS_FOR {
+                                for (auto e = start; e < stop; e += increment)
+                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            };
+
+                            samediff::Threads::parallel_for(func, 0, length);
 #endif
                             delete[] tmp;
                         }
@@ -153,9 +163,12 @@ namespace nd4j {
                             else
                                 TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
 #else
-                            PRAGMA_OMP_PARALLEL_FOR
-                            for (Nd4jLong e = 0; e < length; e++)
-                                buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            auto func = PRAGMA_THREADS_FOR {
+                                for (auto e = start; e < stop; e += increment)
+                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            };
+
+                            samediff::Threads::parallel_for(func, 0, length);
 #endif
                             delete[] tmp;
                         }
diff --git a/libnd4j/include/buffer.h b/libnd4j/include/buffer.h
index e2aa70046..79197753d 100755
--- a/libnd4j/include/buffer.h
+++ b/libnd4j/include/buffer.h
@@ -26,6 +26,7 @@
 #ifdef __CUDACC__
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <helpers/DebugHelper.h>
 #endif
 #include <dll.h>
 
diff --git a/libnd4j/include/cnpy/cnpy.h b/libnd4j/include/cnpy/cnpy.h
index ac7fef863..06ff3336d 100644
--- a/libnd4j/include/cnpy/cnpy.h
+++ b/libnd4j/include/cnpy/cnpy.h
@@ -97,10 +97,10 @@ namespace cnpy {
      * @param t
      * @return
      */
-    char mapType(const std::type_info &t);
+    ND4J_EXPORT char mapType(const std::type_info &t);
 
     template <typename T>
-    char mapType();
+    ND4J_EXPORT char mapType();
 
     /**
      *
@@ -111,7 +111,7 @@ namespace cnpy {
      * @return
      */
     template<typename T>
-    std::vector<char> createNpyHeader(const void *data,
+    ND4J_EXPORT std::vector<char> createNpyHeader(const void *data,
                                       const unsigned int *shape,
                                       const unsigned int ndims,
                                       unsigned int wordSize = 4);
@@ -126,7 +126,7 @@ namespace cnpy {
      * @param ndims
      * @param fortranOrder
      */
-    void parseNpyHeader(FILE *fp,
+    ND4J_EXPORT void parseNpyHeader(FILE *fp,
                         unsigned int &wordSize,
                         unsigned int *&shape,
                         unsigned int &ndims,
@@ -143,7 +143,7 @@ namespace cnpy {
     * @param ndims
     * @param fortran_order
     */
-    void parseNpyHeaderPointer(
+    ND4J_EXPORT void parseNpyHeaderPointer(
             const char *header,
             unsigned int& word_size,
             unsigned int*& shape,
@@ -156,7 +156,7 @@ namespace cnpy {
      * @param global_header_size
      * @param global_header_offset
      */
-    void parseZipFooter(FILE *fp,
+    ND4J_EXPORT void parseZipFooter(FILE *fp,
                         unsigned short &nrecs,
                         unsigned int &global_header_size,
                         unsigned int &global_header_offset);
@@ -167,14 +167,14 @@ namespace cnpy {
      * @param varname
      * @return
      */
-    NpyArray npzLoad(std::string fname, std::string varname);
+    ND4J_EXPORT NpyArray npzLoad(std::string fname, std::string varname);
 
     /**
      *
      * @param fname
      * @return
      */
-    NpyArray npyLoad(std::string fname);
+    ND4J_EXPORT NpyArray npyLoad(std::string fname);
 
     /**
     * Parse the numpy header from
@@ -187,7 +187,7 @@ namespace cnpy {
     * @param ndims
     * @param fortranOrder
     */
-    void parseNpyHeaderStr(std::string header,
+    ND4J_EXPORT void parseNpyHeaderStr(std::string header,
                            unsigned int &wordSize,
                            unsigned int *&shape,
                            unsigned int &ndims,
@@ -199,14 +199,14 @@ namespace cnpy {
      * @param fp
      * @return
      */
-    int * shapeFromFile(FILE *fp);
+    ND4J_EXPORT int* shapeFromFile(FILE *fp);
 
     /**
      *
      * @param data
      * @return
      */
-    int * shapeFromPointer(char *data);
+    ND4J_EXPORT int* shapeFromPointer(char *data);
 
     /**
      * Load the numpy array from the given file.
@@ -250,7 +250,7 @@ namespace cnpy {
 * @param ndims
 * @param fortran_order
 */
-    void parseNpyHeader(std::string header,
+    ND4J_EXPORT void parseNpyHeader(std::string header,
                         unsigned int &word_size,
                         unsigned int *&shape,
                         unsigned int &ndims,
@@ -273,7 +273,7 @@ namespace cnpy {
 
 
     template<typename T>
-    void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w");
+    ND4J_EXPORT void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w");
 
 }
 
@@ -284,8 +284,8 @@ namespace cnpy {
      * @param rhs
      * @return
      */
-template<typename T>
-std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs);
+    template<typename T>
+    ND4J_EXPORT std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs);
 
 
 #endif
diff --git a/libnd4j/include/dll.h b/libnd4j/include/dll.h
index 4b5a71eec..91d5a7677 100644
--- a/libnd4j/include/dll.h
+++ b/libnd4j/include/dll.h
@@ -20,6 +20,9 @@
 
 #ifndef NATIVEOPERATIONS_DLL_H
 #define NATIVEOPERATIONS_DLL_H
+
+#include <msvc.h>
+
 #ifdef _WIN32
 //#include <windows.h>
 #  define ND4J_EXPORT __declspec(dllexport)
diff --git a/libnd4j/include/execution/BlockingQueue.h b/libnd4j/include/execution/BlockingQueue.h
new file mode 100644
index 000000000..a78196dfc
--- /dev/null
+++ b/libnd4j/include/execution/BlockingQueue.h
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_BLOCKINGQUEUE_H
+#define SAMEDIFF_BLOCKINGQUEUE_H
+
+#include <functional>
+#include <queue>
+#include <mutex>
+#include <atomic>
+#include <condition_variable>
+
+namespace samediff {
+    template <typename T>
+    class BlockingQueue {
+    private:
+        std::queue<T> _queue;
+        std::mutex _lock;
+        std::atomic<int> _size;
+        std::atomic<bool> _available;
+
+        std::condition_variable _condition;
+    public:
+        BlockingQueue(int queueSize);
+        ~BlockingQueue() = default;
+        T poll();
+        void put(const T &t);
+
+        bool available();
+        void markAvailable();
+        void markUnavailable();
+    };
+}
+
+#endif //DEV_TESTS_BLOCKINGQUEUE_H
diff --git a/libnd4j/include/execution/CallableInterface.h b/libnd4j/include/execution/CallableInterface.h
new file mode 100644
index 000000000..7e5502af1
--- /dev/null
+++ b/libnd4j/include/execution/CallableInterface.h
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_CALLABLEINTERFACE_H
+#define SAMEDIFF_CALLABLEINTERFACE_H
+
+#include <openmp_pragmas.h>
+#include <cstdint>
+#include <functional>
+#include <atomic>
+#include <array>
+#include <mutex>
+#include <condition_variable>
+
+namespace samediff {
+    /**
+     * This class is suited for passing functions to execution threads without queues
+     */
+    class CallableInterface {
+    private:
+        // parallel_for functions
+        FUNC_1D _function_1d;
+        FUNC_2D _function_2d;
+        FUNC_3D _function_3d;
+
+        // parallel function
+        FUNC_DO _function_do;
+
+        // reduction functions
+        FUNC_RL _function_rl;
+        FUNC_RD _function_rd;
+
+        std::array<int64_t, 9> _arguments;
+
+        volatile int _branch = 0;
+        volatile uint32_t _thread_id = 0;
+        volatile uint32_t _num_threads = 0;
+
+        std::atomic<bool> _finished;
+        std::atomic<bool> _filled;
+        std::atomic<bool> _available;
+
+        std::condition_variable _starter;
+        std::condition_variable _finisher;
+
+        int64_t* _lptr = nullptr;
+        double* _dptr = nullptr;
+
+        std::mutex _ms;
+        std::mutex _mf;
+    public:
+        CallableInterface();
+        ~CallableInterface() = default;
+
+        void waitForTask();
+        void waitForCompletion();
+
+        void fill(int thread_id, int num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void fill(int thread_id, int num_threads, double *dpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+
+        void fill(int thread_id, int num_threads, FUNC_DO func);
+        void fill(int thread_id, int num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void fill(int thread_id, int num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+        void fill(int thread_id, int num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z);
+
+        bool available();
+        void markAvailable();
+        void markUnavailable();
+
+        void finish();
+
+        void execute();
+    };
+}
+
+
+#endif //DEV_TESTS_CALLABLEINTERFACE_H
diff --git a/libnd4j/include/execution/CallableWithArguments.h b/libnd4j/include/execution/CallableWithArguments.h
new file mode 100644
index 000000000..ebf1f0019
--- /dev/null
+++ b/libnd4j/include/execution/CallableWithArguments.h
@@ -0,0 +1,92 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef DEV_TESTS_CALLABLEWITHARGUMENTS_H
+#define DEV_TESTS_CALLABLEWITHARGUMENTS_H
+
+#include <functional>
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <op_boilerplate.h>
+
+namespace samediff {
+    class CallableWithArguments {
+        FUNC_DO _function_do;
+        FUNC_1D _function_1d;
+        FUNC_2D _function_2d;
+        FUNC_3D _function_3d;
+
+        std::vector<int64_t> _arguments;
+
+        std::atomic<bool> _finished;
+
+        std::condition_variable _condition;
+
+        std::mutex _lock;
+
+        int _dimensions = 0;
+
+        uint64_t _threadId;
+        uint64_t _numThreads;
+    public:
+        CallableWithArguments(FUNC_DO func, uint64_t thread_id, uint64_t numThreads);
+        CallableWithArguments(FUNC_1D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x);
+        CallableWithArguments(FUNC_2D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y);
+        CallableWithArguments(FUNC_3D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y, int64_t start_z, int64_t stop_z, int64_t increment_z);
+
+
+        /**
+         * This method returns number of dimensions
+         * @return
+         */
+        int dimensions();
+
+        /**
+         * This method checks if this callable is finished
+         * @return
+         */
+        bool finished();
+
+        /**
+         * this method marks this Callable as finished
+         */
+        void finish();
+
+        /**
+         * This method blocks until callable is finished
+         */
+        void waitUntilFinished();
+
+        std::vector<int64_t>& arguments();
+        FUNC_DO function_do();
+        FUNC_1D function_1d();
+        FUNC_2D function_2d();
+        FUNC_3D function_3d();
+
+
+        uint64_t threadId();
+
+        uint64_t numThreads();
+    };
+}
+
+
+#endif //DEV_TESTS_CALLABLEWITHARGUMENTS_H
diff --git a/libnd4j/include/execution/ThreadPool.h b/libnd4j/include/execution/ThreadPool.h
new file mode 100644
index 000000000..e17b4b540
--- /dev/null
+++ b/libnd4j/include/execution/ThreadPool.h
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_THREADPOOL_H
+#define SAMEDIFF_THREADPOOL_H
+
+#include <list>
+#include <vector>
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <execution/BlockingQueue.h>
+#include <execution/CallableWithArguments.h>
+#include <execution/CallableInterface.h>
+#include <execution/Ticket.h>
+#include <queue>
+
+namespace samediff {
+    class ThreadPool {
+    private:
+        static ThreadPool* _INSTANCE;
+
+        std::vector<std::thread*> _threads;
+        std::vector<BlockingQueue<CallableWithArguments*>*> _queues;
+        std::vector<CallableInterface*> _interfaces;
+
+        std::mutex _lock;
+        std::atomic<int> _available;
+        std::queue<Ticket*> _tickets;
+    protected:
+        ThreadPool();
+        ~ThreadPool();
+    public:
+        static ThreadPool* getInstance();
+
+        /**
+         * This method returns list of pointers to threads ONLY if num_threads of threads were available upon request, returning empty list otherwise
+         * @param num_threads
+         * @return
+         */
+        Ticket* tryAcquire(int num_threads);
+
+        /**
+         * This method marks specified number of threads as released, and available for use
+         * @param num_threads
+         */
+        void release(int num_threads = 1);
+
+        void release(Ticket *ticket);
+    };
+}
+
+
+#endif //DEV_TESTS_THREADPOOL_H
diff --git a/libnd4j/include/execution/Threads.h b/libnd4j/include/execution/Threads.h
new file mode 100644
index 000000000..683220b61
--- /dev/null
+++ b/libnd4j/include/execution/Threads.h
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+#ifndef SAMEDIFF_THREADS_H
+#define SAMEDIFF_THREADS_H
+
+#include <functional>
+#include <openmp_pragmas.h>
+#include <op_boilerplate.h>
+#include <Environment.h>
+#include <op_enums.h>
+
+namespace samediff {
+    class ThreadsHelper {
+    public:
+        static int numberOfThreads(int maxThreads, uint64_t numberOfElements);
+        static int numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y);
+        static int numberOfThreads3d(int maxThreads, uint64_t iters_x, uint64_t iters_y, uint64_t iters_z);
+        static int pickLoop2d(int numThreads, uint64_t iters_x, uint64_t iters_y);
+        static int pickLoop3d(int numThreads, uint64_t iters_x, uint64_t iters_y, uint64_t iters_z);
+    };
+
+    class Span {
+    private:
+        int64_t _startX, _stopX, _incX;
+    public:
+        Span(int64_t start_x, int64_t stop_x, int64_t inc_x);
+        ~Span() = default;
+
+        int64_t startX() const;
+        int64_t stopX() const;
+        int64_t incX() const;
+
+        static Span build(uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x);
+    };
+
+    class Span2 {
+    private:
+        int64_t _startX, _stopX, _incX;
+        int64_t _startY, _stopY, _incY;
+    public:
+        Span2(int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+        ~Span2() = default;
+
+        int64_t startX() const;
+        int64_t startY() const;
+
+        int64_t stopX() const;
+        int64_t stopY() const;
+
+        int64_t incX() const;
+        int64_t incY() const;
+
+        static Span2 build(int loop, uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+    };
+
+    class Span3 {
+    private:
+        int64_t _startX, _stopX, _incX;
+        int64_t _startY, _stopY, _incY;
+        int64_t _startZ, _stopZ, _incZ;
+    public:
+        Span3(int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z);
+        ~Span3() = default;
+
+        int64_t startX() const;
+        int64_t startY() const;
+        int64_t startZ() const;
+
+        int64_t stopX() const;
+        int64_t stopY() const;
+        int64_t stopZ() const;
+
+        int64_t incX() const;
+        int64_t incY() const;
+        int64_t incZ() const;
+
+        static Span3 build(int loop, uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z);
+    };
+
+    class Threads {
+    public:
+        /**
+         * This function executes 1 dimensional loop for a given number of threads
+         * PLEASE NOTE: this function can use smaller number of threads than requested.
+         *
+         * @param function
+         * @param numThreads
+         * @param start
+         * @param stop
+         * @param increment
+         * @return
+         */
+        static int parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        static int parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        /**
+         *
+         * @param function
+         * @param numThreads
+         * @param start_x
+         * @param stop_x
+         * @param inc_x
+         * @param start_y
+         * @param stop_y
+         * @param inc_y
+         * @return
+         */
+        static int parallel_for(FUNC_2D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads(), bool debug = false);
+
+        /**
+         *
+         * @param function
+         * @param numThreads
+         * @param start_x
+         * @param stop_x
+         * @param inc_x
+         * @param start_y
+         * @param stop_y
+         * @param inc_y
+         * @param start_z
+         * @param stop_z
+         * @param inc_z
+         * @return
+         */
+        static int parallel_for(FUNC_3D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        /**
+         *
+         * @param function
+         * @param numThreads
+         * @return
+         */
+        static int parallel_do(FUNC_DO function, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+    };
+}
+
+
+#endif //SAMEDIFF_THREADS_H
diff --git a/libnd4j/include/execution/Ticket.h b/libnd4j/include/execution/Ticket.h
new file mode 100644
index 000000000..e4152b66a
--- /dev/null
+++ b/libnd4j/include/execution/Ticket.h
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_TICKET_H
+#define SAMEDIFF_TICKET_H
+
+#include <vector>
+#include <execution/BlockingQueue.h>
+#include <execution/CallableWithArguments.h>
+#include <execution/CallableInterface.h>
+#include <atomic>
+#include <mutex>
+
+namespace samediff {
+    class Ticket {
+    private:
+        bool _acquired = false;
+        std::vector<BlockingQueue<CallableWithArguments*>*> _queues;
+        std::vector<CallableWithArguments*> _callables;
+        std::vector<CallableInterface*> _interfaces;
+
+        uint32_t _acquiredThreads = 0;
+    public:
+        explicit Ticket(const std::vector<BlockingQueue<CallableWithArguments*>*> &queues);
+        Ticket();
+        ~Ticket() = default;
+
+        bool acquired();
+
+        void acquiredThreads(uint32_t threads);
+
+        void attach(uint32_t thread_id, CallableInterface *interface);
+
+        // deprecated one
+        void enqueue(int thread_id, CallableWithArguments* callable);
+
+        void enqueue(uint32_t thread_id, uint32_t num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, double *lpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_DO func);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_, int64_t stop_z, int64_t inc_z);
+
+        void waitAndRelease();
+    };
+}
+
+
+#endif //DEV_TESTS_TICKET_H
diff --git a/libnd4j/include/execution/impl/BlockingQueue.cpp b/libnd4j/include/execution/impl/BlockingQueue.cpp
new file mode 100644
index 000000000..ff483fd28
--- /dev/null
+++ b/libnd4j/include/execution/impl/BlockingQueue.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/BlockingQueue.h>
+#include <CallableWithArguments.h>
+#include <thread>
+
+namespace samediff {
+    template <typename T>
+    BlockingQueue<T>::BlockingQueue(int queueSize) {
+        _size = 0;
+        _available = true;
+    }
+
+    template <typename T>
+    T BlockingQueue<T>::poll() {
+        // locking untill there's something within queue
+        std::unique_lock<std::mutex> lock(_lock);
+        _condition.wait(lock, [&]{ return this->_size.load() != 0; });
+
+        T t(std::move(_queue.front()));
+        _queue.pop();
+        _size--;
+        return t;
+    }
+
+    template <typename T>
+    void BlockingQueue<T>::put(const T &t) {
+        {
+            // locking before push, unlocking after
+            std::unique_lock<std::mutex> lock(_lock);
+            _queue.push(t);
+            _size++;
+        }
+
+        // notifying condition
+        _condition.notify_one();
+    }
+
+    template <typename T>
+    bool BlockingQueue<T>::available() {
+        return _available.load();
+    }
+
+    template <typename T>
+    void BlockingQueue<T>::markAvailable() {
+        _available = true;
+    }
+
+    template <typename T>
+    void BlockingQueue<T>::markUnavailable() {
+        _available = false;
+    }
+
+    template class BlockingQueue<CallableWithArguments*>;
+}
diff --git a/libnd4j/include/execution/impl/CallableInterface.cpp b/libnd4j/include/execution/impl/CallableInterface.cpp
new file mode 100644
index 000000000..a719af848
--- /dev/null
+++ b/libnd4j/include/execution/impl/CallableInterface.cpp
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/CallableInterface.h>
+#include <helpers/logger.h>
+
+namespace samediff {
+    CallableInterface::CallableInterface() {
+        // initial state is available
+        _available = true;
+        _filled = false;
+        _finished = false;
+    }
+
+    bool CallableInterface::available() {
+        return _available.load();
+    }
+
+    void CallableInterface::markUnavailable() {
+        _available = false;
+    }
+
+    void CallableInterface::markAvailable() {
+        _available = true;
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_DO func) {
+        _function_do = std::move(func);
+
+        _branch = 0;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_1D func, int64_t startX, int64_t stopX, int64_t incX) {
+        _function_1d = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+
+        _branch = 1;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_2D func, int64_t startX, int64_t stopX, int64_t incX, int64_t start_y, int64_t stop_y, int64_t inc_y) {
+        _function_2d = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+        _arguments[3] = start_y;
+        _arguments[4] = stop_y;
+        _arguments[5] = inc_y;
+
+        _branch = 2;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_3D func, int64_t startX, int64_t stopX, int64_t incX, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) {
+        _function_3d = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+        _arguments[3] = start_y;
+        _arguments[4] = stop_y;
+        _arguments[5] = inc_y;
+        _arguments[6] = start_z;
+        _arguments[7] = stop_z;
+        _arguments[8] = inc_z;
+
+        _branch = 3;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, int64_t *lptr, FUNC_RL func, int64_t startX, int64_t stopX, int64_t incX) {
+        _function_rl = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+
+        _lptr = lptr;
+
+        _branch = 4;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, double *dptr, FUNC_RD func, int64_t startX, int64_t stopX, int64_t incX) {
+        _function_rd = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+
+        _dptr = dptr;
+
+        _branch = 5;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::waitForTask() {
+        // block until task is available
+        std::unique_lock<std::mutex> lock(_ms);
+        _starter.wait(lock, [&]{ return _filled.load(); });
+    }
+
+    void CallableInterface::waitForCompletion() {
+        //while (!_finished.load());
+
+        // block until finished
+        std::unique_lock<std::mutex> lock(_mf);
+        _finisher.wait(lock, [&] { return _finished.load(); });
+    }
+
+    void CallableInterface::finish() {
+        // mark as finished
+        {
+            std::unique_lock<std::mutex> l(_mf);
+            _finished.store(true);
+        }
+        _finisher.notify_one();
+    }
+
+    void CallableInterface::execute() {
+        // mark it as consumed
+        _filled = false;
+
+        // actually executing op
+        switch (_branch) {
+            case 0:
+                _function_do(_thread_id, _num_threads);
+                break;
+            case 1:
+                _function_1d(_thread_id, _arguments[0], _arguments[1], _arguments[2]);
+                break;
+            case 2:
+                _function_2d(_thread_id, _arguments[0], _arguments[1], _arguments[2], _arguments[3], _arguments[4], _arguments[5]);
+                break;
+            case 3:
+                _function_3d(_thread_id, _arguments[0], _arguments[1], _arguments[2], _arguments[3], _arguments[4], _arguments[5], _arguments[6], _arguments[7], _arguments[8]);
+                break;
+            case 4:
+                _lptr[0] = _function_rl(_thread_id, _arguments[0], _arguments[1], _arguments[2]);
+                break;
+            case 5:
+                _dptr[0] = _function_rd(_thread_id, _arguments[0], _arguments[1], _arguments[2]);
+                break;
+        }
+
+        // notify that thread finished the job
+        this->finish();
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/execution/impl/CallableWithArguments.cpp b/libnd4j/include/execution/impl/CallableWithArguments.cpp
new file mode 100644
index 000000000..8f17622b7
--- /dev/null
+++ b/libnd4j/include/execution/impl/CallableWithArguments.cpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/CallableWithArguments.h>
+
+namespace samediff {
+    CallableWithArguments::CallableWithArguments(FUNC_DO func, uint64_t thread_id, uint64_t numThreads) {
+        _function_do = func;
+        _finished = false;
+        _threadId = thread_id;
+        _numThreads = numThreads;
+        _dimensions = 0;
+    }
+
+    CallableWithArguments::CallableWithArguments(FUNC_3D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y, int64_t start_z, int64_t stop_z, int64_t increment_z) {
+        _function_3d = func;
+        _arguments = {start_x, stop_x, increment_x, start_y, stop_y, increment_y, start_z, stop_z, increment_z};
+        _finished = false;
+        _threadId = thread_id;
+        _dimensions = 3;
+    }
+
+    CallableWithArguments::CallableWithArguments(FUNC_1D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x) {
+        _function_1d = func;
+        _arguments = {start_x, stop_x, increment_x};
+        _finished = false;
+        _threadId = thread_id;
+        _dimensions = 1;
+    }
+
+    CallableWithArguments::CallableWithArguments(FUNC_2D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y) {
+        _function_2d = func;
+        _arguments = {start_x, stop_x, increment_x, start_y, stop_y, increment_y};
+        _finished = false;
+        _threadId = thread_id;
+        _dimensions = 2;
+    }
+
+    int CallableWithArguments::dimensions() {
+        return _dimensions;
+    }
+
+    std::vector<int64_t>& CallableWithArguments::arguments() {
+        return _arguments;
+    }
+
+    bool CallableWithArguments::finished() {
+        return _finished.load();
+    }
+
+    void CallableWithArguments::finish() {
+        std::lock_guard<std::mutex> lock(_lock);
+        _finished = true;
+        _condition.notify_one();
+    }
+
+    void CallableWithArguments::waitUntilFinished() {
+        std::unique_lock<std::mutex> lock(_lock);
+        _condition.wait(lock, [&]{ return _finished.load(); });
+    }
+
+
+    FUNC_1D CallableWithArguments::function_1d() {
+        return _function_1d;
+    }
+
+    FUNC_2D CallableWithArguments::function_2d() {
+        return _function_2d;
+    }
+
+    FUNC_DO CallableWithArguments::function_do() {
+        return _function_do;
+    }
+
+    FUNC_3D CallableWithArguments::function_3d() {
+        return _function_3d;
+    }
+
+    uint64_t CallableWithArguments::threadId() {
+        return _threadId;
+    }
+
+    uint64_t CallableWithArguments::numThreads() {
+        return _numThreads;
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/execution/impl/ThreadPool.cpp b/libnd4j/include/execution/impl/ThreadPool.cpp
new file mode 100644
index 000000000..5d9e2d5eb
--- /dev/null
+++ b/libnd4j/include/execution/impl/ThreadPool.cpp
@@ -0,0 +1,194 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/ThreadPool.h>
+#include <stdexcept>
+#include <helpers/logger.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+//#include <windows.h>
+#endif
+
+namespace samediff {
+
+    // this function executed once per thread, it polls functions from queue, and executes them via wrapper
+    static void executionLoop_(int thread_id, BlockingQueue<CallableWithArguments*> *queue) {
+        while (true) {
+            // this method blocks until there's something within queue
+            auto c = queue->poll();
+            //nd4j_printf("ThreadPool: starting thread %i\n", c->threadId());
+            switch (c->dimensions()) {
+                case 0: {
+                        c->function_do()(c->threadId(), c->numThreads());
+                        c->finish();
+                    }
+                    break;
+                case 1: {
+                        auto args = c->arguments();
+                        c->function_1d()(c->threadId(), args[0], args[1], args[2]);
+                        c->finish();
+                    }
+                    break;
+                case 2: {
+                        auto args = c->arguments();
+                        c->function_2d()(c->threadId(), args[0], args[1], args[2], args[3], args[4], args[5]);
+                        c->finish();
+                        //nd4j_printf("ThreadPool: finished thread %i\n", c->threadId());
+                    }
+                    break;
+                case 3: {
+                        auto args = c->arguments();
+                        c->function_3d()(c->threadId(), args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8]);
+                        c->finish();
+                    }
+                    break;
+                default:
+                    throw std::runtime_error("Don't know what to do with provided Callable");
+            }
+        }
+    }
+
+    static void executionLoopWithInterface_(int thread_id, CallableInterface *c) {
+        while (true) {
+            // blocking here until there's something to do
+            c->waitForTask();
+
+            // execute whatever we have
+            c->execute();
+        }
+    }
+
+    ThreadPool::ThreadPool() {
+        // TODO: number of threads must reflect number of cores for UMA system. In case of NUMA it should be per-device pool
+        // FIXME: on mobile phones this feature must NOT be used
+        _available = nd4j::Environment::getInstance()->maxThreads();
+
+        _queues.resize(_available.load());
+        _threads.resize(_available.load());
+        _interfaces.resize(_available.load());
+
+        // creating threads here
+        for (int e = 0; e < _available.load(); e++) {
+            _queues[e] = new BlockingQueue<CallableWithArguments*>(2);
+            _interfaces[e] = new CallableInterface();
+            _threads[e] = new std::thread(executionLoopWithInterface_, e, _interfaces[e]);
+            _tickets.push(new Ticket());
+            // _threads[e] = new std::thread(executionLoop_, e, _queues[e]);
+
+            // TODO: add other platforms here as well
+            // now we must set affinity, and it's going to be platform-specific thing
+#ifdef LINUX_BUILD
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            CPU_SET(e, &cpuset);
+            int rc = pthread_setaffinity_np(_threads[e]->native_handle(), sizeof(cpu_set_t), &cpuset);
+            if (rc != 0)
+                throw std::runtime_error("Failed to set pthread affinity");
+#endif
+            /*
+#if defined(_WIN32) || defined(_WIN64)
+            // we can't set affinity to more than 64 cores
+            if (e <= 64) {
+                auto mask = (static_cast<DWORD_PTR>(1) << e);
+                auto result = SetThreadAffinityMask(_threads[e]->native_handle(), mask);
+                if (!result)
+                    throw std::runtime_error("Failed to set pthread affinity");
+            }
+
+            // that's fine. no need for time_critical here
+            SetThreadPriority(_threads[e]->native_handle(), THREAD_PRIORITY_HIGHEST);
+#endif
+             */
+        }
+    }
+
+    ThreadPool::~ThreadPool() {
+        // TODO: implement this one properly
+        for (int e = 0; e < _queues.size(); e++) {
+            // stop each and every thread
+
+            // release queue and thread
+            //delete _queues[e];
+            //delete _threads[e];
+        }
+    }
+
+    static std::mutex _lmutex;
+
+    ThreadPool* ThreadPool::getInstance() {
+        std::unique_lock<std::mutex> lock(_lmutex);
+        if (!_INSTANCE)
+            _INSTANCE = new ThreadPool();
+
+        return _INSTANCE;
+    }
+
+    void ThreadPool::release(int numThreads) {
+        _available += numThreads;
+    }
+
+    Ticket* ThreadPool::tryAcquire(int numThreads) {
+        //std::vector<BlockingQueue<CallableWithArguments*>*> queues;
+
+        Ticket *t = nullptr;
+        // we check for threads availability first
+        bool threaded = false;
+        {
+            // we lock before checking availability
+            std::unique_lock<std::mutex> lock(_lock);
+            if (_available >= numThreads) {
+                threaded = true;
+                _available -= numThreads;
+
+                // getting a ticket from the queue
+                t = _tickets.front();
+                _tickets.pop();
+
+                // ticket must contain information about number of threads for the current session
+                t->acquiredThreads(numThreads);
+
+                // filling ticket with executable interfaces
+                for (int e = 0, i = 0; e < _queues.size() && i < numThreads; e++) {
+                    if (_interfaces[e]->available()) {
+                        t->attach(i++, _interfaces[e]);
+                        _interfaces[e]->markUnavailable();
+                    }
+                }
+            }
+        }
+
+        // we either dispatch tasks to threads, or run single-threaded
+        if (threaded) {
+            return t;
+        } else {
+            // if there's no threads available - return nullptr
+            return nullptr;
+        }
+    }
+
+    void ThreadPool::release(samediff::Ticket *ticket) {
+        // returning ticket back to the queue
+        std::unique_lock<std::mutex> lock(_lock);
+        _tickets.push(ticket);
+    }
+
+
+    ThreadPool* ThreadPool::_INSTANCE = 0;
+}
diff --git a/libnd4j/include/execution/impl/Threads.cpp b/libnd4j/include/execution/impl/Threads.cpp
new file mode 100644
index 000000000..f5ae5b5eb
--- /dev/null
+++ b/libnd4j/include/execution/impl/Threads.cpp
@@ -0,0 +1,641 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+#include <execution/Threads.h>
+#include <execution/ThreadPool.h>
+#include <vector>
+#include <thread>
+#include <helpers/logger.h>
+#include <templatemath.h>
+#include <shape.h>
+
+
+namespace samediff {
+
+    int ThreadsHelper::numberOfThreads(int maxThreads, uint64_t numberOfElements) {
+        // let's see how many threads we actually need first
+        auto optimalThreads = nd4j::math::nd4j_max<uint64_t>(1, numberOfElements / 1024);
+
+        // now return the smallest value
+        return nd4j::math::nd4j_min<int>(optimalThreads, maxThreads);
+    }
+
+    Span3::Span3(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) {
+        _startX = startX;
+        _startY = startY;
+        _startZ = startZ;
+        _stopX = stopX;
+        _stopY = stopY;
+        _stopZ = stopZ;
+        _incX = incX;
+        _incY = incY;
+        _incZ = incZ;
+    }
+
+    Span3 Span3::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) {
+        switch (loop) {
+            case 1: {
+                    auto span = (stopX - startX) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopX;
+
+                    return Span3(s, e, incX, startY, stopY, incY, startZ, stopZ, incZ);
+                }
+                break;
+            case 2: {
+                    auto span = (stopY - startY) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopY;
+
+                    return Span3(startX, stopX, incX, s, e, incY, startZ, stopZ, incZ);
+                }
+                break;
+            case 3: {
+                    auto span = (stopZ - startZ) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopZ;
+
+                    return Span3(startX, stopX, incX, startY, stopY, incY, s, e, incZ);
+                }
+                break;
+            default:
+                throw std::runtime_error("");
+        }
+        return Span3(startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+    }
+
+    Span::Span(int64_t startX, int64_t stopX, int64_t incX) {
+        _startX = startX;
+        _stopX = stopX;
+        _incX = incX;
+    }
+
+    Span Span::build(uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX) {
+        auto span = (stopX - startX) / numThreads;
+        auto s = span * threadID;
+        auto e = s + span;
+        if (threadID == numThreads - 1)
+            e = stopX;
+
+        return Span(s, e, incX);
+    }
+
+    Span2::Span2(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) {
+        _startX = startX;
+        _startY = startY;
+        _stopX = stopX;
+        _stopY = stopY;
+        _incX = incX;
+        _incY = incY;
+    }
+
+
+    Span2 Span2::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) {
+
+        switch (loop) {
+            case 1: {
+                    auto span = (stopX - startX) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopX;
+
+                    return Span2(s, e, incX, startY, stopY, incY);
+                }
+                break;
+            case 2: {
+                    auto span = (stopY - startY) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopY;
+
+                    return Span2(startX, stopX, incX, s, e, incY);
+                }
+                break;
+            default:
+                throw std::runtime_error("");
+        }
+    }
+
+    int64_t Span::startX() const {
+        return _startX;
+    }
+
+    int64_t Span::stopX() const {
+        return _stopX;
+    }
+
+    int64_t Span::incX() const {
+        return _incX;
+    }
+
+    int64_t Span2::startX() const {
+        return _startX;
+    }
+
+    int64_t Span2::startY() const {
+        return _startY;
+    }
+
+    int64_t Span2::stopX() const {
+        return _stopX;
+    }
+
+    int64_t Span2::stopY() const {
+        return _stopY;
+    }
+
+    int64_t Span2::incX() const {
+        return _incX;
+    }
+
+    int64_t Span2::incY() const {
+        return _incY;
+    }
+
+    int64_t Span3::startX() const {
+        return _startX;
+    }
+
+    int64_t Span3::startY() const {
+        return _startY;
+    }
+
+    int64_t Span3::startZ() const {
+        return _startZ;
+    }
+
+    int64_t Span3::stopX() const {
+        return _stopX;
+    }
+
+    int64_t Span3::stopY() const {
+        return _stopY;
+    }
+
+    int64_t Span3::stopZ() const {
+        return _stopZ;
+    }
+
+    int64_t Span3::incX() const {
+        return _incX;
+    }
+
+    int64_t Span3::incY() const {
+        return _incY;
+    }
+
+    int64_t Span3::incZ() const {
+        return _incZ;
+    }
+
+    int ThreadsHelper::pickLoop2d(int numThreads, uint64_t itersX, uint64_t itersY) {
+        // if one of dimensions is definitely too small - we just pick the other one
+        if (itersX < numThreads && itersY >= numThreads)
+            return 2;
+        if (itersY < numThreads && itersX >= numThreads)
+            return 1;
+
+        // next step - we pick the most balanced dimension
+        auto remX = itersX % numThreads;
+        auto remY = itersY % numThreads;
+        auto splitY = itersY / numThreads;
+
+        // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution
+        if (remX == 0)
+            return 1;
+        if (remY == 0)
+            return 2;
+
+        // if there's no loop without a remainder - we're picking one with smaller remainder
+        if (remX < remY)
+            return 1;
+        if (remY < remX && splitY >= 64) // we don't want too small splits over last dimension, or vectorization will fail
+            return 2;
+        // if loops are equally sized - give the preference to the first thread
+        return 1;
+    }
+
+
+    static int threads_(int maxThreads, uint64_t elements) {
+
+        if (elements == maxThreads) {
+            return maxThreads;
+        }
+        else if (elements > maxThreads) {
+            // if we have full load across thread, or at least half of threads can be utilized
+            auto rem = elements % maxThreads;
+            if (rem == 0 || rem >= maxThreads / 3)
+                return maxThreads;
+            else
+                return threads_(maxThreads - 1, elements);
+
+        }
+        else if (elements < maxThreads) {
+            return elements;
+        }
+
+        return 1;
+    }
+
+    int ThreadsHelper::numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y) {
+        // in some cases there's nothing to think about, part 1
+        if (iters_x < maxThreads && iters_y < maxThreads)
+            return nd4j::math::nd4j_max<int>(iters_x, iters_y);
+
+        auto remX = iters_x % maxThreads;
+        auto remY = iters_y % maxThreads;
+
+        // in some cases there's nothing to think about, part 2
+        if ((iters_x >= maxThreads && remX == 0 )|| (iters_y >= maxThreads && remY == 0))
+            return maxThreads;
+
+        // at this point we suppose that there's no loop perfectly matches number of our threads
+        // so let's pick something as equal as possible
+        if (iters_x > maxThreads || iters_y > maxThreads)
+            return maxThreads;
+        else
+            return numberOfThreads2d(maxThreads - 1, iters_x, iters_y);
+    }
+
+    int ThreadsHelper::numberOfThreads3d(int maxThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) {
+        // we don't want to run underloaded threads
+        if (itersX * itersY * itersZ <= 32)
+            return 1;
+
+        auto remX = itersX % maxThreads;
+        auto remY = itersY % maxThreads;
+        auto remZ = itersZ % maxThreads;
+
+        // if we have perfect balance across one of dimensions - just go for it
+        if ((itersX >= maxThreads && remX == 0) || (itersY >= maxThreads && remY == 0) || (itersZ >= maxThreads && remZ == 0))
+            return maxThreads;
+
+        int threadsX = 0, threadsY = 0, threadsZ = 0;
+
+        // now we look into possible number of
+        threadsX = threads_(maxThreads, itersX);
+        threadsY = threads_(maxThreads, itersY);
+        threadsZ = threads_(maxThreads, itersZ);
+
+        // we want to split as close to outer loop as possible, so checking it out first
+        if (threadsX >= threadsY && threadsX >= threadsZ)
+            return threadsX;
+        else if (threadsY >= threadsX && threadsY >= threadsZ)
+            return threadsY;
+        else if (threadsZ >= threadsX && threadsZ >= threadsY)
+            return threadsZ;
+
+        return 1;
+    }
+
+    int ThreadsHelper::pickLoop3d(int numThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) {
+        auto remX = itersX % numThreads;
+        auto remY = itersY % numThreads;
+        auto remZ = itersZ % numThreads;
+
+        auto splitX = itersX / numThreads;
+        auto splitY = itersY / numThreads;
+        auto splitZ = itersZ / numThreads;
+
+        // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution
+        if (remX == 0)
+            return 1;
+        else if (remY == 0)
+            return 2;
+        else if (remZ == 0) // TODO: we don't want too smal splits over last dimension? or we do?
+            return 3;
+
+        if (itersX > numThreads)
+            return 1;
+        else if (itersY > numThreads)
+            return 2;
+        else if (itersZ > numThreads)
+            return 3;
+
+        return 1;
+    }
+
+    int Threads::parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_for got start > stop");
+
+        auto delta = (stop - start);
+
+        if (numThreads > delta)
+            numThreads = delta;
+
+        if (numThreads == 0)
+            return 0;
+
+        // shortcut
+        if (numThreads == 1) {
+            function(0, start, stop, increment);
+            return 1;
+        }
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
+        if (ticket != nullptr) {
+            // if we got our threads - we'll run our jobs here
+            auto span = delta / numThreads;
+
+            for (uint32_t e = 0; e < numThreads; e++) {
+                auto start_ = span * e + start;
+                auto stop_  = start_ + span;
+
+                // last thread will process tail
+                if (e == numThreads - 1)
+                    stop_ = stop;
+
+                // putting the task into the queue for a given thread
+                ticket->enqueue(e, numThreads, function, start_, stop_, increment);
+            }
+
+            // block and wait till all threads finished the job
+            ticket->waitAndRelease();
+
+            // we tell that parallelism request succeeded
+            return numThreads;
+        } else {
+            // if there were no threads available - we'll execute function right within current thread
+            function(0, start, stop, increment);
+
+            // we tell that parallelism request declined
+            return 1;
+        }
+    }
+
+    int Threads::parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_for got start > stop");
+
+        auto delta = (stop - start);
+
+        // in some cases we just fire func as is
+        if (delta == 0 || numThreads == 1) {
+            function(0, start, stop, increment);
+            return 1;
+        }
+
+        auto numElements = delta / increment;
+
+        // we decide what's optimal number of threads we need here, and execute it in parallel_tad.
+        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
+        return parallel_tad(function, start, stop, increment, numThreads);
+    }
+
+    int Threads::parallel_for(FUNC_2D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, uint64_t numThreads, bool debug) {
+        if (startX > stopX)
+            throw std::runtime_error("Threads::parallel_for got startX > stopX");
+
+        if (startY > stopY)
+            throw std::runtime_error("Threads::parallel_for got startY > stopY");
+
+        // number of elements per loop
+        auto delta_x = (stopX - startX);
+        auto delta_y = (stopY - startY);
+
+        // number of iterations per loop
+        auto itersX = delta_x / incX;
+        auto itersY = delta_y / incY;
+
+        // total number of iterations
+        auto iters_t = itersX * itersY;
+
+        // we are checking the case of number of requested threads was smaller
+        numThreads = ThreadsHelper::numberOfThreads2d(numThreads, itersX, itersY);
+
+        // basic shortcut for no-threading cases
+        if (numThreads == 1) {
+            function(0, startX, stopX, incX, startY, stopY, incY);
+            return 1;
+        }
+
+        // We have couple of scenarios:
+        // either we split workload along 1st loop, or 2nd
+        auto splitLoop = ThreadsHelper::pickLoop2d(numThreads, itersX, itersY);
+
+        // for debug mode we execute things inplace, without any threads
+        if (debug) {
+            for (int e = 0; e < numThreads; e++) {
+                auto span = Span2::build(splitLoop, e, numThreads, startX, stopX, incX, startY, stopY, incY);
+
+                function(e, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY());
+            }
+
+            // but we still mimic multithreaded execution
+            return numThreads;
+        } else {
+            auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
+            if (ticket != nullptr) {
+
+                for (int e = 0; e < numThreads; e++) {
+                    auto threadId = numThreads - e - 1;
+                    auto span = Span2::build(splitLoop, threadId, numThreads, startX, stopX, incX, startY, stopY, incY);
+
+                    ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY());
+                }
+
+                // block until all threads finish their job
+                ticket->waitAndRelease();
+
+                return numThreads;
+            } else {
+                // if there were no threads available - we'll execute function right within current thread
+                function(0, startX, stopX, incX, startY, stopY, incY);
+
+                // we tell that parallelism request declined
+                return 1;
+            }
+        };
+    }
+
+
+    int Threads::parallel_for(FUNC_3D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ, uint64_t numThreads) {
+        if (startX > stopX)
+            throw std::runtime_error("Threads::parallel_for got startX > stopX");
+
+        if (startY > stopY)
+            throw std::runtime_error("Threads::parallel_for got startY > stopY");
+
+        if (startZ > stopZ)
+            throw std::runtime_error("Threads::parallel_for got startZ > stopZ");
+
+        auto delta_x = stopX - startX;
+        auto delta_y = stopY - startY;
+        auto delta_z = stopZ - startZ;
+
+        auto itersX = delta_x / incX;
+        auto itersY = delta_y / incY;
+        auto itersZ = delta_z / incZ;
+
+        numThreads = 1; //ThreadsHelper::numberOfThreads3d(numThreads, itersX, itersY, itersZ);
+        if (numThreads == 1) {
+            // loop is too small - executing function as is
+            function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+            return 1;
+        }
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
+        if (ticket != nullptr) {
+            auto splitLoop = ThreadsHelper::pickLoop3d(numThreads, itersX, itersY, itersZ);
+
+            for (int e = 0; e < numThreads; e++) {
+                auto thread_id = numThreads - e - 1;
+                auto span = Span3::build(splitLoop, thread_id, numThreads, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+
+                ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY(), span.startZ(), span.stopZ(), span.incZ());
+            }
+
+            // block until we're done
+            ticket->waitAndRelease();
+
+            // we tell that parallelism request succeeded
+            return numThreads;
+        } else {
+            // if there were no threads available - we'll execute function right within current thread
+            function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+
+            // we tell that parallelism request declined
+            return 1;
+        }
+
+    }
+
+    int Threads::parallel_do(FUNC_DO function, uint64_t numThreads) {
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
+        if (ticket != nullptr) {
+
+            // submit tasks one by one
+            for (uint64_t e = 0; e < numThreads - 1; e++)
+                ticket->enqueue(e, numThreads, function);
+
+            function(numThreads - 1, numThreads);
+
+            ticket->waitAndRelease();
+
+            return numThreads;
+        } else {
+            // if there's no threads available - we'll execute function sequentially one by one
+            for (uint64_t e = 0; e < numThreads; e++)
+                function(e, numThreads);
+
+            return numThreads;
+        }
+
+
+        return numThreads;
+    }
+
+    int64_t Threads::parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_long got start > stop");
+
+        auto delta = (stop - start);
+        if (delta == 0 || numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto numElements = delta / increment;
+
+        // we decide what's optimal number of threads we need here, and execute it
+        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
+        if (numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
+        if (ticket == nullptr)
+            return function(0, start, stop, increment);
+
+        // create temporary array
+        int64_t intermediatery[256];
+        auto span = delta / numThreads;
+
+        // execute threads in parallel
+        for (uint32_t e = 0; e < numThreads; e++) {
+            auto start_ = span * e + start;
+            auto stop_ = span * (e + 1) + start;
+
+            if (e == numThreads - 1)
+                intermediatery[e] = function(e, start_, stop, increment);
+            else
+                ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment);
+        }
+
+        ticket->waitAndRelease();
+
+        // aggregate results in single thread
+        for (uint64_t e = 1; e < numThreads; e++)
+            intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]);
+
+        // return accumulated result
+        return intermediatery[0];
+    }
+
+    double Threads::parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_long got start > stop");
+
+        auto delta = (stop - start);
+        if (delta == 0 || numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto numElements = delta / increment;
+
+        // we decide what's optimal number of threads we need here, and execute it
+        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
+        if (numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
+        if (ticket == nullptr)
+            return function(0, start, stop, increment);
+
+        // create temporary array
+        double intermediatery[256];
+        auto span = delta / numThreads;
+
+        // execute threads in parallel
+        for (uint32_t e = 0; e < numThreads; e++) {
+            auto start_ = span * e + start;
+            auto stop_ = span * (e + 1) + start;
+
+            if (e == numThreads - 1)
+                intermediatery[e] = function(e, start_, stop, increment);
+            else
+                ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment);
+        }
+
+        ticket->waitAndRelease();
+
+        // aggregate results in single thread
+        for (uint64_t e = 1; e < numThreads; e++)
+            intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]);
+
+        // return accumulated result
+        return intermediatery[0];
+    }
+
+}
\ No newline at end of file
diff --git a/libnd4j/include/execution/impl/Ticket.cpp b/libnd4j/include/execution/impl/Ticket.cpp
new file mode 100644
index 000000000..5bf911fd0
--- /dev/null
+++ b/libnd4j/include/execution/impl/Ticket.cpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/Ticket.h>
+#include <execution/ThreadPool.h>
+#include <helpers/logger.h>
+#include <array>
+
+namespace samediff {
+    Ticket::Ticket(const std::vector<BlockingQueue<CallableWithArguments*>*> &queues) {
+        _acquired = true;
+        _queues = queues;
+    }
+
+    Ticket::Ticket() {
+        _acquired = true;
+        _interfaces.resize(nd4j::Environment::getInstance()->maxThreads());
+    }
+
+    bool Ticket::acquired() {
+        return _acquired;
+    }
+
+    void Ticket::enqueue(int thread_id, samediff::CallableWithArguments *callable) {
+        _queues[thread_id]->put(callable);
+        _callables.emplace_back(callable);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_DO func) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, func);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, func, start_x, stop_x, inc_x);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, lpt, func, start_x, stop_x, inc_x);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, double *dpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, dpt, func, start_x, stop_x, inc_x);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, std::move(func), start_x, stop_x, inc_x, start_y, stop_y, inc_y);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, func, start_x, stop_x, inc_x, start_y, stop_y, inc_y, start_z, stop_z, inc_z);
+    }
+
+    void Ticket::acquiredThreads(uint32_t threads) {
+        _acquiredThreads = threads;
+    }
+
+    void Ticket::waitAndRelease() {
+        for (uint32_t e = 0; e < this->_acquiredThreads; e++) {
+            // block until finished
+            _interfaces[e]->waitForCompletion();
+
+            // mark available
+            _interfaces[e]->markAvailable();
+
+            // increment availability counter
+            ThreadPool::getInstance()->release();
+        }
+
+        // return this ticket back to the pool
+        ThreadPool::getInstance()->release(this);
+    }
+
+
+    void Ticket::attach(uint32_t thread_id, samediff::CallableInterface *interface) {
+        _interfaces[thread_id] = interface;
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/graph/Node.h b/libnd4j/include/graph/Node.h
index 3eac03e07..b57998e38 100644
--- a/libnd4j/include/graph/Node.h
+++ b/libnd4j/include/graph/Node.h
@@ -232,6 +232,7 @@ namespace nd4j {
             }
 
             static nd4j::ops::DeclarableOp* buildOpByType(OpType opType, int numInputs, int numIArgs, int numTArgs, int opNum, NDArray *scalar);
+            static void deleteOpByType(OpType opType, void *op);
         };
     }
 }
diff --git a/libnd4j/include/graph/impl/Graph.cpp b/libnd4j/include/graph/impl/Graph.cpp
index f4514efdb..2acedcea3 100644
--- a/libnd4j/include/graph/impl/Graph.cpp
+++ b/libnd4j/include/graph/impl/Graph.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <graph/Graph.h>
+#include <array/DataTypeUtils.h>
 #include <helpers/EnumUtils.h>
 #include <graph/FlatUtils.h>
 #include <NativeOps.h>
@@ -154,7 +155,7 @@ namespace nd4j {
                         Nd4jLong *newShape = nullptr;
 
                         // if that's scalar output - we don't care about previous node
-                        if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == MAX_INT)) {
+                        if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == nd4j::DataTypeUtils::max<int>())) {
                             newShape = new Nd4jLong[8];
 
                             newShape[0] = 2;
diff --git a/libnd4j/include/graph/impl/Node.cpp b/libnd4j/include/graph/impl/Node.cpp
index d365ddd6a..795d9b7f0 100644
--- a/libnd4j/include/graph/impl/Node.cpp
+++ b/libnd4j/include/graph/impl/Node.cpp
@@ -682,8 +682,9 @@ namespace nd4j {
             if (_protoContext != nullptr)
                 delete _protoContext;
 
-            if (_isDeductable && _customOp != nullptr)
-                delete _customOp;
+            if (_isDeductable && _customOp != nullptr) {
+                Node::deleteOpByType(_opType, _customOp);
+            }
         }
 
         int nd4j::graph::Node::getRewindNode() {
@@ -710,6 +711,70 @@ namespace nd4j {
             return false;
         }
 
+        void nd4j::graph::Node::deleteOpByType(OpType opType, void *op) {
+            switch (opType) {
+                case OpType_PAIRWISE:
+                    delete reinterpret_cast<nd4j::ops::LegacyPairwiseTransformOp*>(op);
+                    break;
+                case OpType_PAIRWISE_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyPairwiseTransformBoolOp*>(op);
+                    break;
+                case OpType_TRANSFORM_STRICT:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformStrictOp*>(op);
+                    break;
+                case OpType_TRANSFORM_SAME:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformSameOp*>(op);
+                    break;
+                case OpType_TRANSFORM_FLOAT:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformFloatOp*>(op);
+                    break;
+                case OpType_TRANSFORM_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformBoolOp*>(op);
+                    break;
+                case OpType_SCALAR:
+                    delete reinterpret_cast<nd4j::ops::LegacyScalarOp*>(op);
+                    break;
+                case OpType_SCALAR_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyScalarBoolOp*>(op);
+                    break;
+                case OpType_REDUCE_3:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduce3Op*>(op);
+                    break;
+                case OpType_REDUCE_SAME:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceSameOp*>(op);
+                    break;
+                case OpType_REDUCE_FLOAT:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceFloatOp*>(op);
+                    break;
+                case OpType_REDUCE_LONG:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceLongOp*>(op);
+                    break;
+                case OpType_REDUCE_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceBoolOp*>(op);
+                    break;
+                case OpType_INDEX_REDUCE:
+                    delete reinterpret_cast<nd4j::ops::LegacyIndexReduceOp*>(op);
+                    break;
+                case OpType_SUMMARYSTATS:
+                    delete reinterpret_cast<nd4j::ops::LegacyStatsOp*>(op);
+                    break;
+                case OpType_RANDOM:
+                    delete reinterpret_cast<nd4j::ops::LegacyRandomOp*>(op);
+                    break;
+                case OpType_BROADCAST:
+                    delete reinterpret_cast<nd4j::ops::LegacyBroadcastOp*>(op);
+                    break;
+                case OpType_BROADCAST_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyBroadcastBoolOp*>(op);
+                    break;
+                case OpType_CUSTOM:
+                    delete reinterpret_cast<nd4j::ops::DeclarableOp*>(op);
+                    break;
+                default:
+                    throw std::runtime_error("Bad opType passed in");
+            }
+        }
+
         nd4j::ops::DeclarableOp* nd4j::graph::Node::buildOpByType(OpType opType, int numInputs,  int numIArgs, int numTArgs, int opNum, NDArray *scalar) {
             switch (opType) {
                 case OpType_PAIRWISE:
diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h
index 392ed3edf..fb1582056 100644
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
@@ -31,6 +31,7 @@
 #include <indexreduce.h>
 #include <helpers/ConstantTadHelper.h>
 #include <openmp_pragmas.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 
@@ -40,43 +41,43 @@ namespace nd4j {
     public:
 
         template <typename OpType>
-        static FORCEINLINE void loopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, E* extraParams);
+        static FORCEINLINE void loopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, E* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X, typename Z>
     class ReductionFloatLoops : public ReductionLoops<X,Z,Z> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X, typename Z>
     class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X,Z,X> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X, typename Z>
     class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X,Z,X> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X>
     class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X,X,X> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
     };
 
 
@@ -96,8 +97,8 @@ namespace nd4j {
 
     public:
 
-        template<typename OpType, bool doParallel>
-        static FORCEINLINE void loopTransform(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, E* extraParams);
+        template<typename OpType>
+        static FORCEINLINE void loopTransform(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, E* extraParams, uint64_t threadId, uint64_t numThreads);
     };
 
     template <typename X, typename Z>
@@ -105,20 +106,20 @@ namespace nd4j {
     public:
 
         template <typename OpType>
-        static FORCEINLINE void loopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams);
+        static FORCEINLINE void loopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static FORCEINLINE void loopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams);
+        static FORCEINLINE void loopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
 
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop);
 
-        static void wrapperAll(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams);
+        static void wrapperAll(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams);
+        static void innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams);
+        static void innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
     };
 
 
@@ -265,7 +266,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
     void nd4j::ReductionLoops<X, Z, E>::loopReduce(X* x, Nd4jLong* xShapeInfo,
                                                   Z* z, Nd4jLong* zShapeInfo,
                                                   Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets,
-                                                  E* extraParams) {
+                                                  E* extraParams, int64_t start, int64_t stop) {
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo);
 
@@ -319,263 +320,170 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
             //*********************************************//
             case LoopKind::EWS1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[j], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
 
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadLen; ++i0)
-                        start = OpType::update(start, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK2: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
-                            start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1]], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK3: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2)
-                                start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2]], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK4: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2)
                                 for (uint i3 = 0; i3 < tadShape[3]; ++i3)
-                                    start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2] + i3*tadStride[3]], extraParams), extraParams);
+                                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK5: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2)
                                 for (uint i3 = 0; i3 < tadShape[3]; ++i3)
                                     for (uint i4 = 0; i4 < tadShape[4]; ++i4)
-                                        start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2] + i3*tadStride[3] + i4*tadStride[4] ], extraParams), extraParams);
+                                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::X_EWSNONZERO: {
                 uint castZShapeInfo[MAX_RANK];
                 const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
 
                     auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                    z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::Z_EWSNONZERO: {
                 uint castTadShapeInfo[MAX_RANK];
                 const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++) {
                         auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                        start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
                     }
 
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
-
-            //*********************************************//
-            // default: {
-            //     uint castTadShapeInfo[MAX_RANK];
-            //     uint castZShapeInfo[MAX_RANK];
-            //     const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
-            //     const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
-
-            //     PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            //     for (uint i = 0; i < zLen; i++) {
-            //         auto tad = x + tadOffsets[i];
-            //         auto start = OpType::startingValue(tad);
-
-            //         for (uint j = 0; j < tadLen; j++) {
-            //             auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-            //             start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
-            //         }
-
-            //         auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-            //         z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-            //     }
-            // }
+            break;
 
             //*********************************************//
             default: {
-
-                Nd4jLong* innertadOffsets = new Nd4jLong[tadLen];
+                auto innertadOffsets = new Nd4jLong[tadLen];
                 shape::calcOffsets(tadShapeInfo, innertadOffsets);
 
                 uint castZShapeInfo[MAX_RANK];
                 const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
 
                     auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                    z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
+                };
 
-                delete []innertadOffsets;
+                delete[] innertadOffsets;
             }
-
-            //*********************************************//
-            // default: {
-
-            //     Nd4jLong* innertadOffsets = new Nd4jLong[tadLen];
-            //     shape::calcOffsets(tadShapeInfo, innertadOffsets);
-
-            //     const int zRankMinusOne   = shape::rank(zShapeInfo) - 1;
-
-            //     Nd4jLong* offsetPerDimZ   = new Nd4jLong[zRankMinusOne];
-            //     int* idxZ = new int[zRankMinusOne];
-
-            //     memset(idxZ,   0, sizeof(Nd4jLong) * zRankMinusOne);
-
-            //     const Nd4jLong* shapeZ    = shape::shapeOf(zShapeInfo);
-            //     const Nd4jLong* strideZ   = shape::stride(zShapeInfo);
-
-            //     PRAGMA_OMP_SIMD
-            //     for (int k = 0; k < zRankMinusOne; ++k)
-            //         offsetPerDimZ[k] = (shapeZ[k] - 1) * strideZ[k];
-
-            //     int dimZ = zRankMinusOne, lZ = 1;
-            //     Nd4jLong initZ = 0, zOffset = 0, e = 1;
-
-            //     // first iteration
-            //     auto tad = x + tadOffsets[0];
-            //     auto start = OpType::startingValue(tad);
-            //     for (uint j = 0; j < tadLen; j++)
-            //         start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
-            //     z[0] = OpType::postProcess(start, OpType::startingValue(x), extraParams);
-
-            //     // rest iterations
-            //     while (dimZ >= 0) {
-
-            //         if(shapeZ[dimZ] == 1) { --dimZ; continue; } // ignore dimensions equal to unity
-            //             if(dimZ == zRankMinusOne) {              // last dimension
-            //                 if(lZ < shapeZ[dimZ]) { zOffset += strideZ[dimZ]; ++lZ;}
-            //                 else                  { lZ = 1; --dimZ; continue; }
-            //             }
-            //         else if(idxZ[dimZ] < shapeZ[dimZ] - 1) { initZ += strideZ[dimZ]; zOffset = initZ; ++idxZ[dimZ]; dimZ = zRankMinusOne; }
-            //         else                                   { initZ -= offsetPerDimZ[dimZ]; idxZ[dimZ--] = 0; continue;}
-
-            //         start = OpType::startingValue(tad);
-            //         tad = x + tadOffsets[e++];
-
-            //         for (uint j = 0; j < tadLen; j++)
-            //             start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
-
-            //         z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-            //     }
-
-            //     delete []innertadOffsets;
-            // }
         }
     }
 
@@ -583,10 +491,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
     //////////////////////////////////////////////////////////////////////////////
     template <typename X, typename Z, typename E>
-    template <typename OpType, bool doParallel>
+    template <typename OpType>
     void nd4j::TransformLoops<X,Z,E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
                                              Z* z, Nd4jLong* zShapeInfo,
-                                             E* extraParams) {
+                                             E* extraParams, uint64_t threadId, uint64_t numThreads) {
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
@@ -596,265 +504,176 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
         const Nd4jLong len = shape::length(xShapeInfo);
 
-        OmpLaunchHelper threadsInfo(len, doParallel ? -1 : 1);
+        if (len == 0)
+            return;
 
         switch (kindOfLoop) {
 
             //*********************************************//
             case LoopKind::EWS1: {
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+                    int64_t start = span.startX(), stop = span.stopX();
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-
-                    const auto xi = x + threadOffset;
-                    const auto zi = z + threadOffset;
-
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++)
-                        zi[i] = OpType::op(xi[i], extraParams);
+                    for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], extraParams);
                 }
-            }
                 break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-                const uint xEws = shape::elementWiseStride(xShapeInfo);
-                const uint zEws = shape::elementWiseStride(zShapeInfo);
+                    const uint xEws = shape::elementWiseStride(xShapeInfo);
+                    const uint zEws = shape::elementWiseStride(zShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+                    int64_t start = span.startX(), stop = span.stopX();
 
-                    const auto xi = x + threadOffset * xEws;
-                    auto zi = z + threadOffset * zEws;
-
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], extraParams);
+                    for (auto i = start; i < stop; i++)
+                        z[i*zEws] = OpType::op(x[i*xEws], extraParams);
                 }
-            }
                 break;
 
                 //*********************************************//
             case LoopKind::Z_EWSNONZERO: {
-                const uint zEws = shape::elementWiseStride(zShapeInfo);
-                uint castXShapeInfo[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
+                    const uint zEws = shape::elementWiseStride(zShapeInfo);
+                    uint castXShapeInfo[MAX_RANK];
+                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-
-                    auto zi = z + threadOffset * zEws;
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+                    int64_t start = span.startX(), stop = span.stopX();
 
                     if (zEws > 1) {
-
-                        PRAGMA_OMP_SIMD
-                        for (uint i = 0; i < lenPerThread; i++) {
-                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
-                            zi[i * zEws] = OpType::op(x[xOffset], extraParams);
+                        for (auto i = start; i < stop; i++) {
+                            const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
+                            z[i * zEws] = OpType::op(x[xOffset], extraParams);
                         }
                     } else {
-                        PRAGMA_OMP_SIMD
-                        for (uint i = 0; i < lenPerThread; i++) {
-                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
-                            zi[i] = OpType::op(x[xOffset], extraParams);
+                        for (auto i = start; i < stop; i++) {
+                            const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
+                            z[i] = OpType::op(x[xOffset], extraParams);
                         }
                     }
                 }
-            }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK1: {
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(threadsInfo._numThreads)
-                for (uint i0 = 0; i0 < len; ++i0)
-                    z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams);
-            }
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams);
+                }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK2: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
 
-                //PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(threadsInfo._numThreads)
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (uint i0 = 0; i0 < uXShape0; ++i0) {
+                    auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
+                    auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
 
-                    auto z0 = i0 * zStride[0];
-                    auto x0 = i0 * xStride[0];
-                    for (uint i1 = 0; i1 < uXShape1; ++i1)
-                        z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++) {
+                        auto z0 = i0 * zStride[0];
+                        auto x0 = i0 * xStride[0];
+
+                        for (uint i1 = span.startY(); i1 < span.stopY(); ++i1)
+                            z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
+                    }
                 }
-            }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK3: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
-                auto uXShape2 = static_cast<uint>(xShape[2]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape2 = static_cast<uint>(xShape[2]);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 2)
-                for (uint i0 = 0; i0 < uXShape0; ++i0)
-                    for (uint i1 = 0; i1 < uXShape1; ++i1) {
+                    auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
+                    auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
 
-                        auto z0 = i0 * zStride[0] + i1 * zStride[1];
-                        auto x0 = i0 * xStride[0] + i1 * xStride[1];
 
-                        for (uint i2 = 0; i2 < uXShape2; ++i2)
-                            z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
-                    }
-            }
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++) {
+                            auto z0 = i0 * zStride[0] + i1 * zStride[1];
+                            auto x0 = i0 * xStride[0] + i1 * xStride[1];
+
+                            for (uint i2 = 0; i2 < uXShape2; ++i2)
+                                z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
+                        }
+                }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK4: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
-                auto uXShape2 = static_cast<uint>(xShape[2]);
-                auto uXShape3 = static_cast<uint>(xShape[3]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape2 = static_cast<uint>(xShape[2]);
+                    auto uXShape3 = static_cast<uint>(xShape[3]);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 2)
-                for (uint i0 = 0; i0 < uXShape0; ++i0)
-                    for (uint i1 = 0; i1 < uXShape1; ++i1)
-                        for (uint i2 = 0; i2 < uXShape2; ++i2) {
+                    auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
+                    auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
 
-                            auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
-                            auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
+                            for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
+                                auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
+                                auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
 
-                            for (uint i3 = 0; i3 < uXShape3; ++i3)
-                                z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
-                        }
-            }
+                                for (uint i3 = 0; i3 < uXShape3; ++i3)
+                                    z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
+                            }
+                }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK5: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
-                auto uXShape2 = static_cast<uint>(xShape[2]);
-                auto uXShape3 = static_cast<uint>(xShape[3]);
-                auto uXShape4 = static_cast<uint>(xShape[4]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape2 = static_cast<uint>(xShape[2]);
+                    auto uXShape3 = static_cast<uint>(xShape[3]);
+                    auto uXShape4 = static_cast<uint>(xShape[4]);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 3)
-                for (uint i0 = 0; i0 < uXShape0; ++i0)
-                    for (uint i1 = 0; i1 < uXShape1; ++i1)
-                        for (uint i2 = 0; i2 < uXShape2; ++i2) {
+                    auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
+                    auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
 
-                            auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
-                            auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
 
-                            for (uint i3 = 0; i3 < uXShape3; ++i3) {
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
+                            for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
+                                auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
+                                auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
 
-                                auto z1 = z0 + i3 * zStride[3];
-                                auto x1 = x0 + i3 * xStride[3];
+                                for (uint i3 = 0; i3 < uXShape3; ++i3) {
 
-                                for (uint i4 = 0; i4 < uXShape4; ++i4)
-                                    z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
+                                    auto z1 = z0 + i3 * zStride[3];
+                                    auto x1 = x0 + i3 * xStride[3];
 
+                                    for (uint i4 = 0; i4 < uXShape4; ++i4)
+                                        z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
+
+                                }
                             }
-                        }
-            }
+
+                }
                 break;
 
             //*********************************************//
             default: {
-                uint xShapeInfoCast[MAX_RANK];
-                uint zShapeInfoCast[MAX_RANK];
+                    uint xShapeInfoCast[MAX_RANK];
+                    uint zShapeInfoCast[MAX_RANK];
 
-                bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
 
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (auto i = span.startX(); i < span.stopX(); i++) {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpType::op(x[xOffset], extraParams);
                     }
                 }
-            }
 
-            // default: {
-
-            //     const int xRankMinusOne = shape::rank(xShapeInfo) - 1;
-            //     const int zRankMinusOne = shape::rank(zShapeInfo) - 1;
-
-            //     printf("%i  %i \n", xRankMinusOne, zRankMinusOne);
-
-            //     uint* xIdx = new uint[xRankMinusOne + 1];
-            //     uint* zIdx = new uint[zRankMinusOne + 1];
-
-            //     Nd4jLong* xOffsetPerDim = new Nd4jLong[xRankMinusOne];
-            //     Nd4jLong* zOffsetPerDim = new Nd4jLong[zRankMinusOne];
-
-            //     memset(xIdx, 0, sizeof(uint) * xRankMinusOne);
-            //     memset(zIdx, 0, sizeof(uint) * zRankMinusOne);
-
-            //     xIdx[xRankMinusOne] = zIdx[zRankMinusOne] = 1;
-
-            //     const Nd4jLong* xShape  = shape::shapeOf(xShapeInfo);
-            //     const Nd4jLong* zShape  = shape::shapeOf(zShapeInfo);
-            //     const Nd4jLong* xStride = shape::stride(xShapeInfo);
-            //     const Nd4jLong* zStride = shape::stride(zShapeInfo);
-
-            //     PRAGMA_OMP_SIMD
-            //     for (int k = 0; k < xRankMinusOne; ++k)
-            //         xOffsetPerDim[k] = (xShape[k] - 1) * xStride[k];
-            //     PRAGMA_OMP_SIMD
-            //     for (int k = 0; k < zRankMinusOne; ++k)
-            //         zOffsetPerDim[k] = (zShape[k] - 1) * zStride[k];
-
-            //     Nd4jLong xInit = 0, zInit = 0, xOffset = 0, zOffset = 0;
-            //     int jX = xRankMinusOne, jZ = zRankMinusOne;
-
-            //     // first iteration
-            //     z[0] = OpType::op(x[0], extraParams);
-
-            //     // rest iterations
-            //     for (uint i = 1; i < len; i++) {
-
-            //         while(true) {
-            //             if(xShape[jX] == 1) { --jX; continue; }
-            //             if(jX == xRankMinusOne) {
-            //                 if(xIdx[jX] < xShape[jX]) { xOffset += xStride[jX]; ++xIdx[jX]; break; }
-            //                 else                      { xIdx[jX] = 1; --jX; continue; }
-            //             }
-            //             else if(xIdx[jX] < xShape[jX] - 1) { xInit += xStride[jX]; xOffset = xInit; ++xIdx[jX]; jX = xRankMinusOne; break; }
-            //             else                               { xInit -= xOffsetPerDim[jX]; xIdx[jX--] = 0; continue; }
-            //         }
-
-            //         while(true) {
-            //             if(zShape[jZ] == 1) { --jZ; continue; }
-            //             if(jZ == zRankMinusOne) {
-            //                 if(zIdx[jZ] < zShape[jZ]) { zOffset += zStride[jZ]; ++zIdx[jZ]; break; }
-            //                 else                      { zIdx[jZ] = 1; --jZ; continue; }
-            //             }
-            //             else if(zIdx[jZ] < zShape[jZ] - 1) { zInit += zStride[jZ]; zOffset = zInit; ++zIdx[jZ]; jZ = zRankMinusOne; break; }
-            //             else                               { zInit -= zOffsetPerDim[jZ]; zIdx[jZ--] = 0; continue; }
-            //         }
-            //         z[zOffset] = OpType::op(x[xOffset], extraParams);
-            //     }
-
-            //     delete []xIdx;
-            //     delete []zIdx;
-            //     delete []xOffsetPerDim;
-            //     delete []zOffsetPerDim;
-            // }
         }
     }
 
@@ -866,12 +685,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                                   X* y, Nd4jLong* yShapeInfo,
                                                   Z* z, Nd4jLong* zShapeInfo,
                                                   int* dims, int dimsLen,
-                                                  Z* extraParameters) {
+                                                  Z* extraParameters, int64_t start, int64_t stop) {
 
         // both tads have same shape, however strides and ews may differ
 
         Z param0(OpType::startingValue(x)), param1(OpType::startingValue(x)), param2(extraParameters ? extraParameters[0] : OpType::startingValue(x));
-        Z extraParams[3] = {param0, param1, param2};
 
         const Nd4jLong xLen = shape::length(xShapeInfo);
         const Nd4jLong yLen = shape::length(yShapeInfo);
@@ -921,139 +739,128 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
             //*********************************************//
             case LoopKind::EWS1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; ++i) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
                     const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
                     const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto start      = OpType::startingValue(xTad);
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint j = 0; j < tadLen; ++j)
-                        start = OpType::update(start, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-
-               PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; ++i) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint j = 0; j < tadLen; ++j)
-                        start = OpType::update(start, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
 
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadLen; ++i0) {
                         const auto xTadOffset = i0 * xTadStride[0];
                         const auto yTadOffset = i0 * yTadStride[0];
-                        start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK2: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                             const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
                             const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
-                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK3: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
                                 const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
                                 const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
-                                start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK4: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1061,29 +868,27 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                 for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
                                     const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
                                     const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
-                                    start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                    s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                 }
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK5: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1092,68 +897,62 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                     for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
                                         const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
                                         const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
-                                        start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                     }
                                 }
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             default: {
-
                 uint castXTadShapeInfo[MAX_RANK];
                 const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
 
                 if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint i = 0; i < zLen; ++i) {
-
+                    Z extraParams[3];
+                    for (auto i = start; i < stop; i++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
                         const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
                         const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                        auto start      = OpType::startingValue(xTad);
+                        auto s = OpType::startingValue(xTad);
 
                         for (uint j = 0; j < tadLen; ++j) {
                             const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                            start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
                         }
 
-                        z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                    }
+                        z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                    };
                 }
                 else {
-
                     uint castYTadShapeInfo[MAX_RANK];
                     const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint i = 0; i < zLen; ++i) {
-
+                    Z extraParams[3];
+                    for (auto i = start; i < stop; i++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
                         const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
                         const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                        auto start      = OpType::startingValue(xTad);
+                        auto s = OpType::startingValue(xTad);
 
                         for (uint j = 0; j < tadLen; ++j) {
                             const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
                             const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
-                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
-
-                        z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                    }
+                        z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                    };
                 }
             }
         }
@@ -1167,12 +966,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                                      Z* z, Nd4jLong* zShapeInfo,
                                                      Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets,
                                                      Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets,
-                                                     Z* extraParameters) {
+                                                     Z* extraParameters, int64_t start, int64_t stop) {
 
         // both tads have same shape, however strides and ews may differ
 
         Z param0(OpType::startingValue(x)), param1(OpType::startingValue(x)), param2(extraParameters ? extraParameters[0] : OpType::startingValue(x));
-        Z extraParams[3] = {param0, param1, param2};
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXYZ(xTadShapeInfo, yTadShapeInfo, zShapeInfo);
 
@@ -1195,159 +993,146 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
         int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads*numYTads);
 
         switch (kindOfLoop) {
-
             //*********************************************//
             case LoopKind::EWS1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                        auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint j = 0; j < tadLen; ++j)
-                            start = OpType::update(start, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
 
-                        z[zInd] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint j = 0; j < tadLen; ++j)
-                            start = OpType::update(start, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
 
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadLen; ++i0) {
                             const auto xTadOffset = i0 * xTadStride[0];
                             const auto yTadOffset = i0 * yTadStride[0];
-                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK2: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                                 const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
                                 const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
-                                start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK3: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                                 for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
                                     const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
                                     const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
-                                    start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                    s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                 }
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK4: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1355,32 +1140,30 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                     for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
                                         const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
                                         const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
-                                        start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                     }
                                 }
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK5: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1389,7 +1172,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                         for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
                                             const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
                                             const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
-                                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                         }
                                     }
                                 }
@@ -1397,66 +1180,61 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                         }
                         z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             default: {
-
                 uint castXTadShapeInfo[MAX_RANK];
                 const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
 
                 if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint ix = 0; ix < numXTads; ++ix) {
-                        for (uint iy = 0; iy < numYTads; ++iy) {
-
+                    Z extraParams[3];
+                    for (auto ix = 0; ix < numXTads; ix++) {
+                        for (auto iy = 0; iy < numYTads; iy++) {
                             extraParams[0] = param0;
                             extraParams[1] = param1;
                             extraParams[2] = param2;
 
-                            const auto xTad  = x + xTadOffsets[ix];
-                            const auto yTad  = y + yTadOffsets[iy];
-                            const auto zInd  = ix * numYTads + iy;
-                                  auto start = startVal;
+                            const auto xTad = x + xTadOffsets[ix];
+                            const auto yTad = y + yTadOffsets[iy];
+                            const auto zInd = ix * numYTads + iy;
+                            auto s = startVal;
 
                             for (uint j = 0; j < tadLen; ++j) {
                                 const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                                start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
                             }
-                            z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                            z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint castYTadShapeInfo[MAX_RANK];
                     const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint ix = 0; ix < numXTads; ++ix) {
-                        for (uint iy = 0; iy < numYTads; ++iy) {
-
+                    Z extraParams[3];
+                    for (auto ix = 0; ix < numXTads; ix++) {
+                        for (auto iy = 0; iy < numYTads; iy++) {
                             extraParams[0] = param0;
                             extraParams[1] = param1;
                             extraParams[2] = param2;
 
-                            const auto xTad  = x + xTadOffsets[ix];
-                            const auto yTad  = y + yTadOffsets[iy];
-                            const auto zInd  = ix * numYTads + iy;
-                                  auto start = startVal;
+                            const auto xTad = x + xTadOffsets[ix];
+                            const auto yTad = y + yTadOffsets[iy];
+                            const auto zInd = ix * numYTads + iy;
+                            auto s = startVal;
 
                             for (uint j = 0; j < tadLen; ++j) {
                                 const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
                                 const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
-                                start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
 
-                            z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                            z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                         }
-                    }
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/helpers/TAD.h b/libnd4j/include/helpers/TAD.h
index 9888bb1fd..fb52e639c 100644
--- a/libnd4j/include/helpers/TAD.h
+++ b/libnd4j/include/helpers/TAD.h
@@ -721,7 +721,7 @@ namespace shape {
     INLINEDEF void TAD::createOffsets() {
         this->tadOffsets = new Nd4jLong[this->numTads];
         uint nT = this->numTads;
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
+
         for(uint i = 0; i < nT; i++)
             this->tadOffsets[i] = this->tadOffset(i);
     }
diff --git a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
index fe64b364f..7c1330648 100644
--- a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
@@ -19,7 +19,6 @@
 //
 
 #include "../OpBenchmark.h"
-#include <helpers/BlasHelper.h>
 #include <MmulHelper.h>
 
 #ifndef DEV_TESTS_MATRIXBENCHMARK_H
diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp
index fbf2fbc20..fca40d564 100644
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@@ -22,6 +22,7 @@
 #include <NDArrayFactory.h>
 #include <helpers/BlasHelper.h>
 #include <exceptions/datatype_exception.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j {
@@ -74,26 +75,28 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
     //     }
     // }   
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2))
-    for(uint row = 0; row < M; ++row) {
-       for(uint col = 0; col < N; ++col) {
-            
-            T3* c = flagC ? (C + row + col * ldc) : (C + row * ldc + col);
-            T3 val = 0;  
+    auto func = PRAGMA_THREADS_FOR_2D { ;
+        for (auto row = start_x; row < stop_x; row += inc_x) {
+            for (auto col = start_y; col < stop_y; col += inc_y) {
+                T3 *c = flagC ? (C + row + col * ldc) : (C + row * ldc + col);
+                T3 val = 0;
 
-            PRAGMA_OMP_SIMD
-            for(uint i = 0; i < K; ++i) {
-                T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda);
-                T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i);             
-                val += alphaZ * a * b;
+                PRAGMA_OMP_SIMD
+                for (uint i = 0; i < K; ++i) {
+                    T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda);
+                    T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i);
+                    val += alphaZ * a * b;
+                }
+
+                if (betaZ)
+                    *c = val + betaZ * *c;
+                else
+                    *c = val;
             }
-            
-            if(betaZ)
-                *c = val + betaZ * *c;
-            else
-                *c = val;
-       }
-    }
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, M, 1, 0, N, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -108,24 +111,27 @@ static void usualGemv(const char aOrder, const int M, const int N, const double
     
     const bool flagA = aOrder == 'f';
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
-    for(int row = 0; row < M; ++row) {
-                        
-        T3* y = Y + row * incy;
-        T3 val = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto row = start; row < stop; row += increment) {
 
-        PRAGMA_OMP_SIMD
-        for(int i = 0; i < N; ++i) {
-            T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i);
-            T3 x = *(X + i * incx);
-            val += alphaZ * a * x;
+            T3 *y = Y + row * incy;
+            T3 val = 0;
+
+            PRAGMA_OMP_SIMD
+            for (int i = 0; i < N; ++i) {
+                T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i);
+                T3 x = *(X + i * incx);
+                val += alphaZ * a * x;
+            }
+
+            if (betaZ)
+                *y = val + betaZ * *y;
+            else
+                *y = val;
         }
-        
-        if(betaZ)
-            *y = val + betaZ * *y;
-        else
-            *y = val;
-    }
+    };
+
+        samediff::Threads::parallel_for(func, 0, M);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -141,7 +147,7 @@ static void usualDot(const Nd4jLong length, const double alpha, const void* vX,
     T3 sum = 0;
     PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum))
     for(int i = 0; i < length; ++i)
-            sum = sum + X[i * incx] * Y[i * incy];        
+            sum += X[i * incx] * Y[i * incy];
     
     *Z = alphaZ * sum + betaZ * *Z;
 }
diff --git a/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp
index 5f8789077..c4c2fa995 100644
--- a/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp
+++ b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <TrueBroadcastHelper.h>
+#include <ops/ops.h>
 
 using namespace simdOps;
 
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
index 22ff3e6b1..4bd456da2 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
@@ -44,62 +44,67 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
     const Nd4jLong* tadShape  = shape::shapeOf(const_cast<Nd4jLong*>(tadShapeInfo));
     const Nd4jLong* tadStride = shape::stride(const_cast<Nd4jLong*>(tadShapeInfo));
 
-    int tadsPerThread = zLen / TAD_THRESHOLD;
-    int numThreads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-    numThreads = nd4j::math::nd4j_min<int>(numThreads, omp_get_max_threads());
-
     switch (kindOfLoop) {
         //*********************************************//
         case nd4j::LoopKind::EWS1: {
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    functions::indexreduce::IndexValue<X> comp(tad[j], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        functions::indexreduce::IndexValue<X> comp(tad[j], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i] = (Z) indexValue.index;
                 }
+            };
 
-                z[i] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
             //*********************************************//
         case nd4j::LoopKind::EWSNONZERO: {
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i * zEws] = (Z) indexValue.index;
                 }
+            };
 
-                z[i * zEws] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
             //*********************************************//
         case nd4j::LoopKind::RANK1: {
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadLen; ++i0) {
-                    functions::indexreduce::IndexValue<X> comp(tad[i0 * tadStride[0]], i0);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadLen; ++i0) {
+                        functions::indexreduce::IndexValue<X> comp(tad[i0 * tadStride[0]], i0);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i] = (Z) indexValue.index;
                 }
+            };
 
-                z[i] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -108,22 +113,25 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[2];
             shape::updateStrides(2, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1];
-                        const auto tadIndex  = i0 * newStride[0] + i1;
-                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1];
+                            const auto tadIndex = i0 * newStride[0] + i1;
+                            functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                            indexValue = OpType::update(indexValue, comp, extraParams);
+                        }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -132,24 +140,27 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[3];
             shape::updateStrides(3, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                            const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2];
-                            const auto tadIndex  = i0 * newStride[0] + i1 * newStride[1] + i2;
-                            functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                            indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
+                                const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2];
+                                const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2;
+                                functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                                indexValue = OpType::update(indexValue, comp, extraParams);
+                            }
                         }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -158,26 +169,29 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[4];
             shape::updateStrides(4, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                            for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3];
-                                const auto tadIndex  = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3;
-                                functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                                indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
+                                for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
+                                    const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3];
+                                    const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3;
+                                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                                    indexValue = OpType::update(indexValue, comp, extraParams);
+                                }
                             }
                         }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -186,28 +200,31 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[5];
             shape::updateStrides(5, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                            for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
-                                    const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4];
-                                    const auto tadIndex  = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4;
-                                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
+                                for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
+                                    for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
+                                        const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4];
+                                        const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4;
+                                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                                        indexValue = OpType::update(indexValue, comp, extraParams);
+                                    }
                                 }
                             }
                         }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -216,19 +233,22 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             uint castZShapeInfo[MAX_RANK];
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
+                    z[zOffset] = (Z) indexValue.index;
                 }
+            };
 
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                z[zOffset] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -237,19 +257,22 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             uint castTadShapeInfo[MAX_RANK];
             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
+                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i * zEws] = (Z) indexValue.index;
                 }
+            };
 
-                z[i * zEws] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -260,20 +283,23 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
+                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
+                    z[zOffset] = (Z) indexValue.index;
                 }
+            };
 
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                z[zOffset] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
     }
 }
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
index 895afccfd..b8405553e 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
@@ -28,24 +28,32 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_0);
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
index d8c24e096..44ccea08c 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
@@ -28,24 +28,32 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_1);
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
index 4ecc0e370..ec261a7ea 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
@@ -28,24 +28,32 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_2);
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
index 218c335ca..3b1efadc9 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
@@ -28,24 +28,32 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_3);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
index 4a223a0f2..0709e5f3c 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
@@ -19,3 +19,4 @@
 //
 
 #include <helpers/Loops.h>
+#include <op_boilerplate.h>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
index 35ae99afb..151bc6a82 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
@@ -26,16 +26,20 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionBoolLoops<X, Z>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+    void ReductionBoolLoops<X, Z>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionBoolLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                             Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                             Nd4jLong *tadOffsets,
-                                            X *extraParams) {
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_BOOL_OPS);
+                                            X *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_BOOL_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionBoolLoops, , LIBND4J_TYPES, BOOL_TYPES);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
index c7b1f6ff8..af8b0b451 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
@@ -28,16 +28,19 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
-
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_0);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
index 76c1141bf..137ffc011 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
@@ -28,16 +28,19 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
-
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_1);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
index 7288816ad..79b11b419 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
@@ -28,16 +28,19 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
-
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_2);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
index 251624076..ddedd6c18 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
@@ -28,16 +28,19 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
-
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_3);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
index a6dd992c6..2e7708497 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
@@ -33,16 +33,19 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionLongLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z *z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+    void ReductionLongLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z *z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionLongLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                             Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                            Nd4jLong *tadOffsets, X *extraParams) {
-
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_LONG_OPS);
+                                            Nd4jLong *tadOffsets, X *extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_LONG_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionLongLoops, , LIBND4J_TYPES, LONG_TYPES);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
index 623d97e79..08a67ec59 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
@@ -26,20 +26,24 @@ namespace nd4j {
 
     template<typename X>
     template <typename OpType>
-    void ReductionSameLoops<X>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
-        ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+    void ReductionSameLoops<X>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
+        ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
+#endif
     }
 
     template<typename X>
     void ReductionSameLoops<X>::wrapper(const int opNum, X *vx, Nd4jLong *xShapeInfo, X *vz,
                                            Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                            Nd4jLong *tadOffsets,
-                                           X *vextraParams) {
+                                           X *vextraParams, int64_t start, int64_t stop) {
+#ifndef INLINE_LOOPS
         auto x = reinterpret_cast<X *>(vx);
         auto z = reinterpret_cast<X *>(vz);
         auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_SAME_OPS);
+        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_SAME_OPS);
+#endif
     }
 
     BUILD_SINGLE_TEMPLATE(template class ReductionSameLoops, , LIBND4J_TYPES);
diff --git a/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu
index 152e74652..8f67f0004 100644
--- a/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu
+++ b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu
@@ -24,6 +24,7 @@
 #include <execution/LaunchContext.h>
 #include <specials.h>
 #include <logger.h>
+#include <ops/ops.h>
 // #include <cuda_runtime.h>
 // #include <cuda.h>
 
diff --git a/libnd4j/include/helpers/impl/AttentionHelper.cpp b/libnd4j/include/helpers/impl/AttentionHelper.cpp
index 4e7393a8e..3cfee1c08 100644
--- a/libnd4j/include/helpers/impl/AttentionHelper.cpp
+++ b/libnd4j/include/helpers/impl/AttentionHelper.cpp
@@ -34,16 +34,16 @@ namespace nd4j {
         auto numHeads = projectionMatrix->sizeAt(0);
         auto projectedSize = projectionMatrix->sizeAt(1);
 
-        auto inputPerm = input->permute({1, 0, 2});
-        auto inputPrep = inputPerm.reshape('c', {input->sizeAt(1), (miniBatchSize * seqLength)});
-        auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)});
+        auto inputPerm = input->permute({1, 0, 2});     //[batch, nIn, timeSteps] -> [nIn, batch, timeSteps]
+        auto inputPrep = inputPerm.reshape('c', {input->sizeAt(1), (miniBatchSize * seqLength)});   //[nIn, batch*timeSteps]
+        auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)});    //[nHeads, hS, nIn] -> [nHeads*hS, nIn]
 
-        NDArray projected('c', {numHeads * projectionMatrix->sizeAt(1), (miniBatchSize * seqLength)}, input->dataType(), context);
+        NDArray projected('c', {numHeads * projectionMatrix->sizeAt(1), (miniBatchSize * seqLength)}, input->dataType(), context);  //[nHeads*hS, batch*timeSteps]
         nd4j::ops::matmul mmul;
         mmul.execute({&projectionPrep, &inputPrep}, {&projected},  {}, {}, {});
 
         projected.reshapei({numHeads, projectedSize, miniBatchSize, seqLength});
-        projected.permutei({2, 0, 1, 3});
+        projected.permutei({2, 0, 1, 3});   //[minibatch, numHeads, projectedSize, seqLength]
 
         return projected;
     }
diff --git a/libnd4j/include/helpers/impl/BlasHelper.cpp b/libnd4j/include/helpers/impl/BlasHelper.cpp
index 61b542697..bf52fe2c6 100644
--- a/libnd4j/include/helpers/impl/BlasHelper.cpp
+++ b/libnd4j/include/helpers/impl/BlasHelper.cpp
@@ -74,7 +74,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMV<float>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
         return _hasSgemv;
@@ -83,7 +83,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMV<double>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
     return _hasDgemv;
@@ -132,14 +132,14 @@ namespace nd4j {
 
     bool BlasHelper::hasGEMV(const nd4j::DataType dtype)  {
         if(dtype == DataType::FLOAT32) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasSgemv;
             #endif
         }
         if(dtype == DataType::DOUBLE) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasDgemv;
@@ -150,7 +150,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMM<float>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
     return _hasSgemm;
@@ -159,7 +159,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMM<double>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
     return _hasDgemm;
@@ -208,14 +208,14 @@ namespace nd4j {
 
     bool BlasHelper:: hasGEMM(const nd4j::DataType dtype) {
         if(dtype == DataType::FLOAT32) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasSgemm;
             #endif
         }
         if(dtype == DataType::DOUBLE) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasDgemm;
@@ -276,14 +276,14 @@ namespace nd4j {
     }
 
     CblasSgemv BlasHelper::sgemv() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__)|| defined(HAVE_OPENBLAS)
         return (CblasSgemv)&cblas_sgemv;
 #else
         return this->cblasSgemv;
 #endif
     }
     CblasDgemv BlasHelper::dgemv() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return (CblasDgemv)&cblas_dgemv;
 #else
         return this->cblasDgemv;
@@ -291,7 +291,7 @@ namespace nd4j {
     }
 
     CblasSgemm BlasHelper::sgemm() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return (CblasSgemm)&cblas_sgemm;
 #else
         return this->cblasSgemm;
@@ -299,7 +299,7 @@ namespace nd4j {
     }
 
     CblasDgemm BlasHelper::dgemm() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return (CblasDgemm)&cblas_dgemm;
 #else
         return this->cblasDgemm;
diff --git a/libnd4j/include/helpers/impl/DebugHelper.cpp b/libnd4j/include/helpers/impl/DebugHelper.cpp
index f1ba8a755..704c463e6 100644
--- a/libnd4j/include/helpers/impl/DebugHelper.cpp
+++ b/libnd4j/include/helpers/impl/DebugHelper.cpp
@@ -23,6 +23,7 @@
 #include <NDArrayFactory.h>
 #include <ops/declarable/headers/parity_ops.h>
 #include <helpers/DebugInfo.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     DebugInfo DebugHelper::debugStatistics(NDArray const* input) {
@@ -88,11 +89,18 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_m
             }
             *info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount};
             _stdDevValue = 0; //math::nd4j_sqrt<double, double>(info->_stdDevValue / (input->lengthOf() - 1));
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule (static) reduction(+:_stdDevValue))
-            for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
-                double current = input->e<double>(e);
-                _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue;
-            }
+
+            auto func = PRAGMA_REDUCE_DOUBLE {
+                auto _stdDevValue = 0.0;
+                for (auto e = start; e < stop; e++) {
+                    double current = input->e<double>(e);
+                    _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue;
+                }
+
+                return _stdDevValue;
+            };
+            _stdDevValue = samediff::Threads::parallel_double(func, LAMBDA_AD { return _old + _new; }, 0, input->lengthOf());
+
             info->_stdDevValue = math::nd4j_sqrt<double, double>(_stdDevValue / input->lengthOf());
 
         }
diff --git a/libnd4j/include/helpers/impl/GradCheck.cpp b/libnd4j/include/helpers/impl/GradCheck.cpp
index a3ae7d1ac..8b24e5f16 100644
--- a/libnd4j/include/helpers/impl/GradCheck.cpp
+++ b/libnd4j/include/helpers/impl/GradCheck.cpp
@@ -33,13 +33,11 @@ void GradCheck::fillGradArrays(const LossFunc loss, const std::vector<NDArray*>&
 	switch(loss) {
 
 		case MEAN:
-            PRAGMA_OMP_PARALLEL_FOR_IF(numInGradArrs > 1)
 			for(int i = 0; i < numInGradArrs; ++i)
 				*gradArrs[i] = 1. / gradArrs[i]->lengthOf();
 			break;
 
 		case SUM:
-            PRAGMA_OMP_PARALLEL_FOR_IF(numInGradArrs > 1)
 			for(int i = 0; i < numInGradArrs; ++i)
 				*gradArrs[i] = 1.;
 			break;
diff --git a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
index a4b9c4000..80e456e29 100644
--- a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
+++ b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
@@ -45,7 +45,7 @@ OmpLaunchHelper::OmpLaunchHelper(const Nd4jLong N, float desiredNumThreads) {
             else
                 desiredNumThreads = nd4j::math::nd4j_min<int>(omp_get_max_threads(), desiredNumThreads);
         #else
-            desiredNumThreads = 1;
+            desiredNumThreads = nd4j::Environment::getInstance()->maxThreads();
         #endif
         _numThreads = nd4j::math::nd4j_min<int>(N / maxItersPerThread, desiredNumThreads);        
     }
@@ -75,7 +75,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
         #ifdef _OPENMP
             return betterThreads(N, omp_get_max_threads());
         #else
-            return 1;
+            return betterThreads(N, nd4j::Environment::getInstance()->maxThreads());;
         #endif
     }
 
@@ -92,7 +92,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
 #ifdef _OPENMP
         auto maxThreads = omp_get_max_threads();
 #else
-        auto maxThreads = 1;
+        auto maxThreads = nd4j::Environment::getInstance()->maxThreads();
 #endif
 
         // if there's only 1 thread allowed - nothing to do here
diff --git a/libnd4j/include/loops/aggregates.h b/libnd4j/include/loops/aggregates.h
deleted file mode 100644
index 8fbdefcaf..000000000
--- a/libnd4j/include/loops/aggregates.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-//
-
-#ifndef LIBND4J_AGGREGATES_H
-#define LIBND4J_AGGREGATES_H
-
-#include <ops/aggregate_ops.h>
-#include <helpers/DebugHelper.h>
-#include <helpers/helper_ptrmap.h>
-
-namespace functions {
-namespace aggregate {
-
-        template<typename X>
-        class AggregatedFunction {
-
-        public:
-#ifdef __CUDACC__
-            template<typename OpClass>
-            __device__ static void execCuda(X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,  X *realArguments, int numRealArguments);
-
-            __device__ static void execCuda(int opNum, X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,  X *realArguments, int numRealArguments);
-      
-            __device__ static void aggregateBatch(int numAggregates, int opNum, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments);
-
-            __host__ static void aggregateBatchKernelGeneric(dim3& launchDims, cudaStream_t *stream, int opNum, int numAggregates, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments);
-
-            __host__ static void aggregateKernelGeneric(dim3& launchDims, cudaStream_t *stream, int opNum, void **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, void *realArguments, int numRealArguments);
-            
-#endif
-
-             template<typename OpClass>
-            inline static void exec(X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,  X *realArguments, int numRealArguments) {
-                OpClass::executeAggregate(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
-            }
-
-            inline static void exec(int opNum, X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, X *realArguments, int numRealArguments) {
-                DISPATCH_BY_OPNUM_T(exec, PARAMS(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), AGGREGATE_OPS);
-            }
-		};
-}
-}
-
-#ifdef __CUDACC__
-
-
-#endif
-
-#endif //LIBND4J_AGGREGATES_H
diff --git a/libnd4j/include/loops/broadcasting.h b/libnd4j/include/loops/broadcasting.h
index cc0331549..a38e79c3f 100755
--- a/libnd4j/include/loops/broadcasting.h
+++ b/libnd4j/include/loops/broadcasting.h
@@ -91,7 +91,7 @@ namespace functions {
             static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
 
-#endif
+#else
 
             static void execInverse(int opNum,
                              void *x,
@@ -105,7 +105,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             static void exec(int opNum,
                              void *x,
@@ -119,7 +121,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             /**
              * CPU execution
@@ -144,7 +148,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             template<typename OpType>
             static void execInverse(void *x,
@@ -158,7 +164,10 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/broadcasting_bool.h b/libnd4j/include/loops/broadcasting_bool.h
index a3098abbb..3b0958be1 100644
--- a/libnd4j/include/loops/broadcasting_bool.h
+++ b/libnd4j/include/loops/broadcasting_bool.h
@@ -89,7 +89,7 @@ namespace functions {
 
             static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
-#endif
+#else
 
             static void exec(int opNum,
                              void *x,
@@ -103,7 +103,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             static void execInverse(int opNum,
                              void *x,
@@ -117,7 +119,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             /**
              * CPU execution
@@ -142,7 +146,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             template<typename OpType>
             static void execInverse(void *x,
@@ -156,7 +162,10 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/broadcasting_int.h b/libnd4j/include/loops/broadcasting_int.h
index 84bc0f949..92e4ca7dd 100644
--- a/libnd4j/include/loops/broadcasting_int.h
+++ b/libnd4j/include/loops/broadcasting_int.h
@@ -89,7 +89,7 @@ namespace functions {
 
             static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
-#endif
+#else
 
             static void exec(int opNum,
                              void *x,
@@ -103,7 +103,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             static void execInverse(int opNum,
                              void *x,
@@ -117,7 +119,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             /**
              * CPU execution
@@ -142,7 +146,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             template<typename OpType>
             static void execInverse(void *x,
@@ -156,7 +162,10 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index 3bd619827..37dbf833f 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -24,6 +24,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -43,7 +44,9 @@ namespace functions {
                                       Nd4jLong *xTadShapeInfo,
                                       Nd4jLong *xTadOffset,
                                       Nd4jLong *zTadShapeInfo,
-                                      Nd4jLong *zTadOffset) {
+                                      Nd4jLong *zTadOffset,
+                                      uint64_t start,
+                                      uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(execInverse, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -55,7 +58,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_OPS);
+                                               zTadOffset, start, stop), BROADCAST_OPS);
         }
 
         template <typename X, typename Y, typename Z>
@@ -71,7 +74,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -83,7 +88,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_OPS);
+                                               zTadOffset, start, stop), BROADCAST_OPS);
         }
 
         template <typename X, typename  Y, typename Z>
@@ -99,7 +104,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<Y *>(vy);
@@ -131,10 +138,6 @@ namespace functions {
                 auto lenZ = shape::length(zTadShapeInfo);
                 auto lenY = shape::length(yShapeInfo);
 
-                int tadsPerThread = tads / TAD_THRESHOLD;
-                int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
-
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
@@ -142,19 +145,17 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-                      auto oX = x + tadOffsets[i];
-                      auto oZ = z + zTadOffset[i];
+                    for (auto i = start; i < stop; i++) {
+                        auto oX = x + tadOffsets[i];
+                        auto oZ = z + zTadOffset[i];
 
-                      PRAGMA_OMP_SIMD
-                      for (unsigned int f = 0; f < tadLength; f++)
-                        oZ[f] = OpType::op(oX[f], y[f]);
+                        PRAGMA_OMP_SIMD
+                        for (unsigned int f = 0; f < tadLength; f++)
+                            oZ[f] = OpType::op(oX[f], y[f]);
                     }
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO){
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -164,13 +165,10 @@ namespace functions {
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -182,70 +180,61 @@ namespace functions {
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
 
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
                     }
                 }
                 else {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
@@ -253,17 +242,15 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
                     }
@@ -285,7 +272,9 @@ namespace functions {
                                       Nd4jLong *yTadShapeInfo,
                                       Nd4jLong *yTadOffset,
                                       Nd4jLong *zTadShapeInfo,
-                                      Nd4jLong *zTadOffset) {
+                                      Nd4jLong *zTadOffset,
+                                      uint64_t start,
+                                      uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<Y *>(vy);
@@ -319,7 +308,7 @@ namespace functions {
 
             int tadsPerThread = tads / TAD_THRESHOLD;
             int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-            threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+            threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
             auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
             auto xEws = shape::elementWiseStride(xShapeInfo);
@@ -328,8 +317,7 @@ namespace functions {
             const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
             if(kindOfLoop == nd4j::LoopKind::EWS1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (unsigned int i = 0; i < tads; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto oY = y + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
 
@@ -339,24 +327,20 @@ namespace functions {
                 }
             }
             else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto oY = y + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zEws] = OpType::op(x[f * xEws], oY[f * yEws]);
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oY = x + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
 
@@ -365,73 +349,63 @@ namespace functions {
                         auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         oZ[offset] = OpType::op(x[offset], oY[offset]);
                     }
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint tadShapeInfoZCast[MAX_RANK];
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                 bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                         oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                     }
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
                 bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                     }
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
                 bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                     }
-                }
+                };
             }
             else {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint tadShapeInfoZCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
@@ -439,20 +413,18 @@ namespace functions {
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                 bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
                         auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
-                        auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                        auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                         oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
                     }
-                }
+                };
             }
         }
     }
diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
index bca423e3e..7a3eb1e31 100644
--- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
@@ -24,6 +24,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -43,7 +44,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -55,7 +58,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_BOOL_OPS);
+                                               zTadOffset, start, stop), BROADCAST_BOOL_OPS);
         }
 
         template <typename X, typename Y>
@@ -71,7 +74,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(execInverse, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -83,7 +88,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_BOOL_OPS);
+                                               zTadOffset, start, stop), BROADCAST_BOOL_OPS);
         }
 
         template <typename X, typename Z>
@@ -99,7 +104,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -133,7 +140,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
@@ -142,10 +149,9 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
-                           auto oZ = z + zTadOffset[i];
+                        auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
@@ -153,101 +159,86 @@ namespace functions {
                     }
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]);
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
-                    }
+                    };
+
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
@@ -255,20 +246,18 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
-                    }
+                    };
                 }
         }
 
@@ -286,7 +275,9 @@ namespace functions {
                              Nd4jLong *yTadShapeInfo,
                              Nd4jLong *yTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -320,7 +311,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
                 auto xEws = shape::elementWiseStride(xShapeInfo);
@@ -329,8 +320,7 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -340,8 +330,7 @@ namespace functions {
                     }
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -355,14 +344,10 @@ namespace functions {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
@@ -377,15 +362,13 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                         }
@@ -398,15 +381,13 @@ namespace functions {
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                         }
@@ -419,16 +400,14 @@ namespace functions {
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
-                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                         }
                     }
@@ -442,9 +421,7 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp
index 375d7577a..9dcce7545 100644
--- a/libnd4j/include/loops/cpu/broadcasting_int.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp
@@ -24,6 +24,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -43,7 +44,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -55,7 +58,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_INT_OPS);
+                                               zTadOffset, start, stop), BROADCAST_INT_OPS);
         }
 
         template <typename X>
@@ -71,7 +74,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_T(execInverse, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -83,7 +88,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_INT_OPS);
+                                               zTadOffset, start, stop), BROADCAST_INT_OPS);
         }
 
         template <typename X>
@@ -99,7 +104,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -133,7 +140,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
@@ -142,112 +149,95 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
-                           auto oZ = z + zTadOffset[i];
+                        auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f] = OpType::op(oX[f], y[f]);
-                    }
+                    };
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]);
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
@@ -255,20 +245,18 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
-                    }
+                    };
                 }
         }
 
@@ -286,7 +274,9 @@ namespace functions {
                              Nd4jLong *yTadShapeInfo,
                              Nd4jLong *yTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -320,7 +310,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
                 auto xEws = shape::elementWiseStride(xShapeInfo);
@@ -329,46 +319,39 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f] = OpType::op(x[f], oY[f]);
-                    }
+                    };
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (uint f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(x[f * xEws], oY[f * yEws]);
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(x[offset], oY[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo)) {
 
@@ -377,64 +360,54 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
-                        PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
-                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
@@ -442,9 +415,7 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
@@ -455,7 +426,7 @@ namespace functions {
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
                         }
-                    }
+                    };
                 }
         }
 
diff --git a/libnd4j/include/loops/cpu/indexreduce.cpp b/libnd4j/include/loops/cpu/indexreduce.cpp
index 23286ecd9..df3fd64a9 100644
--- a/libnd4j/include/loops/cpu/indexreduce.cpp
+++ b/libnd4j/include/loops/cpu/indexreduce.cpp
@@ -23,6 +23,7 @@
 #include <Loops.h>
 #include <types/types.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 #include "../legacy_ops.h"
 
 using namespace simdOps;
@@ -44,8 +45,7 @@ void IndexReduce<X,Y>::exec(const int opNum,
                         void *z, Nd4jLong *zShapeInfo,
                         int *dimension, int dimensionLength,
                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) {
-
-DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -64,42 +64,41 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
 
     uint xShapeInfoCast[MAX_RANK];
     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+    int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+    IndexValue<X> intermediatery[64];
+    for (int e = 0; e < maxThreads; e++)
+        intermediatery[e].index = -1;
 
     if (xEws == 1) {
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            auto local = OpType::startingIndexValue(x);
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
+        auto func = PRAGMA_THREADS_FOR {
+            intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            auto ulen = info.getItersPerThread(threadNum);
-
-            for (Nd4jLong i = 0; i < ulen; i++) {
-                IndexValue<X> curr(x[i + threadOffset], i + threadOffset);
-                local = OpType::update(local, curr, extraParams);
+            for (auto i = start; i < stop; i += increment) {
+                IndexValue<X> curr(x[i], i);
+                intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
             }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, len, 1, maxThreads);
+
+        for (int e = 0; e < maxThreads; e++)
+            startingIndex = OpType::update(startingIndex, intermediatery[e], extraParams);
 
-            PRAGMA_OMP_CRITICAL
-            startingIndex = OpType::update(startingIndex, local, extraParams);
-        }
     } else {
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            auto local = OpType::startingIndexValue(x);
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
+        auto func = PRAGMA_THREADS_FOR {
+            intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            auto ulen = info.getItersPerThread(threadNum);
-
-            for (Nd4jLong i = 0; i < ulen; i++) {
-                auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, canCastX);
-                IndexValue<X> curr(x[offset], threadOffset + i);
-                local = OpType::update(local, curr, extraParams);
+            for (auto i = start; i < stop; i += increment) {
+                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                IndexValue<X> curr(x[offset], i);
+                intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
             }
+        };
 
-            PRAGMA_OMP_CRITICAL
-            startingIndex = OpType::update(startingIndex, local, extraParams);
-        }
+        maxThreads = samediff::Threads::parallel_for(func, 0, len, 1, maxThreads);
+
+        for (int e = 0; e < maxThreads; e++)
+            startingIndex = OpType::update(startingIndex, intermediatery[e], extraParams);
     }
     return startingIndex.index;
 }
@@ -124,9 +123,10 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
         if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
             return;
         const auto indexValue = OpType::startingIndexValue(x);
-        PRAGMA_OMP_PARALLEL_FOR_IF(zLen > nd4j::Environment::getInstance()->elementwiseThreshold())
+
         for (uint i = 0; i < zLen; i++)
-            z[i] = (Z) indexValue.index;;
+            z[i] = (Z) indexValue.index;
+
         return;
     }
 
diff --git a/libnd4j/include/loops/cpu/pairwise.hpp b/libnd4j/include/loops/cpu/pairwise.hpp
index 9dfa129aa..1fc85e5d8 100644
--- a/libnd4j/include/loops/cpu/pairwise.hpp
+++ b/libnd4j/include/loops/cpu/pairwise.hpp
@@ -26,6 +26,7 @@
 #include <helpers/shape.h>
 #include <op_boilerplate.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -42,7 +43,9 @@ namespace functions {
                 void *z,
                 Nd4jLong zEws,
                 void *extraParams,
-                Nd4jLong n) {
+                Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
                                                xEws,
                                                y,
@@ -50,7 +53,7 @@ namespace functions {
                                                z,
                                                zEws,
                                                extraParams,
-                                               n), PAIRWISE_TRANSFORM_OPS);
+                                               n, start, stop), PAIRWISE_TRANSFORM_OPS);
         };
 
 
@@ -61,48 +64,24 @@ namespace functions {
                                               void *vy, Nd4jLong yEws,
                                               void *vz, Nd4jLong zEws,
                                               void *vextraParams,
-                                              const Nd4jLong n) {
+                                              const Nd4jLong n,
+                                              const uint64_t start,
+                                              const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<Y *>(vy);
             auto z = reinterpret_cast<Z *>(vz);
             auto extraParams = reinterpret_cast<Z *>(vextraParams);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], y[i], extraParams);
             }
             else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams);
             }
         }
 
@@ -115,14 +94,16 @@ namespace functions {
                 Nd4jLong *yShapeInfo,
                 void *z,
                 Nd4jLong *zShapeInfo,
-                void *extraParams) {
+                void *extraParams,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
                                               xShapeInfo,
                                               y,
                                               yShapeInfo,
                                               z,
                                               zShapeInfo,
-                                              extraParams),
+                                              extraParams, start, stop),
                                  PAIRWISE_TRANSFORM_OPS);
         };
 
@@ -136,7 +117,9 @@ namespace functions {
                 Nd4jLong* yShapeInfo,
                 void *vz,
                 Nd4jLong* zShapeInfo,
-                void *vextraParams) {
+                void *vextraParams,
+                const uint64_t start,
+                const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<Y *>(vy);
@@ -148,7 +131,6 @@ namespace functions {
             auto yEws = shape::elementWiseStride(yShapeInfo);
             auto zEws = shape::elementWiseStride(zShapeInfo);
 
-            nd4j::OmpLaunchHelper info(n);
 
             if (shape::isScalar(yShapeInfo)) {
 
@@ -156,38 +138,22 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(unsigned int i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[0], extraParams);
+                    };
                 }
                 else {
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
+                    };
                 }
                 return;
             }
@@ -198,96 +164,63 @@ namespace functions {
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[offset], extraParams);
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -295,20 +228,13 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/loops/cpu/pairwise2.hpp b/libnd4j/include/loops/cpu/pairwise2.hpp
deleted file mode 100644
index 17acd35b7..000000000
--- a/libnd4j/include/loops/cpu/pairwise2.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by remote on 2018-09-20.
-//
-
-#include <ops/ops.h>
-#include <loops/pairwise_transform.h>
-#include <types/types.h>
-#include <templatemath.h>
-#include <helpers/shape.h>
-#include <op_boilerplate.h>
-#include <OmpLaunchHelper.h>
-
-using namespace simdOps;
-
-namespace functions {
-    namespace pairwise_transforms {
-
-        template <typename X, typename Y, typename Z>
-        void PairWiseTransform<X, Y, Z>::exec(
-                const int opNum,
-                void *x,
-                Nd4jLong xEws,
-                void *y,
-                Nd4jLong yEws,
-                void *z,
-                Nd4jLong zEws,
-                void *extraParams,
-                Nd4jLong n) {
-            DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
-                                              xEws,
-                                              y,
-                                              yEws,
-                                              z,
-                                              zEws,
-                                              extraParams,
-                                              n), PAIRWISE_TRANSFORM_OPS);
-        };
-
-
-
-        template <typename X, typename Y, typename Z>
-        template <typename OpType>
-        void PairWiseTransform<X, Y, Z>::exec(void *vx, Nd4jLong xEws,
-                                            void *vy, Nd4jLong yEws,
-                                            void *vz, Nd4jLong zEws,
-                                            void *vextraParams,
-                                            const Nd4jLong n) {
-
-            auto x = reinterpret_cast<X *>(vx);
-            auto y = reinterpret_cast<Y *>(vy);
-            auto z = reinterpret_cast<Z *>(vz);
-            auto extraParams = reinterpret_cast<Z *>(vextraParams);
-
-            nd4j::OmpLaunchHelper info(n);
-
-            if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
-            }
-        }
-    }
-}
diff --git a/libnd4j/include/loops/cpu/pairwise_bool.cpp b/libnd4j/include/loops/cpu/pairwise_bool.cpp
index 8feabb98a..2259c37b0 100644
--- a/libnd4j/include/loops/cpu/pairwise_bool.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_bool.cpp
@@ -22,6 +22,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -38,7 +39,9 @@ namespace functions {
                 void *z,
                 Nd4jLong zEws,
                 void *extraParams,
-                Nd4jLong n) {
+                Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                                xEws,
                                                y,
@@ -46,7 +49,7 @@ namespace functions {
                                                z,
                                                zEws,
                                                extraParams,
-                                               n), PAIRWISE_BOOL_OPS);
+                                               n, start, stop), PAIRWISE_BOOL_OPS);
         };
 
 
@@ -60,46 +63,24 @@ namespace functions {
                                               void *vz,
                                               Nd4jLong zEws,
                                               void *vextraParams,
-                                              const Nd4jLong n) {
+                                              const Nd4jLong n,
+                                               const uint64_t start,
+                                               const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
             auto z = reinterpret_cast<Z *>(vz);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i] = OpType::op(x[i], y[i], extraParams);
             }
             else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams);
             }
         }
 
@@ -112,14 +93,16 @@ namespace functions {
                 Nd4jLong *yShapeInfo,
                 void *z,
                 Nd4jLong *zShapeInfo,
-                void *extraParams) {
+                void *extraParams,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                               xShapeInfo,
                                               y,
                                               yShapeInfo,
                                               z,
                                               zShapeInfo,
-                                              extraParams),
+                                              extraParams, start, stop),
                                  PAIRWISE_BOOL_OPS);
         };
 
@@ -129,7 +112,9 @@ namespace functions {
         void PairWiseBoolTransform<X, Z>::exec(void *vx, Nd4jLong* xShapeInfo,
                                             void *vy, Nd4jLong* yShapeInfo,
                                             void *vz, Nd4jLong* zShapeInfo,
-                                            void *vextraParams) {
+                                            void *vextraParams,
+                                            const uint64_t start,
+                                            const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
@@ -141,8 +126,6 @@ namespace functions {
             auto yEws = shape::elementWiseStride(yShapeInfo);
             auto zEws = shape::elementWiseStride(zShapeInfo);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (shape::isScalar(yShapeInfo)) {
 
                uint xShapeInfoCast[MAX_RANK];
@@ -150,37 +133,22 @@ namespace functions {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[0], extraParams);
+                    };
                 }
                 else {
-
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
+                    };
                 }
                 return;
             }
@@ -189,96 +157,62 @@ namespace functions {
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
-
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -286,20 +220,13 @@ namespace functions {
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/loops/cpu/pairwise_int.cpp b/libnd4j/include/loops/cpu/pairwise_int.cpp
index 63b9dc8c8..673951d6a 100644
--- a/libnd4j/include/loops/cpu/pairwise_int.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_int.cpp
@@ -22,6 +22,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -38,7 +39,9 @@ namespace functions {
                 void *z,
                 Nd4jLong zEws,
                 void *extraParams,
-                Nd4jLong n) {
+                Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                                xEws,
                                                y,
@@ -46,7 +49,7 @@ namespace functions {
                                                z,
                                                zEws,
                                                extraParams,
-                                               n), PAIRWISE_INT_OPS);
+                                               n, start, stop), PAIRWISE_INT_OPS);
         };
 
 
@@ -60,46 +63,24 @@ namespace functions {
                                               void *vz,
                                               Nd4jLong zEws,
                                               void *vextraParams,
-                                              const Nd4jLong n) {
+                                              const Nd4jLong n,
+                                              const uint64_t start,
+                                              const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
             auto z = reinterpret_cast<X *>(vz);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i] = OpType::op(x[i], y[i], extraParams);
             }
             else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams);
             }
         }
 
@@ -112,14 +93,16 @@ namespace functions {
                 Nd4jLong *yShapeInfo,
                 void *z,
                 Nd4jLong *zShapeInfo,
-                void *extraParams) {
+                void *extraParams,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                               xShapeInfo,
                                               y,
                                               yShapeInfo,
                                               z,
                                               zShapeInfo,
-                                              extraParams),
+                                              extraParams, start, stop),
                                  PAIRWISE_INT_OPS);
         };
 
@@ -129,7 +112,9 @@ namespace functions {
         void PairWiseIntTransform<X>::exec(void *vx, Nd4jLong* xShapeInfo,
                                             void *vy, Nd4jLong* yShapeInfo,
                                             void *vz, Nd4jLong* zShapeInfo,
-                                            void *vextraParams) {
+                                            void *vextraParams,
+                                            const uint64_t start,
+                                            const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
@@ -141,46 +126,28 @@ namespace functions {
             auto yEws = shape::elementWiseStride(yShapeInfo);
             auto zEws = shape::elementWiseStride(zShapeInfo);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (shape::isScalar(yShapeInfo)) {
 
                uint xShapeInfoCast[MAX_RANK];
                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[0], extraParams);
+                    };
                 }
                 else {
-
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
+                    };
                 }
                 return;
             }
@@ -189,96 +156,63 @@ namespace functions {
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -286,20 +220,13 @@ namespace functions {
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/loops/cpu/random.cpp b/libnd4j/include/loops/cpu/random.cpp
index 5abc1447a..d4c808719 100644
--- a/libnd4j/include/loops/cpu/random.cpp
+++ b/libnd4j/include/loops/cpu/random.cpp
@@ -52,28 +52,22 @@ namespace functions {
 
             auto length = shape::length(zShapeInfo);
 
-//            nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            nd4j::OmpLaunchHelper info(length);
-
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                    for (auto i = start; i < stop; i += increment)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else if (shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
 
@@ -82,19 +76,16 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else if (shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -103,19 +94,16 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else if (shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
 
@@ -124,19 +112,16 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else {
 
@@ -147,20 +132,17 @@ namespace functions {
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], y[yOffset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
         };
 
@@ -184,41 +166,34 @@ namespace functions {
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            nd4j::OmpLaunchHelper info(length);
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else {
 
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
         }
 
@@ -232,25 +207,21 @@ namespace functions {
 
             auto length = shape::length(zShapeInfo);
 
-            //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             nd4j::OmpLaunchHelper info(length);
 
             uint zShapeInfoCast[MAX_RANK];
             const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-            PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-            {
-                auto threadNum = omp_get_thread_num();
-                auto threadOffset = info.getThreadOffset(threadNum);
-                auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+            auto func = PRAGMA_THREADS_FOR {
                 PRAGMA_OMP_SIMD
-                for (Nd4jLong i = 0; i < ulen; i++)  {
-                    auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                    z[offset] = OpClass::op(i+threadOffset, length, rng, extraArguments);
+                for (uint64_t i = start; i < stop; i += increment)  {
+                    auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                    z[offset] = OpClass::op(i, length, rng, extraArguments);
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func,  0, length, 1);
         }
 
         template<typename X>
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
index 246d18ac4..882b1740e 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@@ -55,7 +55,7 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
@@ -65,25 +65,14 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                for (auto i = 0; i < length; i++)
+                    startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
-
-                z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                z[0] = OpType::postProcess(startingValue, length, extraParams);
             }
         }
 
@@ -102,23 +91,14 @@ namespace functions {
                     return execScalar<OpType>(x, xEws, length, extraParams);
                 }
                 else {
-                    X start = OpType::startingValue(x);
-                    auto intermediate = new X[nd4j::math::nd4j_max<int>(1, omp_get_max_threads())];
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        intermediate[e] = start;
-
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    delete[] intermediate;
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -150,8 +130,8 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
-                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), REDUCE_BOOL_OPS);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
+                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_BOOL_OPS);
         }
 
         template <typename X, typename Z>
@@ -164,7 +144,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vresult);
@@ -176,7 +156,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
-                    PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
@@ -205,9 +185,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -227,49 +207,33 @@ namespace functions {
         template <typename X, typename Z>
         template <typename OpType>
         Z _CUDA_H ReduceBoolFunction<X, Z>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
-
                 auto x = reinterpret_cast<X *>(vx);
                 auto extraParams = reinterpret_cast<X *>(vextraParams);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                Z intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
-                if (xEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        for (Nd4jLong i = 0; i < ulen; i++) {
-                            local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-                        }
-
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
+                auto func = PRAGMA_THREADS_FOR {
+                    if (xEws == 1) {
+                        for (auto i = start; i < stop; i++)
+                            intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                    } else {
+                        for (auto i = start; i < stop; i++)
+                            intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                     }
-                }
-                else {
+                };
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws*threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
-                }
-                return OpType::postProcess(startingVal, length, extraParams);
+                // return result
+                return OpType::postProcess(intermediate[0], length, extraParams);
             }
 
 
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
index a94a19b25..112656852 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
@@ -59,9 +59,10 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
+
                 return;
             }
 
@@ -69,25 +70,29 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                Z intermediate[64];
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                };
 
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
+
+                // write out results
+                z[0] = OpType::postProcess(intermediate[0], length, extraParams);
             }
         }
 
@@ -105,23 +110,14 @@ namespace functions {
                     return execScalar<OpType>(x, xEws, length, extraParams);
                 }
                 else {
-                    X start = OpType::startingValue(x);
-                    auto intermediate = new X[nd4j::math::nd4j_max<int>(1, omp_get_max_threads())];
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        intermediate[e] = start;
-
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    delete[] intermediate;
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -153,7 +149,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
                 DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                                xShapeInfo,
                                                extraParams,
@@ -162,7 +158,7 @@ namespace functions {
                                                dimension,
                                                dimensionLength,
                                                tadShapeInfo,
-                                               tadOffset),
+                                               tadOffset, start, stop),
                                   REDUCE_FLOAT_OPS);
         }
 
@@ -176,7 +172,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vresult);
@@ -188,7 +184,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
-                    PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
@@ -222,9 +218,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -245,49 +241,34 @@ namespace functions {
         template <typename OpType>
         Z _CUDA_H ReduceFloatFunction<X, Z>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
 
-                auto x = reinterpret_cast<X *>(vx);
-                auto extraParams = reinterpret_cast<Z *>(vextraParams);
+            auto x = reinterpret_cast<X *>(vx);
+            auto extraParams = reinterpret_cast<Z *>(vextraParams);
+            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            Z intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
-                int nt = info._numThreads;
+            PRAGMA_OMP_SIMD
+            for (auto e = 0; e < maxThreads; e++)
+                intermediate[e] = OpType::startingValue(x);
 
-            if (xEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto local = OpType::startingValue(x);
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-
-                    PRAGMA_OMP_CRITICAL
-                    startingVal = OpType::update(startingVal, local, extraParams);
+            auto func = PRAGMA_THREADS_FOR {
+                if (xEws == 1) {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                } else {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                 }
-            }
-            else {
+            };
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto local = OpType::startingValue(x);
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+            maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+            // merge results
+            for (int e = 1; e < maxThreads; e++)
+                intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                    PRAGMA_OMP_CRITICAL
-                    startingVal = OpType::update(startingVal, local, extraParams);
-                }
-            }
-            return OpType::postProcess(startingVal, length, extraParams);
-            }
+            // return result
+            return OpType::postProcess(intermediate[0], length, extraParams);
+        }
 
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceFloatFunction, , LIBND4J_TYPES, FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
index 1a148805e..76dc209f6 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
@@ -55,7 +55,7 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
@@ -65,25 +65,29 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                Z intermediate[64];
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                };
 
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
+
+                // write out results
+                z[0] = OpType::postProcess(intermediate[0], length, extraParams);
             }
         }
 
@@ -103,23 +107,14 @@ namespace functions {
                     return execScalar<OpType>(x, xEws, length, extraParams);
                 }
                 else {
-                    X start = OpType::startingValue(x);
-                    auto intermediate = new X[nd4j::math::nd4j_max<int>(1, omp_get_max_threads())];
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        intermediate[e] = start;
-
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    delete[] intermediate;
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -152,8 +147,8 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
-                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), REDUCE_LONG_OPS);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
+                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_LONG_OPS);
         }
 
         template <typename X, typename Z>
@@ -166,7 +161,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vresult);
@@ -178,7 +173,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
-                    PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
@@ -212,9 +207,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -235,48 +230,34 @@ namespace functions {
         template <typename OpType>
         Z _CUDA_H ReduceLongFunction<X, Z>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
 
-                auto x = reinterpret_cast<X *>(vx);
-                auto extraParams = reinterpret_cast<X *>(vextraParams);
+            auto x = reinterpret_cast<X *>(vx);
+            auto extraParams = reinterpret_cast<X *>(vextraParams);
+            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            Z intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
+            PRAGMA_OMP_SIMD
+            for (auto e = 0; e < maxThreads; e++)
+                intermediate[e] = OpType::startingValue(x);
 
+            auto func = PRAGMA_THREADS_FOR {
                 if (xEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                } else {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                 }
-                else {
+            };
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws*threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+            maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+            // merge results
+            for (int e = 1; e < maxThreads; e++)
+                intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
-                }
-                return OpType::postProcess(startingVal, length, extraParams);
-            }
+            // return result
+            return OpType::postProcess(intermediate[0], length, extraParams);
+        }
 
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceLongFunction, , LIBND4J_TYPES, LONG_TYPES);
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
index 0dfff5e73..cbd7e6e12 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
@@ -57,7 +57,7 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
@@ -67,25 +67,29 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                X intermediate[64];
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                };
 
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                z[0] = OpType::postProcess(start, length, extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
+
+                // write out results
+                z[0] = OpType::postProcess(intermediate[0], length, extraParams);
             }
         }
 
@@ -103,26 +107,15 @@ namespace functions {
 
                 if (xEws >= 1) {
                     return execScalar<OpType>(x, xEws, length, extraParams);
-                }
-                else {
-                    X start = OpType::startingValue(x);
-                    const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                    X intermediate[256];
-
-                    for (int e = 0; e < maxThreads; e++)
-                        intermediate[e] = start;
-
+                } else {
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < maxThreads; e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -154,7 +147,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
                 DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                                xShapeInfo,
                                                extraParams,
@@ -163,7 +156,7 @@ namespace functions {
                                                dimension,
                                                dimensionLength,
                                                tadShapeInfo,
-                                               tadOffset),
+                                               tadOffset, start, stop),
                                   REDUCE_SAME_OPS);
         }
 
@@ -177,7 +170,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<X *>(vz);
@@ -189,7 +182,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
-                    PRAGMA_OMP_PARALLEL_FOR_IF(zLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < zLength; i++)
                         z[i] = startingVal;
                     return;
@@ -223,9 +216,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionSameLoops<X>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionSameLoops<X>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -246,48 +239,34 @@ namespace functions {
         template <typename OpType>
         X _CUDA_H ReduceSameFunction<X>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
 
-                auto x = reinterpret_cast<X *>(vx);
-                auto extraParams = reinterpret_cast<X *>(vextraParams);
+            auto x = reinterpret_cast<X *>(vx);
+            auto extraParams = reinterpret_cast<X *>(vextraParams);
+            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            X intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
+            PRAGMA_OMP_SIMD
+            for (auto e = 0; e < maxThreads; e++)
+                intermediate[e] = OpType::startingValue(x);
 
+            auto func = PRAGMA_THREADS_FOR {
                 if (xEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                } else {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                 }
-                else {
+            };
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws*threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+            maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+            // merge results
+            for (int e = 1; e < maxThreads; e++)
+                intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
-                }
-                return OpType::postProcess(startingVal, length, extraParams);
-            }
+            // return result
+            return OpType::postProcess(intermediate[0], length, extraParams);
+        }
 
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ReduceSameFunction, , LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cpu/reduce3.cpp b/libnd4j/include/loops/cpu/reduce3.cpp
index fd09dc0e1..dbe93620a 100644
--- a/libnd4j/include/loops/cpu/reduce3.cpp
+++ b/libnd4j/include/loops/cpu/reduce3.cpp
@@ -24,6 +24,7 @@
 #include <loops/legacy_ops.h>
 #include <helpers/ConstantTadHelper.h>
 #include <Loops.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -51,72 +52,82 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
             return;
         const auto startingVal = OpType::startingValue(x);
-        PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
         for (uint i = 0; i < length; i++)
             z[i] = startingVal;
+
         return;
     }
 
     Z extraParamsVals[3] = {(Z) 0.0f, (Z) 0.0f, (Z) 0.0f};
-    // it's possible case for EqualsWithEps op
-    if (extraParams != nullptr)
-        extraParamsVals[2] = extraParams[0];
 
     uint xShapeInfoCast[MAX_RANK];
     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
     Z startingVal = OpType::startingValue(x);
-    const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-    nd4j::OmpLaunchHelper t(length, maxThreads);
-    Z intermediate[256];
-    Z extraParamsLocal[3 * 256];
+    int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+    Z intermediate[64];
+    Z extraParamsLocal[3 * 64];
 
     PRAGMA_OMP_SIMD
     for (int e = 0; e < maxThreads; e++)
         intermediate[e] = startingVal;
 
-    memset(extraParamsLocal, 0, 3 * 256 * sizeof(Z));
-    if (extraParams != nullptr)
+    memset(extraParamsLocal, 0, 3 * 64 * sizeof(Z));
+    if (extraParams != nullptr) {
         PRAGMA_OMP_SIMD
-        for (int e = 0; e < maxThreads; e++)
-            extraParamsLocal[3 * e + 2] = extraParams[0];
+        // mostly for future reference
+        for (int e = 0; e < maxThreads; e++) {
+            extraParamsLocal[3 * e] = extraParams[0];
+            extraParamsLocal[3 * e + 1] = extraParams[1];
+            extraParamsLocal[3 * e + 2] = extraParams[2];
+        }
+    }
 
     nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, yShapeInfo);
 
     if (kindOfLoop == nd4j::LoopKind::EWS1) {
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
-        for(unsigned int i = 0; i < length; i++) {
-            const auto threadNum = omp_get_thread_num();
-            intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[i], y[i], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
+            }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
     } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
-        for(unsigned int i = 0; i < length; i++) {
-            const auto threadNum = omp_get_thread_num();
-            auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
-            intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
+            }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
     } else {
         uint yShapeInfoCast[MAX_RANK];
         const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
-        for(unsigned int i = 0; i < length; i++) {
-            const auto threadNum = omp_get_thread_num();
-            auto xOffset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
-            auto yOffset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
-            intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
+            }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
     }
 
     // merge step
     for (int e = 0; e < maxThreads; e++)
         OpType::aggregateExtraParams(extraParamsVals, extraParamsLocal + 3 * e);
+
     for (int e = 0; e < maxThreads; e++)
         startingVal = OpType::update(startingVal, intermediate[e], extraParamsVals);
 
+    // writing out result
     z[0] = OpType::postProcess(startingVal, length, extraParamsVals);
 }
 
@@ -139,7 +150,7 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
                     void *vextraParams,
                     void *vy, Nd4jLong *yShapeInfo,
                     void *vz, Nd4jLong *zShapeInfo,
-                    int *dimension, int dimensionLength) {
+                    int *dimension, int dimensionLength, int64_t start, int64_t stop) {
 
     auto x = reinterpret_cast<X*>(vx);
     auto y = reinterpret_cast<X*>(vy);
@@ -151,9 +162,9 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
         return;
     }
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #endif
 }
 
@@ -165,16 +176,16 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
                         void *vy, Nd4jLong *yShapeInfo,
                         void *vz, Nd4jLong *zShapeInfo,
                         int *dimension, int dimensionLength,
-                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
+                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto y = reinterpret_cast<X *>(vy);
     auto z = reinterpret_cast<Z *>(vz);
     auto extraParams = reinterpret_cast<Z *>(vextraParams);
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #endif
 }
 
@@ -188,7 +199,7 @@ void Reduce3<X,Z>:: execAll(void *vx, Nd4jLong *xShapeInfo,
                             void *vz, Nd4jLong *zShapeInfo,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets,
-                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
+                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto y = reinterpret_cast<X *>(vy);
@@ -196,9 +207,9 @@ void Reduce3<X,Z>:: execAll(void *vx, Nd4jLong *xShapeInfo,
     auto extraParams = reinterpret_cast<Z*>(vextraParams);
 
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
 #endif
 }
 
@@ -209,9 +220,9 @@ void Reduce3<X,Y>::exec( const int opNum,
                         void *extraParamsVals,
                         void *vy, Nd4jLong *yShapeInfo,
                         void *vz, Nd4jLong *zShapeInfo,
-                        int *dimension, int dimensionLength) {
+                        int *dimension, int dimensionLength, int64_t start, int64_t stop) {
 
-    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength), REDUCE3_OPS);
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, start, stop), REDUCE3_OPS);
 }
 
 
@@ -223,9 +234,9 @@ void Reduce3<X,Y>::exec( const int opNum,
                         void *vy, Nd4jLong *yShapeInfo,
                         void *vz, Nd4jLong *zShapeInfo,
                         int *dimension, int dimensionLength,
-                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
+                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop) {
 
-    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx,xShapeInfo,extraParamsVals,vy, yShapeInfo,vz,zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), REDUCE3_OPS);
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx,xShapeInfo,extraParamsVals,vy, yShapeInfo,vz,zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), REDUCE3_OPS);
 }
 
 
@@ -238,9 +249,9 @@ void Reduce3<X,Y>::execAll(const int opNum,
                             void *vz, Nd4jLong *zShapeInfo,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets,
-                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
+                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop) {
 
-    DISPATCH_BY_OPNUM_TT(execAll, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), REDUCE3_OPS);
+    DISPATCH_BY_OPNUM_TT(execAll, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, start, stop), REDUCE3_OPS);
 }
 
 
diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp
index 79e53e4a2..071913e22 100644
--- a/libnd4j/include/loops/cpu/scalar.hpp
+++ b/libnd4j/include/loops/cpu/scalar.hpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <types/types.h>
 #include <LoopKind.h>
+#include <execution/Threads.h>
 #include "../legacy_ops.h"
 
 using namespace simdOps;
@@ -39,7 +40,8 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
                                                 void *vscalars,
                                                 int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                                                const uint64_t start, const uint64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
@@ -63,29 +65,27 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
         return;
     }
 
-    int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
+    int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
 
     if (kindOfLoop == nd4j::LoopKind::EWS1) {
-        PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-        for (unsigned int r = 0; r < numTads; r++) {
+        for (auto r = start; r < stop; r++) {
             auto oZ = z + zTadOffsets[r];
             auto oX = x + xTadOffsets[r];
 
             PRAGMA_OMP_SIMD
             for (unsigned int f = 0; f < tadLength; f++)
                 oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
-        }
+        };
     }
     else {
-        PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-        for (unsigned int r = 0; r < numTads; r++) {
+        for (auto r = start; r < stop; r++) {
             auto oZ = z + zTadOffsets[r];
             auto oX = x + xTadOffsets[r];
 
             PRAGMA_OMP_SIMD
             for (unsigned int f = 0; f < tadLength; f++)
                 oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
-        }
+        };
     }
 }
 
@@ -98,9 +98,10 @@ void ScalarTransform<X,Y,Z>::transform(int opNum,
                               void *scalars,
                               int *dimension, int dimensionLength,
                               Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                              Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                              Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                              const uint64_t start, const uint64_t stop) {
 
-    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_OPS);
+    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -110,9 +111,10 @@ void ScalarTransform<X, Y, Z>::transform(const int opNum,
                                         void *z, Nd4jLong zStride,
                                         void *scalar,
                                         void *extraParams,
-                                        const Nd4jLong n, bool allowParallelism) {
+                                        const uint64_t n,
+                                        const uint64_t start, const uint64_t stop) {
 
-    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xStride, z, zStride, scalar, extraParams, n, allowParallelism), SCALAR_OPS);
+    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xStride, z, zStride, scalar, extraParams, n, start, stop), SCALAR_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -121,9 +123,10 @@ void ScalarTransform<X, Y, Z>::transform(const int opNum,
                                         void *x, Nd4jLong *xShapeInfo,
                                         void *z, Nd4jLong *zShapeInfo,
                                         void *scalar,
-                                        void *extraParams, bool allowParallelism) {
+                                        void *extraParams,
+                                        const uint64_t start, const uint64_t stop) {
 
-    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, allowParallelism), SCALAR_OPS);
+    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -132,7 +135,8 @@ template<typename OpType>
 void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
                                         void *vz, Nd4jLong *zShapeInfo,
                                         void *vscalar,
-                                        void *vextraParams, bool allowParallelism) {
+                                        void *vextraParams,
+                                        const uint64_t start, const uint64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
@@ -146,48 +150,30 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
     nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
     if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-        transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, allowParallelism);
+        transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
     }
     else {
 
         uint xShapeInfoCast[MAX_RANK];
         const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-        nd4j::OmpLaunchHelper info(len, allowParallelism ? -1 : 1);
-
         if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-            PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-            {
-                auto threadNum = omp_get_thread_num();
-                auto threadOffset = info.getThreadOffset(threadNum);
-                auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                PRAGMA_OMP_SIMD
-                for (unsigned int i = 0; i < ulen; i++) {
-                    auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                    z[offset] = OpType::op(x[offset], scalar, extraParams);
-                }
-            }
+            PRAGMA_OMP_SIMD
+            for (auto i = start; i < stop; i++) {
+                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                z[offset] = OpType::op(x[offset], scalar, extraParams);
+            };
         }
         else {
-
             uint zShapeInfoCast[MAX_RANK];
             const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
-            PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-            {
-                auto threadNum = omp_get_thread_num();
-                auto threadOffset = info.getThreadOffset(threadNum);
-                auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                PRAGMA_OMP_SIMD
-                for (unsigned int i = 0; i < ulen; i++) {
-                    auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                    auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                    z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
-                }
-            }
+            PRAGMA_OMP_SIMD
+            for (auto i = start; i < stop; i++) {
+                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
+            };
         }
     }
 }
@@ -199,44 +185,22 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong xEws,
                                         void *vz, Nd4jLong zEws,
                                         void *vscalar,
                                         void *vextraParams,
-                                        const Nd4jLong len, bool allowParallelism) {
+                                        const uint64_t len, const uint64_t start, const uint64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
     auto scalar = reinterpret_cast<Y *>(vscalar)[0];
     auto extraParams = reinterpret_cast<Z *>(vextraParams);
 
-    nd4j::OmpLaunchHelper info(len, allowParallelism ? -1 : 1);
-
     if (xEws == 1 && zEws == 1) {
-
-        PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-        {
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
-            auto xi = x + threadOffset;
-            auto zi = z + threadOffset;
-            auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-            PRAGMA_OMP_SIMD
-            for (unsigned int i = 0; i < ulen; i++)
-                zi[i] = OpType::op(xi[i], scalar, extraParams);
-        }
+        PRAGMA_OMP_SIMD
+        for (auto i = start; i < stop; i++)
+            z[i] = OpType::op(x[i], scalar, extraParams);
     }
     else {
-
-        PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-        {
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
-            auto xi = x + xEws * threadOffset;
-            auto zi = z + zEws * threadOffset;
-            auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-            PRAGMA_OMP_SIMD
-            for (unsigned int i = 0; i < ulen; i++)
-                zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams);
-        }
+        PRAGMA_OMP_SIMD
+        for (auto i = start; i < stop; i++)
+            z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams);
     }
 }
 
diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp
index b37bdd6ef..d6dce445b 100644
--- a/libnd4j/include/loops/cpu/scalar_bool.cpp
+++ b/libnd4j/include/loops/cpu/scalar_bool.cpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <types/types.h>
 #include <LoopKind.h>
+#include <execution/Threads.h>
 
 #include "../legacy_ops.h"
 
@@ -39,7 +40,8 @@ namespace functions {
                                                 void *vscalars,
                                                 int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                                                const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<Z *>(vz);
@@ -64,29 +66,27 @@ namespace functions {
                 return;
             }
 
-            int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
+            int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
 
             if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
-                }
+                };
             }
-            else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+            else {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
-                }
+                };
             }
         }
 
@@ -103,8 +103,8 @@ namespace functions {
                               Nd4jLong *xTadShapeInfo,
                               Nd4jLong *xTadOffsets,
                               Nd4jLong *zTadShapeInfo,
-                              Nd4jLong *zTadOffsets) {
-            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_BOOL_OPS);
+                              Nd4jLong *zTadOffsets, const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_BOOL_OPS);
         }
 
 
@@ -116,8 +116,9 @@ namespace functions {
                 Nd4jLong zEws,
                 void *scalar,
                 void *extraParams,
-                const Nd4jLong n) {
-            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n), SCALAR_BOOL_OPS);
+                const uint64_t n,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n, start, stop), SCALAR_BOOL_OPS);
         }
 
         template<typename X, typename Y>
@@ -127,8 +128,9 @@ namespace functions {
                 void *z,
                 Nd4jLong *zShapeInfo,
                 void *scalar,
-                void *extraParams) {
-            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams), SCALAR_BOOL_OPS);
+                void *extraParams,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_BOOL_OPS);
         }
 
         template<typename X, typename Z>
@@ -138,7 +140,8 @@ namespace functions {
                                void *vz,
                                Nd4jLong *zShapeInfo,
                                void *vscalar,
-                               void *vextraParams) {
+                               void *vextraParams,
+                               const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<Z *>(vz);
@@ -149,53 +152,33 @@ namespace functions {
             auto zEws = shape::elementWiseStride(zShapeInfo);
             auto len = shape::length(xShapeInfo);
 
-            // nd4j_logger("Launching scalar: xOrder: %i; zOrder: %i; xEWS: %i\n", xOrder, zOrder, xEws);
-
             nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
             if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len);
+                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
                 return;
             }
 
             uint xShapeInfoCast[MAX_RANK];
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-            nd4j::OmpLaunchHelper info(len);
-
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        z[offset] = OpType::op(x[offset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    z[offset] = OpType::op(x[offset], scalar, extraParams);
+                };
             }
             else {
-
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                        z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                    z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
+                };
             }
         }
 
@@ -208,44 +191,23 @@ namespace functions {
                     Nd4jLong zEws,
                     void *vscalar,
                     void *vextraParams,
-                    const Nd4jLong len) {
+                    const uint64_t len,
+                    const uint64_t start, const uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vz);
                 auto scalar = reinterpret_cast<X *>(vscalar)[0];
                 auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-                nd4j::OmpLaunchHelper info(len);
-
                 if (xEws == 1 && zEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto zi = z + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i] = OpType::op(xi[i], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], scalar, extraParams);
                 }
                 else {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws * threadOffset;
-                        auto zi = z + zEws * threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams);
                 }
             }
 
diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp
index 9e73e2756..5f2308418 100644
--- a/libnd4j/include/loops/cpu/scalar_int.cpp
+++ b/libnd4j/include/loops/cpu/scalar_int.cpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <types/types.h>
 #include <LoopKind.h>
+#include <execution/Threads.h>
 
 #include "../legacy_ops.h"
 
@@ -39,7 +40,8 @@ namespace functions {
                                                 void *vscalars,
                                                 int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                                                const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
@@ -64,29 +66,27 @@ namespace functions {
                 return;
             }
 
-            int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
+            int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
 
             if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
-                }
+                };
             }
-            else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+            else {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
-                }
+                };
             }
         }
 
@@ -103,8 +103,10 @@ namespace functions {
                               Nd4jLong *xTadShapeInfo,
                               Nd4jLong *xTadOffsets,
                               Nd4jLong *zTadShapeInfo,
-                              Nd4jLong *zTadOffsets) {
-            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_INT_OPS);
+                              Nd4jLong *zTadOffsets,
+                              const uint64_t start, const uint64_t stop) {
+
+            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_INT_OPS);
         }
 
 
@@ -116,8 +118,9 @@ namespace functions {
                 Nd4jLong zEws,
                 void *scalar,
                 void *extraParams,
-                const Nd4jLong n) {
-            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n), SCALAR_INT_OPS);
+                const uint64_t n,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n, start, stop), SCALAR_INT_OPS);
         }
 
         template<typename X>
@@ -127,8 +130,9 @@ namespace functions {
                 void *z,
                 Nd4jLong *zShapeInfo,
                 void *scalar,
-                void *extraParams) {
-            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams), SCALAR_INT_OPS);
+                void *extraParams,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_INT_OPS);
         }
 
         template<typename X>
@@ -138,7 +142,8 @@ namespace functions {
                                void *vz,
                                Nd4jLong *zShapeInfo,
                                void *vscalar,
-                               void *vextraParams) {
+                               void *vextraParams,
+                               const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
@@ -149,53 +154,33 @@ namespace functions {
             auto zEws = shape::elementWiseStride(zShapeInfo);
             auto len = shape::length(xShapeInfo);
 
-            // nd4j_logger("Launching scalar: xOrder: %i; zOrder: %i; xEWS: %i\n", xOrder, zOrder, xEws);
-
             nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
             if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len);
+                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
                 return;
             }
 
             uint xShapeInfoCast[MAX_RANK];
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-            nd4j::OmpLaunchHelper info(len);
-
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        z[offset] = OpType::op(x[offset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    z[offset] = OpType::op(x[offset], scalar, extraParams);
+                };
             }
             else {
-
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                        z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                    z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
+                };
             }
         }
 
@@ -208,44 +193,23 @@ namespace functions {
                     Nd4jLong zEws,
                     void *vscalar,
                     void *vextraParams,
-                    const Nd4jLong len) {
+                    const uint64_t len,
+                    const uint64_t start, const uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<X *>(vz);
                 auto scalar = reinterpret_cast<X *>(vscalar)[0];
                 auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-                nd4j::OmpLaunchHelper info(len);
-
                 if (xEws == 1 && zEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto zi = z + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i] = OpType::op(xi[i], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], scalar, extraParams);
                 }
                 else {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws * threadOffset;
-                        auto zi = z + zEws * threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams);
                 }
             }
 
diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
index 1f5a7c339..a8f766f6a 100644
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@@ -24,6 +24,7 @@
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -90,8 +91,7 @@ namespace functions {
             uint xShapeInfoCast[MAX_RANK];
             const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-            for (Nd4jLong i = 0; i < length; i++) {
-
+            for (uint64_t i = 0; i < length; i++) {
                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
 
                 SummaryStatsData<X> curr;
@@ -123,7 +123,7 @@ namespace functions {
                     return;
                 SummaryStatsData<X> comp;
                 comp.initWithValue(x[0]);
-                PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < resultLength; i++)
                     z[i] = OpType::getValue(biasCorrected, comp);
                 return;
@@ -157,35 +157,37 @@ namespace functions {
             uint tadShapeShapeInfoCast[MAX_RANK];
             const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int r = 0; r < resultLength; r++) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto r = start; r < stop; r += increment) {
 
-                auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
-                auto tx = x + tadOffsetForBlock;
-                SummaryStatsData<X> comp;
-                comp.initWithValue(tx[0]);
+                    auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
+                    auto tx = x + tadOffsetForBlock;
+                    SummaryStatsData <X> comp;
+                    comp.initWithValue(tx[0]);
 
-                if (tadEWS == 1 && tadOrder == 'c') {
-                    for (int i = 1; i < tadLength; i ++) {
-                        SummaryStatsData <X> indexVal2;
-                        indexVal2.initWithValue(tx[i]);
+                    if (tadEWS == 1 && tadOrder == 'c') {
+                        for (int i = 1; i < tadLength; i++) {
+                            SummaryStatsData <X> indexVal2;
+                            indexVal2.initWithValue(tx[i]);
 
-                        comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
+                            comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
+                        }
+                    } else {
+                        for (int i = 1; i < tadLength; i++) {
+                            auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
+
+                            SummaryStatsData <X> indexVal2;
+                            indexVal2.initWithValue(tx[xOffset]);
+
+                            comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
+                        }
                     }
+
+                    z[r] = OpType::getValue(biasCorrected, comp);
                 }
-                else {
-                    for (int i = 1; i < tadLength; i ++) {
-                        auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
+            };
 
-                        SummaryStatsData <X> indexVal2;
-                        indexVal2.initWithValue(tx[xOffset]);
-
-                        comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
-                    }
-                }
-
-                z[r] = OpType::getValue(biasCorrected, comp);
-            }
+            samediff::Threads::parallel_tad(func,  0, resultLength, 1);
         }
 
 
diff --git a/libnd4j/include/loops/cpu/transform/transform_any.cpp b/libnd4j/include/loops/cpu/transform/transform_any.cpp
index 5727c096d..5b3c4a0f8 100644
--- a/libnd4j/include/loops/cpu/transform/transform_any.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_any.cpp
@@ -37,9 +37,8 @@ namespace functions {
 				void *z,
 				Nd4jLong *zShapeInfo,
 				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets, bool allowParallelism) {
-                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets, allowParallelism), TRANSFORM_ANY_OPS);
+                uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_ANY_OPS);
 		}
 
 /////////////////////////////////////////////////////////////////////
@@ -47,22 +46,13 @@ template <typename X, typename Z>
 template<typename OpType>
 void _CUDA_H TransformAny<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
                                     void *vz,Nd4jLong *zShapeInfo,
-                                    void *vextraParams,
-                                    Nd4jLong *tadShapeInfo,Nd4jLong *tadOffsets, bool allowParallelism) {
+                                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
 	auto x = reinterpret_cast<X *>(vx);
 	auto z = reinterpret_cast<Z *>(vz);
 	auto extraParams = reinterpret_cast<X *>(vextraParams);
-             
-    if(OpType::requiresSpecial) {
-        OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-        return;
-    }
 
-    if (allowParallelism)
-        nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
-    else
-        nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType, false>(x, xShapeInfo, z, zShapeInfo, extraParams);
+    nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
 }
 
 
diff --git a/libnd4j/include/loops/cpu/transform/transform_bool.cpp b/libnd4j/include/loops/cpu/transform/transform_bool.cpp
index 3560c85fe..fdfde93f5 100644
--- a/libnd4j/include/loops/cpu/transform/transform_bool.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_bool.cpp
@@ -37,9 +37,8 @@ namespace functions {
 				void *z,
 				Nd4jLong *zShapeInfo,
 				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_BOOL_OPS);
+                uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_BOOL_OPS);
 		}
 
         template <typename X, typename Z>
@@ -49,20 +48,13 @@ namespace functions {
                     Nd4jLong *xShapeInfo,
                     void *vz,
                     Nd4jLong *zShapeInfo,
-                    void *vextraParams,
-                    Nd4jLong *tadShapeInfo,
-                    Nd4jLong *tadOffsets) {
+                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
             auto x = reinterpret_cast<X *>(vx);
 		    auto z = reinterpret_cast<Z *>(vz);
-		       auto extraParams = reinterpret_cast<X *>(vextraParams);
+		    auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
-
-            nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_float.cpp b/libnd4j/include/loops/cpu/transform/transform_float.cpp
index 922a76265..8e164a90f 100644
--- a/libnd4j/include/loops/cpu/transform/transform_float.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_float.cpp
@@ -36,9 +36,8 @@ namespace functions {
 				void *z,
 				Nd4jLong *zShapeInfo,
 				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_FLOAT_OPS);
+                uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_FLOAT_OPS);
 		}
 
         template <typename X, typename Z>
@@ -48,20 +47,13 @@ namespace functions {
                     Nd4jLong *xShapeInfo,
                     void *vz,
                     Nd4jLong *zShapeInfo,
-                    void *vextraParams,
-                    Nd4jLong *tadShapeInfo,
-                    Nd4jLong *tadOffsets) {
+                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
             auto x = reinterpret_cast<X *>(vx);
 		    auto z = reinterpret_cast<Z *>(vz);
 		    auto extraParams = reinterpret_cast<Z *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
-
-            nd4j::TransformLoops<X,Z,Z>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,Z,Z>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_same.cpp b/libnd4j/include/loops/cpu/transform/transform_same.cpp
index f821d73bc..67f7762f0 100644
--- a/libnd4j/include/loops/cpu/transform/transform_same.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_same.cpp
@@ -36,10 +36,8 @@ namespace functions {
 				Nd4jLong *xShapeInfo,
 				void *z,
 				Nd4jLong *zShapeInfo,
-				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_SAME_OPS);
+				void *extraParams, uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_SAME_OPS);
 		}
 
         template <typename X>
@@ -47,18 +45,14 @@ namespace functions {
 		void _CUDA_H TransformSame<X>::exec(void *vx, Nd4jLong *xShapeInfo,
                                             void *vz, Nd4jLong *zShapeInfo,
                                             void *vextraParams,
-                                            Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
+                                            uint64_t threadId, uint64_t numThreads) {
 
 		    auto x = reinterpret_cast<X *>(vx);
 		    auto z = reinterpret_cast<X *>(vz);
 		    auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
 
-            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_strict.cpp b/libnd4j/include/loops/cpu/transform/transform_strict.cpp
index e600d2fb8..29964e3e0 100644
--- a/libnd4j/include/loops/cpu/transform/transform_strict.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_strict.cpp
@@ -36,10 +36,8 @@ namespace functions {
 				Nd4jLong *xShapeInfo,
 				void *z,
 				Nd4jLong *zShapeInfo,
-				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_STRICT_OPS);
+				void *extraParams, uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_STRICT_OPS);
 		}
 
         template <typename X>
@@ -49,20 +47,13 @@ namespace functions {
                     Nd4jLong *xShapeInfo,
                     void *vz,
                     Nd4jLong *zShapeInfo,
-                    void *vextraParams,
-                    Nd4jLong *tadShapeInfo,
-                    Nd4jLong *tadOffsets) {
+                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
-
-            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cuda/aggregates.cu b/libnd4j/include/loops/cuda/aggregates.cu
deleted file mode 100644
index 9ced20e51..000000000
--- a/libnd4j/include/loops/cuda/aggregates.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-// @author Yurii Shyrma, created on 27.11.2018
-//
-
-#include "../aggregates.h"
-
-namespace functions {
-namespace aggregate {
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-template<typename OpClass>
-__device__ void AggregatedFunction<X>::execCuda(X **arguments, int numArguments, 
-                                        Nd4jLong **shapeArguments, int numShapeArguments, 
-                                        int *indexArguments, int numIndexArguments, 
-                                        int **intArrays, int numIntArrays,  
-                                        X *realArguments, int numRealArguments) {
-
-    OpClass::executeAggregateCuda(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__device__ void AggregatedFunction<X>::execCuda(int opNum, 
-                                        X **arguments, int numArguments, 
-                                        Nd4jLong **shapeArguments, int numShapeArguments, 
-                                        int *indexArguments, int numIndexArguments, 
-                                        int **intArrays, int numIntArrays,  
-                                        X *realArguments, int numRealArguments) {
-    
-    DISPATCH_BY_OPNUM_T(execCuda, PARAMS(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), AGGREGATE_OPS);
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__global__ static void execAggregateKernel(int opNum,
-                                void **varguments, int numArguments,
-                                Nd4jLong **shapeArguments, int numShapeArguments,
-                                int *indexArguments, int numIndexArguments,
-                                int **intArrays, int numIntArrays,
-                                void *vrealArguments, int numRealArguments) {
-
-    auto arguments = reinterpret_cast<X**>(varguments);
-    auto realArguments = reinterpret_cast<X*>(vrealArguments);
-    functions::aggregate::AggregatedFunction<X>::execCuda(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);    
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__host__ void AggregatedFunction<X>::aggregateKernelGeneric(dim3& launchDims, cudaStream_t *stream,
-                                int opNum,
-                                void **arguments, int numArguments,
-                                Nd4jLong **shapeArguments, int numShapeArguments,
-                                int *indexArguments, int numIndexArguments,
-                                int **intArrays, int numIntArrays,
-                                void *realArguments, int numRealArguments) {
-
-    execAggregateKernel<X><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
-    nd4j::DebugHelper::checkErrorCode(stream, "aggregateKernelGeneric(...) failed");
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__device__ void AggregatedFunction<X>::aggregateBatch(int opNum, int numAggregates, 
-                                                    int maxArgs, int maxShapes, 
-                                                    int maxIntArrays, int maxIntArraySize, 
-                                                    int maxIdx, int maxReals, 
-                                                    void *ptrToArguments) {
-
-    nd4j::PointersHelper<X> helper(ptrToArguments, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals);
-
-    // TODO: we probably should lift this restriction
-    __shared__ int *intArrays[32];
-
-    __shared__ X **arguments;
-    __shared__ Nd4jLong **shapes;
-    __shared__ int *idxArg;
-    __shared__ X *realArg;
-
-    for(int r = blockIdx.x; r < numAggregates; r += gridDim.x) {
-        if (threadIdx.x == 0) {
-            arguments = helper.getArguments(r);
-            shapes = helper.getShapeArguments(r);
-            idxArg = helper.getIndexArguments(r);
-            realArg = helper.getRealArguments(r);
-        }
-
-        // we fill intArrays param in parallel within block
-        if (threadIdx.x < 32 && threadIdx.x < maxIntArrays) {
-            intArrays[threadIdx.x] = helper.getIntArrayArguments(r, threadIdx.x);
-        }
-        __syncthreads();
-
-        functions::aggregate::AggregatedFunction<X>::execCuda(opNum, arguments, helper.getNumArguments(r), shapes, helper.getNumShapeArguments(r), idxArg, helper.getNumIndexArguments(r), intArrays, helper.getNumIntArrayArguments(r), realArg, helper.getNumRealArguments(r));
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__global__ static void execAggregateBatch(int opNum, int numAggregates, 
-                                        int maxArgs, int maxShapes, 
-                                        int maxIntArrays, int maxIntArraySize, 
-                                        int maxIdx, int maxReals, 
-                                        void *ptrToArguments) {
-
-    functions::aggregate::AggregatedFunction<X>::aggregateBatch(opNum, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments);
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__host__ void AggregatedFunction<X>::aggregateBatchKernelGeneric(dim3& launchDims, cudaStream_t *stream, 
-                                                    int opNum, int numAggregates, 
-                                                    int maxArgs, int maxShapes, 
-                                                    int maxIntArrays, int maxIntArraySize, 
-                                                    int maxIdx, int maxReals, 
-                                                    void *ptrToArguments) {
-
-    execAggregateBatch<X><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(opNum, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments);
-    nd4j::DebugHelper::checkErrorCode(stream, "aggregateBatchKernel(...) failed");
-}
-
-
-
-
-
-BUILD_SINGLE_TEMPLATE(template class AggregatedFunction, , FLOAT_TYPES);
-}
-}
diff --git a/libnd4j/include/loops/cuda/broadcasting.cu b/libnd4j/include/loops/cuda/broadcasting.cu
index 8028db2ba..8846e5473 100644
--- a/libnd4j/include/loops/cuda/broadcasting.cu
+++ b/libnd4j/include/loops/cuda/broadcasting.cu
@@ -32,84 +32,6 @@
 
 namespace functions {
     namespace broadcast {
-        template <typename X, typename Y, typename Z>
-        void Broadcast<X, Y, Z>::execInverse(int opNum,
-                                void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-            //
-        }
 
-        template <typename X, typename Y, typename Z>
-        void Broadcast<X, Y, Z>::exec(int opNum,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        /**
-         * CPU execution
-         * @param x the input
-         * @param xShapeInfo the x shape information
-         * @param y the y data
-         * @param yShapeInfo the y shape information
-         * @param result the result
-         * @param resultShapeInfo the result shape information
-         * @param dimension the dimension to broadcast along long
-         * @param dimensionLength the length of the dimension buffer
-         */
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void Broadcast<X, Y, Z>::exec(void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-            //
-        }
-
-
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void Broadcast<X, Y, Z>::execInverse(void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu
index aaec44690..af354a2e2 100644
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@@ -224,76 +224,6 @@ namespace functions {
 	}
 
 
-        template<typename X, typename Y>
-        void BroadcastBool<X,Y>::exec(int opNum,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X, typename Y>
-        void BroadcastBool<X,Y>::execInverse(int opNum,
-                                void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X, typename Y>
-        template<typename OpType>
-        void BroadcastBool<X,Y>::exec(void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X, typename Y>
-        template<typename OpType>
-        void BroadcastBool<X,Y>::execInverse(void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
-
-
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES);
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu
index fc613a438..f183c009e 100644
--- a/libnd4j/include/loops/cuda/broadcasting_int.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_int.cu
@@ -217,75 +217,6 @@ namespace functions {
 		}
 	}
 
-
-        template<typename X>
-        void BroadcastInt<X>::exec(int opNum,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X>
-        void BroadcastInt<X>::execInverse(int opNum,
-                                void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X>
-        template<typename OpType>
-        void BroadcastInt<X>::exec(void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X>
-        template<typename OpType>
-        void BroadcastInt<X>::execInverse(void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT BroadcastInt, , INTEGER_TYPES);
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu
index 8a560e416..1bd5d10cb 100644
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@@ -359,32 +359,6 @@ namespace functions {
             }
         }
 
-
-
-
-        template <typename X, typename Z>
-        Nd4jLong IndexReduce<X,Z>::execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Z>
-        void IndexReduce<X,Z>::exec(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) {
-
-        }
-
-        template <typename X, typename Z>
-        template<typename OpType>
-        Nd4jLong IndexReduce<X,Z>:: execScalar(void *x, Nd4jLong *xShapeInfo, void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Z>
-        template<typename OpType>
-        _CUDA_H void IndexReduce<X,Z>::exec(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) {
-
-        }
-
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES, INDEXING_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/pairwise.cu b/libnd4j/include/loops/cuda/pairwise.cu
index 17f8537e5..4833d32d0 100644
--- a/libnd4j/include/loops/cuda/pairwise.cu
+++ b/libnd4j/include/loops/cuda/pairwise.cu
@@ -22,58 +22,6 @@
 
 namespace functions {
     namespace pairwise_transforms {
-        template <typename X, typename Y, typename Z>
-        void PairWiseTransform<X, Y, Z>::exec(
-                const int opNum,
-                void *x,
-                Nd4jLong *xShapeInfo,
-                void *y,
-                Nd4jLong *yShapeInfo,
-                void *z,
-                Nd4jLong *zShapeInfo,
-                void *extraParams) {
 
-        }
-
-        template <typename X, typename Y, typename Z>
-        void PairWiseTransform<X, Y, Z>::exec(
-                const int opNum,
-                void *x,
-                Nd4jLong xStride,
-                void *y,
-                Nd4jLong yStride,
-                void *z,
-                Nd4jLong resultStride,
-                void *extraParams,
-                Nd4jLong len) {
-
-        }
-
-
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void PairWiseTransform<X, Y, Z>:: exec(
-                void *vx,
-                Nd4jLong* xShapeInfo,
-                void *vy,
-                Nd4jLong* yShapeInfo,
-                void *vresult,
-                Nd4jLong* zShapeInfo,
-                void *vextraParams) {
-
-        }
-
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void PairWiseTransform<X, Y, Z>::exec(void *vx,
-                         Nd4jLong xStride,
-                         void *vy,
-                         Nd4jLong yStride,
-                         void *vresult,
-                         Nd4jLong resultStride,
-                         void *vextraParams,
-                         const Nd4jLong len) {
-
-        }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu
index 414aadd30..05adbbce4 100644
--- a/libnd4j/include/loops/cuda/pairwise_bool.cu
+++ b/libnd4j/include/loops/cuda/pairwise_bool.cu
@@ -110,63 +110,6 @@ void PairWiseBoolTransform<X,Y>::executeCudaShaped(dim3& launchDims, cudaStream_
 	DISPATCH_BY_OPNUM_TT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_BOOL_OPS);
 }
 
-
-    template<typename X, typename Y>
-    void PairWiseBoolTransform<X,Y>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong *xShapeBuffer,
-            void *y,
-            Nd4jLong *yShapeBuffer,
-            void *result,
-            Nd4jLong *resultShapeBuffer,
-            void *extraParams) {
-
-    }
-
-    template<typename X, typename Y>
-    void PairWiseBoolTransform<X,Y>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong xStride,
-            void *y,
-            Nd4jLong yStride,
-            void *result,
-            Nd4jLong resultStride,
-            void *extraParams,
-            Nd4jLong n) {
-
-    }
-
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void PairWiseBoolTransform<X,Y>::exec(
-            void *vx,
-            Nd4jLong* xShapeBuffer,
-            void *vy,
-            Nd4jLong* yShapeBuffer,
-            void *vresult,
-            Nd4jLong* resultShapeBuffer,
-            void *vextraParams) {
-
-    }
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void PairWiseBoolTransform<X,Y>::exec(void *vx,
-                     Nd4jLong xStride,
-                     void *vy,
-                     Nd4jLong yStride,
-                     void *vresult,
-                     Nd4jLong resultStride,
-                     void *vextraParams,
-                     const Nd4jLong n) {
-
-    }
-
-
-
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT PairWiseBoolTransform, , LIBND4J_TYPES, BOOL_TYPES);
 }
 }
diff --git a/libnd4j/include/loops/cuda/pairwise_int.cu b/libnd4j/include/loops/cuda/pairwise_int.cu
index 2bedb4a82..85dce56f2 100644
--- a/libnd4j/include/loops/cuda/pairwise_int.cu
+++ b/libnd4j/include/loops/cuda/pairwise_int.cu
@@ -109,63 +109,6 @@ void PairWiseIntTransform<X>::executeCudaShaped(dim3& launchDims, cudaStream_t *
 	DISPATCH_BY_OPNUM_T(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_INT_OPS);
 }
 
-
-    template<typename X>
-    void PairWiseIntTransform<X>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong *xShapeBuffer,
-            void *y,
-            Nd4jLong *yShapeBuffer,
-            void *result,
-            Nd4jLong *resultShapeBuffer,
-            void *extraParams) {
-
-    }
-
-    template<typename X>
-    void PairWiseIntTransform<X>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong xStride,
-            void *y,
-            Nd4jLong yStride,
-            void *result,
-            Nd4jLong resultStride,
-            void *extraParams,
-            Nd4jLong n) {
-
-    }
-
-
-    template<typename X>
-    template<typename OpType>
-    void PairWiseIntTransform<X>::exec(
-            void *vx,
-            Nd4jLong* xShapeBuffer,
-            void *vy,
-            Nd4jLong* yShapeBuffer,
-            void *vresult,
-            Nd4jLong* resultShapeBuffer,
-            void *vextraParams) {
-
-    }
-
-    template<typename X>
-    template<typename OpType>
-    void PairWiseIntTransform<X>::exec(void *vx,
-                     Nd4jLong xStride,
-                     void *vy,
-                     Nd4jLong yStride,
-                     void *vresult,
-                     Nd4jLong resultStride,
-                     void *vextraParams,
-                     const Nd4jLong n) {
-
-    }
-
-
-
     BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT PairWiseIntTransform, , INTEGER_TYPES);
 }
 }
diff --git a/libnd4j/include/loops/cuda/random.cu b/libnd4j/include/loops/cuda/random.cu
index 3bf06ae91..47ced2769 100644
--- a/libnd4j/include/loops/cuda/random.cu
+++ b/libnd4j/include/loops/cuda/random.cu
@@ -442,39 +442,6 @@ namespace functions {
             DEBUG_KERNEL(stream, opNum);
         }
 
-        template<typename T>
-        template<typename OpClass>
-        void RandomFunction<T>::execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        template<typename OpClass>
-        void RandomFunction<T>::execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        template<typename OpClass>
-        void RandomFunction<T>::execTransform(Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        void RandomFunction<T>::execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        void RandomFunction<T>::execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        void RandomFunction<T>::execTransform(int opNum, Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT RandomFunction, , FLOAT_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp
index fa1ab2e17..ac1d1adc3 100644
--- a/libnd4j/include/loops/cuda/reduce3.chpp
+++ b/libnd4j/include/loops/cuda/reduce3.chpp
@@ -132,7 +132,7 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
 		extraZ[1] = (Z) 0.0f;
 
 		if (extraParams != nullptr)
-			extraZ[2] = *(static_cast<Z*>(extraParams));
+			extraZ[2] = static_cast<Z*>(extraParams)[2];
 		else
 			extraZ[2] = (Z) 0.0f;
 	}
diff --git a/libnd4j/include/loops/cuda/reduce3.cu b/libnd4j/include/loops/cuda/reduce3.cu
index 1ad94beee..4f0e0457c 100644
--- a/libnd4j/include/loops/cuda/reduce3.cu
+++ b/libnd4j/include/loops/cuda/reduce3.cu
@@ -27,56 +27,7 @@
 
 namespace functions {
     namespace reduce3 {
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo) {
 
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParamsVals, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo) {
-
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength) {
-
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength,  Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
-
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength) {
-
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
-
-        }
 
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/scalar_bool.cu b/libnd4j/include/loops/cuda/scalar_bool.cu
index 37939b9b9..bb498c3a9 100644
--- a/libnd4j/include/loops/cuda/scalar_bool.cu
+++ b/libnd4j/include/loops/cuda/scalar_bool.cu
@@ -231,41 +231,6 @@ void ScalarBoolTransform<X,Y>::executeCudaAlongDimension(dim3& launchDims, cudaS
 }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ScalarBoolTransform, , LIBND4J_TYPES, BOOL_TYPES);
-
-
-    template<typename X, typename Y>
-    template <typename OpType>
-    void ScalarBoolTransform<X,Y>::transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X, typename Y>
-    void ScalarBoolTransform<X,Y>::transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X, typename Y>
-    void ScalarBoolTransform<X,Y>::transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams) {
-
-    }
-
-    template<typename X, typename Y>
-    void ScalarBoolTransform<X,Y>::transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void ScalarBoolTransform<X,Y>::transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) {
-
-    }
-
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void ScalarBoolTransform<X,Y>::transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
 }
 }
 
diff --git a/libnd4j/include/loops/cuda/scalar_int.cu b/libnd4j/include/loops/cuda/scalar_int.cu
index 44c73fcb4..f25beca82 100644
--- a/libnd4j/include/loops/cuda/scalar_int.cu
+++ b/libnd4j/include/loops/cuda/scalar_int.cu
@@ -230,40 +230,6 @@ void ScalarIntTransform<X>::executeCudaAlongDimension(dim3& launchDims, cudaStre
 
     BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ScalarIntTransform, , INTEGER_TYPES);
 
-
-    template<typename X>
-    template <typename OpType>
-    void ScalarIntTransform<X,>::transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X>
-    void ScalarIntTransform<X>::transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X>
-    void ScalarIntTransform<X>::transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams) {
-
-    }
-
-    template<typename X>
-    void ScalarIntTransform<X>::transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
-
-    template<typename X>
-    template<typename OpType>
-    void ScalarIntTransform<X>::transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) {
-
-    }
-
-
-    template<typename X>
-    template<typename OpType>
-    void ScalarIntTransform<X>::transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
 }
 }
 
diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu
index 4867f5de1..e505929e6 100644
--- a/libnd4j/include/loops/cuda/summarystatsreduce.cu
+++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu
@@ -414,73 +414,6 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
         }
 
 
-        template <typename X, typename Y>
-        Y SummaryStatsReduce<X,Y>::execScalar(int opNum,
-                            bool biasCorrected,
-                            void *x,
-                            Nd4jLong *xShapeInfo,
-                            void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Y>
-        void SummaryStatsReduce<X,Y>::execScalar(int opNum,
-                               bool biasCorrected,
-                               void *x,
-                               Nd4jLong *xShapeInfo,
-                               void *extraParams,
-                               void *vz,
-                               Nd4jLong *resultShapeInfoBuffer) {
-
-        }
-
-        template <typename X, typename Y>
-        void SummaryStatsReduce<X,Y>::exec(int opNum,
-                         bool biasCorrected,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *extraParams,
-                         void *vz,
-                         Nd4jLong *resultShapeInfoBuffer,
-                         int *dimension, int dimensionLength) {
-
-        }
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        Y SummaryStatsReduce<X,Y>::execScalar(bool biasCorrected,
-                            void *x,
-                            Nd4jLong *xShapeInfo,
-                            void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void SummaryStatsReduce<X,Y>::execScalar(bool biasCorrected,
-                               void *x,
-                               Nd4jLong *xShapeInfo,
-                               void *extraParams,
-                               void *vz,
-                               Nd4jLong *resultShapeInfoBuffer) {
-            //
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void SummaryStatsReduce<X,Y>::exec(bool biasCorrected,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *extraParams,
-                         void *vz,
-                         Nd4jLong *resultShapeInfoBuffer,
-                         int *dimension,
-                         int dimensionLength) {
-
-        }
-
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT SummaryStatsReduce, , LIBND4J_TYPES, FLOAT_TYPES);
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu
index 18b53cea7..5ca6f0067 100644
--- a/libnd4j/include/loops/cuda/transform/transform_any.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_any.cu
@@ -114,17 +114,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformAny(...) failed");
 		}
 
-        template<typename X, typename Z>
-        void TransformAny<X,Z>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) {
-
-        }
-
-        template<typename X, typename Z>
-        template <typename OpType>
-        void TransformAny<X,Z>::exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) {
-
-        }
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformAny, , LIBND4J_TYPES, LIBND4J_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu
index e88a4274b..0f56020b0 100644
--- a/libnd4j/include/loops/cuda/transform/transform_bool.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu
@@ -120,17 +120,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformBool(...) failed");
 		}
 
-        template<typename X, typename Z>
-        void TransformBool<X,Z>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X, typename Z>
-        template <typename OpType>
-        void TransformBool<X,Z>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu
index 44ddb0246..49d6ab26f 100644
--- a/libnd4j/include/loops/cuda/transform/transform_float.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_float.cu
@@ -142,18 +142,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformFloat(...) failed");
 		}
 
-        template<typename X, typename Z>
-        void TransformFloat<X,Z>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X, typename Z>
-        template <typename OpType>
-        void TransformFloat<X,Z>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-
 		BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu
index e59381fba..4c587111b 100644
--- a/libnd4j/include/loops/cuda/transform/transform_same.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_same.cu
@@ -118,17 +118,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformSame(...) failed");
 		}
 
-        template<typename X>
-        void TransformSame<X>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X>
-        template <typename OpType>
-        void TransformSame<X>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu
index 0befdf35f..1136ef695 100644
--- a/libnd4j/include/loops/cuda/transform/transform_strict.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu
@@ -119,17 +119,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformStrict(...) failed");
 		}
 
-        template<typename X>
-        void TransformStrict<X>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X>
-        template <typename OpType>
-        void TransformStrict<X>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp
index dc85b9554..5a4a9db41 100644
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <loops/type_conversions.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 
@@ -79,10 +80,13 @@ namespace nd4j {
         auto amin = nd4j::math::nd4j_abs<float>(min);
 
         // now we actually apply quantization
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < N; e++) {
-            rz[e] = static_cast<char>(nd4j::math::nd4j_round<float,char>(1.0f * x[e] / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>(1.0f * x[e] / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
+            }
+        };
+
+        samediff::Threads::parallel_for(func,  0, N);
     }
 
     template <typename T>
@@ -172,12 +176,15 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         // we use 3 as offset, since first 12 bytes are occupied with header
         int flimit = limit + 4;
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(flimit > Environment::getInstance()->elementwiseThreshold())
-        for (int e = 4; e < flimit; e++) {
-            int el = x[e];
-            int ael = nd4j::math::nd4j_abs<int>(el) - 1;
-            z[ael] += el > 0 ? threshold : -threshold;
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                int el = x[e];
+                int ael = nd4j::math::nd4j_abs<int>(el) - 1;
+                z[ael] += el > 0 ? threshold : -threshold;
+            }
+        };
+
+        samediff::Threads::parallel_for(func,  4, flimit);
     }
 
     /**
@@ -194,19 +201,12 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         auto x = reinterpret_cast<S *>(dx);
         auto z = reinterpret_cast<T *>(dz);
 
-        if (N < nd4j::Environment::getInstance()->elementwiseThreshold()) {
-            for (int i = 0; i < N; i++) {
-                // FIXME: get rid of through-float though
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
                 z[i] = static_cast<T>(static_cast<float>(x[i]));
             }
-        } else {
-
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < N; i++) {
-                // FIXME: get rid of through-float though
-                z[i] = static_cast<T>(static_cast<float>(x[i]));
-            }
-        }
+        };
+        samediff::Threads::parallel_for(func,  0, N);
     };
 
     template void TypeCast::convertFromThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
diff --git a/libnd4j/include/loops/indexreduce.h b/libnd4j/include/loops/indexreduce.h
index 792ed16a9..ad4472dec 100755
--- a/libnd4j/include/loops/indexreduce.h
+++ b/libnd4j/include/loops/indexreduce.h
@@ -37,10 +37,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include <helpers/TAD.h>
 
@@ -70,7 +66,7 @@ namespace functions {
     static _CUDA_H void executeIndexReduceScalar(dim3 launchDims, cudaStream_t *stream, const int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets);
 
     static _CUDA_H void executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
 		static Nd4jLong execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams);
 
@@ -81,6 +77,7 @@ namespace functions {
 
 		template<typename OpType>
 		static _CUDA_H void exec(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset);
+#endif
 		};
 	}
 }
diff --git a/libnd4j/include/loops/legacy_ops.h b/libnd4j/include/loops/legacy_ops.h
index 0e5200321..92fd58d7a 100644
--- a/libnd4j/include/loops/legacy_ops.h
+++ b/libnd4j/include/loops/legacy_ops.h
@@ -92,8 +92,6 @@
         (5, TimesOneMinus), \
         (6, Cube), \
         (7, OneMinus), \
-        (8, Col2Im), \
-        (9, Im2col),\
         (11, Reciprocal), \
         (12, Square), \
         (13, CompareAndSetTransform) ,\
@@ -101,7 +99,6 @@
         (17, Ceiling), \
         (18, Floor), \
         (19, ClipByValue) ,\
-        (20, Reverse), \
         (21, Copy)
 
 #define TRANSFORM_ANY_OPS \
diff --git a/libnd4j/include/loops/pairwise_bool.h b/libnd4j/include/loops/pairwise_bool.h
index 0ff4ebdee..f7a65c3f5 100644
--- a/libnd4j/include/loops/pairwise_bool.h
+++ b/libnd4j/include/loops/pairwise_bool.h
@@ -40,11 +40,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 
 #include "legacy_ops.h"
 
@@ -68,8 +63,7 @@ namespace functions {
             static __host__ void executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams);
 
 
-#endif
-        public:
+#else
 
             static void exec(
 				const int opNum,
@@ -79,7 +73,9 @@ namespace functions {
 				Nd4jLong *yShapeBuffer,
 				void *result,
 				Nd4jLong *resultShapeBuffer,
-				void *extraParams);
+				void *extraParams,
+                const uint64_t start,
+                const uint64_t stop);
 			
 			static void exec(
 				const int opNum,
@@ -90,7 +86,9 @@ namespace functions {
 				void *result,
 				Nd4jLong resultStride,
 				void *extraParams,
-				Nd4jLong n);
+				Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop);
 
 
 			template<typename OpType>
@@ -101,7 +99,9 @@ namespace functions {
                     Nd4jLong* yShapeBuffer,
                     void *vresult,
                     Nd4jLong* resultShapeBuffer,
-                    void *vextraParams);
+                    void *vextraParams,
+                    const uint64_t start,
+                    const uint64_t stop);
 
             template<typename OpType>
             static void exec(void *vx,
@@ -111,7 +111,10 @@ namespace functions {
                              void *vresult,
                              Nd4jLong resultStride,
                              void *vextraParams,
-                             const Nd4jLong n);
+                             const Nd4jLong n,
+                             const uint64_t start,
+                             const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/pairwise_int.h b/libnd4j/include/loops/pairwise_int.h
index 14d273285..aa6437d17 100644
--- a/libnd4j/include/loops/pairwise_int.h
+++ b/libnd4j/include/loops/pairwise_int.h
@@ -40,10 +40,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 
 #include "legacy_ops.h"
@@ -68,8 +64,7 @@ namespace functions {
             static __host__ void executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams);
 
 
-#endif
-        public:
+#else
 
             static void exec(
 				const int opNum,
@@ -79,7 +74,9 @@ namespace functions {
 				Nd4jLong *yShapeBuffer,
 				void *result,
 				Nd4jLong *resultShapeBuffer,
-				void *extraParams);
+				void *extraParams,
+                const uint64_t start,
+                const uint64_t stop);
 			
 			static void exec(
 				const int opNum,
@@ -90,7 +87,9 @@ namespace functions {
 				void *result,
 				Nd4jLong resultStride,
 				void *extraParams,
-				Nd4jLong n);
+				Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop);
 
 
 			template<typename OpType>
@@ -101,7 +100,9 @@ namespace functions {
                     Nd4jLong* yShapeBuffer,
                     void *vresult,
                     Nd4jLong* resultShapeBuffer,
-                    void *vextraParams);
+                    void *vextraParams,
+                    const uint64_t start,
+                    const uint64_t stop);
 
             template<typename OpType>
             static void exec(void *vx,
@@ -111,7 +112,10 @@ namespace functions {
                              void *vresult,
                              Nd4jLong resultStride,
                              void *vextraParams,
-                             const Nd4jLong n);
+                             const Nd4jLong n,
+                             const uint64_t start,
+                             const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/pairwise_transform.h b/libnd4j/include/loops/pairwise_transform.h
index 4fe3eb0cc..0109b309f 100755
--- a/libnd4j/include/loops/pairwise_transform.h
+++ b/libnd4j/include/loops/pairwise_transform.h
@@ -41,12 +41,6 @@
 #include <types/float16.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
-
 
 namespace functions {
     namespace pairwise_transforms {
@@ -76,7 +70,9 @@ namespace functions {
 				Nd4jLong *yShapeInfo,
 				void *z,
 				Nd4jLong *zShapeInfo,
-				void *extraParams);
+				void *extraParams,
+                uint64_t start,
+                uint64_t stop);
 
 			static void exec(
 				const int opNum,
@@ -87,7 +83,9 @@ namespace functions {
 				void *z,
 				Nd4jLong resultStride,
 				void *extraParams,
-				Nd4jLong len);
+				Nd4jLong len,
+                uint64_t start,
+                uint64_t stop);
 
 
 			template<typename OpType>
@@ -98,7 +96,9 @@ namespace functions {
                     Nd4jLong* yShapeInfo,
                     void *vresult,
                     Nd4jLong* zShapeInfo,
-                    void *vextraParams);
+                    void *vextraParams,
+                    uint64_t start,
+                    uint64_t stop);
 
             template<typename OpType>
             static void exec(void *vx,
@@ -108,7 +108,9 @@ namespace functions {
                              void *vresult,
                              Nd4jLong resultStride,
                              void *vextraParams,
-                             const Nd4jLong len);
+                             Nd4jLong len,
+                             uint64_t start,
+                             uint64_t stop);
         };
     }
 }
diff --git a/libnd4j/include/loops/random.h b/libnd4j/include/loops/random.h
index 620187b82..5048e5ce0 100644
--- a/libnd4j/include/loops/random.h
+++ b/libnd4j/include/loops/random.h
@@ -52,7 +52,7 @@ namespace functions {
             static _CUDA_H void executeCudaSingle(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static _CUDA_H void executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static _CUDA_H void executeCudaTriple(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
-#endif
+#else
 
             template<typename OpClass>
             static void execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
@@ -66,6 +66,7 @@ namespace functions {
             static void execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static void execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static void execTransform(int opNum, Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/reduce3.h b/libnd4j/include/loops/reduce3.h
index 781a17bb7..178bac7c2 100755
--- a/libnd4j/include/loops/reduce3.h
+++ b/libnd4j/include/loops/reduce3.h
@@ -44,10 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include "legacy_ops.h"
 
@@ -114,7 +110,7 @@ class Reduce3 {
 
 
 
-#endif
+#else
 
 		template<typename OpType>
 		static void execScalar(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo);
@@ -124,25 +120,25 @@ class Reduce3 {
 
 		
 		template<typename OpType>
-		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength);
+		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int64_t start, int64_t stop);
 
 		
 		template<typename OpType>
-		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop);
 
 
 		template<typename OpType>
-		static void execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength,  Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets);
+		static void execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength,  Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop);
 		
 		
-		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength);
+		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int64_t start, int64_t stop);
 
 
-		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop);
 
 		
-		static void execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets);
-
+		static void execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop);
+#endif
 };
 
 
diff --git a/libnd4j/include/loops/reduce_bool.h b/libnd4j/include/loops/reduce_bool.h
index 89df1330f..540a6041d 100644
--- a/libnd4j/include/loops/reduce_bool.h
+++ b/libnd4j/include/loops/reduce_bool.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,10 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include "legacy_ops.h"
 
@@ -77,7 +72,7 @@ namespace functions {
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo);
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -121,7 +116,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -145,7 +140,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -178,8 +173,10 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+#endif
         };
 
+
 #ifdef __CUDACC__
         /**
     *
diff --git a/libnd4j/include/loops/reduce_float.h b/libnd4j/include/loops/reduce_float.h
index 9856e1d8e..ff2c0e668 100644
--- a/libnd4j/include/loops/reduce_float.h
+++ b/libnd4j/include/loops/reduce_float.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,10 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include "legacy_ops.h"
 
@@ -79,7 +74,7 @@ namespace functions {
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo);
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShape, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -123,7 +118,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -147,7 +142,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -180,8 +175,10 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+#endif
         };
 
+
 #ifdef __CUDACC__
         /**
     *
diff --git a/libnd4j/include/loops/reduce_long.h b/libnd4j/include/loops/reduce_long.h
index 193160074..a5d2a9498 100644
--- a/libnd4j/include/loops/reduce_long.h
+++ b/libnd4j/include/loops/reduce_long.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,11 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 //an op for the kernel
@@ -78,7 +72,7 @@ namespace functions {
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -122,7 +116,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -146,7 +140,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -179,6 +173,7 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+#endif
         };
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/reduce_same.h b/libnd4j/include/loops/reduce_same.h
index c7f5f9173..e828ecf46 100644
--- a/libnd4j/include/loops/reduce_same.h
+++ b/libnd4j/include/loops/reduce_same.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,11 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 //an op for the kernel
@@ -80,7 +74,7 @@ namespace functions {
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo);
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -124,7 +118,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -148,7 +142,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -181,6 +175,8 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+
+#endif
         };
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/scalar.h b/libnd4j/include/loops/scalar.h
index b2ee46dba..0f32dedf3 100755
--- a/libnd4j/include/loops/scalar.h
+++ b/libnd4j/include/loops/scalar.h
@@ -70,15 +70,15 @@ namespace functions {
             __host__
             static void executeCudaAlongDimension(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalars, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
-#endif
+#else
             template <typename OpType>
-            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, bool allowParallelism);
+            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong len, bool allowParallelism);
+            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t len, const uint64_t start, const uint64_t stop);
 
 
 
@@ -101,7 +101,7 @@ namespace functions {
          */
 
             template<typename OpType>
-            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, bool allowParallelism);
+            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop);
 
 
             /**
@@ -117,7 +117,8 @@ namespace functions {
              */
 
             template<typename OpType>
-            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong len, bool allowParallelism);
+            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t len, const uint64_t start, const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/scalar_bool.h b/libnd4j/include/loops/scalar_bool.h
index ddc039d89..a5931ddfb 100644
--- a/libnd4j/include/loops/scalar_bool.h
+++ b/libnd4j/include/loops/scalar_bool.h
@@ -86,15 +86,15 @@ namespace functions {
 /*
 #include "cuda/scalar_temp.cu"
 */
-#endif
+#else
             template <typename OpType>
-            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
  
-           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams);
+            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
 
 
 
@@ -117,7 +117,7 @@ namespace functions {
          */
 
             template<typename OpType>
-            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams);
+            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop);
 
 
             /**
@@ -133,7 +133,8 @@ namespace functions {
              */
 
             template<typename OpType>
-            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/scalar_int.h b/libnd4j/include/loops/scalar_int.h
index f873d5419..509d7574f 100644
--- a/libnd4j/include/loops/scalar_int.h
+++ b/libnd4j/include/loops/scalar_int.h
@@ -83,18 +83,15 @@ namespace functions {
             static void executeCudaAlongDimension(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalars, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
 
-/*
-#include "cuda/scalar_temp.cu"
-*/
-#endif
+#else
             template <typename OpType>
-            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
  
-           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams);
+            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
 
 
 
@@ -117,7 +114,7 @@ namespace functions {
          */
 
             template<typename OpType>
-            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams);
+            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop);
 
 
             /**
@@ -133,7 +130,8 @@ namespace functions {
              */
 
             template<typename OpType>
-            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/summarystatsreduce.h b/libnd4j/include/loops/summarystatsreduce.h
index 915293904..afaee9c47 100755
--- a/libnd4j/include/loops/summarystatsreduce.h
+++ b/libnd4j/include/loops/summarystatsreduce.h
@@ -286,7 +286,7 @@ namespace functions {
             static _CUDA_H void execSummaryStatsReduceScalar(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer);
             static _CUDA_H void execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer);
             static _CUDA_H void execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer);
-#endif
+#else
 
             static Z execScalar(int opNum,
                     bool biasCorrected,
@@ -335,7 +335,7 @@ namespace functions {
                     Nd4jLong *resultShapeInfoBuffer,
                     int *dimension,
                     int dimensionLength);
-
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_any.h b/libnd4j/include/loops/transform_any.h
index ab9ad47c4..d97e3e90e 100644
--- a/libnd4j/include/loops/transform_any.h
+++ b/libnd4j/include/loops/transform_any.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -69,12 +64,12 @@ class TransformAny {
 
 		static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
-
-		static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism);
+#else
+		static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 		template<typename OpType>
-		static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism);
+		static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
 };
 
 }
diff --git a/libnd4j/include/loops/transform_bool.h b/libnd4j/include/loops/transform_bool.h
index ee416ea87..4c87ae58c 100644
--- a/libnd4j/include/loops/transform_bool.h
+++ b/libnd4j/include/loops/transform_bool.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -78,12 +73,12 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
-
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+#else
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_float.h b/libnd4j/include/loops/transform_float.h
index 66547ee79..ae28e069f 100644
--- a/libnd4j/include/loops/transform_float.h
+++ b/libnd4j/include/loops/transform_float.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -102,11 +97,12 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+#else
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_same.h b/libnd4j/include/loops/transform_same.h
index ef646a1b6..ae5b498e6 100644
--- a/libnd4j/include/loops/transform_same.h
+++ b/libnd4j/include/loops/transform_same.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -79,12 +74,13 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
 
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+#else
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_strict.h b/libnd4j/include/loops/transform_strict.h
index fe520743e..96917ebc1 100644
--- a/libnd4j/include/loops/transform_strict.h
+++ b/libnd4j/include/loops/transform_strict.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -79,12 +74,16 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
+#else
 
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+
+
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+
+#endif
         };
     }
 }
diff --git a/libnd4j/include/msvc.h b/libnd4j/include/msvc.h
new file mode 100644
index 000000000..c884736f3
--- /dev/null
+++ b/libnd4j/include/msvc.h
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_MSVC_H
+#define SAMEDIFF_MSVC_H
+
+#if defined(_MSC_VER)
+
+#pragma warning( disable : 4244 )
+#pragma warning( disable : 4267 )
+#pragma warning( disable : 4251 )
+#pragma warning( disable : 4101 )
+#pragma warning( disable : 4305 )
+#pragma warning( disable : 4309 )
+#pragma warning( disable : 4333 )
+#pragma warning( disable : 4146 )
+#pragma warning( disable : 4018 )
+#pragma warning( disable : 4297 )
+
+#endif
+
+#endif //DEV_TESTS_MSVC_H
diff --git a/libnd4j/include/op_boilerplate.h b/libnd4j/include/op_boilerplate.h
index 4f70d9bf2..102a1776a 100644
--- a/libnd4j/include/op_boilerplate.h
+++ b/libnd4j/include/op_boilerplate.h
@@ -1461,7 +1461,7 @@
 
 #ifdef _RELEASE
 
-#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
+#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); }
 #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast<void *>(VARIABLE));  if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; };
 
 #else
@@ -1528,6 +1528,7 @@
 #elif _MSC_VER
 #define FORCEINLINE __forceinline
 #elif __GNUC__
+#define INLINE_LOOPS
 #define FORCEINLINE __attribute__((always_inline)) inline 
 #elif __CUDACC__
 #define FORCEINLINE __forceinline__ inline 
diff --git a/libnd4j/include/openmp_pragmas.h b/libnd4j/include/openmp_pragmas.h
index f1d4a8f67..667f54521 100644
--- a/libnd4j/include/openmp_pragmas.h
+++ b/libnd4j/include/openmp_pragmas.h
@@ -23,7 +23,7 @@
 
 #if defined(_MSC_VER)
 
-#define OMP_STRINGIFY(args)
+#define OMP_STRINGIFY(args) #args
 #define OMP_IF(args)
 #define OMP_SCHEDULE(args)
 #define OMP_MAXT
@@ -32,7 +32,7 @@
 #define PRAGMA_OMP_ATOMIC
 #define PRAGMA_OMP_ATOMIC_ARGS(args)
 #define PRAGMA_OMP_CRITICAL
-#define PRAGMA_OMP_SIMD
+#define PRAGMA_OMP_SIMD __pragma(omp simd)
 #define PRAGMA_OMP_SIMD_ARGS(args)
 #define PRAGMA_OMP_SIMD_SUM(args)
 #define PRAGMA_OMP_SIMD_MAX(args)
@@ -61,6 +61,7 @@
 
 #else
 
+
 #define OMP_STRINGIFY(args) #args
 #define OMP_IF(args) if(args)
 #define OMP_SCHEDULE(args) schedule(args)
@@ -99,4 +100,39 @@
 
 #endif
 
+// reductions
+#define FUNC_RL std::function<int64_t(uint64_t, int64_t, int64_t, int64_t)>
+#define FUNC_AL std::function<int64_t(int64_t, int64_t)>
+
+// aggregation functions
+#define FUNC_RD std::function<double(uint64_t, int64_t, int64_t, int64_t)>
+#define FUNC_AD std::function<double(double, double)>
+
+// parallel block
+#define FUNC_DO std::function<void(uint64_t, uint64_t)>
+
+// parallel_for block
+#define FUNC_1D std::function<void(uint64_t, int64_t, int64_t, int64_t)>
+#define FUNC_2D std::function<void(uint64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t)>
+#define FUNC_3D std::function<void(uint64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t)>
+
+// aggregation lambda
+#define LAMBDA_AL [&](int64_t _old, int64_t _new) -> int64_t
+#define LAMBDA_AD [&](double _old, double _new) -> double
+
+#define LAMBDA_SUML LAMBDA_AL {return _old + _new; }
+#define LAMBDA_SUMD LAMBDA_AD {return _old + _new; }
+
+// reduction lambda
+#define PRAGMA_REDUCE_LONG  [&] (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) mutable -> int64_t
+#define PRAGMA_REDUCE_DOUBLE  [&] (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) mutable -> double
+
+// paralllel block lambda
+#define PRAGMA_THREADS_DO  [&](uint64_t thread_id, uint64_t numThreads) -> void
+
+// paralllel_for lambdas
+#define PRAGMA_THREADS_FOR  [&](uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) -> void
+#define PRAGMA_THREADS_FOR_2D [&](uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y) -> void
+#define PRAGMA_THREADS_FOR_3D [&](uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) -> void
+
 #endif //DEV_TESTS_OPENMP_PRAGMAS_H
diff --git a/libnd4j/include/ops/aggregate_ops.h b/libnd4j/include/ops/aggregate_ops.h
deleted file mode 100644
index a10a2912e..000000000
--- a/libnd4j/include/ops/aggregate_ops.h
+++ /dev/null
@@ -1,996 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-//
-#ifndef LIBND4J_AGGREGATE_OPS_H
-#define LIBND4J_AGGREGATE_OPS_H
-
-#include <ops/ops.h>
-#include <templatemath.h>
-
-#define HS_MAX_EXP 6.0f
-
-#ifdef __CUDACC__
-#define aggregate_def __device__ inline static
-#else
-#include <ops/gemm.h>
-#define aggregate_def inline static
-#endif
-/*
- *
- *
- * Aggregate Ops are special things suited for CUDA mostly. They are meant to be executed within single block ONLY.
- * So, when batched, they should provide proper parallelism levels on poorly parallel tasks otherwise.
- *
- * On CPU aggregate ops are trying to minimize OpenMP multi-threading use, only SIMD is enforced
- *
- *
- */
-namespace aggregateOps {
-
-    template<typename T>
-    class GEMM {
-    public:
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            // no-op
-        }
-#endif
-
-#ifndef __CUDACC__
-        static CBLAS_ORDER  convertOrder(int from) {
-            switch(from) {
-                //'c'
-                case 99:
-                    return CblasRowMajor;
-                    //'C'
-                case 67: return CblasRowMajor;
-                    //'f'
-                case 102: return CblasColMajor;
-                    //'F'
-                case 70: return CblasColMajor;
-                default: return CblasColMajor;
-
-            }
-        }
-
-
-        static CBLAS_TRANSPOSE convertTranspose(int from) {
-            switch(from) {
-                //'t'
-                case 116: return CblasTrans;
-                    //'T'
-                case 84: return CblasTrans;
-                    //'n'
-                case 110: return CblasNoTrans;
-                    //'N'
-                case 78: return CblasNoTrans;
-                    //'c'
-                case 99: return CblasConjTrans;
-                    //'C'
-                case 67: return CblasConjTrans;
-                default: return CblasNoTrans;
-            }
-        }
-#endif
-
-#ifndef __CUDACC__
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int M = indexArguments[0];
-            int N = indexArguments[1];
-            int K = indexArguments[2];
-            int lda = indexArguments[3];
-            int ldb = indexArguments[4];
-            int ldc = indexArguments[5];
-            int TransA = indexArguments[6];
-            int TransB = indexArguments[7];
-            int Order = indexArguments[8];
-
-            T alpha = realArguments[0];
-            T beta = realArguments[1];
-
-            T *A = arguments[0];
-            T *B = arguments[1];
-            T *C = arguments[2];
-
-            nd4j::blas::GEMM<T, T, T>::op(convertOrder(Order), convertTranspose(TransA), convertTranspose(TransB),M,N,K,(T) alpha,A,lda,B,ldb,(T) beta,C,ldc);
-        }
-#else
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            // stub for nvcc
-        }
-#endif
-    };
-
-    /**
-     * We don't include this class into ops directly, since it won't be ever used directly,
-     * Only as part of SkipGram or CBOW
-     */
-    template<typename T>
-    class HierarchicSoftmax {
-        private:
-
-        public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int vectorLength = indexArguments[0];
-            int expLength = indexArguments[1];
-            int code = indexArguments[2];
-            int isInference = indexArguments[3];
-
-            T *syn0 = arguments[0]; // we pass row pointer here
-            T *syn1 = arguments[1]; // we pass row pointer here
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-
-            T dot(0.0f);
-            T g(0.0f);
-            T f(0.0f);
-            T alpha = realArguments[0];
-
-            //nd4j_printf("Vector length: [%i]; expLength: [%i]; Code: [%i]; Inf: [%i]\n", vectorLength, expLength, code, isInference);
-
-
-//            shape::printArray<T>(syn0, vectorLength, "syn0");
-//            shape::printArray<T>(syn1, vectorLength, "syn1");
-//            shape::printArray<T>(neu1e, vectorLength, "neu1e");
-
-            // dot
-            for (int x = 0; x < vectorLength; x++) {
-                dot += syn0[x] * syn1[x];
-            }
-
-            // gradient
-            if (dot < (T) - HS_MAX_EXP || dot >= (T) HS_MAX_EXP) {
-                return;
-            }
-
-            int idx = static_cast<int>((dot + HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0f));
-
-            if (idx >= expLength || idx < 0) {
-                return;
-            }
-
-            f = expTable[idx];
-            g = (static_cast<T>(1.0f) - static_cast<T>(code) - f) * alpha;
-
-            //nd4j_printf("dot: [%f]; idx: [%i]; f: [%f]; g: [%f]\n", (float) dot, idx, (float) f, (float) g);
-
-            // axpy1
-            PRAGMA_OMP_SIMD
-            for (int x = 0; x < vectorLength; x++) {
-                neu1e[x] = g * syn1[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference) {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    syn1[x] = g * syn0[x] + syn1[x];
-                }
-            }
-        }
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            /*
-                We know that syn0 & syn1 are 2D matrices, so we can just use offsets here
-            */
-            __shared__ int vectorLength;
-            __shared__ int expLength;
-            __shared__ int code;
-            __shared__ int isInference;
-
-            T *syn0 = arguments[0];
-            T *syn1 = arguments[1];
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-            __shared__ T dot;
-            __shared__ T g;
-            __shared__ T f;
-            __shared__ T alpha;
-
-            if (threadIdx.x == 0) {
-                vectorLength = indexArguments[0];
-                expLength = indexArguments[1];
-                code = indexArguments[2];
-                isInference = indexArguments[3];
-
-                dot = (T) 0.0f;
-
-                alpha = realArguments[0];
-            }
-            __syncthreads();
-
-
-            // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that
-            // dot
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                T prod = syn0[x] * syn1[x];
-                nd4j::math::atomics::nd4j_atomicAdd<T>(&dot, prod);
-            }
-
-
-            // gradient
-            __syncthreads();
-
-            if (dot < - (T) HS_MAX_EXP || dot >= (T) HS_MAX_EXP)
-                return;
-
-            int idx = (int) ((dot + HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0));
-
-            if (idx >= expLength)
-                return;
-
-
-            if (threadIdx.x == 0) {
-                // gradient calculation
-                f = expTable[idx];
-                g = ((T) 1.0f - (T) code - f) * alpha;
-            }
-            __syncthreads();
-
-            // axpy1
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                neu1e[x] = g * syn1[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference)
-                for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                    syn1[x] = g * syn0[x] + syn1[x];
-                }
-        }
-#endif
-    };
-
-    /**
-     * We don't include this class into ops directly, since it won't be ever used directly,
-     * Only as part of SkipGram or CBOW
-     */
-    template<typename T>
-    class NegativeSampling {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int vectorLength = indexArguments[0];
-            int expLength = indexArguments[1];
-            int code = indexArguments[2];
-            int isInference = indexArguments[3];
-
-            T *syn0 = arguments[0]; // we pass row pointer here
-            T *syn1Neg = arguments[1]; // we pass row pointer here
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-            T dot = (T) 0.0f;
-            T g = (T) 0.0f;
-            T alpha = realArguments[0];
-
-            // dot
-            for (int x = 0; x < vectorLength; x++) {
-                dot += syn0[x] * syn1Neg[x];
-            }
-
-            if (dot > HS_MAX_EXP)
-                g = (code - 1) * alpha;
-            else if (dot < (T) - HS_MAX_EXP)
-                g = (code - 0) * alpha;
-            else {
-                int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0));
-                if (idx >= expLength)
-                    return;
-
-                if (idx < 0)
-                    return;
-
-                g = ((T) code - expTable[idx]) * alpha;
-            }
-
-            // axpy1
-            PRAGMA_OMP_SIMD
-            for (int x = 0; x < vectorLength; x++) {
-                neu1e[x] = g * syn1Neg[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference) {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    syn1Neg[x] = g * syn0[x] + syn1Neg[x];
-                }
-            }
-        }
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            /*
-                We know that syn0 & syn1 are 2D matrices, so we can just use offsets here
-            */
-            __shared__ int vectorLength;
-            __shared__ int expLength;
-            __shared__ int code;
-            __shared__ int isInference;
-
-            T *syn0 = arguments[0];
-            T *syn1Neg = arguments[1];
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-            __shared__ T dot;
-            __shared__ T g;
-            __shared__ T alpha;
-
-            if (threadIdx.x == 0) {
-                vectorLength = indexArguments[0];
-                expLength = indexArguments[1];
-                code = indexArguments[2];
-                isInference = indexArguments[3];
-
-                dot = (T) 0.0f;
-
-                alpha = realArguments[0];
-            }
-            __syncthreads();
-
-
-            // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that
-            // dot
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                T prod = syn0[x] * syn1Neg[x];
-                nd4j::math::atomics::nd4j_atomicAdd<T>(&dot, prod);
-            }
-
-
-            // gradient
-            __syncthreads();
-
-
-            int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0));
-            if (idx >= expLength && dot <= (T) HS_MAX_EXP && dot >= (T) -HS_MAX_EXP)
-                return;
-
-
-            if (threadIdx.x == 0) {
-                // gradient calculation
-                if (dot > (T) HS_MAX_EXP)
-                    g = (code - 1) * alpha;
-                else if (dot < (T) - HS_MAX_EXP)
-                    g = (code - 0) * alpha;
-                else {
-
-
-                    g = ((T) code - expTable[idx]) * alpha;
-                }
-
-            //    printf("dot: [%f]; g: [%f]\n", dot, g);
-            }
-            __syncthreads();
-
-           // printf("before syn1Neg[%i]: [%f], dot: [%f]; g: [%f]; vectorLength: [%i]\n", threadIdx.x, syn1Neg[threadIdx.x], dot, g, vectorLength);
-
-            // axpy1
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                neu1e[x] = g * syn1Neg[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference)
-                for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                    syn1Neg[x] = g * syn0[x] + syn1Neg[x];
-                }
-
-        //    printf("after syn1Neg[%i]: [%f]\n", threadIdx.x, syn1Neg[threadIdx.x]);
-
-        }
-#endif
-    };
-
-    template<typename T>
-    class Dot {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-            T *vecZ = arguments[2];
-
-            T dot = (T) 0.0f;
-
-            int vectorLength = indexArguments[0];
-
-            PRAGMA_OMP_SIMD_SUM(dot)
-            for (int x = 0; x < vectorLength; x++) {
-                dot += vecX[x] * vecY[x];
-            }
-
-            vecZ[0] = dot;
-        };
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-            T *vecZ = arguments[2];
-
-            int vectorLength = indexArguments[0];
-
-            __shared__ T dot;
-            if (threadIdx.x == 0)
-                dot = (T) 0.0f;
-            __syncthreads();
-
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                T prod = vecX[x] * vecY[x];
-                nd4j::math::atomics::nd4j_atomicAdd<T>(&dot, prod);
-            }
-            __syncthreads();
-
-            if (threadIdx.x == 0)
-                vecZ[0] = dot;
-        }
-#endif
-    };
-
-    template<typename T>
-    class Axpy {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-
-            T alpha = realArguments[0];
-
-            int vectorLength = indexArguments[0];
-
-            PRAGMA_OMP_SIMD
-            for (int x = 0; x < vectorLength; x++) {
-                vecY[x] = alpha * vecX[x] + vecY[x];
-            }
-        };
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-
-            T alpha = realArguments[0];
-
-            int vectorLength = indexArguments[0];
-
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                vecY[x] = alpha * vecX[x] + vecY[x];
-            }
-            __syncthreads();
-        }
-#endif
-    };
-
-
-    template<typename T>
-    class SkipGram {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int syn0Row = indexArguments[0];
-            int vectorLength = indexArguments[1];
-            int hsRounds = indexArguments[2];
-            int ngRounds = indexArguments[3];
-            int expLength = indexArguments[4];
-            int vocabSize = indexArguments[5];
-            int ngStarter = indexArguments[6];
-            int negTableLength = indexArguments[7];
-            int isInference = indexArguments[8];
-
-
-            auto neu1e = new T[vectorLength];
-            std::memset(neu1e, 0, sizeof(T) * vectorLength);
-
-            T *args[4];
-            int idxArgs[4];
-
-            args[1] = arguments[1]; // syn1
-            args[2] = arguments[2]; // expTable
-            args[3] = neu1e;
-
-
-            idxArgs[0] = vectorLength; // vectorLength
-            idxArgs[1] = expLength; // expLength
-            idxArgs[3] = isInference;
-
-            T *syn1Neg = arguments[3];
-            T *negTable = arguments[4];
-            T *inferenceVector = arguments[5];
-
-            T *syn0 = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength);
-
-            args[0] = syn0;// syn0
-
-            int *idxSyn1 = intArrays[0];
-            int *codes = intArrays[1];
-
-            //nd4j_printf("syn0Row: [%i]; vecLen: [%i]; hsRounds: [%i]; ngRounds: [%i]; expLength: [%i]; vocabSize: [%i]; ngStarter: [%i]; negTableLength: [%i]; isInf: [%i]\n", syn0Row, vectorLength, hsRounds, ngRounds, expLength, vocabSize, ngStarter, negTableLength, isInference);
-
-            auto next_random = static_cast<unsigned long long>(realArguments[1]);
-
-            if (hsRounds > 0) {
-                for (int r = 0; r < hsRounds; r++) {
-                    args[1] = arguments[1] + (idxSyn1[r] * vectorLength); // syn1 row
-                    idxArgs[2] = codes[r];  // code for row
-
-                    //nd4j_printf("idx syn1: [%i]; code: [%i]\n", idxSyn1[r], idxArgs[2]);
-
-                    HierarchicSoftmax<T>::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1);
-                }
-            }
-
-
-
-            int target = ngStarter;
-            if (ngRounds > 0) {
-                for (int r = 0; r < ngRounds + 1; r++) {
-                    if (r == 0) {
-                        idxArgs[2] = 1;
-                    } else {
-                        next_random = next_random * (unsigned long long) 25214903917 + 11;
-                        target = negTable[(next_random >> 16) % negTableLength];
-
-                        if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-                        if (target == ngStarter)
-                            continue;
-
-                        idxArgs[2] = 0;
-                    }
-
-                    args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1
-
-                    NegativeSampling<T>::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1);
-                }
-            }
-
-            //nd4j_printf("applying...\n","");
-
-            if (!isInference) {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    syn0[x] += neu1e[x];
-                }
-            } else {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    inferenceVector[x] += neu1e[x];
-                }
-            }
-
-            delete[] neu1e;
-        }
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            __shared__ int syn0Row;
-            __shared__ int vectorLength;
-            __shared__ int hsRounds;
-            __shared__ int ngRounds;
-            __shared__ int expLength;
-            __shared__ int vocabSize;
-            __shared__ int ngStarter;
-            __shared__ int negTableLength;
-            __shared__ int isInference;
-
-            __shared__ T *neu1e;
-
-            __shared__ T *args[4];
-            __shared__ int idxArgs[4];
-
-
-            __shared__ unsigned long long next_random;
-
-            __shared__ T *negTable;
-            T *syn1Neg = arguments[3];
-            __shared__ T *inferenceVector;
-
-            if (threadIdx.x == 0) {
-                extern __shared__ unsigned char shmem[];
-                neu1e = (T *) shmem;
-
-                syn0Row = indexArguments[0];
-                vectorLength = indexArguments[1];
-                hsRounds = indexArguments[2];
-                ngRounds = indexArguments[3];
-                expLength = indexArguments[4];
-                vocabSize = indexArguments[5];
-                ngStarter = indexArguments[6];
-                negTableLength = indexArguments[7];
-                isInference = indexArguments[8];
-
-                inferenceVector = arguments[5];
-
-                next_random = (unsigned long long) realArguments[1];
-
-                args[0] = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength); // syn0
-                args[1] = arguments[1]; // syn1
-                args[2] = arguments[2]; // expTable
-                args[3] = neu1e;
-
-                negTable = arguments[4];
-
-                idxArgs[0] = vectorLength; // vectorLength
-                idxArgs[1] = expLength; // expLength
-                idxArgs[3] = isInference;
-            }
-            __syncthreads();
-
-            T *syn0 = isInference ? inferenceVector : arguments[0] + (syn0Row * vectorLength);
-
-            for (int i = threadIdx.x; i < vectorLength; i+=blockDim.x) {
-                neu1e[i] = (T) 0.0f;
-            }
-
-            int *idxSyn1 = intArrays[0];
-            int *codes = intArrays[1];
-
-
-            for (int r = 0; r < hsRounds; r++) {
-                if (threadIdx.x == 0) {
-                    args[1] = arguments[1] + (idxSyn1[r] * vectorLength);// syn1 row
-                    idxArgs[2] = codes[r];  // code for row
-                }
-                __syncthreads();
-
-                HierarchicSoftmax<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0,  realArguments, 1);
-            }
-            __syncthreads();
-
-
-            __shared__ int target;
-            if (ngRounds > 0)
-                for (int r = 0; r < ngRounds + 1; r++) {
-                    if (threadIdx.x == 0) {
-                        if (r == 0) {
-                            // this line isn't a mistake
-                            target = ngStarter;
-
-                            idxArgs[2] = 1;
-                        } else {
-                            next_random = next_random * (unsigned long long)25214903917 + 11 + blockIdx.x;
-                            target = negTable[(next_random >> 16) % negTableLength];
-
-                            if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-
-                            idxArgs[2] = 0;
-                        }
-
-                        args[1] = syn1Neg + (target * vectorLength);
-                    }
-                    __syncthreads();
-
-                    // we put it here, to make sure all threads pick up continue call
-                    if (r != 0 && target == ngStarter)
-                        continue;
-
-                    NegativeSampling<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 1);
-                }
-
-
-
-            // final axpy with 1.0f as alpha
-            if (!isInference)
-                for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) {
-                    syn0[x] += neu1e[x];
-                }
-            else
-                for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) {
-                    inferenceVector[x] += neu1e[x];
-                }
-        }
-#endif
-    };
-
-    template<typename T>
-    class CBOW {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments,
-                         int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,
-                         T *realArguments, int numRealArguments) {
-            int vectorLength = indexArguments[0];
-            int hsRounds = indexArguments[1];
-            int ngRounds = indexArguments[2];
-            int expLength = indexArguments[3];
-            int vocabSize = indexArguments[4];
-            int ngStarter = indexArguments[5];
-            int negTableLength = indexArguments[6];
-            int idxSyn0Length = indexArguments[7];
-            //int initialIdx = indexArguments[8];
-            int numLabels = indexArguments[9];
-            int trainWords = indexArguments[10];
-            int isInference = indexArguments[11];
-
-
-            int *idxSyn0 = intArrays[0];
-            int *idxSyn1 = intArrays[1];
-            int *codes = intArrays[2];
-
-
-            T *neu1 = new T[vectorLength];
-            T *neu1e = new T[vectorLength];
-            std::memset(neu1, 0, sizeof(T) * vectorLength);
-            std::memset(neu1e, 0, sizeof(T) * vectorLength);
-
-            T *syn0 = arguments[0];
-            T *syn1 = arguments[1];
-            T *expTable = arguments[2];
-            T *syn1Neg = arguments[3];
-            T *negTable = arguments[4];
-            T *inferenceVector = arguments[5];
-
-            T *args[4];
-            int idxArgs[4];
-            idxArgs[0] = vectorLength; // vectorLength
-            idxArgs[1] = expLength; // expLength
-            idxArgs[3] = isInference;
-
-            unsigned long long next_random = (unsigned long long) realArguments[1];
-
-            // building neu1 for current window
-            for (int c = 0; c < idxSyn0Length; c++) {
-                T *syn0word = syn0 + (idxSyn0[c] * vectorLength);
-
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    neu1[i] += syn0word[i];
-                }
-            }
-
-            // for inference we use additional inference vector
-            if (isInference) {
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    neu1[i] += inferenceVector[i];
-                }
-            }
-
-
-            // average neu1
-            if (idxSyn0Length > 0) {
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    neu1[i] /= idxSyn0Length + isInference;
-                }
-            }
-
-            args[0] = neu1;
-            args[2] = expTable;
-            args[3] = neu1e;
-
-            if (hsRounds > 0)
-                for (int i = 0; i < hsRounds; i++) {
-                    args[1] = syn1 + (idxSyn1[i] * vectorLength);
-                    idxArgs[2] = codes[i];
-
-                    HierarchicSoftmax<T>::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-                }
-
-            int target = ngStarter;
-            if (ngRounds > 0)
-                for (int i = 0; i < ngRounds + 1; i++) {
-                    if (i == 0) {
-                        idxArgs[2] = 1;
-                    } else {
-                        next_random = next_random * (unsigned long long) 25214903917 + 11;
-                        target = negTable[(next_random >> 16) % negTableLength];
-
-                        if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-                        if (target == ngStarter)
-                            continue;
-
-                        idxArgs[2] = 0;
-                    }
-
-                    args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1
-
-                    //printf("Negative round: target: [%i]; code: [%i]; neu1e[0]: [%f]\n", target, idxArgs[4], neu1e[0]);
-
-                    NegativeSampling<T>::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-                }
-
-
-            // if we don't train words - we skip start of idxSyn0
-            int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels;
-
-            // propagate neu1e -> syn0
-            if (!isInference) {
-                for (int c = starter; c < idxSyn0Length; c++) {
-                    T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength);
-
-                    PRAGMA_OMP_SIMD
-                    for (int i = 0; i < vectorLength; i++) {
-                        syn0word[i] += neu1e[i];
-                    }
-                }
-            } else {
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    inferenceVector[i] += neu1e[i];
-                }
-            }
-
-
-
-            delete[] neu1;
-            delete[] neu1e;
-        }
-
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments,
-                         int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,
-                         T *realArguments, int numRealArguments) {
-            __shared__ int vectorLength;
-            __shared__ int hsRounds;
-            __shared__ int ngRounds;
-            __shared__ int expLength;
-            __shared__ int vocabSize;
-            __shared__ int ngStarter;
-            __shared__ int negTableLength;
-            __shared__ int idxSyn0Length;
-            __shared__ int initialIdx;
-            __shared__ int numLabels;
-            __shared__ int trainWords;
-            __shared__ int isInference;
-
-            int *idxSyn0 = intArrays[0];
-            int *idxSyn1 = intArrays[1];
-            int *codes = intArrays[2];
-
-            __shared__ T *neu1;
-            __shared__ T *neu1e;
-
-            __shared__ T *args[5];
-            __shared__ int idxArgs[4];
-
-            T *syn0 = arguments[0];
-            T *syn1 = arguments[1];
-            //T *expTable = arguments[2];
-            T *syn1Neg = arguments[3];
-            T *negTable = arguments[4];
-            T *inferenceVector = arguments[5];
-
-            if (threadIdx.x == 0) {
-                vectorLength = indexArguments[0];
-                hsRounds = indexArguments[1];
-                ngRounds = indexArguments[2];
-                expLength = indexArguments[3];
-                vocabSize = indexArguments[4];
-                ngStarter = indexArguments[5];
-                negTableLength = indexArguments[6];
-                idxSyn0Length = indexArguments[7];
-                initialIdx = indexArguments[8];
-                numLabels = indexArguments[9];
-                trainWords = indexArguments[10];
-                isInference = indexArguments[11];
-
-                extern __shared__ unsigned char shmem[];
-                neu1 = (T *) shmem;
-                neu1e = neu1 + vectorLength;
-
-                args[0] = neu1;
-                args[2] = arguments[2]; //expTable
-                args[3] = neu1e;
-
-                idxArgs[0] = vectorLength; // vectorLength
-                idxArgs[1] = expLength; // expLength
-                idxArgs[3] = isInference;
-            }
-            __syncthreads();
-
-            for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                neu1[i] = (T) 0.0f;
-                neu1e[i] = (T) 0.0f;
-            }
-
-            unsigned long long next_random = (unsigned long long) realArguments[1];
-            for (int c = 0; c < idxSyn0Length; c++) {
-                T *syn0word = syn0 + (idxSyn0[c] * vectorLength);
-
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                    neu1[i] += syn0word[i];
-                }
-            }
-
-            if (isInference)
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                    neu1[i] += inferenceVector[i];
-                }
-
-            // average neu1
-            if (idxSyn0Length > 0) {
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                    neu1[i] /= idxSyn0Length + + isInference;
-                }
-            }
-            __syncthreads();
-
-
-
-            if (hsRounds > 0)
-                for (int i = 0; i < hsRounds; i++) {
-                    if (threadIdx.x == 0) {
-                        args[1] = syn1 + (idxSyn1[i] * vectorLength);
-                        idxArgs[2] = codes[i];
-                    }
-                    __syncthreads();
-
-                    HierarchicSoftmax<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-                }
-
-            __shared__ int target;
-            if (ngRounds > 0)
-                for (int i = 0; i < ngRounds + 1; i++) {
-                    if (threadIdx.x == 0) {
-                        if (i == 0) {
-                            target = ngStarter;
-                        } else {
-                            next_random = next_random * (unsigned long long) 25214903917 + 11;
-                            target = negTable[(next_random >> 16) % negTableLength];
-
-                            if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-                        }
-
-                        args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1
-                        idxArgs[2] = i == 0 ? 1 : 0;
-                    }
-                    __syncthreads();
-
-                    if (i != 0 && target == ngStarter)
-                            continue;
-
-
-                    NegativeSampling<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-
-                    //printf("Negative round: target: [%i]; code: [%i]; neu1[%i]: [%f]; neu1e[%i]: [%f]\n", target, idxArgs[2], threadIdx.x, neu1[threadIdx.x], threadIdx.x, neu1e[threadIdx.x]);
-                }
-
-
-            // if we don't train words - we skip start of idxSyn0
-            int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels;
-
-            if (!isInference)
-                for (int c = starter; c < idxSyn0Length; c++) {
-                    T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength);
-
-                    for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                        syn0word[i] += neu1e[i];
-                    }
-                }
-            else {
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                        inferenceVector[i] += neu1e[i];
-                }
-            }
-
-        }
-#endif
-    };
-
-}
-
-#endif //LIBND4J_AGGREGATE_OPS_H
diff --git a/libnd4j/include/ops/declarable/BooleanOp.h b/libnd4j/include/ops/declarable/BooleanOp.h
index b341ce394..b741c61c4 100644
--- a/libnd4j/include/ops/declarable/BooleanOp.h
+++ b/libnd4j/include/ops/declarable/BooleanOp.h
@@ -35,7 +35,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             BooleanOp(const char *name, int numInputs, bool scalar);
-            ~BooleanOp();
 
             bool evaluate(std::initializer_list<nd4j::NDArray*> args);
             bool evaluate(std::vector<nd4j::NDArray*>& args);
diff --git a/libnd4j/include/ops/declarable/BroadcastableOp.h b/libnd4j/include/ops/declarable/BroadcastableOp.h
index bc2cddc59..39435195b 100644
--- a/libnd4j/include/ops/declarable/BroadcastableOp.h
+++ b/libnd4j/include/ops/declarable/BroadcastableOp.h
@@ -33,7 +33,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             BroadcastableOp(const char *name, int numTArgs, int numIArgs);
-            ~BroadcastableOp();
 
             ShapeList *calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context& block) override;
         };
diff --git a/libnd4j/include/ops/declarable/DeclarableCustomOp.h b/libnd4j/include/ops/declarable/DeclarableCustomOp.h
index 38cc20e71..49d3735d4 100644
--- a/libnd4j/include/ops/declarable/DeclarableCustomOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableCustomOp.h
@@ -33,7 +33,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs);
-            ~DeclarableCustomOp();
 
             ShapeList* calculateOutputShape(ShapeList* inputShapes, nd4j::graph::Context& block) override = 0;
         };
diff --git a/libnd4j/include/ops/declarable/DeclarableListOp.h b/libnd4j/include/ops/declarable/DeclarableListOp.h
index 6fa4fe086..2d6115027 100644
--- a/libnd4j/include/ops/declarable/DeclarableListOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableListOp.h
@@ -34,13 +34,12 @@ namespace nd4j {
         protected:
             Nd4jStatus validateAndExecute(Context& block) override = 0;
 
-            nd4j::NDArray* getZ(Context& block, int inputId);
+            nd4j::NDArray* getZ(Context& block, int inputId) ;
             void setupResult(NDArray* array, Context& block);
             void setupResultList(NDArrayList* arrayList, Context& block);
 
         public:
             DeclarableListOp(int numInputs, int numOutputs, const char* opName, int tArgs, int iArgs);
-            ~DeclarableListOp();
 
             
             Nd4jStatus execute(Context* block) override;
diff --git a/libnd4j/include/ops/declarable/DeclarableOp.h b/libnd4j/include/ops/declarable/DeclarableOp.h
index f8c96d400..5da74860b 100644
--- a/libnd4j/include/ops/declarable/DeclarableOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableOp.h
@@ -126,7 +126,7 @@ namespace nd4j {
             DeclarableOp(const char *name, bool isLogical);
 
             // default testructor
-            ~DeclarableOp();
+            virtual ~DeclarableOp();
 
             // this method returns OpDescriptor, describing this Op instance
             OpDescriptor *getOpDescriptor();
diff --git a/libnd4j/include/ops/declarable/DeclarableReductionOp.h b/libnd4j/include/ops/declarable/DeclarableReductionOp.h
index 4a75c5daf..5306f60eb 100644
--- a/libnd4j/include/ops/declarable/DeclarableReductionOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableReductionOp.h
@@ -33,7 +33,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             DeclarableReductionOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs);
-            ~DeclarableReductionOp();
 
             ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
         };
diff --git a/libnd4j/include/ops/declarable/LegacyOp.h b/libnd4j/include/ops/declarable/LegacyOp.h
index 951f60165..a7c7ad055 100644
--- a/libnd4j/include/ops/declarable/LegacyOp.h
+++ b/libnd4j/include/ops/declarable/LegacyOp.h
@@ -45,6 +45,7 @@ namespace nd4j {
         public:
             LegacyOp(int numInputs);
             LegacyOp(int numInputs, int opNum);
+            ~LegacyOp() = default;
 
             // All Op classes provide own specific implementation for this method
             ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override = 0;
diff --git a/libnd4j/include/ops/declarable/LogicOp.h b/libnd4j/include/ops/declarable/LogicOp.h
index 026afe634..70fa3a6ff 100644
--- a/libnd4j/include/ops/declarable/LogicOp.h
+++ b/libnd4j/include/ops/declarable/LogicOp.h
@@ -37,7 +37,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(nd4j::graph::Context& block) override;
         public:
             LogicOp(const char *name);
-            ~LogicOp() = default;
 
             ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
         };
diff --git a/libnd4j/include/ops/declarable/OpTuple.h b/libnd4j/include/ops/declarable/OpTuple.h
index e0296dd9c..fc0fd594a 100644
--- a/libnd4j/include/ops/declarable/OpTuple.h
+++ b/libnd4j/include/ops/declarable/OpTuple.h
@@ -29,7 +29,7 @@ namespace nd4j {
     namespace ops {
         class ND4J_EXPORT OpTuple {
         public:
-            const char * _opName;
+            std::string _opName;
             std::vector<nd4j::NDArray*> _inputs;
             std::vector<nd4j::NDArray*> _outputs;
             std::vector<double> _tArgs;
diff --git a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
index 986b93019..1b949eb35 100644
--- a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
@@ -30,9 +30,10 @@ namespace nd4j {
             auto y = INPUT_VARIABLE(1);
             auto z = OUTPUT_VARIABLE(0);
 
-            REQUIRE_TRUE(x->isSameShape(y),0, "Axpy: both arguments should have the same shape")
+            REQUIRE_TRUE(x->isSameShape(y),0, "Axpy: both arguments should have the same shape");
+            REQUIRE_TRUE(x->dataType() == y->dataType() && x->dataType() == z->dataType(), 0, "Axpy: all arguments must have the same data type");
 
-            double a = (double) 1.0f;
+            double a = 1.0;
 
             if (block.width() > 2) {
                 auto alpha = INPUT_VARIABLE(2);
@@ -41,15 +42,6 @@ namespace nd4j {
                 a = T_ARG(0);
             }
 
-            /*
-            auto lambda = LAMBDA_TT(_y, _x, a) {
-                return a * _x + _y;
-            };
-
-            y->applyPairwiseLambda(x, lambda, z);
-            */
-
-            // FIXME: set proper extras here
             ExtraArguments arguments({a});
 
             y->applyPairwiseTransform(pairwise::Axpy, x, z, &arguments);
@@ -59,9 +51,9 @@ namespace nd4j {
 
         DECLARE_TYPES(axpy) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
-                    ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
-                    ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF});
+                    ->setAllowedInputTypes(0, {ALL_FLOATS})
+                    ->setAllowedInputTypes(1, {ALL_FLOATS})
+                    ->setAllowedOutputTypes(0, {ALL_FLOATS});
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
index 6897f7f77..ad7b7fee2 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
@@ -30,14 +30,6 @@ namespace nd4j {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
-            // TODO: once we add support for multiple dtypes - uncommend this
-            /*
-            int it = INT_ARG(0);
-            DataType newType = DataTypeUtils::fromInt(it);
-
-            input->cast(output, newType);
-            */
-			
             if(input->isEmpty()){
                 REQUIRE_TRUE(output->isEmpty(), 0, "If input is empty, output array must also be empty");
                 return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
index 5641bab43..8b6bd24bc 100644
--- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
  *
  * This program and the accompanying materials are made available under the
  * terms of the Apache License, Version 2.0 which is available at
@@ -88,8 +89,27 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
     nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0);
 
     // formula: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta
+    // auto v = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+    // auto m = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+
     helpers::batchnorm(input, mean, variance, gamma, beta, output, axes, epsilon);
 
+    // NDArray stdInv = *v + epsilon;
+    // stdInv.applyTransform(transform::Reciprocal);               // 1 / (variance + epsilon)
+    // stdInv.applyTransform(transform::Sqrt);                     // 1 / (variance + epsilon)^0.5
+    // if(applyScale)
+    //     stdInv *= *gamma;
+
+    //  // empty array with same shape as input
+    // input->applyBroadcast(nd4j::broadcast::Subtract, axes, m, output);
+    // output->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv);
+
+    // if(applyOffset)
+    //     output->applyBroadcast(nd4j::broadcast::Add, axes, beta);
+
+    // delete v;
+    // delete m;
+
     return Status::OK();
 }
 
@@ -113,10 +133,9 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     NDArray* input    = INPUT_VARIABLE(0);
     NDArray* mean     = INPUT_VARIABLE(1);
     NDArray* variance = INPUT_VARIABLE(2);
-    NDArray* dLdO     = INPUT_VARIABLE(3);    // next epsilon
     NDArray* gamma    = nullptr;
     NDArray* beta     = nullptr;
-
+    NDArray* dLdO     = INPUT_VARIABLE(block.width() - 1);    // next epsilon
 
     NDArray* dLdI = OUTPUT_VARIABLE(0);
     NDArray* dLdM = OUTPUT_VARIABLE(1);
@@ -129,11 +148,11 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     const float  epsilon     = T_ARG(0);
 
     if(applyScale) {
-        gamma = INPUT_VARIABLE(4);
+        gamma = INPUT_VARIABLE(3);
         dLdG  = OUTPUT_VARIABLE(3);
     }
     if(applyOffset) {
-        beta = INPUT_VARIABLE(4 + (int)applyScale);
+        beta = INPUT_VARIABLE(3 + (int)applyScale);
         dLdB = OUTPUT_VARIABLE(3 + (int)applyScale);
     }
 
@@ -172,67 +191,120 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str());
 
     // types of all input arrays should be the same (except dLdO)
-    for(int i = 1; i < block.width() - 1; ++i)
-        if(i != 3)
-            REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
+    for(int i = 1; i < block.width() - 2; ++i)
+        REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
 
     // ***** calculations ***** //
 
-    // formula for forward step: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta
+    // notations:
+    // f = g * (gamma * ((x - m) / (v + eps)^0.5) + beta) -> means dLdO * ff_output
+    // g = dLdO
+    // stdInv = 1 / (v + eps)^0.5
+    // N - batch size (product of spatial dimensions)
 
-    // consider mean and variance as constants (since we get them as inputs and don't calculate them)
-    // dLdI = (dLdO * gamma) / (variance + epsilon)^0.5
-    // dLdV = (-0.5  * gamma * (dLdO * (x - mean))_sum) / (variance + epsilon)^1.5
-    // dLdM = - (dLdO_sum * gamma) / (variance + epsilon)^0.5
-    // dLdG = (dLdO * (x - mean))_sum / (variance + epsilon)^0.5
-    // dLdB = dLdO_sum
+    // derivatives:
+    // dLdI = dfdx + dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx)
+
+    // dfdx =  gamma*stdInv*g;
+    // dfdm = -gamma*stdInv*g_sum;
+    // dmdx  = 1/N;
+    // dvdx  = 2 *  (x - m) / N
+    // dvdm  = -2 * [(x - m)]_sum / N
+    // dfdv  = -0.5 * [g*(x - m)]_sum * stdInv^3, drop gamma here for calc convenience
+
+    // finally:
+    // dLdI = gamma * (  stdInv * (g - g_sum/N) + (2/N) * dfdv * (dvdm/2  + (x - m))  )
+
+    // dLdG = (g * (x - m))_sum * stdInv
+    // dLdB = g_sum
+
+    // variance = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+    // mean = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
 
     const auto excludedAxes = ShapeUtils::evalDimsToExclude(inRank, axes);
-
-    NDArray temp1 = *variance + epsilon;
-    temp1.applyTransform(transform::Reciprocal);            // 1 / (variance + epsilon)
-    auto temp2 = temp1.transform(transform::Sqrt);     // 1 / (variance + epsilon)^0.5
-    if(applyScale)
-        temp2 *= *gamma;                                    // gamma / (variance + epsilon)^0.5
-
-    NDArray temp3(input); // empty array with same shape as input
-    input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &temp3);  // input - mean
-    temp3 *= *dLdO;                                                        // (input - mean) * dLdO
-
     const bool keepUnitiesInShape = inRank == mean->rankOf();
 
-    // dLdI
-    dLdO->applyBroadcast(nd4j::broadcast::Multiply, axes, &temp2, dLdI);
+    // inverse batch size 1/N
+    const float Ninv = 1.f * shape::tadLength(input->getShapeInfo(), axes.data(), axes.size()) / input->lengthOf();
 
-    // dLdM
-    dLdO->reduceAlongDimension(reduce::Sum, dLdM, excludedAxes, keepUnitiesInShape);    // dLdO sum over excluded axes
+    // input - mean
+    NDArray xMinusMean(input); // empty array with same shape as input
+    input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMinusMean);
+
+    // stdInv
+    NDArray stdInv = *variance + epsilon;
+    stdInv.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
+    stdInv.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
+
+    // dvdm (use dLdM as storage for dvdm)
+    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, dLdM, excludedAxes, keepUnitiesInShape);
+    *dLdM *= -Ninv;
+
+    // g_sum
+    auto gSum = dLdO->reduceAlongDims(nd4j::reduce::Sum, excludedAxes, keepUnitiesInShape);
 
     // dLdB
     if(applyOffset)
-        dLdB->assign(dLdM);
+        dLdB->assign(gSum);
 
-    // dLdM
-    // dLdM->applyPairwiseTransform(nd4j::pairwise::Multiply, temp2);
-    // dLdM->applyTransform(nd4j::transform::Neg);
-    *dLdM = 0;      // put zeros so far
+    // stdInv * (g - g_sum/N) (use dLdI as storage for this expression)
+    gSum *= Ninv;
+    dLdO->applyBroadcast(nd4j::broadcast::Subtract, axes, &gSum, dLdI);
+    dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv);
 
-    //dLdV
-    temp3.reduceAlongDimension(reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);     // ((input - mean) * dLdO)_sum
+    // dLdV <- [g*(x - m)]_sum
+    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);
 
     // dLdG
-    if(applyScale) {
-        dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, &temp2, dLdG);
-        // dLdV->assign(dLdG);
-        dLdG->applyPairwiseTransform(nd4j::pairwise::Divide, *gamma);
-    }
-    else
-        // dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, temp2);
+    *dLdV *= stdInv;
+    if(applyScale)
+        dLdG->assign(dLdV);
 
-    // dLdV
-    // dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, temp1);
-    // *dLdV *= -0.5;
+    // (2 / N) * dfdv (use dLdV as storage for dfdv)
+    *dLdV *= stdInv*stdInv;         // dLdV*stdInv * stdInv^2
+    *dLdV *=  -Ninv;             // -0.5f * (2 / N);
+
+    // dfdv * (dvdm  + (x - m)) (use xMinusMean as storage for this expression)
+    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, dLdM);
+    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV);
+
+    // dLdI
+    *dLdI += xMinusMean;
+    if(applyScale)
+        dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma);
+
+    *dLdM = 0;      // put zeros so far
     *dLdV = 0;      // put zeros so far
 
+    // java code
+    // NDArray std = *variance + epsilon;
+    // std.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
+    // std.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
+    // NDArray xMu(input);
+    // input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMu);
+    // NDArray xHat(input);
+    // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, &std, &xHat);
+    // NDArray dxhat(input);
+    // dLdO->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma, &dxhat);
+    // NDArray temp = dxhat*xMu;
+    // temp.reduceAlongDimension(reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);
+    // *dLdV *= -0.5f * std*std*std;
+    // NDArray* dxmu1 = dxhat.reduceAlongDimension(reduce::Sum, excludedAxes, keepUnitiesInShape);
+    // *dxmu1 *= -std;
+    // NDArray* dxmu2 = xMu.reduceAlongDimension(reduce::Sum, excludedAxes, keepUnitiesInShape);
+    // *dxmu2 *=  *dLdV * (-2.f/N);
+    // NDArray dLdmu = *dxmu1 + *dxmu2;
+    // dLdmu *= (1.f /N);
+    // *dLdV *= (2.f/N);
+    // dxhat.applyBroadcast(nd4j::broadcast::Multiply, axes, &std);
+    // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV);
+    // dxhat += xMu;
+    // dxhat.applyBroadcast(nd4j::broadcast::Add, axes, &dLdmu, dLdI);
+    // delete  dxmu1;
+    // delete  dxmu2;
+    // xHat *= *dLdO;
+    // xHat.reduceAlongDimension(reduce::Sum, dLdG, excludedAxes, keepUnitiesInShape);
+
     return Status::OK();
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
index 45324300d..2123317b5 100644
--- a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
@@ -28,13 +28,13 @@ namespace nd4j {
 namespace ops  {
 
     CUSTOM_OP_IMPL(multi_head_dot_product_attention, 7, -1, false, 0, 2) {
-        auto queries = INPUT_VARIABLE(0);
-        auto keys    = INPUT_VARIABLE(1);
-        auto values  = INPUT_VARIABLE(2);
-        auto Wq      = INPUT_VARIABLE(3);
-        auto Wk      = INPUT_VARIABLE(4);
-        auto Wv      = INPUT_VARIABLE(5);
-        auto Wo      = INPUT_VARIABLE(6);
+        auto queries = INPUT_VARIABLE(0);       //[batch, nIn, timeSteps]
+        auto keys    = INPUT_VARIABLE(1);       //[batch, nIn, timeSteps]
+        auto values  = INPUT_VARIABLE(2);       //[batch, nIn, timeSteps]
+        auto Wq      = INPUT_VARIABLE(3);       //[nHeads, headSize, nIn]
+        auto Wk      = INPUT_VARIABLE(4);       //[nHeads, headSize, nIn]
+        auto Wv      = INPUT_VARIABLE(5);       //[nHeads, headSize, nIn]
+        auto Wo      = INPUT_VARIABLE(6);       //[nHeads * headSize, nOut]
         auto mask    = block.width() > 7 ? INPUT_VARIABLE(7) : nullptr;
 
 
@@ -93,11 +93,12 @@ namespace ops  {
 
 
         // Project queries, keys, values
-        auto projectedQueries = AttentionHelper::multiHeadProject(queries, Wq, block.launchContext());
-        auto projectedKeys = AttentionHelper::multiHeadProject(keys, Wk, block.launchContext());
-        auto projectedValues = AttentionHelper::multiHeadProject(values, Wv, block.launchContext());
+        auto projectedQueries = AttentionHelper::multiHeadProject(queries, Wq, block.launchContext());      //[minibatch, numHeads, projectedSize, seqLength]
+        auto projectedKeys = AttentionHelper::multiHeadProject(keys, Wk, block.launchContext());            //[minibatch, numHeads, projectedSize, seqLength]
+        auto projectedValues = AttentionHelper::multiHeadProject(values, Wv, block.launchContext());        //[minibatch, numHeads, projectedSize, seqLength]
 
         // Apply Attention
+        // attnResults = [minibatch, numHeads, projectedSize, seqLenth
         NDArray attnResults('c', {projectedQueries.sizeAt(0), projectedValues.sizeAt(1), projectedValues.sizeAt(2), projectedQueries.sizeAt(3)}, projectedValues.dataType(), block.launchContext());
         nd4j::ops::dot_product_attention attention;
         attention.execute({&projectedQueries, &projectedKeys, &projectedValues, mask}, {&attnResults, weights ? OUTPUT_VARIABLE(1) : nullptr}, {}, {normalization, weights}, {});
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
index bdfdfb6c6..3fd5e2250 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
@@ -78,7 +78,7 @@ namespace nd4j {
             }
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
                 return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64));
             }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
index a80194eb2..91e9d5a41 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
@@ -77,7 +77,7 @@ namespace nd4j {
             }
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
                 return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT64));
             }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
index f027bfca3..eb1a01861 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
@@ -95,11 +95,9 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) {
     	seqLen->assign(time);                                        // set each element of seqLen to be equal to time
     }
 
-    std::initializer_list<Nd4jLong> dimsForReverse = timeMajor ? std::initializer_list<Nd4jLong>{0,1} : std::initializer_list<Nd4jLong>{1,0};
-
     // reverse x     
     nd4j::ops::reverse_sequence reverse;
-    auto resultsIn = reverse.execute({x, seqLen}, {}, dimsForReverse, {}, false, x->dataType());
+    auto resultsIn = timeMajor ? reverse.execute({x, seqLen}, {}, {0, 1}, {}, false, x->dataType()) : reverse.execute({x, seqLen}, {}, {1, 0}, {}, false, x->dataType());
     REQUIRE_TRUE (resultsIn->status() == ND4J_STATUS_OK, 0, "dynamic_bidirectional_rnn: there is a problem with reverse on the sequence.");
     auto revInput = resultsIn->at(0);
 
@@ -109,7 +107,7 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) {
     hBWFinal->assign(resultsBW->at(1));
 
     // reverse hBWtemp 
-    auto resultsOut = reverse.execute({hBWtemp, seqLen}, {}, dimsForReverse, {});
+    auto resultsOut = timeMajor ? reverse.execute({hBWtemp, seqLen}, {}, {0, 1}, {}) : reverse.execute({hBWtemp, seqLen}, {}, {1, 0}, {});
     hBW->assign(resultsOut->at(0));
     
 	delete resultsOut;
diff --git a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
index fef13d44b..b3c2a93d4 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
@@ -28,7 +28,7 @@ namespace nd4j {
 namespace ops {
 
 CUSTOM_OP_IMPL(reverse_sequence, 2, 1, false, 0, 2) {
-        
+
     auto input      = INPUT_VARIABLE(0);
     auto seqLengths = INPUT_VARIABLE(1);
     auto output     = OUTPUT_VARIABLE(0);
@@ -39,13 +39,13 @@ CUSTOM_OP_IMPL(reverse_sequence, 2, 1, false, 0, 2) {
     REQUIRE_TRUE(input->rankOf() > 1, 0, "REVERSE_SEQUENSE operation: input array must have rank > 1, but got %i instead !", input->rankOf());
     REQUIRE_TRUE(seqLengths->rankOf() == 1, 0, "REVERSE_SEQUENSE operation: input array seqLengths must be 1D vector, that is it must have rank == 1, but got %i instead !", seqLengths->rankOf());
     REQUIRE_TRUE(seqLengths->lengthOf() == input->sizeAt(batchDim), 0, "REVERSE_SEQUENSE custom operation: the length of array seqLengths must be equal to the value of batchDim dimension of input array, but got %i and %i correspondingly !", seqLengths->lengthOf(), input->sizeAt(batchDim));
-    REQUIRE_TRUE(seqDim != batchDim, 0, "REVERSE_SEQUENSE operation: input integer parameters seqDim and batchDim must be different, but they are %i and %i correspondingly !", seqDim, batchDim);
+    REQUIRE_TRUE(seqDim != batchDim, 0, "REVERSE_SEQUENSE operation: input integer parameters seqDim and batchDim must be different, but they both are equal to %i !", batchDim);
     REQUIRE_TRUE(batchDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, input->rankOf());
-    REQUIRE_TRUE(seqDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, input->rankOf());        
+    REQUIRE_TRUE(seqDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, input->rankOf());
 
     auto maxElem = seqLengths->reduceNumber(reduce::Max);
     REQUIRE_TRUE(maxElem.e<Nd4jLong>(0) <= input->sizeAt(seqDim), 0, "REVERSE_SEQUENSE operation: max element in seqLengths array must be not greater than value of seqDim dimension of input array !");
-    
+
     helpers::reverseSequence(block.launchContext(), input, seqLengths, output, seqDim, batchDim);
 
     return Status::OK();
@@ -65,15 +65,15 @@ DECLARE_SHAPE_FN(reverse_sequence) {
     int seqDim = INT_ARG(0);
     int batchDim = block.numI() > 1 ? INT_ARG(1) : 0;
 
+    REQUIRE_TRUE(batchDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, inShapeInfo[0]);
+    REQUIRE_TRUE(seqDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, inShapeInfo[0]);
     REQUIRE_TRUE(inShapeInfo[0] > 1, 0, "REVERSE_SEQUENSE operation: input array must have rank > 1, but got %i instead !", inShapeInfo[0]);
     REQUIRE_TRUE(seqLenShapeInfo[0] == 1, 0, "REVERSE_SEQUENSE operation: input array seqLengths must be 1D vector, that is it must have rank == 1, but got %i instead !", seqLenShapeInfo[0]);
     REQUIRE_TRUE(seqLenShapeInfo[1] == inShapeInfo[batchDim+1], 0, "REVERSE_SEQUENSE custom operation: the length of array seqLengths must be equal to the value of batchDim dimension of input array, but got %i and %i correspondingly !", seqLenShapeInfo[1], inShapeInfo[batchDim+1]);
-    REQUIRE_TRUE(batchDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, inShapeInfo[0]);
-    REQUIRE_TRUE(seqDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, inShapeInfo[0]);
-    
+
     Nd4jLong* outShapeInfo = nullptr;
     COPY_SHAPE(inShapeInfo, outShapeInfo);
-        
+
     return SHAPELIST(CONSTANT(outShapeInfo));
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
index 5b6e6122e..a7123d42f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/BarnesHutTsne.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -150,26 +151,30 @@ namespace helpers {
 
 //        auto shift = 0;
         auto rowSize = sizeof(T) * colCount;
-        PRAGMA_OMP_PARALLEL_FOR
-        for (int n = 0; n < N; n++) {
-            int start = rowP->e<int>(n);
-            int end = rowP->e<int>(n+1);
-            int shift = n * colCount;
-            for (int i = start; i < end; i++) {
-                T const* thisSlice = dataP + colP->e<int>(i) * colCount;
-                T res = 1;
 
-                for (int k = 0; k < colCount; k++) {
-                    auto tempVal = dataP[shift + k] - thisSlice[k];//thisSlice[k];
-                    res += tempVal * tempVal;
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto n = start; n < stop; n += increment) {
+                int s = rowP->e<int>(n);
+                int end = rowP->e<int>(n + 1);
+                int shift = n * colCount;
+                for (int i = s; i < end; i++) {
+                    T const *thisSlice = dataP + colP->e<int>(i) * colCount;
+                    T res = 1;
+
+                    for (int k = 0; k < colCount; k++) {
+                        auto tempVal = dataP[shift + k] - thisSlice[k];//thisSlice[k];
+                        res += tempVal * tempVal;
+                    }
+
+                    res = vals[i] / res;
+                    for (int k = 0; k < colCount; k++)
+                        outputP[shift + k] += ((dataP[shift + k] - thisSlice[k]) * res);
                 }
-
-                res = vals[i] / res;
-                for (int k = 0; k < colCount; k++)
-                    outputP[shift + k] += ((dataP[shift + k] - thisSlice[k]) * res);
+                //shift += colCount;
             }
-            //shift += colCount;
-        }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, N);
     }
 
     void barnes_edge_forces(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray* output, NDArray const& data) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index bd29094ec..ba0f36eb5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -23,6 +23,7 @@
 #include <ShapeUtils.h>
 #include <numeric>
 #include <ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -44,11 +45,9 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
 
         if (inEWS == 1 && outEWS == 1) {
 
-            PRAGMA_OMP_SIMD_MAX(max)
             for (int i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
 
-            PRAGMA_OMP_SIMD_SUM(sum)
             for (int i = 0; i < length; i++) {
                 outBuff[i] = nd4j::math::nd4j_exp<T, T>(inBuff[i] - max);
                 sum += outBuff[i];
@@ -60,11 +59,9 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
         }
         else {
 
-            PRAGMA_OMP_SIMD_MAX(max)
             for (int i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
 
-            PRAGMA_OMP_SIMD_SUM(sum)
             for (int i = 0; i < length; i++) {
                 T r = nd4j::math::nd4j_exp<T, T>(inBuff[i * inEWS] - max);
                 outBuff[i * outEWS] = r;
@@ -89,19 +86,17 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
         T sum = 0.;
         int length = shape::length(inShapeInfo);
 
-PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max))
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
         }
 
-PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum))
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
             sum += outBuff[offset];
         }
-PRAGMA_OMP_SIMD
+
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             outBuff[offset] /= sum;
@@ -151,7 +146,6 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
         auto length = shape::length(inShapeInfo);
 
         if (inEWS == 1) {
-            PRAGMA_OMP_SIMD_MAX(max)
             for (int i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
 
@@ -212,7 +206,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
     }
     else if(input.isSameShapeStrict(&output)) {
 
-        TadPack tadPack  = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {dimension});
+        TadPack tadPack  = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimension);
         Nd4jLong* tadShapeInfo  = tadPack.primaryShapeInfo();
         Nd4jLong* tadOffsets    = tadPack.primaryOffsets();
         const uint numOfSubArrs = tadPack.numberOfTads();
@@ -220,27 +214,30 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
 
         if(shape::elementWiseStride(tadShapeInfo) == 1){
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (uint i = 0; i < numOfSubArrs; ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
 
-                T* inBuff  = input.bufferAsT<T>()  + tadOffsets[i];
-                T* outBuff = output.bufferAsT<T>() + tadOffsets[i];
+                    T *inBuff = input.bufferAsT<T>() + tadOffsets[i];
+                    T *outBuff = output.bufferAsT<T>() + tadOffsets[i];
 
-                T max = -DataTypeUtils::max<T>();
-                T sum = 0;
+                    T max = -DataTypeUtils::max<T>();
+                    T sum = 0;
 
-                for(uint j = 0; j < tadLen; ++j)
-                    max = nd4j::math::nd4j_max<T>(max, inBuff[j]);
+                    for (uint j = 0; j < tadLen; ++j)
+                        max = nd4j::math::nd4j_max<T>(max, inBuff[j]);
 
-                for (uint j = 0; j < tadLen; ++j) {
-                    T temp = nd4j::math::nd4j_exp<T,T>(inBuff[j] - max);
-                    outBuff[j] = temp;
-                    sum += temp;
+                    for (uint j = 0; j < tadLen; ++j) {
+                        T temp = nd4j::math::nd4j_exp<T, T>(inBuff[j] - max);
+                        outBuff[j] = temp;
+                        sum += temp;
+                    }
+
+                    for (uint j = 0; j < tadLen; ++j)
+                        outBuff[j] /= sum;
                 }
+            };
 
-                for (uint j = 0; j < tadLen; ++j)
-                    outBuff[j] /= sum;
-            }
+            samediff::Threads::parallel_tad(func,0, numOfSubArrs);
         }
         else {
 
@@ -250,29 +247,30 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
             auto offsets = new Nd4jLong[tadLen];
             shape::calcOffsets(tadShapeInfo, offsets);
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (uint i = 0; i < numOfSubArrs; ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
+                    auto outBuff = output.bufferAsT<T>() + tadOffsets[i];
 
-                T* inBuff  = input.bufferAsT<T>()  + tadOffsets[i];
-                T* outBuff = output.bufferAsT<T>() + tadOffsets[i];
+                    T max = -DataTypeUtils::max<T>();
+                    T sum = 0.f;
 
-                T max = -DataTypeUtils::max<T>();
-                T sum = 0.f;
+                    for (uint j = 0; j < tadLen; ++j)
+                        max = nd4j::math::nd4j_max<T>(max, inBuff[offsets[j]]);
 
+                    for (uint j = 0; j < tadLen; ++j) {
+                        T temp = nd4j::math::nd4j_exp<T, T>(inBuff[offsets[j]] - max);
+                        outBuff[offsets[j]] = temp;
+                        sum += temp;
+                    }
 
-
-                for(uint j = 0; j < tadLen; ++j)
-                    max = nd4j::math::nd4j_max<T>(max, inBuff[offsets[j]]);
-
-                for (uint j = 0; j < tadLen; ++j) {
-                    T temp = nd4j::math::nd4j_exp<T,T>(inBuff[offsets[j]] - max);
-                    outBuff[offsets[j]] = temp;
-                    sum += temp;
+                    for (uint j = 0; j < tadLen; ++j)
+                        outBuff[offsets[j]] /= sum;
                 }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
 
-                for (uint j = 0; j < tadLen; ++j)
-                    outBuff[offsets[j]] /= sum;
-            }
             delete []offsets;
         }
     }
@@ -299,16 +297,19 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
     const Nd4jLong* inputShapeInfo = input.getShapeInfo();
     const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(inputLen > Environment::getInstance()->elementwiseThreshold())
-    for(Nd4jLong i = 0; i < inputLen; ++i) {
-         // FIXME: double!
-        double x = input.e<double>(i);
-        if(x < 0.0) {
-            // FIXME: double
-            output.p(i, (x * alpha.e<double>(shape::subArrayIndex(i, inputShapeInfo, alphaShapeInfo))));
-        } else
-            output.p(i, x);
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            // FIXME: double!
+            double x = input.e<double>(i);
+            if (x < 0.0) {
+                // FIXME: double
+                output.p(i, (x * alpha.e<double>(shape::subArrayIndex(i, inputShapeInfo, alphaShapeInfo))));
+            } else
+                output.p(i, x);
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, inputLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
index 0e6e1f777..a36330fbe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
@@ -20,6 +20,7 @@
 
 
 #include<ops/declarable/helpers/addBias.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -62,12 +63,15 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
 
         if(inOutAreSame) {
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint h = 0; h < oH ; ++h)
-                        for(uint w = 0; w < oW ; ++w)
-                            z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint h = start_z; h < stop_z; h += inc_z)
+                            for (uint w = 0; w < oW; ++w)
+                                z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] += static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1);
         }
         else {
 
@@ -76,12 +80,15 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
             const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1];
             const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint h = 0; h < oH ; ++h)
-                        for(uint w = 0; w < oW ; ++w)
-                            z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + h*xStrideH + w*xStrideW] +  static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint h = start_z; h < stop_z; h += inc_z)
+                            for (uint w = 0; w < oW; ++w)
+                                z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + h * xStrideH + w * xStrideW] + static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1);
         }
     }
     else if(output.rankOf() == 5) {
@@ -98,13 +105,16 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
 
         if(inOutAreSame) {
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint d = 0; d < oD ; ++d)
-                        for(uint h = 0; h < oH ; ++h)
-                            for(uint w = 0; w < oW ; ++w)
-                                z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint d = start_z; d < stop_z; d += inc_z)
+                            for (uint h = 0; h < oH; ++h)
+                                for (uint w = 0; w < oW; ++w)
+                                    z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] += static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1);
         }
         else {
 
@@ -114,13 +124,16 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
             const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
             const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[4] : input.stridesOf()[3];
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint d = 0; d < oD ; ++d)
-                        for(uint h = 0; h < oH ; ++h)
-                            for(uint w = 0; w < oW ; ++w)
-                                z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + d*xStrideD + h*xStrideH + w*xStrideW] + static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint d = start_z; d < stop_z; d += inc_z)
+                            for (uint h = 0; h < oH; ++h)
+                                for (uint w = 0; w < oW; ++w)
+                                    z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + d * xStrideD + h * xStrideH + w * xStrideW] + static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1);
         }
     }
     else {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
index 5484d822d..ae76f0289 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@@ -21,6 +21,7 @@
 
 #include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -38,50 +39,55 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
 
     if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                T h, s, v;
 
-            T h, s, v;
+                rgbToHsv<T>(x[i], x[i + 1], x[i + 2], h, s, v);
 
-            rgbToHsv<T>(x[i], x[i+1], x[i+2], h, s, v);
+                h += delta * 360;
+                if (h > 360)
+                    h -= 360;
+                else if (h < 0)
+                    h += 360;
 
-            h += delta * 360;
-            if(h > 360)
-                h -= 360;
-            else if(h < 0)
-                h += 360;
+                hsvToRgb<T>(h, s, v, z[i], z[i + 1], z[i + 2]);
+            }
+        };
 
-            hsvToRgb<T>(h, s, v, z[i], z[i+1], z[i+2]);
-        }
+        samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3);
     }
     else {
 
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads   = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i = 0; i < numOfTads; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
 
-            const T* xTad = x + packX.platformOffsets()[i];
-                  T* zTad = z + packZ.platformOffsets()[i];
+                const T *xTad = x + packX.platformOffsets()[i];
+                T *zTad = z + packZ.platformOffsets()[i];
 
-            T h, s, v;
+                T h, s, v;
 
-            rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
+                rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
 
-            h += delta * 360;
-            if(h > 360)
-                h -= 360;
-            else if(h < 0)
-                h += 360;
+                h += delta * 360;
+                if (h > 360)
+                    h -= 360;
+                else if (h < 0)
+                    h += 360;
 
-            hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
+                hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
 
-        }
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
index 9a5141a82..d4b0de398 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@@ -22,6 +22,7 @@
 #include <ops/declarable/helpers/adjust_saturation.h>
 #include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -39,50 +40,51 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
 
     if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                T h, s, v;
 
-            T h, s, v;
+                rgbToHsv<T>(x[i], x[i + 1], x[i + 2], h, s, v);
 
-            rgbToHsv<T>(x[i], x[i+1], x[i+2], h, s, v);
+                s *= factor;
+                if (s > 1.f)
+                    s = 1.f;
+                else if (s < 0.f)
+                    s = 0.f;
 
-            s *= factor;
-            if(s > 1.f)
-                s = 1.f;
-            else if(s < 0.f)
-                s = 0.f;
+                hsvToRgb<T>(h, s, v, z[i], z[i + 1], z[i + 2]);
+            }
+        };
 
-            hsvToRgb<T>(h, s, v, z[i], z[i+1], z[i+2]);
-        }
-    }
-    else {
-
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+        samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3);
+    } else {
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads   = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i = 0; i < numOfTads; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                const T *xTad = x + packX.platformOffsets()[i];
+                T *zTad = z + packZ.platformOffsets()[i];
 
-            const T* xTad = x + packX.platformOffsets()[i];
-                  T* zTad = z + packZ.platformOffsets()[i];
+                T h, s, v;
 
-            T h, s, v;
+                rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
 
-            rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
+                s *= factor;
+                if (s > 1.f)
+                    s = 1.f;
+                else if (s < 0.f)
+                    s = 0.f;
 
-            s *= factor;
-            if(s > 1.f)
-                s = 1.f;
-            else if(s < 0.f)
-                s = 0.f;
+                hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
+            }
+        };
 
-            hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
-
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
index ffd75e435..b408da720 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
@@ -22,6 +22,7 @@
 #include <types/float16.h>
 #include <ops/declarable/helpers/batched_gemm.h>
 #include <helpers/BlasHelper.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -92,25 +93,28 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st
 
         int vaSize = vA.size();
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (int p = 0; p < vaSize; ++p) {
-            auto A = reinterpret_cast<T*>(vA.at(p)->buffer());
-            auto B = reinterpret_cast<T*>(vB.at(p)->buffer());
-            auto C = reinterpret_cast<T*>(vC.at(p)->buffer());
-            auto alpha = alphas->e<T>(p);
-            auto beta = betas->e<T>(p);
-            for (int m = 0; m < M; ++m) {
-                for (int n = 0; n < N; ++n) {
-                    T c_mnp = 0;
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto p = start; p < stop; p += increment) {
+                auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
+                auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
+                auto C = reinterpret_cast<T *>(vC.at(p)->buffer());
+                auto alpha = alphas->e<T>(p);
+                auto beta = betas->e<T>(p);
+                for (int m = 0; m < M; ++m) {
+                    for (int n = 0; n < N; ++n) {
+                        T c_mnp = 0;
 
-                    PRAGMA_OMP_SIMD
-                    for (int k = 0; k < K; ++k)
-                        c_mnp += A[tA == CblasNoTrans ? (m + k * lda) : (m * lda + k)] * B[tB == CblasNoTrans ? (k + n * ldb) : (k * ldb + n)];
+                        PRAGMA_OMP_SIMD
+                        for (int k = 0; k < K; ++k)
+                            c_mnp += A[tA == CblasNoTrans ? (m + k * lda) : (m * lda + k)] * B[tB == CblasNoTrans ? (k + n * ldb) : (k * ldb + n)];
 
-                    C[m + n * ldc] = alpha * c_mnp + beta * C[m + n * ldc];
+                        C[m + n * ldc] = alpha * c_mnp + beta * C[m + n * ldc];
+                    }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, vaSize);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index a0847f704..7a0d8b97b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -22,6 +22,7 @@
 #include<ops/declarable/helpers/batchnorm.h>
 #include <helpers/ShapeUtils.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -71,9 +72,8 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
 
     if(beta != nullptr) {
         const T* betaBuff  = beta->bufferAsT<T>();
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            const auto threadNum = omp_get_thread_num();
+        auto func = PRAGMA_THREADS_DO {
+            const auto threadNum = thread_id;
             Nd4jLong* inOffsets = new Nd4jLong[step];
             Nd4jLong* memBuff = new Nd4jLong[2 * inShapeInfo[0]];
 
@@ -98,17 +98,17 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
             }
             delete []inOffsets;
             delete []memBuff;
-        }
+        };
+
+        samediff::Threads::parallel_do(func, info._numThreads);
     }
     else {
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            const auto threadNum = omp_get_thread_num();
+        auto func = PRAGMA_THREADS_DO {
+            const auto threadNum = thread_id;
             Nd4jLong* inOffsets = new Nd4jLong[step];
             Nd4jLong* memBuff = new Nd4jLong[2 * inShapeInfo[0]];
 
             for (int j = 0; j < lenSmall; ++j) {
-
                 const bool isOwner = j < info._numThreads ? threadNum == j : threadNum == j % info._numThreads;
                 if (!isOwner) continue;
 
@@ -128,7 +128,9 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
             }
             delete []inOffsets;
             delete []memBuff;
-        }
+        };
+
+        samediff::Threads::parallel_do(func, info._numThreads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
index bba3e8acb..ddd1ad892 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
@@ -22,6 +22,7 @@
 #include <DataTypeUtils.h>
 #include<ops/declarable/helpers/betaInc.h>
 #include <NDArrayFactory.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -84,7 +85,7 @@ static T continuedFraction(const T a, const T b, const T x) {
 			return f;
     }
 
-    return 1.f / 0.f;	// no convergence, more iterations is required
+    return std::numeric_limits<float>::infinity(); // no convergence, more iterations is required
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -121,9 +122,12 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
 
 	int xLen = x.lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
-	for(int i = 0; i < xLen; ++i)
-		output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
+    };
+
+    samediff::Threads::parallel_for(func, 0, xLen);
 }
 
 ///////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
index b4a54ad7a..5aad38da8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/col2im.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -56,64 +57,77 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
         
     memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
 
-	T *col, *im;
-    int imRow, imCol;
 
     // if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
     if (false) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(col, im, imRow, imCol) collapse(2))
-    	for (int b = 0; b < bS; b++) {
-      		for (int c = 0; c < iC; ++c) {                    
-            	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                	for (int kCol = 0; kCol < kW; ++kCol) {                            
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {                    
+        auto func = PRAGMA_THREADS_FOR_2D {
+            T *col, *im;
+            int imRow, imCol;
 
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
+            for (uint b = start_x; b < stop_x; b += inc_x) {
+                for (uint c = start_y; c < stop_y; c += inc_y) {
+                    for (int kRow = 0; kRow < kH; ++kRow) {
+                        for (int kCol = 0; kCol < kW; ++kCol) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
 
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
+                                    imRow = (-pH + kRow * dH) + colH * sH;
+                                    imCol = (-pW + kCol * dW) + colW * sW;
 
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im += *col;
+                                    col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5;
+                                    im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3;
+
+                                    if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) &&
+                                        static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
+                                        *im += *col;
+                                }
                             }
                         }
                     }
                 }
             }
-        }  
+        };
+
+        samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(firstprivate(imRow, imCol))
-    	for (int b = 0; b < bS; ++b) {
-            T* im0  = imBuff + b*imStride0;
-            T* col4 = colBuff + b*colStride0;
-        	for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
-                T* col5 = col4;
-            	for (int colW = 0; colW < oW; ++colW, col5 += colStride5) {
-                    T* col1 = col5;
-                    T* im1 = im0;
-                	for (int c = 0; c < iC; ++c, col1 += colStride1, im1 += imStride1) {
-                        int imRow = (-pH + colH*sH);
-                        T* col2 = col1;
-                        T* im2 = im1 + imRow*imStride2;
-                    	for (int kRow = 0; kRow < kH; ++kRow, col2 += colStride2, imRow += dH, im2 += dH*imStride2) {
-                            int imCol =-pW + colW*sW;
-                            T* col3 = col2;                            
-                            T* im3 = im2 + imCol*imStride3;
-                        	for (int kCol = 0; kCol < kW; ++kCol, col3 += colStride3, imCol += dW, im3 += dW*imStride3) {
+        auto func = PRAGMA_THREADS_FOR {
+            T *col, *im;
 
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im3 += *col3;
+            for (uint b = start; b < stop; b += increment) {
+                T *im0 = imBuff + b * imStride0;
+                T *col4 = colBuff + b * colStride0;
+                for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
+                    T *col5 = col4;
+                    for (int colW = 0; colW < oW; ++colW, col5 += colStride5) {
+                        T *col1 = col5;
+                        T *im1 = im0;
+                        for (int c = 0; c < iC; ++c, col1 += colStride1, im1 += imStride1) {
+                            int imRow = (-pH + colH * sH);
+                            T *col2 = col1;
+                            T *im2 = im1 + imRow * imStride2;
+                            for (int kRow = 0;
+                                 kRow < kH; ++kRow, col2 += colStride2, imRow += dH, im2 += dH * imStride2) {
+                                int imCol = -pW + colW * sW;
+                                T *col3 = col2;
+                                T *im3 = im2 + imCol * imStride3;
+                                for (int kCol = 0;
+                                     kCol < kW; ++kCol, col3 += colStride3, imCol += dW, im3 += dW * imStride3) {
+
+                                    if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) &&
+                                        static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
+                                        *im3 += *col3;
+                                }
                             }
                         }
                     }
-                }                           
+                }
             }
-        }  
+        };
+
+        samediff::Threads::parallel_tad(func, 0, bS);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
index 50a11f767..5f7fbf694 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
@@ -15,6 +15,7 @@
  ******************************************************************************/
 
 #include <ops/declarable/helpers/compare_elem.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -26,26 +27,38 @@ namespace nd4j {
                 int elementsPerThread = length / ELEMENT_THRESHOLD;
                 int num_threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
                 num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-                Nd4jLong sum = 0;
+                Nd4jLong sumt = 0;
 
                 if(isStrictlyIncreasing) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
-                    for (Nd4jLong i = 0; i < length - 1; i++) {
-                        auto val0 = input->t<T>(i);
-                        auto val1 = input->t<T>(i + 1);
-                        sum += val0 >= val1 ? -1 : 0;
-                    }
+                    //PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
+                    auto func = PRAGMA_REDUCE_LONG {
+                        Nd4jLong sum = 0;
+                        for (auto i = start; i < stop; i++) {
+                            auto val0 = input->t<T>(i);
+                            auto val1 = input->t<T>(i + 1);
+                            sum += val0 >= val1 ? -1 : 0;
+                        }
+                        return sum;
+                    };
+                    sumt = samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, length - 1);
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
-                    for (Nd4jLong i = 0; i < length - 1; i++) {
-                        auto val0 = input->t<T>(i);
-                        auto val1 = input->t<T>(i + 1);
-                        sum += val0 > val1 ? -1 : 0;
-                    }
+                    //PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
+                    auto func = PRAGMA_REDUCE_LONG {
+                        Nd4jLong sum = 0;
+                        for (auto i = start; i < stop; i++) {
+                            auto val0 = input->t<T>(i);
+                            auto val1 = input->t<T>(i + 1);
+                            sum += val0 > val1 ? -1 : 0;
+                        }
+
+                        return sum;
+                    };
+                    sumt = samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, length - 1);
                 }
 
-                output = (sum > -1);
+                nd4j_printf("Sum: %lld\n", sumt)
+
+                output = (sumt > -1);
 
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
index 859330a9d..e2d24c591 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/confusion.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j {
@@ -30,13 +31,16 @@ namespace helpers {
         std::unique_ptr<ResultSet> arrs(output->allTensorsAlongDimension({1}));
         int lLen = labels->lengthOf();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(lLen > Environment::getInstance()->elementwiseThreshold())
-        for (int j = 0; j < lLen; ++j){
-            auto label = labels->e<Nd4jLong>(j);
-            auto pred = predictions->e<Nd4jLong>(j);
-            T value = (weights == nullptr ? (T)1.0f : weights->e<T>(j));
-            (*arrs->at(label)).p<T>(pred, value);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (int j = start; j < stop; j += increment) {
+                auto label = labels->e<Nd4jLong>(j);
+                auto pred = predictions->e<Nd4jLong>(j);
+                T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));
+                (*arrs->at(label)).p<T>(pred, value);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, lLen);
     }
 
     void confusionFunctor(nd4j::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
index 93d00220e..0829bcbe6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@@ -24,6 +24,7 @@
 #include <ops/declarable/helpers/col2im.h>
 #include <NDArrayFactory.h>
 #include <MmulHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops  {
@@ -62,32 +63,34 @@ namespace nd4j {
             T* colBuff = columns.bufferAsT<T>();
             T* volBuff = const_cast<NDArray&>(volume).bufferAsT<T>();
 
-            T *col, *vol;
-            int volDep, volRow, volCol;
 
-            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo()))
+            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, vol, volDep, volRow, volCol) collapse(2))
-                for (int b = 0; b < bS; ++b) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kDep = 0; kDep < kD; ++kDep) {
-                            for (int kRow = 0; kRow < kH; ++kRow) {
-                                for (int kCol = 0; kCol < kW; ++kCol) {
-                                    for (int colD = 0; colD < oD; ++colD) {
-                                        for (int colH = 0; colH < oH; ++colH) {
-                                            for (int colW = 0; colW < oW; ++colW) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    T *col, *vol;
+                    int volDep, volRow, volCol;
 
-                                                volDep = (-pD + kDep * dD) + colD*sD;
-                                                volRow = (-pH + kRow * dH) + colH*sH;
-                                                volCol = (-pW + kCol * dW) + colW*sW;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int kDep = start_z; kDep < stop_z; kDep += inc_z) {
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+                                        for (int colD = 0; colD < oD; ++colD) {
+                                            for (int colH = 0; colH < oH; ++colH) {
+                                                for (int colW = 0; colW < oW; ++colW) {
 
-                                                col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
 
-                                                if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
-                                                    *col = static_cast<T>(0.);
-                                                else {
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *col = *vol;
+                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+
+                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
+                                                        *col = static_cast<T>(0.);
+                                                    else {
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *col = *vol;
+                                                    }
                                                 }
                                             }
                                         }
@@ -96,31 +99,36 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
 
-            else
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, kD, 1);
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(vol, col, volDep, volRow, volCol))
-                for (int b = 0; b < bS; b++) {
-                    for (int colD = 0; colD < oD; ++colD) {
-                        for (int colH = 0; colH < oH; ++colH) {
-                            for (int colW = 0; colW < oW; ++colW) {
-                                for (int c = 0; c < iC; ++c) {
-                                    for (int kDep = 0; kDep < kD; ++kDep) {
-                                        for (int kRow = 0; kRow < kH; ++kRow) {
-                                            for (int kCol = 0; kCol < kW; ++kCol) {
+            } else {
 
-                                                volDep = (-pD + kDep * dD) + colD*sD;
-                                                volRow = (-pH + kRow * dH) + colH*sH;
-                                                volCol = (-pW + kCol * dW) + colW*sW;
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    T *col, *vol;
+                    int volDep, volRow, volCol;
+                    for (int b = start_x; b < stop_x; b++) {
+                        for (int colD = start_y; colD < stop_y; colD++) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
+                                    for (int c = 0; c < iC; ++c) {
+                                        for (int kDep = 0; kDep < kD; ++kDep) {
+                                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                                for (int kCol = 0; kCol < kW; ++kCol) {
 
-                                                col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
 
-                                                if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
-                                                    *col = static_cast<T>(0.);
-                                                else {
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *col = *vol;
+                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+
+                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
+                                                        *col = static_cast<T>(0.f);
+                                                    else {
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *col = *vol;
+                                                    }
                                                 }
                                             }
                                         }
@@ -129,7 +137,11 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, oD, 1);
+                //func(0, 0, bS, 1, 0, oD, 1);
+            }
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -168,29 +180,31 @@ namespace nd4j {
             T* volBuff = volume.bufferAsT<T>();
             T* colBuff = const_cast<NDArray&>(columns).bufferAsT<T>();
 
-            T* col, *vol;
-            int volDep, volRow, volCol;
 
-            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo()))
+            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, vol, volDep, volRow, volCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kDep = 0; kDep < kD; ++kDep) {
-                            for (int kRow = 0; kRow < kH; ++kRow) {
-                                for (int kCol = 0; kCol < kW; ++kCol) {
-                                    for (int colD = 0; colD < oD; ++colD) {
-                                        for (int colH = 0; colH < oH; ++colH) {
-                                            for (int colW = 0; colW < oW; ++colW) {
+                auto func = PRAGMA_THREADS_FOR {
+                    T* col, *vol;
+                    int volDep, volRow, volCol;
 
-                                                volDep = -pD + kDep * dD + colD * sD;
-                                                volRow = -pH + kRow * dH + colH * sH;
-                                                volCol = -pW + kCol * dW + colW * sW;
+                    for (int b = start; b < stop; b++) {
+                        for (int c = 0; c < iC; c++) {
+                            for (int kDep = 0; kDep < kD; ++kDep) {
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+                                        for (int colD = 0; colD < oD; ++colD) {
+                                            for (int colH = 0; colH < oH; ++colH) {
+                                                for (int colW = 0; colW < oW; ++colW) {
 
-                                                if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
-                                                    col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *vol += *col;
+                                                    volDep = -pD + kDep * dD + colD * sD;
+                                                    volRow = -pH + kRow * dH + colH * sH;
+                                                    volCol = -pW + kCol * dW + colW * sW;
+
+                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
+                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *vol += *col;
+                                                    }
                                                 }
                                             }
                                         }
@@ -199,28 +213,34 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
 
-            else
+                samediff::Threads::parallel_tad(func, 0, bS);
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(vol, col, volDep, volRow, volCol))
-                for (int b = 0; b < bS; b++) {
-                    for (int colD = 0; colD < oD; ++colD) {
-                        for (int colH = 0; colH < oH; ++colH) {
-                            for (int colW = 0; colW < oW; ++colW) {
-                                for (int c = 0; c < iC; ++c) {
-                                    for (int kDep = 0; kDep < kD; ++kDep) {
-                                        for (int kRow = 0; kRow < kH; ++kRow) {
-                                            for (int kCol = 0; kCol < kW; ++kCol) {
+            } else {
 
-                                                volDep = (-pD + kDep * dD) + colD*sD;
-                                                volRow = (-pH + kRow * dH) + colH*sH;
-                                                volCol = (-pW + kCol * dW) + colW*sW;
+                auto func = PRAGMA_THREADS_FOR {
+                    T* col, *vol;
+                    int volDep, volRow, volCol;
 
-                                                if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
-                                                    col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *vol += *col;
+                    for (int b = start; b < stop; b++) {
+                        for (int colD = 0; colD < oD; colD++) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
+                                    for (int c = 0; c < iC; ++c) {
+                                        for (int kDep = 0; kDep < kD; ++kDep) {
+                                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                                for (int kCol = 0; kCol < kW; ++kCol) {
+
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
+
+                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
+                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *vol += *col;
+                                                    }
                                                 }
                                             }
                                         }
@@ -229,7 +249,10 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_tad(func, 0, bS);
+            }
         }
 
 
@@ -568,22 +591,24 @@ namespace nd4j {
             const Nd4jLong zStride2 = output.stridesOf()[dimIH];
             const Nd4jLong zStride3 = output.stridesOf()[dimIH + 1];
 
-            uint xCoord2, xCoord3;
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4) private(xCoord2, xCoord3))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint h = 0; h < oH ; ++h) {
-                        for(uint w = 0; w < oW ; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                uint xCoord2, xCoord3;
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint h = start_z; h < stop_z; h += inc_z) {
+                            for (uint w = 0; w < oW; ++w) {
+                                xCoord2 = h / factorH;
+                                xCoord3 = w / factorW;
 
-                            xCoord2 = h / factorH;
-                            xCoord3 = w / factorW;
-
-                            z[b*zStride0 + c*zStride1 + h*zStride2 + w*zStride3] = x[b*xStride0 + c*xStride1 + xCoord2*xStride2 + xCoord3*xStride3];
+                                z[b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3] = x[b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3];
+                            }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oH, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -616,25 +641,31 @@ namespace nd4j {
             const Nd4jLong zStride3 = output.stridesOf()[dimID + 1];
             const Nd4jLong zStride4 = output.stridesOf()[dimID + 2];
 
-            uint xCoord2, xCoord3, xCoord4;
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5) private(xCoord2, xCoord3, xCoord4))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint d = 0; d < oD ; ++d) {
-                        for(uint h = 0; h < oH ; ++h) {
-                            for(uint w = 0; w < oW ; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                uint xCoord2, xCoord3, xCoord4;
 
-                                xCoord2 = d / factorD;
-                                xCoord3 = h / factorH;
-                                xCoord4 = w / factorW;
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint d = start_z; d < stop_z; d += inc_z) {
+                            for (uint h = 0; h < oH; ++h) {
+                                for (uint w = 0; w < oW; ++w) {
 
-                                z[b*zStride0 + c*zStride1 + d*zStride2 + h*zStride3 + w*zStride4] = x[b*xStride0 + c*xStride1 + xCoord2*xStride2 + xCoord3*xStride3 + xCoord4*xStride4];
+                                    xCoord2 = d / factorD;
+                                    xCoord3 = h / factorH;
+                                    xCoord4 = w / factorW;
+
+                                    z[b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4] = x[
+                                            b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3 +
+                                            xCoord4 * xStride4];
+                                }
                             }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -668,23 +699,26 @@ namespace nd4j {
             const Nd4jLong zStride3 = gradI.stridesOf()[dimIH + 1];
 
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint h = 0; h < iH; ++h) {
-                        for(uint w = 0; w < iW; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint h = start_z; h < stop_z; h += inc_z) {
+                            for (uint w = 0; w < iW; ++w) {
 
-                            const auto zOffset = b*zStride0 + c*zStride1 + h*zStride2 + w*zStride3;
+                                const auto zOffset = b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3;
 
-                            z[zOffset] = 0;
+                                z[zOffset] = 0;
 
-                            for(uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
-                                for(uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
-                                    z[zOffset] += x[b*xStride0 + c*xStride1 + xh*xStride2 + xw*xStride3];
+                                for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
+                                    for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
+                                        z[zOffset] += x[b * xStride0 + c * xStride1 + xh * xStride2 + xw * xStride3];
+                            }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iH, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -723,26 +757,29 @@ namespace nd4j {
             const Nd4jLong zStride4 = gradI.stridesOf()[dimID + 2];
 
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint d = 0; d < iD; ++d) {
-                        for(uint h = 0; h < iH; ++h) {
-                            for(uint w = 0; w < iW; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint d = start_z; d < stop_z; d += inc_z) {
+                            for (uint h = 0; h < iH; ++h) {
+                                for (uint w = 0; w < iW; ++w) {
 
-                                const auto zOffset = b*zStride0 + c*zStride1 + d*zStride2 + h*zStride3 + w*zStride4;
+                                    const auto zOffset = b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4;
 
-                                z[zOffset] = 0;
+                                    z[zOffset] = 0;
 
-                                for(uint xd = d * factorD; xd < d * factorD + factorD; ++xd)
-                                    for(uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
-                                        for(uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
-                                            z[zOffset] += x[b*xStride0 + c*xStride1 + xd*xStride2 + xh*xStride3 + xw*xStride4];
+                                    for (uint xd = d * factorD; xd < d * factorD + factorD; ++xd)
+                                        for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
+                                            for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
+                                                z[zOffset] += x[b * xStride0 + c * xStride1 + xd * xStride2 + xh * xStride3 + xw * xStride4];
+                                }
                             }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iD, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -779,142 +816,156 @@ namespace nd4j {
             const Nd4jLong iStep3   = dW*iStride3;
             const int kProd         = kH*kW;
 
-            Nd4jLong hstart, wstart, hend, wend;
-            T *pIn;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
 
-                                pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                T max = -DataTypeUtils::max<T>();
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                        T val = pIn[kh + kw];
-                                        if (val > max)
-                                            max = val;
-                                    }
-                                out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max;
+                                    T max = -DataTypeUtils::max<T>();
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
+                                            T val = pIn[kh + kw];
+                                            if (val > max)
+                                                max = val;
+                                        }
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max;
+                                }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {      // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
 
-                                pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                T sum = static_cast<T>(0.f);
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                        sum += pIn[kh + kw];
+                                    T sum = static_cast<T>(0.f);
 
-                                if (extraParam0 == 0) {                     //Exclude padding
-                                    int a = (hend-hstart)/iStep2 + ((hend-hstart) % iStep2 == 0 ? 0 : 1);
-                                    int b = (wend-wstart)/iStep3 + ((wend-wstart) % iStep3 == 0 ? 0 : 1);
-                                    sum /=  static_cast<T>(a * b);          //  Accounts for dilation
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            sum += pIn[kh + kw];
+
+                                    if (extraParam0 == 0) {                     //Exclude padding
+                                        int a = (hend - hstart) / iStep2 + ((hend - hstart) % iStep2 == 0 ? 0 : 1);
+                                        int r = (wend - wstart) / iStep3 + ((wend - wstart) % iStep3 == 0 ? 0 : 1);
+                                        sum /= static_cast<T>(a * r);          //  Accounts for dilation
+                                    } else if (extraParam0 == 1)                  //Include padding
+                                        sum /= kProd;
+
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
                                 }
-                                else if (extraParam0 == 1)                  //Include padding
-                                    sum /= kProd;
-
-                                out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
 
-                                pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                T sum = static_cast<T>(0.f);
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+                                    T sum = static_cast<T>(0.f);
 
-                                sum = nd4j::math::nd4j_pow<T,T,T>(sum, static_cast<T>((T)1.f) / extraParam0);
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
 
-                                out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
+                                    sum = nd4j::math::nd4j_pow<T, T, T>(sum, static_cast<T>((T) 1.f) / extraParam0);
+
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
+                                }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
@@ -961,176 +1012,192 @@ namespace nd4j {
             const Nd4jLong iStep4   = dW*iStride4;
             const int kProd         = kD*kH*kW;
 
-            Nd4jLong dstart, hstart, wstart, dend, hend, wend;
-            T sum, *pIn;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
 
-                                    pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    sum = -DataTypeUtils::max<T>();
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
-                                                T val = pIn[kd + kh + kw];
-                                                if (val > sum)
-                                                    sum = val;
-                                            }
-                                    out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                        sum = -DataTypeUtils::max<T>();
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
+                                                    T val = pIn[kd + kh + kw];
+                                                    if (val > sum)
+                                                        sum = val;
+                                                }
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {     // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
 
-                                    pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    sum = static_cast<T>(0.);
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                sum += pIn[kd + kh + kw];
+                                        sum = static_cast<T>(0.);
 
-                                    if (extraParam0 == 0)         //Exclude padding
-                                        sum /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(iStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep4));   //Accounts for dilation
-                                    else if (extraParam0 == 1)    //Include padding
-                                        sum /= kProd;
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    sum += pIn[kd + kh + kw];
 
-                                    out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                        if (extraParam0 == 0)         //Exclude padding
+                                            sum /= nd4j::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(iStep2)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(iStep3)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(iStep4));   //Accounts for dilation
+                                        else if (extraParam0 == 1)    //Include padding
+                                            sum /= kProd;
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
 
-                                    pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    sum = static_cast<T>(0.);
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+                                        sum = static_cast<T>(0.);
 
-                                    sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
 
-                                    out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                        sum = nd4j::math::nd4j_pow<T, T, T>(sum, (T) 1.f / extraParam0);
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling3d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw "";
+                throw std::runtime_error("Incorrect poooling3d mode");
             }
         }
 
@@ -1182,191 +1249,230 @@ namespace nd4j {
 
             const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3;
 
-            Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
-            T sum, valO, *pIn, *pgI;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, sum, hstart, wstart, hend, wend, maxKH, maxKW))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                pIn = in + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                sum = -DataTypeUtils::max<T>();
-                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                if(sameStrides) {
+                                    sum = -DataTypeUtils::max<T>();
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
 
-                                    hstart *= iStride2;
-                                    hend   *= iStride2;
-                                    wstart *= iStride3;
-                                    wend   *= iStride3;
+                                    if (sameStrides) {
 
-                                    // we set these to default values
-                                    maxKH = hstart;
-                                    maxKW = wstart;
+                                        hstart *= iStride2;
+                                        hend *= iStride2;
+                                        wstart *= iStride3;
+                                        wend *= iStride3;
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                            T valIn = pIn[kh + kw];
-                                            if (valIn > sum) {
-                                                sum = valIn;
-                                                maxKH = kh;
-                                                maxKW = kw;
+                                        // we set these to default values
+                                        maxKH = hstart;
+                                        maxKW = wstart;
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
+                                                T valIn = pIn[kh + kw];
+                                                if (valIn > sum) {
+                                                    sum = valIn;
+                                                    maxKH = kh;
+                                                    maxKW = kw;
+                                                }
                                             }
-                                        }
-                                    gI[pIn - in + maxKH + maxKW] += valO;
-                                }
-                                else {
+                                        gI[pIn - in + maxKH + maxKW] += valO;
+                                    } else {
 
-                                    // we set these to default values
-                                    maxKH = hstart;
-                                    maxKW = wstart;
+                                        // we set these to default values
+                                        maxKH = hstart;
+                                        maxKW = wstart;
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                            T valIn = pIn[kh * iStride2 + kw * iStride3];
-                                            if (valIn > sum) {
-                                                sum = valIn;
-                                                maxKH = kh;
-                                                maxKW = kw;
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                T valIn = pIn[kh * iStride2 + kw * iStride3];
+                                                if (valIn > sum) {
+                                                    sum = valIn;
+                                                    maxKH = kh;
+                                                    maxKW = kw;
+                                                }
                                             }
-                                        }
-                                    gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
+
+                                        gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {     // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pgI, valO, hstart, wstart, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                pgI  = gI + b * gIStride0 + c * gIStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pgI = gI + b * gIStride0 + c * gIStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= gIStride2;
-                                hend   *= gIStride2;
-                                wstart *= gIStride3;
-                                wend   *= gIStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) /
+                                                        dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) /
+                                                        dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) /
+                                                      dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) /
+                                                      dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
+                                    hstart *= gIStride2;
+                                    hend *= gIStride2;
+                                    wstart *= gIStride3;
+                                    wend *= gIStride3;
 
-                                if ((int) extraParam0 == 0)         //Exclude padding
-                                    valO /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep3)));   //Accounts for dilation
-                                else if ((int) extraParam0 == 1)    //Include padding
-                                    valO /= kProd;
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
-                                        pgI[kh + kw] += valO;
+                                    if ((int) extraParam0 == 0)         //Exclude padding
+                                        valO /= static_cast<T>(nd4j::math::nd4j_ceil<double, T>(
+                                                static_cast<double>(hend - hstart) / static_cast<double>(gIStep2))) *
+                                                static_cast<T>(nd4j::math::nd4j_ceil<double, T>(
+                                                        static_cast<double>(wend - wstart) /
+                                                        static_cast<double>(gIStep3)));   //Accounts for dilation
+                                    else if ((int) extraParam0 == 1)    //Include padding
+                                        valO /= kProd;
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
+                                            pgI[kh + kw] += valO;
+                                }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, pgI, sum, hstart, wstart, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                pIn  = in + b * iStride0 + c * iStride1;
-                                pgI  = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
+                                    pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                sum = static_cast<T>(0.f);
-                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) /
+                                                        dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) /
+                                                        dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) /
+                                                      dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) /
+                                                      dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                if(sameStrides) {
+                                    sum = static_cast<T>(0.f);
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
 
-                                    hstart *= iStride2;
-                                    hend   *= iStride2;
-                                    wstart *= iStride3;
-                                    wend   *= iStride3;
+                                    if (sameStrides) {
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                            sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+                                        hstart *= iStride2;
+                                        hend *= iStride2;
+                                        wstart *= iStride3;
+                                        wend *= iStride3;
 
-                                    valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                                sum += nd4j::math::nd4j_pow<T, T, T>(
+                                                        nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                            pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kh + kw]);
-                                }
-                                else {
+                                        valO *= nd4j::math::nd4j_pow<T, T, T>(sum,
+                                                                              ((T) 1. - extraParam0) / extraParam0);
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                            sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]), extraParam0);
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                                pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T, T, T>(
+                                                        nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) *
+                                                                nd4j::math::nd4j_sgn<T, T>(pIn[kh + kw]);
+                                    } else {
 
-                                    valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                                sum += nd4j::math::nd4j_pow<T, T, T>(
+                                                        nd4j::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]),
+                                                        extraParam0);
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                            const auto inVal = pIn[kh * iStride2 + kw * iStride3];
-                                            pgI[kh * gIStride2 + kw * gIStride3] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
+                                        valO *= nd4j::math::nd4j_pow<T, T, T>(sum,
+                                                                              ((T) 1. - extraParam0) / extraParam0);
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                const auto inVal = pIn[kh * iStride2 + kw * iStride3];
+                                                pgI[kh * gIStride2 + kw * gIStride3] += valO *
+                                                                                        nd4j::math::nd4j_pow<T, T, T>(
+                                                                                                nd4j::math::nd4j_abs<T>(
+                                                                                                        inVal),
+                                                                                                extraParam0 - 1.f) *
+                                                                                        nd4j::math::nd4j_sgn<T, T>(
+                                                                                                inVal);
+                                            }
                                         }
                                     }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling2dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw "";
+                throw std::runtime_error("Incorrect pooling2dBP mode");
             }
         }
 
@@ -1425,226 +1531,239 @@ namespace nd4j {
 
             const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4;
 
-            Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
-            T sum, valO, *pIn, *pgI;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, sum, dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                    pIn = in + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    sum = -DataTypeUtils::max<T>();
-                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    if(sameStrides) {
+                                        sum = -DataTypeUtils::max<T>();
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
 
-                                        dstart *= iStride2;
-                                        dend   *= iStride2;
-                                        hstart *= iStride3;
-                                        hend   *= iStride3;
-                                        wstart *= iStride4;
-                                        wend   *= iStride4;
+                                        if (sameStrides) {
 
-                                        maxKD = dstart;
-                                        maxKH = hstart;
-                                        maxKW = wstart;
+                                            dstart *= iStride2;
+                                            dend *= iStride2;
+                                            hstart *= iStride3;
+                                            hend *= iStride3;
+                                            wstart *= iStride4;
+                                            wend *= iStride4;
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
-                                                    T valIn = pIn[kd + kh + kw];
-                                                    if (valIn > sum) {
-                                                        sum = valIn;
-                                                        maxKD = kd;
-                                                        maxKH = kh;
-                                                        maxKW = kw;
+                                            maxKD = dstart;
+                                            maxKH = hstart;
+                                            maxKW = wstart;
+
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
+                                                        T valIn = pIn[kd + kh + kw];
+                                                        if (valIn > sum) {
+                                                            sum = valIn;
+                                                            maxKD = kd;
+                                                            maxKH = kh;
+                                                            maxKW = kw;
+                                                        }
                                                     }
-                                                }
-                                        gI[pIn - in + maxKD + maxKH + maxKW] += valO;
-                                    }
-                                    else {
+                                            gI[pIn - in + maxKD + maxKH + maxKW] += valO;
+                                        } else {
 
-                                        // we set these to default values
-                                        maxKH = hstart;
-                                        maxKW = wstart;
-                                        maxKD = dstart;
+                                            // we set these to default values
+                                            maxKH = hstart;
+                                            maxKW = wstart;
+                                            maxKD = dstart;
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                    T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
-                                                    if (valIn > sum) {
-                                                        sum = valIn;
-                                                        maxKD = kd;
-                                                        maxKH = kh;
-                                                        maxKW = kw;
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                        T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
+                                                        if (valIn > sum) {
+                                                            sum = valIn;
+                                                            maxKD = kd;
+                                                            maxKH = kh;
+                                                            maxKW = kw;
+                                                        }
                                                     }
-                                                }
-                                        gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
+
+                                            gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
+                                        }
                                     }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {     // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pgI, valO, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                    pgI  = gI + b * gIStride0 + c * gIStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pgI = gI + b * gIStride0 + c * gIStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= gIStride2;
-                                    dend   *= gIStride2;
-                                    hstart *= gIStride3;
-                                    hend   *= gIStride3;
-                                    wstart *= gIStride4;
-                                    wend   *= gIStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
+                                        dstart *= gIStride2;
+                                        dend *= gIStride2;
+                                        hstart *= gIStride3;
+                                        hend *= gIStride3;
+                                        wstart *= gIStride4;
+                                        wend *= gIStride4;
 
-                                    if (extraParam0 == 0)         //Exclude padding
-                                        valO /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(gIStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
-                                    else if (extraParam0 == 1)    //Include padding
-                                        valO /= kProd;
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
-                                                pgI[kd + kh + kw] += valO;
+                                        if (extraParam0 == 0)         //Exclude padding
+                                            valO /= nd4j::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(gIStep2)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(gIStep3)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
+                                        else if (extraParam0 == 1)    //Include padding
+                                            valO /= kProd;
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
+                                                    pgI[kd + kh + kw] += valO;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, pgI, valO, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                    pIn  = in + b * iStride0 + c * iStride1;
-                                    pgI  = gI + (pIn - in);
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
+                                        pgI = gI + (pIn - in);
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    sum = static_cast<T>(0.);
-                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    if(sameStrides) {
+                                        sum = static_cast<T>(0.);
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
 
-                                        dstart *= iStride2;
-                                        dend   *= iStride2;
-                                        hstart *= iStride3;
-                                        hend   *= iStride3;
-                                        wstart *= iStride4;
-                                        wend   *= iStride4;
+                                        if (sameStrides) {
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+                                            dstart *= iStride2;
+                                            dend *= iStride2;
+                                            hstart *= iStride3;
+                                            hend *= iStride3;
+                                            wstart *= iStride4;
+                                            wend *= iStride4;
 
-                                        valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                        sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f) *  nd4j::math::nd4j_sgn<T,T>(pIn[kd + kh + kw]);
-                                    }
-                                    else {
+                                            valO *= nd4j::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                        pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]),extraParam0 - (T) 1.f) * nd4j::math::nd4j_sgn<T, T>(pIn[kd + kh + kw]);
+                                        } else {
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                                        sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
 
-                                        valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+                                            valO *= nd4j::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                    const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
-                                                    pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
-                                                }
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                        const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
+                                                        pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T, T>(inVal);
+                                                    }
+                                        }
                                     }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling3dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
index f61a53f30..3150c0cfd 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
@@ -38,14 +38,17 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray
 
     int tads = tadsA->size();
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD
-    for (Nd4jLong e = 0; e < tads; e++) {
-        auto a_ = tadsA->at(e);
-        auto b_ = tadsB->at(e);
-        auto o_ = tadsO->at(e);
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            auto a_ = tadsA->at(e);
+            auto b_ = tadsB->at(e);
+            auto o_ = tadsO->at(e);
 
-        helpers::cross(context, a_, b_, o_);
-    }
+            helpers::cross(context, a_, b_, o_);
+        }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tads);
 
     delete tadsA;
     delete tadsB;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
index 55cc57d3e..f041452ab 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/d_t_s.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -44,45 +45,51 @@ namespace helpers {
 
         if (isNHWC) {
             const int total_count = batch_size * output_height * output_width * output_depth;
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int out_idx = 0; out_idx < total_count; out_idx++) {
-                const int d = out_idx % output_depth;
-                const int out_idx2 = out_idx / output_depth;
-                const int w = out_idx2 % output_width;
-                const int out_idx3 = out_idx2 / output_width;
-                const int h = out_idx3 % output_height;
-                const int b = out_idx3 / output_height;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto out_idx = start; out_idx < stop; out_idx += increment) {
+                    const int d = out_idx % output_depth;
+                    const int out_idx2 = out_idx / output_depth;
+                    const int w = out_idx2 % output_width;
+                    const int out_idx3 = out_idx2 / output_width;
+                    const int h = out_idx3 % output_height;
+                    const int b = out_idx3 / output_height;
 
-                const int in_h = h / block_size;
-                const int offset_h = h % block_size;
-                const int in_w = w / block_size;
-                const int offset_w = w % block_size;
-                const int offset_d = (offset_h * block_size + offset_w) * output_depth;
-                const int in_d = d + offset_d;
-                const int inp_idx = in_d + input_depth * (in_w + input_width * (in_h + input_height * b));
-                (output_ptr + out_idx)[0] = (input_ptr + inp_idx)[0];
-            }
+                    const int in_h = h / block_size;
+                    const int offset_h = h % block_size;
+                    const int in_w = w / block_size;
+                    const int offset_w = w % block_size;
+                    const int offset_d = (offset_h * block_size + offset_w) * output_depth;
+                    const int in_d = d + offset_d;
+                    const int inp_idx = in_d + input_depth * (in_w + input_width * (in_h + input_height * b));
+                    (output_ptr + out_idx)[0] = (input_ptr + inp_idx)[0];
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         } else {
             const int total_count = batch_size * input_depth_by_input_area;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int input_idx = 0; input_idx < total_count; input_idx++) {
-                const int n_bY_bX_oC_iY = input_idx / input_width;
-                const int iX = input_idx - n_bY_bX_oC_iY * input_width;
+            auto func = PRAGMA_THREADS_FOR {
+                for (int input_idx = start; input_idx < stop; input_idx += increment) {
+                    const int n_bY_bX_oC_iY = input_idx / input_width;
+                    const int iX = input_idx - n_bY_bX_oC_iY * input_width;
 
-                const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height;
-                const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height;
+                    const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height;
+                    const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height;
 
-                const int n_bY = n_bY_bX / block_size;
-                const int bX = n_bY_bX - n_bY * block_size;
+                    const int n_bY = n_bY_bX / block_size;
+                    const int bX = n_bY_bX - n_bY * block_size;
 
-                const int n = n_bY / block_size;
-                const int bY = n_bY - n * block_size;
+                    const int n = n_bY / block_size;
+                    const int bY = n_bY - n * block_size;
 
-                const int output_idx = bX + block_size * (iX + input_width * (bY + block_size * (oC_iY + n * output_depth_by_input_height)));
+                    const int output_idx = bX + block_size * (iX + input_width * (bY + block_size * (oC_iY + n * output_depth_by_input_height)));
 
-                (output_ptr + output_idx)[0] = (input_ptr + input_idx)[0];
-            }
+                    (output_ptr + output_idx)[0] = (input_ptr + input_idx)[0];
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
index 3a687981e..f2f2033c1 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
@@ -34,7 +34,6 @@ static void _diagFunctor(const NDArray* input, NDArray* output) {
 
     const int inLength = input->lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(inLength > Environment::getInstance()->elementwiseThreshold())
     for(int i = 0; i < inLength; ++i)
         output->p<T>(i * (inLength + 1), (*input).e<T>(i));
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
index c75bbf131..f5c0fe71c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/dilation2d.h>
 #include <array/DataTypeUtils.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -52,33 +53,36 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
     const uint oH = output->sizeAt(1);
     const uint oW = output->sizeAt(2);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4))
-    for (uint b = 0; b < bS; ++b) {
-        for (uint oh = 0; oh < oH; ++oh) {
-            for (uint ow = 0; ow < oW; ++ow) {
-                for (uint c = 0; c < iC; ++c)  {
+    auto func = PRAGMA_THREADS_FOR_2D {
+        for (uint b = start_x; b < stop_x; b += inc_x) {
+            for (uint oh = start_y; oh < stop_y; oh += inc_y) {
+                for (uint ow = 0; ow < oW; ++ow) {
+                    for (uint c = 0; c < iC; ++c) {
 
-                    X max = -DataTypeUtils::max<X>();
+                        X max = -DataTypeUtils::max<X>();
 
-                    for (uint kh = 0; kh < kH; ++kh) {
-                        const int ih = oh * sH - pH + kh * dH;
-                        if (ih < 0 || ih >= iH) continue;
+                        for (uint kh = 0; kh < kH; ++kh) {
+                            const int ih = oh * sH - pH + kh * dH;
+                            if (ih < 0 || ih >= iH) continue;
 
-                        for (uint kw = 0; kw < kW; ++kw) {
-                            const int iw = ow * sW - pW + kw * dW;
-                            if(iw < 0 || iw >= iW) continue;
+                            for (uint kw = 0; kw < kW; ++kw) {
+                                const int iw = ow * sW - pW + kw * dW;
+                                if (iw < 0 || iw >= iW) continue;
 
-                            const X val = x[shape::getOffset(xShapeInfo, {b,(uint)ih,(uint)iw,c})] + y[shape::getOffset(yShapeInfo, {kh,kw,c})];
-                            if (val > max)
-                                max = val;
+                                const X val = x[shape::getOffset(xShapeInfo, {b, (uint) ih, (uint) iw, c})] + y[shape::getOffset(yShapeInfo, {kh, kw, c})];
+                                if (val > max)
+                                    max = val;
+                            }
                         }
-                    }
 
-                    z[shape::getOffset(zShapeInfo, {b,oh,ow,c})] = static_cast<Z>(max);
+                        z[shape::getOffset(zShapeInfo, {b, oh, ow, c})] = static_cast<Z>(max);
+                    }
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1);
 }
 
 void dilation2d(nd4j::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
index 7b40d0fa7..9db974b36 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@@ -22,6 +22,7 @@
 #include <NativeOps.h>
 #include <vector>
 #include <memory>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -33,13 +34,16 @@ namespace helpers {
         nd4j::graph::RandomGenerator nodeRng(3019L, seed);
         int inLen = input->lengthOf();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(inLen > Environment::getInstance()->elementwiseThreshold())
-        for (Nd4jLong e = 0; e < inLen; ++e) {
-            float val = nodeRng.relativeT(e, T(0.f), T(1.f));
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));
 
-            if (val < probValue)
-                output->p<T>(e, input->e<T>(e) / probValue);
-        }
+                if (val < probValue)
+                    output->p<T>(e, input->e<T>(e) / probValue);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, inLen);
     }
     BUILD_SINGLE_TEMPLATE(template void dropoutSimple, (NDArray const* input, NDArray* output, double probValue, int seed), FLOAT_TYPES);
 
@@ -59,7 +63,6 @@ namespace helpers {
             std::vector<Nd4jLong> dims(reduceShape->lengthOf());
 
             bool fit = true;
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
             for( int i = 0; i < dims.size(); i++ ) {
                 if (fit) {
                     dims[i] = reduceShape->e<Nd4jLong>(i);
@@ -126,14 +129,17 @@ namespace helpers {
         //input->template applyRandom<randomOps::AlphaDropOut<T>>(rng, nullptr, output, probValueArr);
         nd4j::graph::RandomGenerator nodeRng(3019L, seed);
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(input->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-        for (Nd4jLong e = 0; e < input->lengthOf(); ++e) {
-            float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
-            float xVal = input->e<float>(e);
-            output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
+                float xVal = input->e<float>(e);
+                output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
+            }
+        };
 
-        return ND4J_STATUS_OK;
+        samediff::Threads::parallel_for(func, 0, input->lengthOf());
+
+        return Status::OK();
     }
 
     template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
index 2a2b631c8..073167f18 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@@ -18,6 +18,7 @@
 // Created by george on 05.04.18.
 //
 #include <ops/declarable/helpers/dynamic.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -61,14 +62,17 @@ namespace nd4j {
                 } else {
                     unsigned int outSize = outputList.size();
 
-                    PRAGMA_OMP_PARALLEL_FOR_IF(outSize > Environment::getInstance()->tadThreshold())
-                    for (unsigned int i = 0; i < outSize; i++) {
-                        outputs[i].first = outputList[i];
-                        outputs[i].second = 0;
-                        for (int e = 0; e < indices->lengthOf(); ++e)
-                            if (indices->e<Nd4jLong>(e) == i)
-                                outputs[i].first->p(outputs[i].second++, input->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto i = start; i < stop; i += increment) {
+                            outputs[i].first = outputList[i];
+                            outputs[i].second = 0;
+                            for (int e = 0; e < indices->lengthOf(); ++e)
+                                if (indices->e<Nd4jLong>(e) == i)
+                                    outputs[i].first->p(outputs[i].second++, input->e<T>(e));
+                        }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, outSize);
                 }
             }
             template <typename T>
@@ -165,14 +169,17 @@ namespace nd4j {
                     auto output = outputList[0];
                     unsigned int gradsSize = inputGradientList.size();
 
-                    PRAGMA_OMP_PARALLEL_FOR_IF(gradsSize > Environment::getInstance()->tadThreshold())
-                    for (unsigned int i = 0; i < gradsSize; i++) {
-                        outputs[i].first = inputGradientList[i];
-                        outputs[i].second = 0;
-                        for (int e = 0; e < indices->lengthOf(); ++e)
-                            if (indices->e<Nd4jLong>(e) == i)
-                                output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto i = start; i < stop; i += increment) {
+                            outputs[i].first = inputGradientList[i];
+                            outputs[i].second = 0;
+                            for (int e = 0; e < indices->lengthOf(); ++e)
+                                if (indices->e<Nd4jLong>(e) == i)
+                                    output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
+                        }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, gradsSize);
                 }
 
                 outputList[1]->assign(indices);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
index f450584d7..f3fe89103 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/axis.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -47,37 +48,41 @@ namespace helpers {
             rowCast = 0;
         if (sizeCol * rateCol < 3)
             colCast = 0;
-        //Nd4jLong outputLastDim = output->sizeAt(3);
-       PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong batch = 0; batch < batchCount; batch++) {
-            auto patch = listOfMatricies->at(batch);
-            auto outMatrix = listOfOutputs->at(batch);
 
-            for (Nd4jLong i = 0; i < outRowDim; i++) {
-                for (Nd4jLong j = 0; j < outColDim; j++) {
-                    Nd4jLong pos = 0;
-                    //for (Nd4jLong k = 0; k < outputLastDim; k++) {
-                    auto rowStart = i * strideRow - (theSame?rowCast:0);
-                    auto colStart = j * strideCol - (theSame?colCast:0);
-                    auto rowEnd = rowStart + sizeRow * rateRow;
-                    auto colEnd = colStart + sizeCol * rateCol;
-                    if (!theSame) {
-                        rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim);
-                        colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim);
-                    }
-                    //auto pixel = 0LL;
-                    for (auto row = rowStart; row < rowEnd; row += rateRow)
-                        for (auto col = colStart; col < colEnd; col += rateCol)
-                            for (auto pixel = 0; pixel < lastDim; pixel++) {
-                                bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame);
-                                if (setUp) {
-                                    outMatrix->t<T>(i, j, pos) = patch->e<T>(row, col, pixel);
-                                }
-                                pos++;
-                            }
-                }
-            }
-        }
+       auto func = PRAGMA_THREADS_FOR {
+           for (auto batch = 0; batch < stop; batch += increment) {
+               auto patch = listOfMatricies->at(batch);
+               auto outMatrix = listOfOutputs->at(batch);
+
+               for (Nd4jLong i = 0; i < outRowDim; i++) {
+                   for (Nd4jLong j = 0; j < outColDim; j++) {
+                       Nd4jLong pos = 0;
+                       //for (Nd4jLong k = 0; k < outputLastDim; k++) {
+                       auto rowStart = i * strideRow - (theSame ? rowCast : 0);
+                       auto colStart = j * strideCol - (theSame ? colCast : 0);
+                       auto rowEnd = rowStart + sizeRow * rateRow;
+                       auto colEnd = colStart + sizeCol * rateCol;
+                       if (!theSame) {
+                           rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim);
+                           colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim);
+                       }
+                       //auto pixel = 0LL;
+                       for (auto row = rowStart; row < rowEnd; row += rateRow)
+                           for (auto col = colStart; col < colEnd; col += rateCol)
+                               for (auto pixel = 0; pixel < lastDim; pixel++) {
+                                   bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) ||
+                                                (!theSame);
+                                   if (setUp) {
+                                       outMatrix->t<T>(i, j, pos) = patch->e<T>(row, col, pixel);
+                                   }
+                                   pos++;
+                               }
+                   }
+               }
+           }
+       };
+
+       samediff::Threads::parallel_tad(func, 0, batchCount);
     }
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
index 1a43fb250..3fb7c290d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/gather.h>
 #include <numeric>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -56,12 +57,16 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
             std::vector<int> dimsOut(indices->rankOf());
             std::iota(dimsOut.begin(), dimsOut.end(), axis);   // fill with axis, axis+1, ... axis+indices->rankOf()-1
             const Nd4jLong numOfSubArrs = indices->lengthOf();
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, dimsOut);
-                NDArray subArrIn  = (*input)(indices->e<Nd4jLong>(i), {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, dimsOut);
+                    NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     } 
     else {
@@ -72,12 +77,16 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
         }
         else { // vector case
             const Nd4jLong numOfSubArrs = intArgs.size() - 1;
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, {axis});
-                NDArray subArrIn  = (*input)(intArgs[i+1], {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, {axis});
+                    NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     }    
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
index 687dc0bde..9e3bdf885 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/hamming.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -46,7 +47,7 @@ namespace nd4j {
 
                 Nd4jLong distance = 0;
                 auto lengthOf = x.lengthOf();
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
+                int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
                 Nd4jLong intermediate[256];
 
                 // nullify temp values
@@ -54,30 +55,38 @@ namespace nd4j {
                     intermediate[e] = 0;
 
                 if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < lengthOf; e++) {
-                        auto _x = static_cast<unsigned long long>(xBuffer[e]);
-                        auto _y = static_cast<unsigned long long>(yBuffer[e]);
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto _x = static_cast<unsigned long long>(xBuffer[e]);
+                            auto _y = static_cast<unsigned long long>(yBuffer[e]);
 
-                        intermediate[omp_get_thread_num()] += hamming_distance(_x, _y);
-                    }
+                            intermediate[thread_id] += hamming_distance(_x, _y);
+                        }
+                    };
 
+                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < lengthOf; e++) {
-                        auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
-                        auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
+                            auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
 
-                        intermediate[omp_get_thread_num()] += hamming_distance(_x, _y);
-                    }
+                            intermediate[thread_id] += hamming_distance(_x, _y);
+                        }
+                    };
+
+                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < lengthOf; e++) {
-                        auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
-                        auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
+                            auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
 
-                        intermediate[omp_get_thread_num()] += hamming_distance(_x, _y);
-                    }
+                            intermediate[thread_id] += hamming_distance(_x, _y);
+                        }
+                    };
+
+                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 }
 
                 // accumulate intermediate variables into output array
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
index b254788a8..04df86c36 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/hashcode.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -40,18 +41,20 @@ namespace nd4j {
                 auto tempResult = tempBufferB;
 
                 // we divide array into 32 element chunks, and store intermediate results once
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (int b = 0; b < numBlocks; b++) {
-                    auto blockBuffer = buffer + b * numBlocks;
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto b = 0; b < stop; b += increment) {
+                        auto blockBuffer = buffer + b * numBlocks;
 
-                    Nd4jLong r = 1;
-                    for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
-                        auto v = longBytes<T>(blockBuffer[e]);
-                        r = 31 * r + v;
+                        Nd4jLong r = 1;
+                        for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
+                            auto v = longBytes<T>(blockBuffer[e]);
+                            r = 31 * r + v;
+                        }
+
+                        tempBuffer[b] = r;
                     }
-
-                    tempBuffer[b] = r;
-                }
+                };
+                samediff::Threads::parallel_tad(func, 0, numBlocks);
 
                 // we replace pointer with intermediate one, and repeat only one chunk left
                 int iterationCount = 0;
@@ -60,18 +63,20 @@ namespace nd4j {
                     numBlocks = lastLength / blockSize + ((lastLength % blockSize == 0) ? 0 : 1);
 
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (int b = 0; b < numBlocks; b++) {
-                        auto blockBuffer = tempBuffer + b * numBlocks;
+                    auto func2 = PRAGMA_THREADS_FOR {
+                        for (auto b = start; b < stop; b += increment) {
+                            auto blockBuffer = tempBuffer + b * numBlocks;
 
-                        Nd4jLong r = 1;
-                        for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
-                            auto v = longBytes<T>(blockBuffer[e]);
-                            r = 31 * r + v;
+                            Nd4jLong r = 1;
+                            for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
+                                auto v = longBytes<T>(blockBuffer[e]);
+                                r = 31 * r + v;
+                            }
+
+                            tempResult[b] = r;
                         }
-
-                        tempResult[b] = r;
-                    }
+                    };
+                    samediff::Threads::parallel_tad(func2, 0, numBlocks);
 
 
                     iterationCount++;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
index 349d0381a..1ffb59824 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
@@ -42,29 +42,17 @@ void histogramFixedWidth_(const NDArray& input, const NDArray& range, NDArray& o
 
     Nd4jLong inputLength = input.lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR
+    // FIXME: make this one parallel without CRITICAL section
     for(Nd4jLong i = 0; i < inputLength; ++i) {
-
         const T value = input.e<T>(i);
 
         if(value < secondEdge) {
-
-            PRAGMA_OMP_CRITICAL
-            {
-                output.p<Nd4jLong>(0, output.e<Nd4jLong>(0) + 1);
-            }
+            output.p<Nd4jLong>(0, output.e<Nd4jLong>(0) + 1);
         } else if(value >= lastButOneEdge) {
-            PRAGMA_OMP_CRITICAL
-            {
-                output.p<Nd4jLong>(nbins - 1, output.e<Nd4jLong>(nbins - 1) + 1);
-            }
+            output.p<Nd4jLong>(nbins - 1, output.e<Nd4jLong>(nbins - 1) + 1);
         } else {
             Nd4jLong currInd = static_cast<Nd4jLong>((value - leftEdge) / binWidth);
-
-            PRAGMA_OMP_CRITICAL
-            {
-                output.p<Nd4jLong>(currInd, output.e<Nd4jLong>(currInd) + 1);
-            }
+            output.p<Nd4jLong>(currInd, output.e<Nd4jLong>(currInd) + 1);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
index 002c68226..7be34e6ca 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/im2col.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -59,64 +60,71 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
     const Nd4jLong imStride2  = imStride[2];
     const Nd4jLong imStride3  = imStride[3];
 
-    T *col, *im;
-    int imRow, imCol;
 
     if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(col, im, imRow, imCol) collapse(2))
-    	for (int b = 0; b < bS; b++) {
-        	for (int c = 0; c < iC; ++c) {
-            	for (int kRow = 0; kRow < kH; ++kRow) {
-                	for (int kCol = 0; kCol < kW; ++kCol) {
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {
+        auto func = PRAGMA_THREADS_FOR_2D {
+            for (int b = start_x; b < stop_x; b++) {
+                for (int c = start_y; c < stop_y; c++) {
+                    for (int kRow = 0; kRow < kH; ++kRow) {
+                        for (int kCol = 0; kCol < kW; ++kCol) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
 
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
+                                    int imRow = (-pH + kRow * dH) + colH * sH;
+                                    int imCol = (-pW + kCol * dW) + colW * sW;
 
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
+                                    auto col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5;
 
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else {
-                                    im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-                                	*col = *im;
+                                    if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
+                                        *col = zeroPadVal;
+                                    else {
+                                        auto im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3;
+                                        *col = *im;
+                                    }
                                 }
                             }
                         }
                     }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(im, col, imRow, imCol) collapse(2))
-    	for (int b = 0; b < bS; b++) {
-        	for (int colH = 0; colH < oH; ++colH) {
-            	for (int colW = 0; colW < oW; ++colW) {
-                	for (int c = 0; c < iC; ++c) {
-                    	for (int kRow = 0; kRow < kH; ++kRow) {
-                        	for (int kCol = 0; kCol < kW; ++kCol) {
+        auto func = PRAGMA_THREADS_FOR_2D {
+            T *col, *im;
+            int imRow, imCol;
 
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
+            for (int b = start_x; b < stop_x; b += inc_x) {
+                for (int colH = start_y; colH < stop_y; colH += inc_y) {
+                    for (int colW = 0; colW < oW; ++colW) {
+                        for (int c = 0; c < iC; ++c) {
+                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                for (int kCol = 0; kCol < kW; ++kCol) {
 
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
+                                    imRow = (-pH + kRow * dH) + colH * sH;
+                                    imCol = (-pW + kCol * dW) + colW * sW;
 
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else {
-                                    im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-                                	*col = *im;
+                                    col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5;
+
+                                    if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
+                                        *col = zeroPadVal;
+                                    else {
+                                        im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3;
+                                        *col = *im;
+                                    }
                                 }
                             }
                         }
                     }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
index 2ac679fc5..11bc1ecaa 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/image_resize.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -83,7 +84,7 @@ namespace helpers {
             return top + (bottom - top) * yVal;
         };
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        // FIXME: fix parallelism here
         for (Nd4jLong b = 0; b < batchSize; ++b) {
             for (Nd4jLong y = 0; y < outHeight; ++y) {
                 const T *ys_input_lower_ptr = input_b_ptr + ys[y].bottomIndex * inRowSize;
@@ -149,11 +150,13 @@ namespace helpers {
 
         int xsSize = xs.size();
         // Scale x interpolation weights to avoid a multiplication during iteration.
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int i = 0; i < xsSize; ++i) {
-            xs[i].bottomIndex *= channels;
-            xs[i].topIndex *= channels;
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                xs[i].bottomIndex *= channels;
+                xs[i].topIndex *= channels;
+            }
+        };
+        samediff::Threads::parallel_for(func, 0, xsSize);
 
         resizeImage(images, batchSize, inHeight, inWidth, outHeight, outWidth, channels, xs, ys, output);
         return ND4J_STATUS_OK;
@@ -184,24 +187,22 @@ namespace helpers {
         double heightScale = center ? (inHeight - 1.) / double(outHeight - 1.0) : (inHeight / double(outHeight));
         double widthScale = center ? (inWidth - 1.) / double(outWidth - 1.0) : (inWidth / double(outWidth));
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-        for (int b = 0; b < batchSize; ++b) {
-            for (int y = 0; y < outHeight; ++y) {
-                Nd4jLong inY = nd4j::math::nd4j_min(
-                        (center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(y * heightScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(
-                                y * heightScale)), inHeight - 1);
-                for (int x = 0; x < outWidth; ++x) {
-                    Nd4jLong inX = nd4j::math::nd4j_min(
-                            (center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(x * widthScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(
-                                    x * widthScale)), inWidth - 1);
-                    for (Nd4jLong e = 0; e < channels; e++)
-                        output->p(b, y, x, e, images->e<T>(b, inY, inX, e));
-//              std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0));
+        auto func = PRAGMA_THREADS_FOR_2D {
+            for (auto b = start_x; b < stop_x; b += inc_x) {
+                for (auto y = start_y; y < stop_y; y += inc_y) {
+                    Nd4jLong inY = nd4j::math::nd4j_min((center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(y * heightScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(y * heightScale)), inHeight - 1);
+
+                    for (int x = 0; x < outWidth; ++x) {
+                        Nd4jLong inX = nd4j::math::nd4j_min((center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(x * widthScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(x * widthScale)),inWidth - 1);
+                        for (Nd4jLong e = 0; e < channels; e++)
+                            output->p(b, y, x, e, images->e<T>(b, inY, inX, e));
+                    }
                 }
             }
-        }
+        };
+        samediff::Threads::parallel_for(func, 0, batchSize, 1, 0, outHeight, 1);
 
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
     void resizeImage(NDArray const *images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
@@ -263,67 +264,73 @@ namespace helpers {
             T heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / (cropHeight - 1) : T(0);
             T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int y = 0; y < cropHeight; ++y) {
-                const float inY = (cropHeight > 1)
-                                  ? y1 * (imageHeight - 1) + y * heightScale
-                                  : 0.5 * (y1 + y2) * (imageHeight - 1);
-                if (inY < 0 || inY > imageHeight - 1) {
-                    for (int x = 0; x < cropWidth; ++x) {
-                        for (int d = 0; d < depth; ++d) {
-                            crops->p(b, y, x, d, extrapolationVal);
-                        }
-                    }
-                    continue;
-                }
-                if (method == 0 /* bilinear */) {
-                    const int topYIndex = nd4j::math::p_floor(inY);
-                    const int bottomYIndex = nd4j::math::p_ceil(inY);
-                    const float y_lerp = inY - topYIndex;
+            auto func = PRAGMA_THREADS_FOR {
+                for (int y = start; y < stop; y += increment) {
+                    const float inY = (cropHeight > 1)
+                                      ? y1 * (imageHeight - 1) + y * heightScale
+                                      : 0.5 * (y1 + y2) * (imageHeight - 1);
 
-                    for (int x = 0; x < cropWidth; ++x) {
-                        const float in_x = (cropWidth > 1)
-                                           ? x1 * (imageWidth - 1) + x * widthScale
-                                           : 0.5 * (x1 + x2) * (imageWidth - 1);
-                        if (in_x < 0 || in_x > imageWidth - 1) {
+                    if (inY < 0 || inY > imageHeight - 1) {
+                        for (int x = 0; x < cropWidth; ++x) {
                             for (int d = 0; d < depth; ++d) {
                                 crops->p(b, y, x, d, extrapolationVal);
                             }
-                            continue;
-                        }
-                        int left_x_index = math::p_floor(in_x);
-                        int right_x_index = math::p_ceil(in_x);
-                        T x_lerp = in_x - left_x_index;
-
-                        for (int d = 0; d < depth; ++d) {
-                            const float topLeft(images->e<float>(bIn, topYIndex, left_x_index, d));
-                            const float topRight(images->e<float>(bIn, topYIndex, right_x_index, d));
-                            const float bottomLeft(images->e<float>(bIn, bottomYIndex, left_x_index, d));
-                            const float bottomRight(images->e<float>(bIn, bottomYIndex, right_x_index, d));
-                            const float top = topLeft + (topRight - topLeft) * x_lerp;
-                            const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp;
-                            crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
                         }
+                        continue;
                     }
-                } else {  // method is "nearest neighbor"
-                    for (int x = 0; x < cropWidth; ++x) {
-                        const float inX = (cropWidth > 1)
-                                          ? x1 * (imageWidth - 1) + x * widthScale
-                                          : 0.5 * (x1 + x2) * (imageWidth - 1);
-                        if (inX < 0 || inX > imageWidth - 1) {
-                            for (int d = 0; d < depth; ++d) {
-                                crops->p(b, y, x, d, extrapolationVal);
+                    if (method == 0 /* bilinear */) {
+                        const int topYIndex = nd4j::math::p_floor(inY);
+                        const int bottomYIndex = nd4j::math::p_ceil(inY);
+                        const float y_lerp = inY - topYIndex;
+
+                        for (int x = 0; x < cropWidth; ++x) {
+                            const float in_x = (cropWidth > 1)
+                                               ? x1 * (imageWidth - 1) + x * widthScale
+                                               : 0.5 * (x1 + x2) * (imageWidth - 1);
+
+                            if (in_x < 0 || in_x > imageWidth - 1) {
+                                for (int d = 0; d < depth; ++d) {
+                                    crops->p(b, y, x, d, extrapolationVal);
+                                }
+                                continue;
+                            }
+                            int left_x_index = math::p_floor(in_x);
+                            int right_x_index = math::p_ceil(in_x);
+                            T x_lerp = in_x - left_x_index;
+
+                            for (int d = 0; d < depth; ++d) {
+                                const float topLeft(images->e<float>(bIn, topYIndex, left_x_index, d));
+                                const float topRight(images->e<float>(bIn, topYIndex, right_x_index, d));
+                                const float bottomLeft(images->e<float>(bIn, bottomYIndex, left_x_index, d));
+                                const float bottomRight(images->e<float>(bIn, bottomYIndex, right_x_index, d));
+                                const float top = topLeft + (topRight - topLeft) * x_lerp;
+                                const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp;
+                                crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
                             }
-                            continue;
                         }
-                        const int closestXIndex = roundf(inX);
-                        const int closestYIndex = roundf(inY);
-                        for (int d = 0; d < depth; ++d) {
-                            crops->p(b, y, x, d, (F)images->e<T>(bIn, closestYIndex, closestXIndex, d));
+                    } else {  // method is "nearest neighbor"
+                        for (int x = 0; x < cropWidth; ++x) {
+                            const float inX = (cropWidth > 1)
+                                              ? x1 * (imageWidth - 1) + x * widthScale
+                                              : 0.5 * (x1 + x2) * (imageWidth - 1);
+
+                            if (inX < 0 || inX > imageWidth - 1) {
+                                for (int d = 0; d < depth; ++d) {
+                                    crops->p(b, y, x, d, extrapolationVal);
+                                }
+                                continue;
+                            }
+                            const int closestXIndex = roundf(inX);
+                            const int closestYIndex = roundf(inY);
+                            for (int d = 0; d < depth; ++d) {
+                                crops->p(b, y, x, d, (F) images->e<T>(bIn, closestYIndex, closestXIndex, d));
+                            }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, cropHeight);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
index f4fb98b2a..ab48ebb32 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
@@ -72,7 +72,8 @@ namespace helpers {
 
         for (int i = 0; i < numBoxes; ++i) {
             bool shouldSelect = numSelected < output->lengthOf();
-            PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(numSelected))
+
+            // FIXME: add parallelism here
             for (int j = numSelected - 1; j >= 0; --j) {
                 if (shouldSelect)
                 if (needToSuppressWithThreshold(*boxes, indices[i], indices[selectedIndices[j]], T(overlapThreshold))) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
index def210457..4bc9d3304 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@@ -23,6 +23,7 @@
 #include <helpers/TAD.h>
 #include<ops/declarable/helpers/ismax.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -144,14 +145,8 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
 
         int span = (tads / num_threads) + 8;
 
-        PRAGMA_OMP_PARALLEL_THREADS(num_threads)
-        {
-            int tid = omp_get_thread_num();
-            int start = span * tid;
-            int end = span * (tid + 1);
-            if (end > tads) end = tads;
-
-            for (int r = start; r < end; r++) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
                     auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
                     auto rZ = output->bufferAsT<Z>() + zOfsets[r];
 
@@ -198,7 +193,9 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
                         }
                     }
             }
-        }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, tads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
index 09cb2df2e..62f8316ce 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <NDArrayFactory.h>
+#include <ops/ops.h>
 
 namespace nd4j {
 namespace ops {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
index 0d0705104..c9b833cf5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@@ -22,6 +22,7 @@
 #include <ops/declarable/helpers/lrn.h>
 #include <Status.h>
 #include <ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -60,76 +61,80 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
 
     if(inTadEws == 1 && outTadEws == 1) {
         
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const T* x = inBuff    + inTadOffsets[i];
-                  T* y = outBuff + outTadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = start; i < stop; i += increment) {
+                const T *x = inBuff + inTadOffsets[i];
+                T *y = outBuff + outTadOffsets[i];
 
-            T prev = 0;
+                T prev = 0;
 
-            // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s)
-                        prev = prev + x[s] * x[s];
-                    y[j] = prev;
+                // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s)
+                            prev = prev + x[s] * x[s];
+                        y[j] = prev;
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j] = prev + x[end - 1] * x[end - 1];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j] = prev + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
+                    else if (begin > 0 && last > tadLen)
+                        y[j] = prev - x[begin - 1] * x[begin - 1];
+                    else
+                        y[j] = prev;
+
+                    if (j != 0)
+                        prev = y[j];
+
+                    y[j] = x[j] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j] = prev + x[end - 1] * x[end - 1];
-                else if (begin > 0 && last <= tadLen)
-                    y[j] = prev + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
-                else if (begin > 0 && last > tadLen)
-                    y[j] = prev - x[begin - 1] * x[begin - 1];
-                else
-                    y[j] = prev;
+            }
+        };
 
-                  if(j != 0)
-                    prev = y[j];
-                
-                y[j] = x[j] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta); 
-            }          
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
     else {
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const T* x = inBuff    + inTadOffsets[i];
-                  T* y = outBuff + outTadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = 0; i < numOfTads; ++i) {
+                const T *x = inBuff + inTadOffsets[i];
+                T *y = outBuff + outTadOffsets[i];
 
-            T prev = 0;
+                T prev = 0;
 
-            // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s)
-                        prev = prev + x[s*inTadEws] * x[s*inTadEws];
-                    y[j*outTadEws] = prev;
+                // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s)
+                            prev = prev + x[s * inTadEws] * x[s * inTadEws];
+                        y[j * outTadEws] = prev;
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j * outTadEws] = prev + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j * outTadEws] = prev + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws] - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else if (begin > 0 && last > tadLen)
+                        y[j * outTadEws] = prev - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else
+                        y[j * outTadEws] = prev;
+
+                    if (j != 0)
+                        prev = y[j * outTadEws];
+
+                    y[j * outTadEws] = x[j * inTadEws] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j*outTadEws] = prev + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws];
-                else if (begin > 0 && last <= tadLen)
-                    y[j*outTadEws] = prev + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else if (begin > 0 && last > tadLen)
-                    y[j*outTadEws] = prev - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else
-                    y[j*outTadEws] = prev;
+            }
+        };
 
-                  if(j != 0)
-                    prev = y[j*outTadEws];
-                
-                y[j*outTadEws] = x[j*inTadEws] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta); 
-            }          
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }    
     return Status::OK();
 }
@@ -173,141 +178,146 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
 
     if(inTadEws == 1 && gradITadEws == 1) {
         
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const X* x = inBuff    + inTadOffsets[i];
-                  Y* y = gradIBuff + gradITadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = start; i < stop; i += increment) {
+                const X *x = inBuff + inTadOffsets[i];
+                      Y *y = gradIBuff + gradITadOffsets[i];
 
-            // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {
-                    y[0] = 0;
-                    for (uint s = begin; s < end; ++s)
-                        y[0] = y[0] + x[s] * x[s];
+                // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        y[0] = 0;
+                        for (uint s = begin; s < end; ++s)
+                            y[0] = y[0] + x[s] * x[s];
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j] = y[j - 1] + x[end - 1] * x[end - 1];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j] = y[j - 1] + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
+                    else if (begin > 0 && last > tadLen)
+                        y[j] = y[j - 1] - x[begin - 1] * x[begin - 1];
+                    else
+                        y[j] = y[j - 1];
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j] = y[j - 1] + x[end - 1] * x[end - 1];
-                else if (begin > 0 && last <= tadLen)
-                    y[j] = y[j - 1] + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
-                else if (begin > 0 && last > tadLen)
-                    y[j] = y[j - 1] - x[begin - 1] * x[begin - 1];
-                else
-                    y[j] = y[j - 1];                
+
+                Y *factor = new Y[tadLen];
+
+                Y prev = 0;
+                // second loop calculates derivatives using information gained in first loop above
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    Y init = tbias + talpha * y[j];
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s) {
+                            factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s], -tbeta - 1);
+                            prev = prev + x[s] * factor[s];
+                        }
+                        y[0] = prev;
+                    } else if (begin == 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
+                        y[j] = prev + x[end - 1] * factor[end - 1];
+                    } else if (begin > 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
+                        y[j] = prev + x[end - 1] * factor[end - 1] - x[begin - 1] * factor[begin - 1];
+                    } else if (begin > 0 && last > tadLen)
+                        y[j] = prev - x[begin - 1] * factor[begin - 1];
+                    else
+                        y[j] = prev;
+
+                    if (j != 0)
+                        prev = y[j];
+
+                    y[j] = factor[j] * init - 2 * x[j] * coeff * prev;
+                }
+
+                delete[]factor;
             }
+        };
 
-            Y* factor = new Y[tadLen];
-
-            Y prev = 0;
-            // second loop calculates derivatives using information gained in first loop above
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-
-                Y init = tbias + talpha * y[j];
-
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s) {
-                        factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s], -tbeta - 1);
-                        prev = prev + x[s] * factor[s];
-                    }
-                    y[0] = prev;
-                }
-                else if(begin == 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
-                    y[j] = prev + x[end - 1] * factor[end - 1];
-                }
-                else if (begin > 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
-                    y[j] = prev + x[end - 1] * factor[end - 1] - x[begin - 1] * factor[begin - 1];
-                }
-                else if (begin > 0 && last > tadLen)
-                    y[j] = prev - x[begin - 1] * factor[begin - 1];
-                else 
-                    y[j] = prev;
-                
-                if(j != 0)
-                    prev = y[j];
-
-                y[j] = factor[j] * init - 2 * x[j] * coeff * prev;                
-            }
-            
-            delete []factor;
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const X* x = inBuff    + inTadOffsets[i];
-                  Y* y = gradIBuff + gradITadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = start; i < stop; i += increment) {
+                const X *x = inBuff + inTadOffsets[i];
+                      Y *y = gradIBuff + gradITadOffsets[i];
 
-            // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {
-                    y[0] = 0;
-                    for (uint s = begin; s < end; ++s)
-                        y[0] = y[0] + x[s*inTadEws] * x[s*inTadEws];
+                // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        y[0] = 0;
+                        for (uint s = begin; s < end; ++s)
+                            y[0] = y[0] + x[s * inTadEws] * x[s * inTadEws];
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j * gradITadEws] =
+                                y[(j - 1) * gradITadEws] + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j * gradITadEws] =
+                                y[(j - 1) * gradITadEws] + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws] -
+                                x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else if (begin > 0 && last > tadLen)
+                        y[j * gradITadEws] =
+                                y[(j - 1) * gradITadEws] - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else
+                        y[j * gradITadEws] = y[(j - 1) * gradITadEws];
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws] + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws];
-                else if (begin > 0 && last <= tadLen)
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws] + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else if (begin > 0 && last > tadLen)
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws];              
+
+                Y *factor = new Y[tadLen];
+
+                Y prev = 0;
+                // second loop calculates derivatives using information gained in first loop above
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    Y init = tbias + talpha * y[j * gradITadEws];
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s) {
+                            factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s * gradITadEws], -tbeta - 1);
+                            prev = prev + x[s * inTadEws] * factor[s];
+                        }
+                        y[0] = prev;
+                    } else if (begin == 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
+                                                                        -tbeta - 1);
+                        y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1];
+                    } else if (begin > 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
+                                                                        -tbeta - 1);
+                        y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1] -
+                                             x[(begin - 1) * inTadEws] * factor[begin - 1];
+                    } else if (begin > 0 && last > tadLen)
+                        y[j * gradITadEws] = prev - x[(begin - 1) * inTadEws] * factor[begin - 1];
+                    else
+                        y[j * gradITadEws] = prev;
+
+                    if (j != 0)
+                        prev = y[j * gradITadEws];
+
+                    y[j * gradITadEws] = factor[j] * init - 2 * x[j * inTadEws] * coeff * prev;
+                }
+
+                delete[]factor;
             }
+        };
 
-            Y* factor = new Y[tadLen];
-
-            Y prev = 0;
-            // second loop calculates derivatives using information gained in first loop above
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-
-                Y init = tbias + talpha * y[j*gradITadEws];
-
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s) {
-                        factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s*gradITadEws], -tbeta - 1);
-                        prev = prev + x[s*inTadEws] * factor[s];
-                    }
-                    y[0] = prev;
-                }
-                else if(begin == 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1)*gradITadEws], -tbeta - 1);
-                    y[j*gradITadEws] = prev + x[(end - 1)*inTadEws] * factor[end - 1];
-                }
-                else if (begin > 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1)*gradITadEws], -tbeta - 1);
-                    y[j*gradITadEws] = prev + x[(end - 1)*inTadEws] * factor[end - 1] - x[(begin - 1)*inTadEws] * factor[begin - 1];
-                }
-                else if (begin > 0 && last > tadLen)
-                    y[j*gradITadEws] = prev - x[(begin - 1)*inTadEws] * factor[begin - 1];
-                else 
-                    y[j*gradITadEws] = prev;
-                
-                if(j != 0)
-                    prev = y[j*gradITadEws];
-
-                y[j*gradITadEws] = factor[j] * init - 2 * x[j*inTadEws] * coeff * prev;                
-            }
-            
-            delete []factor;
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }    
     gradI *= gradO;
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index 743aab40a..922fdc3a9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -34,6 +34,7 @@
 #include <array/NDArrayList.h>
 #include <iterator>
 #include <MmulHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -122,11 +123,14 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
     auto cLast_ = cLast->bufferAsT<T>();
     auto h_ = h->bufferAsT<T>();
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD
-    for (uint e = 0; e < uLen; e++) {
-        c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
-        h_[e] = nd4j::math::nd4j_tanh<T,T>(c_[e]);
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (uint e = start; e < stop; e += increment) {
+            c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
+            h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, uLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
index 9a2034fd0..25605d77e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@@ -20,6 +20,7 @@
 
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrixSetDiag.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -47,22 +48,22 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
     const int xRank = input.rankOf();
     const auto xLen = input.lengthOf();
 
-    std::vector<Nd4jLong> coords(xRank);  // we use the same coordinates storage both for input and output since their ranks are the same
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (Nd4jLong i = 0; i < xLen; ++i) {
+            shape::index2coords(i, xShapeInfo, coords);
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
-    for (Nd4jLong i = 0; i < xLen; ++i) {
+            const auto xOffset = shape::getOffset(xShapeInfo, coords);
+            const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords);
 
-        shape::index2coords(i, xShapeInfo, coords.data());
-
-        const auto xOffset = shape::getOffset(xShapeInfo, coords.data());
-        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords.data());
-
-        // condition to be on diagonal of innermost matrix
-        if(coords[xRank - 2] == coords[xRank - 1])
-            z[zOffset] = y[shape::getOffset(yShapeInfo, coords.data())];
-        else
-            z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
-    }
+            // condition to be on diagonal of innermost matrix
+            if (coords[xRank - 2] == coords[xRank - 1])
+                z[zOffset] = y[shape::getOffset(yShapeInfo, coords)];
+            else
+                z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
+        }
+    };
+    samediff::Threads::parallel_for(func, 0, xLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
index c26637bd8..e0e487e82 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
@@ -21,6 +21,7 @@
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrix_diag_part.h>
 #include <Status.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -43,10 +44,14 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
     int lastDimension = nd4j::math::nd4j_min(input->sizeAt(-2), input->sizeAt(-1));
     // TODO: tune this properlys
     int lO = listOut->size();
-    PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold())
-    for(int i = 0; i < lO; ++i)
-        for(int j = 0; j < lastDimension; ++j)
-            listOut->at(i)->p(j, listDiag->at(i)->e<T>(j, j));
+
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            for (int j = 0; j < lastDimension; ++j)
+                listOut->at(i)->p(j, listDiag->at(i)->e<T>(j, j));
+    };
+
+    samediff::Threads::parallel_tad(func, 0, lO);
     
     delete listOut;
     delete listDiag;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
index daffa8f17..8c5332be6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
@@ -22,6 +22,7 @@
 #include <TAD.h>
 #include <ShapeUtils.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -53,11 +54,14 @@ namespace helpers {
             std::unique_ptr<ResultSet> rows(sortedVals.allTensorsAlongDimension(lastDims));
             Nd4jLong oL = output->lengthOf();
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < oL; e++) {
-                auto row = rows->at(e);
-                output->p(e, row->e<T>(n));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto row = rows->at(e);
+                    output->p(e, row->e<T>(n));
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, oL);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
index a83518899..3e18d6d14 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
@@ -20,6 +20,7 @@
 
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 #include "../one_hot.h"
 
 namespace nd4j {
@@ -47,41 +48,47 @@ namespace nd4j {
                 Z one = static_cast<Z>(on);
 
                 if (tadEws >= 1) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (unsigned int e = 0; e < numTads; e++) {
-                        auto cO = output + tadPack.primaryOffsets()[e];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = 0; e < stop; e += increment) {
+                            auto cO = output + tadPack.primaryOffsets()[e];
 
-                        auto idx = static_cast<int>(indices[e]);
-                        if (idx < 0 || idx >= tLen) {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[t * tadEws] = zero;
-                            }
-                        } else {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[t * tadEws] = idx == t ? one : zero;
+                            auto idx = static_cast<int>(indices[e]);
+                            if (idx < 0 || idx >= tLen) {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[t * tadEws] = zero;
+                                }
+                            } else {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[t * tadEws] = idx == t ? one : zero;
+                                }
                             }
                         }
-                    }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, numTads);
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (unsigned int e = 0; e < numTads; e++) {
-                        auto cO = output + tadPack.primaryOffsets()[e];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto cO = output + tadPack.primaryOffsets()[e];
 
-                        auto idx = static_cast<int>(indices[e]);
-                        if (idx < 0 || idx >= tLen) {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero;
-                            }
-                        } else {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero;
+                            auto idx = static_cast<int>(indices[e]);
+                            if (idx < 0 || idx >= tLen) {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero;
+                                }
+                            } else {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero;
+                                }
                             }
                         }
-                    }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, numTads);
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
index 6ebbb784b..5c1f3c28d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
@@ -66,7 +66,7 @@ static void _percentile(const NDArray& input, NDArray& output, std::vector<int>&
     position = len - position - 1;
 
     // FIXME: our sort impl should be used instead, so this operation might be implemented as generic
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(flattenedArr))
+    // FIXME: parallelism !
     for(int i=0; i<listOfSubArrs->size(); ++i) {
         
         T* buff = reinterpret_cast<T *>(flattenedArr.getBuffer());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
index 6290de6ad..cb97ffe1e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
@@ -21,6 +21,7 @@
 #include<ops/declarable/helpers/polyGamma.h>
 #include<ops/declarable/helpers/zeta.h>
 #include <NDArrayFactory.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -39,7 +40,6 @@ static FORCEINLINE T getFactorial(const int n) {
 
 	T result = (T)1.f;
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(prodT : result)
 	for(int i = 2; i <= n; ++i)
 		result *= i;
 	
@@ -74,9 +74,12 @@ static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const ND
 	NDArray& result = output;
 
 	int xLen = x.lengthOf();
-    PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
-	for(int i = 0; i < x.lengthOf(); ++i)
-		result.p(i, polyGammaScalar<T>(context, n.e<int>(i), x.e<T>(i)));
+
+	auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            result.p(i, polyGammaScalar<T>(context, n.e<int>(i), x.e<T>(i)));
+    };
+	samediff::Threads::parallel_for(func, 0, x.lengthOf());
 
 //	return result;
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
index 33ba9575d..bb0e7e24e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
@@ -20,6 +20,7 @@
 
 
 #include <ops/declarable/helpers/range.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -37,10 +38,11 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
     auto s = start.e<T>(0);
     auto d = delta.e<T>(0);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD
-    for(Nd4jLong i = 0; i < len; ++i)
-    	buff[i] = s + i * d;
-        
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            buff[i] = s + i * d;
+    };
+    samediff::Threads::parallel_for(func, 0, len);
 }
 
     void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
index 83deeca88..9f424606d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/reverse.h>
 #include <helpers/ShapeUtils.h>
 #include <array/ResultSet.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -52,36 +53,36 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
             // two step phase here
             if (inArr == outArr) {
                 if (inEWS == 1) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
-                        auto idx = sLength - e;
-                        swap(inArr, e, idx);
-//                        T tmp = inArr[e];
-//                        inArr[e] = inArr[idx];
-//                        inArr[idx] = tmp;
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto idx = sLength - e;
+                            swap(inArr, e, idx);
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2);
                 }
                 else if (inEWS > 1) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
-                        auto idx1 = (sLength - e) * inEWS;
-                        Nd4jLong idx2 =  e * inEWS;
-//                        T tmp = inArr[idx2];
-//                        inArr[idx2] = inArr[idx1];
-//                        inArr[idx1] = tmp;
-                        swap(inArr, idx1, idx2);
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto idx1 = (sLength - e) * inEWS;
+                            Nd4jLong idx2 = e * inEWS;
+                            swap(inArr, idx1, idx2);
+                        }
+                    };
+
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2);
                 }
                 else {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
+                            auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
+                            swap(outArr, inOffset, outOffset);
+                        }
+                    };
 
-                        auto inOffset  = shape::getIndexOffset(e, inShapeBuffer);
-                        auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
-                        //outArr[outOffset] = inArr[inOffset];
-                        swap(outArr, inOffset, outOffset);
-                    }
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2);
                 }
             }
             else {
@@ -91,47 +92,57 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
 
                 if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++)
-                        outArr[sLength - e] = inArr[e];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (Nd4jLong e = start; e < stop; e += increment)
+                            outArr[sLength - e] = inArr[e];
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
-                        PRAGMA_OMP_PARALLEL_FOR
-                        for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++)
-                            outArr[e] = inArr[e];
+                        auto f2 = PRAGMA_THREADS_FOR {
+                            for (auto e = start; e < stop; e += increment)
+                                outArr[e] = inArr[e];
+                        };
+                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
                     }
                 }
                 else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++)
-                        outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment)
+                            outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
-                        PRAGMA_OMP_PARALLEL_FOR
-                        for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++)
-                            outArr[e * outEWS] = inArr[e * inEWS];
+                        auto f2 = PRAGMA_THREADS_FOR {
+                            for (auto e = start; e < stop; e += increment)
+                                outArr[e * outEWS] = inArr[e * inEWS];
+                        };
+                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
                     }
                 }
                 else {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) {
-
-                        auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
-                        auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
-                        outArr[outOffset] = inArr[inOffset];
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
+                            auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
+                            outArr[outOffset] = inArr[inOffset];
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
 
-                        PRAGMA_OMP_PARALLEL_FOR
-                        for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) {
-
-                            auto inOffset  = shape::getIndexOffset(e, inShapeBuffer);
-                            auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
-                            outArr[outOffset] = inArr[inOffset];
-                        }
+                        auto f2 = PRAGMA_THREADS_FOR {
+                            for (auto e = start; e < stop; e += increment) {
+                                auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
+                                auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
+                                outArr[outOffset] = inArr[inOffset];
+                            }
+                        };
+                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
                     }
                 }
             }
@@ -140,7 +151,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
 
 ///////////////////////////////////////////////////////////////////
 template <typename T>
-static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
+static void reverseSequence_(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
 
     int posOfNonUnityDim = -1;
     if(input->isVector() || shape::isLikeVector(input->getShapeInfo(), posOfNonUnityDim)) {
@@ -184,7 +195,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input
 }
 
     void reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), _reverseSequence, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), reverseSequence_, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES);
     }
 
 //////////////////////////////////////////////////////////////////////////
@@ -208,7 +219,7 @@ void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* outpu
     delete listIn;
 }
 
-BUILD_SINGLE_TEMPLATE(template void _reverseSequence, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void reverseSequence_, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES);
 BUILD_SINGLE_TEMPLATE(template void reverseArray, (nd4j::LaunchContext * context, void *inArr, Nd4jLong *inShapeBuffer, void *outArr, Nd4jLong *outShapeBuffer, int numOfElemsToReverse), LIBND4J_TYPES);
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index 5b4c44874..5422d04c1 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -20,6 +20,7 @@
 //
 
 #include <ops/declarable/helpers/s_t_b.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -53,21 +54,22 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop
     const uint iC = xShapeInfo[4];
 
     // loop through output array
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4))
-    for (uint b = 0; b < bS; ++b) {
-        for (uint h = cropBottom; h < iH - cropTop; ++h) {
-            for (uint w = cropLeft; w < iW - cropRight; ++w) {
-                for (uint c = 0; c < iC; ++c) {
+    auto func = PRAGMA_THREADS_FOR_3D {
+        for (uint b = start_x; b < stop_x; b += inc_x) {
+            for (uint h = start_y; h < stop_y; h += inc_y) {
+                for (uint w = start_z; w < stop_z; w += inc_z) {
+                    for (uint c = 0; c < iC; ++c) {
+                        const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
+                        const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
 
-                    const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
-
-                    const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
-
-                    z[zOffset] = x[xOffset];
+                        z[zOffset] = x[xOffset];
+                    }
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, bS, 1, cropBottom, iH - cropTop, 1, cropLeft, iW - cropRight, 1);
 }
 
 BUILD_SINGLE_TEMPLATE(template void batchToSpace_, (const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight), LIBND4J_TYPES);
@@ -109,23 +111,24 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
     const int rank = input.rankOf();
     const Nd4jLong zLen = output.lengthOf();
 
-    std::vector<Nd4jLong> coords(rank);
-
     // loop through input array
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
 
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+            shape::index2coords(i, output.getShapeInfo(), coords);
 
-        shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+            // evaluate spatial coordinates for x
+            for (uint j = 1; j <= numOfSpatialDims; ++j)
+                coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
 
-        // evaluate spatial coordinates for x
-        for(uint j = 1; j <= numOfSpatialDims; ++j)
-            coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
+            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+        }
+    };
 
-        z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-    }
+    samediff::Threads::parallel_tad(func, 0, zLen);
 }
 
 BUILD_SINGLE_TEMPLATE(template void batchToSpaceND_, (const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
@@ -212,24 +215,26 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB
     const uint iC = zShapeInfo[4];
 
     // loop through output array
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4))
-    for (uint b = 0; b < bS; ++b) {
-        for (uint h = 0; h < oH; ++h) {
-            for (uint w = 0; w < oW; ++w) {
-                for (uint c = 0; c < iC; ++c) {
+    auto func = PRAGMA_THREADS_FOR_2D {
+        for (uint b = start_x; b < stop_x; b += inc_x) {
+            for (uint h = start_y; h < stop_y; h += inc_y) {
+                for (uint w = 0; w < oW; ++w) {
+                    for (uint c = 0; c < iC; ++c) {
 
-                    const Nd4jLong zOffset = b * zShapeInfo[5] + h * zShapeInfo[6] + w * zShapeInfo[7] + c * zShapeInfo[8];
+                        const Nd4jLong zOffset = b * zShapeInfo[5] + h * zShapeInfo[6] + w * zShapeInfo[7] + c * zShapeInfo[8];
 
-                    if(h >= padBottom && h < oH - padTop && w >= padLeft && w < oW - padRight) {
-                        const Nd4jLong xOffset = b * xShapeInfo[5] + (h - padBottom) * xShapeInfo[6] + (w - padLeft) * xShapeInfo[7] + c * xShapeInfo[8];
-                        z[zOffset] = x[xOffset];
+                        if (h >= padBottom && h < oH - padTop && w >= padLeft && w < oW - padRight) {
+                            const Nd4jLong xOffset = b * xShapeInfo[5] + (h - padBottom) * xShapeInfo[6] + (w - padLeft) * xShapeInfo[7] + c * xShapeInfo[8];
+                            z[zOffset] = x[xOffset];
+                        } else
+                            z[zOffset] = 0.f;
                     }
-                    else
-                        z[zOffset] = 0.f;
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1);
 }
 
 BUILD_SINGLE_TEMPLATE(template void spaceToBatch_, (const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight), LIBND4J_TYPES);
@@ -292,36 +297,37 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
     const int rank = input.rankOf();
     const Nd4jLong zLen = output.lengthOf();
 
-    std::vector<Nd4jLong> coords(rank);
-
     // loop through output array
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
+            shape::index2coords(i, output.getShapeInfo(), coords);
 
-        shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+            bool within = true;
 
-        bool within = true;
+            for (uint j = 1; j <= numOfSpatialDims; ++j) {
 
-        for(uint j = 1; j <= numOfSpatialDims; ++j) {
+                const auto padLeft = padding.e<uint>(j - 1, 0);
+                const auto padRight = padding.e<uint>(j - 1, 1);
 
-            const auto padLeft  = padding.e<uint>(j - 1, 0);
-            const auto padRight = padding.e<uint>(j - 1, 1);
+                within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
 
-            within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
+                if (!within)
+                    break;
 
-            if(!within)
-                break;
+                coords[j] -= padLeft;       // get coordinates for x
+            }
 
-            coords[j] -= padLeft;       // get coordinates for x
+            if (within)
+                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+            else
+                z[zOffset] = 0.f;
         }
+    };
 
-        if(within)
-            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-        else
-            z[zOffset] = 0.f;
-    }
+    samediff::Threads::parallel_tad(func, 0, zLen);
 }
 
 BUILD_SINGLE_TEMPLATE(template void spaceToBatchND_, (const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
index af9a74b68..fd285ed9c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/s_t_d.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -46,47 +47,53 @@ namespace helpers {
         if (isNHWC) {
             const int total_count = batch_size * input_height * input_width * input_depth;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int inp_idx = 0; inp_idx < total_count; inp_idx++){
-                // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
-                const int d = inp_idx % input_depth;
-                const int inp_idx2 = inp_idx / input_depth;
-                const int w = inp_idx2 % input_width;
-                const int inp_idx3 = inp_idx2 / input_width;
-                const int h = inp_idx3 % input_height;
-                const int b = inp_idx3 / input_height;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                    // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
+                    const int d = inp_idx % input_depth;
+                    const int inp_idx2 = inp_idx / input_depth;
+                    const int w = inp_idx2 % input_width;
+                    const int inp_idx3 = inp_idx2 / input_width;
+                    const int h = inp_idx3 % input_height;
+                    const int b = inp_idx3 / input_height;
 
-                const int out_h = h / block_size;
-                const int offset_h = h % block_size;
-                const int out_w = w / block_size;
-                const int offset_w = w % block_size;
-                const int offset_d = (offset_h * block_size + offset_w) * input_depth;
-                const int out_d = d + offset_d;
-                
-                const int out_idx = out_d + output_depth * (out_w + output_width * (out_h + output_height * b));
-                *(output_ptr + out_idx) = *(input_ptr + inp_idx);
-            }
+                    const int out_h = h / block_size;
+                    const int offset_h = h % block_size;
+                    const int out_w = w / block_size;
+                    const int offset_w = w % block_size;
+                    const int offset_d = (offset_h * block_size + offset_w) * input_depth;
+                    const int out_d = d + offset_d;
+
+                    const int out_idx = out_d + output_depth * (out_w + output_width * (out_h + output_height * b));
+                    *(output_ptr + out_idx) = *(input_ptr + inp_idx);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         } else {
             const int total_count = batch_size * output_depth_by_output_area;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int inp_idx = 0; inp_idx < total_count; inp_idx++) {
-                const int n_iC_oY_bY_oX = inp_idx / block_size;
-                const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                    const int n_iC_oY_bY_oX = inp_idx / block_size;
+                    const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
 
-                const int n_iC_oY_bY = n_iC_oY_bY_oX / output_width;
-                const int oX = n_iC_oY_bY_oX - n_iC_oY_bY * output_width;
+                    const int n_iC_oY_bY = n_iC_oY_bY_oX / output_width;
+                    const int oX = n_iC_oY_bY_oX - n_iC_oY_bY * output_width;
 
-                const int n_iC_oY = n_iC_oY_bY / block_size;
-                const int bY = n_iC_oY_bY - n_iC_oY * block_size;
+                    const int n_iC_oY = n_iC_oY_bY / block_size;
+                    const int bY = n_iC_oY_bY - n_iC_oY * block_size;
 
-                const int n = n_iC_oY / input_depth_by_output_height;
-                const int iC_oY = n_iC_oY - n * input_depth_by_output_height;
+                    const int n = n_iC_oY / input_depth_by_output_height;
+                    const int iC_oY = n_iC_oY - n * input_depth_by_output_height;
 
-                const int output_idx = oX + (((n * block_size + bY) * block_size + bX) * input_depth_by_output_height + iC_oY) * output_width;
-                
-                *(output_ptr + output_idx) = *(input_ptr + inp_idx);
-            }
+                    const int output_idx = oX + (((n * block_size + bY) * block_size + bX) * input_depth_by_output_height + iC_oY) * output_width;
+
+                    *(output_ptr + output_idx) = *(input_ptr + inp_idx);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index 0b16ac989..99605e7cc 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/scatter.h>
 #include <numeric>
 #include <helpers/ShapeUtils.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -34,16 +35,16 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
     const Nd4jLong indLen = indices.lengthOf();
 
     if(outRank == 1) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                Nd4jLong idx = indices.e<Nd4jLong>(i);
+                NDArray out = output({idx, idx + 1});
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
-        for(Nd4jLong i = 0; i < indLen; ++i) {
+                out.applyPairwiseTransform(op, updates.e(i), nullptr);
+            }
+        };
 
-            Nd4jLong idx = indices.e<Nd4jLong>(i);
-            NDArray out = output({idx, idx+1});
-
-            out.applyPairwiseTransform(op, updates.e(i), nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
     else {      // outRank > 1
 
@@ -54,17 +55,16 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
         std::vector<int> dimsToExcludeUpd(sizeOfDims);
         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
 
-        shape::printIntArray(dimsToExcludeUpd.data(),dimsToExcludeUpd.size());
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
+                NDArray updSubArr = updates(i, dimsToExcludeUpd);
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) // causes known openMP asan bug !
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
-        for(Nd4jLong i = 0; i < indLen; ++i) {
+                outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
+            }
+        };
 
-            NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
-            NDArray updSubArr = updates(i, dimsToExcludeUpd);
-
-            outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
 }
 
@@ -77,40 +77,41 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
     const Nd4jLong indLastDim = indices.sizeAt(-1);
 
     if(outRank == 1) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                Nd4jLong idx = indices.e<Nd4jLong>(i);
+                NDArray out = output({idx, idx + 1});
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
-        for(Nd4jLong i = 0; i < indLen; ++i) {
+                out.applyPairwiseTransform(op, updates.e(i), nullptr);
+            }
+        };
 
-            Nd4jLong idx = indices.e<Nd4jLong>(i);
-            NDArray out = output({idx, idx+1});
-
-            out.applyPairwiseTransform(op, updates.e(i), nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
     else {
-
         std::vector<int> dimsToExcludeInd = ShapeUtils::evalDimsToExclude(indRank, {indRank-1});
         std::vector<int> dimsToExcludeUpd(indRank - 1);
         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
-        std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut))
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided) firstprivate(idxRangeOut))
-        for(Nd4jLong i = 0; i < indLen/indLastDim; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
 
-            NDArray indSubArr = indices(i, dimsToExcludeInd);
+            for (auto i = start; i < stop; i += increment) {
+                NDArray indSubArr = indices(i, dimsToExcludeInd);
 
-            for(Nd4jLong j = 0; j < indLastDim; ++j) {
-                idxRangeOut[2*j] = indSubArr.e<Nd4jLong>(j);
-                idxRangeOut[2*j + 1] = idxRangeOut[2*j] + 1;
+                for (Nd4jLong j = 0; j < indLastDim; ++j) {
+                    idxRangeOut[2 * j] = indSubArr.e<Nd4jLong>(j);
+                    idxRangeOut[2 * j + 1] = idxRangeOut[2 * j] + 1;
+                }
+
+                NDArray outSubArr = output(idxRangeOut);
+                NDArray updSubArr = updates(i, dimsToExcludeUpd);
+
+                outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
             }
+        };
 
-            NDArray outSubArr = output(idxRangeOut);
-            NDArray updSubArr = updates(i, dimsToExcludeUpd);
-
-            outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen / indLastDim, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
 }
 
@@ -125,20 +126,24 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr
     std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(updates.rankOf(), {-1});
 
     if(!calcGrad) {
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
-        for(Nd4jLong i = 0; i < indicesLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto subArr = updates(i, dimsToExclude);
+                output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
+            }
+        };
 
-            auto subArr = updates(i, dimsToExclude);
-            output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
-        }
+        samediff::Threads::parallel_for(func, 0, indicesLen);
     } else {
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
-		for(Nd4jLong i = 0; i < indicesLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto subArr = updates(i, dimsToExclude);
+                auto ind = indices.e<Nd4jLong>(i);
+                subArr.p(ind, subArr.e(ind) - 1.);
+            }
+        };
 
-            auto subArr = updates(i, dimsToExclude);
-            auto ind = indices.e<Nd4jLong>(i);
-            subArr.p(ind, subArr.e(ind) - 1.);
-        }
+        samediff::Threads::parallel_for(func, 0, indicesLen);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
index e13cfb177..2884107f3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@@ -21,6 +21,9 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ShapeUtils.h>
+#include <execution/Threads.h>
+#include <map>
+
 namespace nd4j {
 namespace ops {
 namespace helpers {
@@ -167,10 +170,13 @@ namespace helpers {
 
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (int e = 0; e < meanT->lengthOf(); e++) {
-                       meanV->p<T>(e, meanV->e<T>(e) + listOfTensors->at(i)->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            meanV->p<T>(e, meanV->e<T>(e) + listOfTensors->at(i)->e<T>(e));
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, meanT->lengthOf());
+
                     count++;
                 }
                 else {
@@ -221,10 +227,12 @@ namespace helpers {
 
             for (int i = 0; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (int e = 0; e < sumT->lengthOf(); e++) {
-                       sumT->p(e, sumT->e<T>(e) + listOfTensors->at(i)->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            sumT->p(e, sumT->e<T>(e) + listOfTensors->at(i)->e<T>(e));
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, sumT->lengthOf());
                 }
                 else {
                     idx = indices->e<int>(i);
@@ -270,10 +278,12 @@ namespace helpers {
             sumT->assign(listOfTensors->at(0));
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i)  == idx) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (int e = 0; e < sumT->lengthOf(); e++) {
-                       sumT->p(e, sumT->e<T>(e) * listOfTensors->at(i)->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            sumT->p(e, sumT->e<T>(e) * listOfTensors->at(i)->e<T>(e));
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, sumT->lengthOf());
                 }
                 else {
                     idx = indices->e<int>(i);
@@ -463,7 +473,8 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 double sumValue = input->e<double>(fi->second.at(0));
                 int loop_size = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sumValue)
+
+                // FIXME: parallelism here?
                 for (size_t idx = 1; idx < loop_size; ++idx) {
                     sumValue += input->e<double>(fi->second.at(idx));
                 }
@@ -477,11 +488,12 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
+            // FIXME: parallelism here?
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 auto outputT = listOfOutTensors->at(fi->first);
                 outputT->assign(listOfTensors->at(fi->second.at(0)));
                 Nd4jLong loopSize = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR
+
                 for (Nd4jLong idx = 1; idx < loopSize; ++idx) {
                     auto current = listOfTensors->at(fi->second.at(idx));
                     *outputT += *current;
@@ -501,7 +513,8 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 double sumValue = input->e<double>(fi->second.at(0));
                 Nd4jLong loop_size = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:sumValue)
+
+                // FIXME: parallelism here?
                 for (Nd4jLong idx = 1; idx < loop_size; ++idx) {
                     sumValue += input->e<double>(fi->second.at(idx));
                 }
@@ -518,7 +531,8 @@ namespace helpers {
                 auto outputT = listOfOutTensors->at(fi->first);
                 outputT->assign(listOfTensors->at(fi->second.at(0)));
                 Nd4jLong loop_size = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR
+
+                // FIXME: parallelism here?
                 for (Nd4jLong idx = 1; idx < loop_size; ++idx) {
                     auto current = listOfTensors->at(fi->second.at(idx));
                     *(outputT) += *current;
@@ -619,12 +633,15 @@ namespace helpers {
         segmentMaxFunctor_<T>(input, indices, tempRes);
         if (input->isVector()) {
             Nd4jLong loop_size = input->lengthOf();
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < loop_size; ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                if (nd4j::math::nd4j_abs(tempRes->e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
-                    output->p(e, gradOut->e<T>(classNum));
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    if (nd4j::math::nd4j_abs(tempRes->e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
+                        output->p(e, gradOut->e<T>(classNum));
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, loop_size);
         }
         else {
             std::vector<int> restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -637,18 +654,21 @@ namespace helpers {
             //int numOfClasses = tempRes->sizeAt(0); // number of classes
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-                for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
-                    if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
-                        currentOut->p(e, currentGradOut->e<T>(e));
+                    for (uint64_t e = 0; e < current->lengthOf(); e++) {
+                        if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
+                            currentOut->p(e, currentGradOut->e<T>(e));
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, indices->lengthOf());
         }
         delete tempRes;
         return ND4J_STATUS_OK;
@@ -664,12 +684,14 @@ namespace helpers {
         std::unique_ptr<NDArray> tempRes(gradOut->dup());
         segmentMinFunctor(context, input, indices, tempRes.get());
         if (input->isVector()) {
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < input->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                if (nd4j::math::nd4j_abs(tempRes->e<double>(classNum) - input->e<double>(e)) < 1.e-5)
-                    output->p(e, gradOut->e<double>(classNum));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    if (nd4j::math::nd4j_abs(tempRes->e<double>(classNum) - input->e<double>(e)) < 1.e-5)
+                        output->p(e, gradOut->e<double>(classNum));
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, input->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -684,17 +706,22 @@ namespace helpers {
             output->assign(0.);
             int pos = 0;
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<double>(e) - current->e<double>(e)) < 1.e-5)
-                        currentOut->p(e, currentGradOut->e<double>(e));
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<double>(e) - current->e<double>(e)) <
+                            1.e-5)
+                            currentOut->p(e, currentGradOut->e<double>(e));
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, indices->lengthOf());
         }
         return ND4J_STATUS_OK;
     }
@@ -730,17 +757,20 @@ namespace helpers {
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
             int pos = 0;
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
+            //auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    currentOut->p(e, currentGradOut->e<double>(e) / classCount[classNum]);
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        currentOut->p(e, currentGradOut->e<double>(e) / classCount.at(classNum));
+                    }
                 }
-            }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         return ND4J_STATUS_OK;
     }
@@ -762,16 +792,20 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                currentOut->assign(currentGradOut);
-            }
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+
+                    currentOut->assign(currentGradOut);
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
     int segmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
@@ -794,16 +828,19 @@ namespace helpers {
             //int numOfClasses = tempRes->sizeAt(0); // number of classes
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                NDArray* currentFFOut = listOfBPTensors->at(classNum);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+                    auto currentFFOut = listOfBPTensors->at(classNum);
 
-                currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
-            }
+                    currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         delete tempRes;
         return ND4J_STATUS_OK;
@@ -861,12 +898,15 @@ namespace helpers {
         unsortedSegmentMinFunctor(context, input, indices, numOfClasses, tempRes);
         if (input->isVector()) {
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < input->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                if (nd4j::math::nd4j_abs(tempRes->t<T>(classNum) - input->t<T>(e)) < 1.e-6)
-                    output->t<T>(e) = gradOut->t<T>(classNum);
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    if (nd4j::math::nd4j_abs(tempRes->t<T>(classNum) - input->t<T>(e)) < 1.e-6)
+                        output->t<T>(e) = gradOut->t<T>(classNum);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, input->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -876,21 +916,21 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            //int numOfClasses = tempRes->sizeAt(0); // number of classes
-            //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
-                        currentOut->t<T>(e) = currentGradOut->t<T>(e);
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
+                            currentOut->t<T>(e) = currentGradOut->t<T>(e);
+                    }
                 }
-            }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         delete tempRes;
         return ND4J_STATUS_OK;
@@ -955,17 +995,19 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                //NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-                currentOut->assign(currentGradOut);
-            }
+                    currentOut->assign(currentGradOut);
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
     int unsortedSegmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
@@ -973,11 +1015,14 @@ namespace helpers {
 
         unsortedSegmentProdFunctor(context, input, indices, numOfClasses, tempRes);
         if (input->isVector()) {
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                output->p<double>(e, gradOut->e<double>(classNum) * tempRes->e<double>(classNum)/ input->e<double>(e));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    output->p<double>(e, gradOut->e<double>(classNum) * tempRes->e<double>(classNum) / input->e<double>(e));
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -987,19 +1032,22 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                auto currentFFOut = listOfBPTensors->at(classNum);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+                    auto currentFFOut = listOfBPTensors->at(classNum);
 
-                currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
-            }
+                    currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         delete tempRes;
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
 //    template <typename T>
@@ -1016,11 +1064,14 @@ namespace helpers {
 
         // if input is a vector: (as if in doc sample)
         if (input->isVector()) {
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double,double>(classCount[classNum]));
-            }
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto e = 0; e < indices->lengthOf(); e++) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -1029,22 +1080,22 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            //int numOfClasses = tempRes->sizeAt(0); // number of classes
-            //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    currentOut->p(e, currentGradOut->e<double>(e) / nd4j::math::nd4j_sqrt<double,double>(classCount[classNum]));
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        currentOut->p<double>(e, currentGradOut->e<double>(e) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
+                    }
                 }
-            }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
index 03f61d453..bf3463afe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/sequence_mask.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -26,11 +27,14 @@ namespace helpers {
 
     template <typename I, typename B>
     static void sequenceMask_(NDArray* input, NDArray* output, int maxIndex) {
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-        for (Nd4jLong i = 0; i < maxIndex; i++)
-            for(Nd4jLong k = 0; k < input->lengthOf(); k++)
-                if (i < input->t<I>(k))
-                    output->t<B>(k * maxIndex + i) = B(true); //,  T(1.0f));
+        auto func = PRAGMA_THREADS_FOR_2D {
+            for (auto i = start_x; i < stop_x; i += inc_x)
+                for (auto k = start_y; k < stop_y; k += inc_y)
+                    if (i < input->t<I>(k))
+                        output->t<B>(k * maxIndex + i) = B(true); //,  T(1.0f));
+        };
+
+        samediff::Threads::parallel_for(func, 0, maxIndex, 1, 0, input->lengthOf(), 1);
     }
 
     void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
index b0fd449c7..59c257c28 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/sg_cb.h>
 #include <specials.h>
+#include <execution/Threads.h>
 
 #define HS_MAX_EXP 6.0f
 
@@ -350,8 +351,6 @@ namespace nd4j {
                 const auto negTable = reinterpret_cast<T*>(vnegTable);
                 const auto infVector = reinterpret_cast<T*>(vinfVector);
 
-                T sneu1e[600];
-
                 //const auto numThreads = omp_get_max_threads();
                 const auto idxShift = indices.isEmpty() ? 0 : indices.sizeAt(1);
                 const auto hsRounds = codes.isEmpty() ? 0 : codes.sizeAt(1);
@@ -362,64 +361,71 @@ namespace nd4j {
                     auto bIndices = indices.bufferAsT<int>();
                     auto bCodes = codes.bufferAsT<int8_t>();
 
-                    PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(numThreads) private(sneu1e))
-                    for (int t = 0; t < numTargets; t++) {
-                        T* neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
-                        memset(neu1e, 0, vectorLength * sizeof(T));
+                    auto func = PRAGMA_THREADS_FOR {
+                        T sneu1e[600];
 
-                        auto target = bTarget[t];
-                        auto alpha = lr.e<double>(t);
-                        unsigned long long randomValue = nextRandom.e<Nd4jLong>(t);
+                        for (auto t = start; t < stop; t += increment) {
+                            T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
+                            memset(neu1e, 0, vectorLength * sizeof(T));
 
-                        auto syn0row = reinterpret_cast<T*>(s0.bufferWithOffset(target * vectorLength));
+                            auto target = bTarget[t];
+                            auto alpha = lr.e<double>(t);
+                            unsigned long long randomValue = nextRandom.e<Nd4jLong>(t);
 
-                        if (hsRounds > 0) {
-                            int irow = 0;
-                            auto cShift = t * idxShift;
+                            auto syn0row = reinterpret_cast<T *>(s0.bufferWithOffset(target * vectorLength));
 
-                            for (int e = 0; e < hsRounds; e++) {
-                                irow = bIndices[e + cShift];
-                                if (irow < 0 || irow >= vocabSize)
-                                    continue;
+                            if (hsRounds > 0) {
+                                int irow = 0;
+                                auto cShift = t * idxShift;
 
-                                auto syn1row = s1.bufferWithOffset(irow * vectorLength);
-                                auto code = bCodes[e + cShift];
+                                for (int e = 0; e < hsRounds; e++) {
+                                    irow = bIndices[e + cShift];
+                                    if (irow < 0 || irow >= vocabSize)
+                                        continue;
+
+                                    auto syn1row = s1.bufferWithOffset(irow * vectorLength);
+                                    auto code = bCodes[e + cShift];
 
                                     //nd4j_printf("syn0: [%i]; syn1: [%i]; code: [%i]\n", target, irow, code);
-                                hSoftmax_<T>(syn0row, syn1row, expTable, neu1e, alpha, vectorLength, code, expLength, false);
-                            }
-                        }
-
-
-                        if (nsRounds > 0) {
-                            int irow = negStarters.e<int>(t);
-                            int nsStarter = irow;
-                            for (int r = 0; r < nsRounds + 1; r++) {
-                                if (r == 0) {
-                                    // target is known in advance
-                                } else {
-                                    randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                    auto idx = nd4j::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
-                                    irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
-
-                                    if (irow < 0 || irow >= vocabSize)
-                                        irow = randomValue % (vocabSize - 1) + 1;
-
-                                    if (irow == nsStarter)
-                                        continue;
+                                    hSoftmax_<T>(syn0row, syn1row, expTable, neu1e, alpha, vectorLength, code,
+                                                 expLength, false);
                                 }
-
-                                nSampling_<T>(syn0row, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
                             }
+
+
+                            if (nsRounds > 0) {
+                                int irow = negStarters.e<int>(t);
+                                int nsStarter = irow;
+                                for (int r = 0; r < nsRounds + 1; r++) {
+                                    if (r == 0) {
+                                        // target is known in advance
+                                    } else {
+                                        randomValue = randomValue * (unsigned long long) 25214903917 + 11;
+                                        auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                                        irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
+
+                                        if (irow < 0 || irow >= vocabSize)
+                                            irow = randomValue % (vocabSize - 1) + 1;
+
+                                        if (irow == nsStarter)
+                                            continue;
+                                    }
+
+                                    nSampling_<T>(syn0row, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e,
+                                                  alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
+                                }
+                            }
+
+                            for (int e = 0; e < vectorLength; e++)
+                                syn0row[e] += neu1e[e];
+
+                            // optionally release temp arrays
+                            if (vectorLength > 600)
+                                delete[] neu1e;
                         }
+                    };
 
-                        for (int e = 0; e < vectorLength; e++)
-                            syn0row[e] += neu1e[e];
-
-                        // optionally release temp arrays
-                        if (vectorLength > 600)
-                            delete[] neu1e;
-                    }
+                    samediff::Threads::parallel_tad(func, 0, numTargets, 1, numThreads);
             }
             BUILD_SINGLE_TEMPLATE(template void skipgramBatchExec_, (NDArray &s0, NDArray &s1, NDArray &s1n, void *vexpTable, void *vnegTable, void *vinfVector, NDArray &targets, NDArray &negStarters, NDArray &indices, NDArray &codes, NDArray &lr, NDArray &nextRandom, const int nsRounds, const int vocabSize, const int vectorLength, const int expLength, const int negLength, const bool preciseMode, const int numThreads), FLOAT_TYPES);
 
@@ -434,9 +440,6 @@ namespace nd4j {
                 const auto negTable = reinterpret_cast<T*>(vnegTable);
                 const auto infVector = reinterpret_cast<T*>(vinfVector);
 
-                T sneu1[600];
-                T sneu1e[600];
-
                 //const auto numThreads = omp_get_max_threads();
                 const auto idxShift = indices.isEmpty() ? 0 : indices.sizeAt(1);
                 const auto hsRounds = codes.isEmpty() ? 0 : codes.sizeAt(1);
@@ -450,122 +453,131 @@ namespace nd4j {
                 const auto bStarters = negStarters.bufferAsT<int>();
                 const auto numIndices = indices.isEmpty() ? 0 : indices.sizeAt(1);
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(numThreads) private(sneu1, sneu1e))
-                for (int e = 0; e < numTargets; e++){
-                    T* neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
-                    T* neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
+                auto func = PRAGMA_THREADS_FOR {
+                    T sneu1[600];
+                    T sneu1e[600];
 
-                    // optionally we nullify temp arrays after successful (and on first) cycle
-                    memset(neu1, 0, sizeof(T) * vectorLength);
-                    memset(neu1e, 0, sizeof(T) * vectorLength);
+                    for (int e = start; e < stop; e += increment) {
+                        T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
+                        T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
 
-                    auto alpha = lr.e<double>(e);
-                    auto numLabels = nLabels.isEmpty() ? 0 : nLabels.e<int>(e);
+                        // optionally we nullify temp arrays after successful (and on first) cycle
+                        memset(neu1, 0, sizeof(T) * vectorLength);
+                        memset(neu1e, 0, sizeof(T) * vectorLength);
 
-                    int actualContext = 0;
+                        auto alpha = lr.e<double>(e);
+                        auto numLabels = nLabels.isEmpty() ? 0 : nLabels.e<int>(e);
 
-                    // building neu1 for current window
-                    for (int c = 0; c < contextWidth; c++) {
-                        // getting next context word
-                        auto cContext = bContext[c + (e * contextWidth)];
+                        int actualContext = 0;
 
-                        // skipping padded values
-                        if (cContext < 0)
-                            continue;
+                        // building neu1 for current window
+                        for (int c = 0; c < contextWidth; c++) {
+                            // getting next context word
+                            auto cContext = bContext[c + (e * contextWidth)];
 
-                        if (cContext >= vocabSize)
-                            throw std::runtime_error("ContextID can't be >= vocab size");
-
-                        T *syn0word = syn0 + (cContext * vectorLength);
-
-                        for (int i = 0; i < vectorLength; i++)
-                            neu1[i] += syn0word[i];
-
-                        actualContext++;
-                    }
-
-                    if (infVector != nullptr)
-                        actualContext++;
-
-                    if (actualContext > 1) {
-                        for (int i = 0; i < vectorLength; i++)
-                            neu1[i] /= actualContext;
-                    }
-
-                    // hierarchic softmax step
-                    if (!indices.isEmpty()) {
-                        for (int i = 0; i < numIndices; i++) {
-                            const int cIndex = bIndices[(e * numIndices) + i];
-                            const int cCode = bCodes[(e * numIndices) + i];
-
-                            // we're skipping padded values
-                            if (cIndex < 0)
+                            // skipping padded values
+                            if (cContext < 0)
                                 continue;
 
-                            if (cIndex >= vocabSize)
-                                throw std::runtime_error("Index can't be > vocab size");
+                            if (cContext >= vocabSize)
+                                throw std::runtime_error("ContextID can't be >= vocab size");
 
-                            hSoftmax_<T>(neu1, syn1 + (cIndex * vectorLength), expTable, neu1e, alpha, vectorLength, cCode, expLength, false);
+                            T *syn0word = syn0 + (cContext * vectorLength);
+
+                            for (int i = 0; i < vectorLength; i++)
+                                neu1[i] += syn0word[i];
+
+                            actualContext++;
                         }
-                    }
 
-                    // negative sampling step
-                    if (!negStarters.isEmpty() && nsRounds > 0) {
-                        int irow = bStarters[e];
-                        const int nsStarter = irow;
-                        unsigned long long randomValue = nextRandom.e<Nd4jLong>(e);
+                        if (infVector != nullptr)
+                            actualContext++;
 
-                        for (int r = 0; r < nsRounds + 1; r++) {
-                            // we're skipping rng on 0 step
-                            if (r != 0) {
-                                randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
-                                irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
+                        if (actualContext > 1) {
+                            for (int i = 0; i < vectorLength; i++)
+                                neu1[i] /= actualContext;
+                        }
 
-                                if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
-                                if (irow == nsStarter)
+                        // hierarchic softmax step
+                        if (!indices.isEmpty()) {
+                            for (int i = 0; i < numIndices; i++) {
+                                const int cIndex = bIndices[(e * numIndices) + i];
+                                const int cCode = bCodes[(e * numIndices) + i];
+
+                                // we're skipping padded values
+                                if (cIndex < 0)
                                     continue;
 
-                                nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
-                            } else {
-                                nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
-                            }
+                                if (cIndex >= vocabSize)
+                                    throw std::runtime_error("Index can't be > vocab size");
 
-                            //nd4j_printf("Thread <%i>: syn0: [%i]; s1n: [%i];\n", omp_get_thread_num(), 0, irow);
+                                hSoftmax_<T>(neu1, syn1 + (cIndex * vectorLength), expTable, neu1e, alpha, vectorLength,
+                                             cCode, expLength, false);
+                            }
+                        }
+
+                        // negative sampling step
+                        if (!negStarters.isEmpty() && nsRounds > 0) {
+                            int irow = bStarters[e];
+                            const int nsStarter = irow;
+                            unsigned long long randomValue = nextRandom.e<Nd4jLong>(e);
+
+                            for (int r = 0; r < nsRounds + 1; r++) {
+                                // we're skipping rng on 0 step
+                                if (r != 0) {
+                                    randomValue = randomValue * (unsigned long long) 25214903917 + 11;
+                                    auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                                    irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
+
+                                    if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
+                                    if (irow == nsStarter)
+                                        continue;
+
+                                    nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e,
+                                                  alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
+                                } else {
+                                    nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e,
+                                                  alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
+                                }
+
+                                //nd4j_printf("Thread <%i>: syn0: [%i]; s1n: [%i];\n", omp_get_thread_num(), 0, irow);
+                            }
+                        }
+
+
+                        // if we're skipping labels
+                        int starter = trainWords == 1 ? 0 : contextWidth - numLabels;
+
+                        // applying previously averaged results
+                        for (int c = starter; c < contextWidth; c++) {
+                            // getting context
+                            auto cContext = bContext[c + (e * contextWidth)];
+                            auto cLock = bLocker[c + (e * contextWidth)];
+
+                            // skipping padded values
+                            if (cContext < 0 || cLock == 1)
+                                continue;
+
+                            if (cContext >= vocabSize)
+                                throw std::runtime_error("ContextID can't be > vocab size");
+
+                            // one word from context
+                            T *syn0word = syn0 + (cContext * vectorLength);
+
+                            for (int i = 0; i < vectorLength; i++)
+                                syn0word[i] += neu1e[i];
+
+                        }
+
+                        // optionally release temp arrays
+                        if (vectorLength > 600) {
+                            delete[] neu1;
+                            delete[] neu1e;
                         }
                     }
+                };
 
-
-                    // if we're skipping labels
-                    int starter = trainWords == 1 ? 0 : contextWidth - numLabels;
-
-                    // applying previously averaged results
-                    for (int c = starter; c < contextWidth; c++) {
-                        // getting context
-                        auto cContext = bContext[c + (e * contextWidth)];
-                        auto cLock = bLocker[c + (e * contextWidth)];
-
-                        // skipping padded values
-                        if (cContext < 0 || cLock == 1)
-                            continue;
-
-                        if (cContext >= vocabSize)
-                            throw std::runtime_error("ContextID can't be > vocab size");
-
-                        // one word from context
-                        T *syn0word = syn0 + (cContext * vectorLength);
-
-                        for (int i = 0; i < vectorLength; i++)
-                            syn0word[i] += neu1e[i];
-
-                    }
-
-                    // optionally release temp arrays
-                    if (vectorLength > 600) {
-                        delete[] neu1;
-                        delete[] neu1e;
-                    }
-                }
+                samediff::Threads::parallel_tad(func, 0, numTargets, 1, numThreads);
             }
             BUILD_SINGLE_TEMPLATE(template void cbowBatchExec_, (NDArray &s0, NDArray &s1, NDArray &s1n, void *vexpTable, void *vnegTable, void *vinfVector, NDArray &context, NDArray &lockedWords, NDArray &targets, NDArray &negStarters, NDArray &indices, NDArray &codes, NDArray &lr, NDArray &nextRandom, NDArray &nLabels, const int nsRounds, const int vocabSize, const int vectorLength, const int expLength, const int negLength,  const bool trainWords, const int numThreads), FLOAT_TYPES);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
index a80e65999..1fea14824 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
@@ -23,6 +23,7 @@
 #include<ops/declarable/helpers/sru.h>
 #include <NDArrayFactory.h>
 #include <MmulHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -141,47 +142,49 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
     T* pHt   = ht->bufferAsT<T>();
     T* pCt   = ct->bufferAsT<T>();
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong col = 0; col < ncols; ++col) {
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto col = start; col < stop; col += increment) {
+            const auto colNum = col % d2;
+            bool flip = colNum >= K;
+            T maskVal = mask ? *(pMask + col) : T(1);
+            T cur = *(pInit + col);
+            T bF = *(pBias + colNum);
+            T bR = *(pBias + colNum + d2);
+            T *pWiVal = pWi + 3 * col;
+            T *pIVal = pI + col;
+            T *pHtVal = pHt + col;
+            T *pCtVal = pCt + col;
 
-        const auto colNum = col % d2;
-        bool flip = colNum >= K;
-        T maskVal = mask ? *(pMask + col) : T(1);
-        T cur     = *(pInit + col);
-        T bF      = *(pBias + colNum);
-        T bR      = *(pBias + colNum + d2);
-        T* pWiVal = pWi     + 3*col;
-        T* pIVal  = pI      + col;
-        T* pHtVal = pHt     + col;
-        T* pCtVal = pCt     + col;
+            if (flip) {
+                const auto step = (time - 1) * ncols;
+                pIVal += step;
+                pHtVal += step;
+                pCtVal += step;
+                pWiVal += (time - 1) * ncolsWi;
+            }
 
-        if (flip) {
-            const auto step = (time - 1) * ncols;
-            pIVal  += step;
-            pHtVal += step;
-            pCtVal += step;
-            pWiVal += (time - 1) * ncolsWi;
+            auto ncolsRev = flip ? -ncols : ncols;
+            auto ncolsWiRev = flip ? -ncolsWi : ncolsWi;
+
+            for (Nd4jLong t = 0; t < time; ++t) {
+                // evaluate sigmoids
+                T ft = (1.) / (1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[1] + bF)));
+                T rt = (1.) / (1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[2] + bR)));
+
+                cur = (cur - *pWiVal) * ft + *pWiVal;
+                *pCtVal = cur;
+                T val = nd4j::math::nd4j_tanh<T, T>(cur);
+                *pHtVal = (val * maskVal - *pIVal) * rt + *pIVal;
+
+                pIVal += ncolsRev;
+                pWiVal += ncolsWiRev;
+                pCtVal += ncolsRev;
+                pHtVal += ncolsRev;
+            }
         }
+    };
 
-        auto ncolsRev   = flip ? -ncols   : ncols;
-        auto ncolsWiRev = flip ? -ncolsWi : ncolsWi;
-
-        for (Nd4jLong t = 0; t < time; ++t) {
-            // evaluate sigmoids
-            T ft = (1.)/(1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[1] + bF)));
-            T rt = (1.)/(1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[2] + bR)));
-
-            cur = (cur - *pWiVal)*ft + *pWiVal;
-            *pCtVal = cur;
-            T val = nd4j::math::nd4j_tanh<T, T>(cur);
-            *pHtVal = (val*maskVal - *pIVal)*rt + *pIVal;
-
-            pIVal  += ncolsRev;
-            pWiVal += ncolsWiRev;
-            pCtVal += ncolsRev;
-            pHtVal += ncolsRev;
-        }
-    }
+    samediff::Threads::parallel_tad(func, 0, ncols);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -232,72 +235,75 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
     T* pGradBias  = gradBias.bufferAsT<T>();
     T* pGradInit  = gradC0->bufferAsT<T>();
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong col = 0; col < ncols; ++col) {
-        T gbF = 0.f;
-        T gbR = 0.f;
-        const auto colNum = col % d2;
-        const bool flip = colNum >= K;
-        T maskVal       = mask ? *(pMask + col) : T(1.);
-        T cur           = *(pInGradCt + col);
-        T bF            = *(pBias     + colNum);
-        T bR            = *(pBias     + colNum + d2);
-        T* pWiVal        = pWi         + 3*col;
-        T* pInputVal     = pInput      + col;
-        T* pStateVal     = pState      + col;
-        T* pInGradHtVal  = pInGradHt    + col;
-        T* pGradWiVal    = pGradWi     + 3*col;
-        T* pGradInputVal = pGradInput  + col;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto col = start; col < stop; col += increment) {
+            T gbF = 0.f;
+            T gbR = 0.f;
+            const auto colNum = col % d2;
+            const bool flip = colNum >= K;
+            T maskVal = mask ? *(pMask + col) : T(1.);
+            T cur = *(pInGradCt + col);
+            T bF = *(pBias + colNum);
+            T bR = *(pBias + colNum + d2);
+            T *pWiVal = pWi + 3 * col;
+            T *pInputVal = pInput + col;
+            T *pStateVal = pState + col;
+            T *pInGradHtVal = pInGradHt + col;
+            T *pGradWiVal = pGradWi + 3 * col;
+            T *pGradInputVal = pGradInput + col;
 
-        if (!flip) {
-            const auto stepI = (time - 1) * ncols;
-            const auto stepW = (time - 1) * ncolsWi;
-            pInputVal     += stepI;
-            pStateVal     += stepI;
-            pInGradHtVal  += stepI;
-            pGradInputVal += stepI;
-            pWiVal        += stepW;
-            pGradWiVal    += stepW;
+            if (!flip) {
+                const auto stepI = (time - 1) * ncols;
+                const auto stepW = (time - 1) * ncolsWi;
+                pInputVal += stepI;
+                pStateVal += stepI;
+                pInGradHtVal += stepI;
+                pGradInputVal += stepI;
+                pWiVal += stepW;
+                pGradWiVal += stepW;
+            }
+
+            Nd4jLong ncolsRev = flip ? -ncols : ncols;
+            Nd4jLong ncolsWiRev = flip ? -ncolsWi : ncolsWi;
+
+            for (Nd4jLong t = 0; t < time; ++t) {
+                // evaluate sigmoids
+                T ft = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 1) + bF)));
+                T rt = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 2) + bR)));
+
+                T val = nd4j::math::nd4j_tanh<T, T>(*pStateVal);
+                T prevVal = (t < time - 1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col));
+                // grad wrt input
+                *pGradInputVal = *pInGradHtVal - (*pInGradHtVal) * rt;
+                // grad wrt rt, wiR and bR
+                T grt = (*pInGradHtVal) * (val * maskVal - *pInputVal) * (rt - rt * rt);
+                *(pGradWiVal + 2) = grt;
+                gbR += grt;
+                // grad wrt state
+                T gradSateVal = (*pInGradHtVal) * maskVal * (rt - rt * val * val) + cur;
+                // grad wrt wi0
+                *pGradWiVal = gradSateVal - gradSateVal * ft;
+                // grad wrt ft, wi1, and bF
+                T gft = gradSateVal * (prevVal - *pWiVal) * (ft - ft * ft);
+                *(pGradWiVal + 1) = gft;
+                gbF += gft;
+                // grad wrt c_previous
+                cur = gradSateVal * ft;
+
+                pInputVal -= ncolsRev;
+                pWiVal -= ncolsWiRev;
+                pStateVal -= ncolsRev;
+                pGradWiVal -= ncolsWiRev;
+                pGradInputVal -= ncolsRev;
+                pInGradHtVal -= ncolsRev;
+            }
+            *(pGradBias + col) = gbF;
+            *(pGradBias + col + ncols) = gbR;
+            *(pGradInit + col) = cur;
         }
+    };
 
-        Nd4jLong ncolsRev   = flip ? -ncols   : ncols;
-        Nd4jLong ncolsWiRev = flip ? -ncolsWi : ncolsWi;
-
-        for (Nd4jLong t = 0; t < time; ++t) {
-            // evaluate sigmoids
-            T ft = ((T)1.)/((T)1. + nd4j::math::nd4j_exp<T,T>(-(*(pWiVal + 1) + bF)));
-            T rt = ((T)1.)/((T)1. + nd4j::math::nd4j_exp<T,T>(-(*(pWiVal + 2) + bR)));
-
-            T val     = nd4j::math::nd4j_tanh<T,T>(*pStateVal);
-            T prevVal = (t < time-1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col));
-            // grad wrt input
-            *pGradInputVal = *pInGradHtVal - (*pInGradHtVal)*rt ;
-            // grad wrt rt, wiR and bR
-            T grt = (*pInGradHtVal) * (val*maskVal - *pInputVal) * (rt - rt*rt);
-            *(pGradWiVal + 2) = grt;
-            gbR += grt;
-            // grad wrt state
-            T gradSateVal = (*pInGradHtVal) * maskVal * (rt - rt*val*val) + cur;
-            // grad wrt wi0
-            *pGradWiVal = gradSateVal - gradSateVal*ft;
-            // grad wrt ft, wi1, and bF
-            T gft = gradSateVal * (prevVal - *pWiVal) * (ft - ft*ft);
-            *(pGradWiVal + 1) = gft;
-            gbF += gft;
-            // grad wrt c_previous
-            cur = gradSateVal * ft;
-
-            pInputVal     -= ncolsRev;
-            pWiVal        -= ncolsWiRev;
-            pStateVal     -= ncolsRev;
-            pGradWiVal    -= ncolsWiRev;
-            pGradInputVal -= ncolsRev;
-            pInGradHtVal  -= ncolsRev;
-        }
-        *(pGradBias + col) = gbF;
-        *(pGradBias + col + ncols) = gbR;
-        *(pGradInit + col) = cur;
-    }
+    samediff::Threads::parallel_tad(func, 0, ncols);
 
     // gradB
     gradBias.reduceAlongDimension(reduce::Sum, gradB, {0});    // [4*K]
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
index 55de117a5..b974a236b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/stack.h>
 #include <helpers/ShapeUtils.h>
 #include <array/ResultSet.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j {
@@ -35,9 +36,12 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 	if(inArrs[0]->rankOf() == 0) {
 	    int inSize = inArrs.size();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(inSize > Environment::getInstance()->tadThreshold())
-		for(int i=0; i < inSize; ++i)
-			outArr->p<T>(i, inArrs[i]->t<T>(0));
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment)
+                outArr->p<T>(i, inArrs[i]->t<T>(0));
+        };
+
+        samediff::Threads::parallel_for(func, 0, inSize);
 	}
 	else {
 
@@ -45,9 +49,11 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 		auto list = outArr->allTensorsAlongDimension(dimsToExclude);		// list.size() == block.width()
         int listSize = list->size();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(listSize > Environment::getInstance()->tadThreshold())
-		for(int i=0; i<listSize; ++i)
-			list->at(i)->assign(inArrs[i]);
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment)
+                list->at(i)->assign(inArrs[i]);
+        };
+        samediff::Threads::parallel_tad(func, 0, listSize);
 
 		delete list;
 	}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
index f05647589..e38232928 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/top_k.h>
 #include <ops/declarable/headers/parity_ops.h>
 #include <NDArrayFactory.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -148,19 +149,21 @@ namespace helpers {
             int status = topKFunctor(context, input, values, indices.get(), k, true);
             result->assign(0);
             if (status == ND4J_STATUS_OK) {
-                bool condition = target->lengthOf() > Environment::getInstance()->tadThreshold();
-                PRAGMA_OMP_PARALLEL_FOR_IF(condition)
-                for (int e = 0; e < target->lengthOf(); e++) {
-                    bool found = false;
-                    for (int j = 0; j < k; j++) {
-                        if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
-                            found = true;
-                            break;
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto e = start; e < stop; e += increment) {
+                        bool found = false;
+                        for (int j = 0; j < k; j++) {
+                            if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
+                                found = true;
+                                break;
+                            }
                         }
+                        if (found)
+                            result->p<bool>(e, true);
                     }
-                    if (found)
-                        result->p<bool>(e, true);
-                }
+                };
+
+                samediff::Threads::parallel_tad(func, 0, target->lengthOf());
             }
             return status;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index 9e04ed4df..ea2fb348a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -42,11 +42,13 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
     const_cast<NDArray&>(input).fillAsTriangular<T>(0, diagonal, dOdI.sizeAt(-1), 'b', &dOdI);
     int dLen = dOdI.lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(dLen > Environment::getInstance()->elementwiseThreshold())
-    for(Nd4jLong i = 0; i < dLen; ++i) {
-        if(dOdI.t<T>(i) != static_cast<T>(0.f))
-            dOdI.t<T>(i) = static_cast<T>(1.f);
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            if (dOdI.t<T>(i) != static_cast<T>(0.f))
+                dOdI.t<T>(i) = static_cast<T>(1.f);
+        }
+    };
+    samediff::Threads::parallel_for(func, 0, dLen);
 
     // FIXME: !!!
     gradI.assign(dOdI * gradO);                          // chain rule: dLoss/dI = dO/dI * dLoss/dO
@@ -59,14 +61,14 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void trace_(const NDArray& input, NDArray& output) {
-
     const int inRank = input.rankOf();
-
     auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(setOfSubArrs->size() > Environment::getInstance()->tadThreshold())
-    for(int i = 0; i < setOfSubArrs->size(); ++i)
-        output.p(i, setOfSubArrs->at(i)->getTrace());
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            output.p(i, setOfSubArrs->at(i)->getTrace());
+    };
+    samediff::Threads::parallel_for(func, 0, setOfSubArrs->size());
 
     delete setOfSubArrs;
 }
@@ -107,7 +109,8 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::graph::RandomGenerato
             std::vector<int> indices(firstDim);
             std::iota(indices.begin(), indices.end(), 0);
             output.p<T>(Nd4jLong(0), input.e<T>(0));
-            PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+
+            // FIXME: parallelism!!
             for(int i = firstDim-1; i > 0; --i) {
                 int r = rng.relativeInt(i) % i;
                 output.t<T>(i) = input.t<T>(indices[r]);
@@ -184,54 +187,61 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 
     const auto zLen = output.lengthOf();
 
-    std::vector<Nd4jLong> coords(rank);  // we use the same coordinates storage both for input and output since their ranks are the same
-
     if(mode == 0) { // CONSTANT case
 
         const T padVal = padValue.e<T>(0);
 
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
-        for(uint i = 0; i < zLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            Nd4jLong coords[MAX_RANK];
+            for (auto i = start; i < stop; i += increment) {
+                shape::index2coords(i, output.getShapeInfo(), coords);
+                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-            shape::index2coords(i, output.getShapeInfo(), coords.data());
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+                bool within = true;
+                for (int j = rankMinusOne; j >= 0; --j) {
+                    if (xShape[j] == zShape[j]) continue;
+                    const auto left = paddings.e<Nd4jLong>(j, 0);
+                    if (coords[j] < left || coords[j] >= left + xShape[j]) {
+                        within = false;
+                        break;
+                    }
+                    else { coords[j] = coords[j] - left; }
+                }
 
-            bool within = true;
-            for(int j = rankMinusOne; j >= 0; --j) {
-                if(xShape[j] == zShape[j]) continue;
-                const auto left = paddings.e<Nd4jLong>(j, 0);
-                if(coords[j] < left || coords[j] >= left + xShape[j]) {within = false; break;}
-                else                                                  {coords[j] = coords[j] - left;}
+                if (within)
+                    z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+                else
+                    z[zOffset] = padVal;
             }
+        };
 
-            if(within)
-                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-            else
-                z[zOffset] = padVal;
-        }
+        samediff::Threads::parallel_tad(func, 0, zLen);
     }
     else {  // REFLECT and SYMMETRIC cases
 
         const Nd4jLong shift1 = mode == 1 ? 0 : 1;         // REFLECT : SYMMETRIC
         const Nd4jLong shift2 = mode == 1 ? 2 : 1;         // REFLECT : SYMMETRIC
 
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
-        for(uint i = 0; i < zLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            Nd4jLong coords[MAX_RANK];
+            for (auto i = start; i < stop; i += increment) {
+                shape::index2coords(i, output.getShapeInfo(), coords);
+                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-            shape::index2coords(i, output.getShapeInfo(), coords.data());
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+                for (int j = rankMinusOne; j >= 0; --j) {
 
-            for(int j = rankMinusOne; j >= 0; --j) {
+                    if (xShape[j] == zShape[j]) continue;
+                    coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
+                    if (coords[j] < 0) coords[j] = -coords[j] - shift1;                // means fill from left
+                    else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
+                }
 
-                if(xShape[j] == zShape[j]) continue;
-                coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
-                if(coords[j] < 0)               coords[j] = -coords[j] - shift1;                // means fill from left
-                else if(coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
+                const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
+                z[zOffset] = x[xOffset];
             }
+        };
 
-            const auto xOffset = shape::getOffset(input.getShapeInfo(), coords.data());
-            z[zOffset] = x[xOffset];
-        }
+        samediff::Threads::parallel_tad(func, 0, zLen);
     }
 }
 
@@ -558,50 +568,49 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
 
     const int yLastDim = indices.sizeAt(-1);
 
-    std::vector<Nd4jLong> coords(maxRank);
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK * 3];
+        for (auto i = start; i < stop; i += increment) {
+            Nd4jLong *zCoordStart, *xCoordStart;
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+            if (yLastDim == xRank) {
+                zCoordStart = coords;
+                xCoordStart = coords;
+            } else if (zRank >= xRank) {
+                zCoordStart = coords;
+                xCoordStart = coords + zRank - xRank;
+            } else {
+                zCoordStart = coords + xRank - zRank;
+                xCoordStart = coords;
+            }
 
-        Nd4jLong *zCoordStart, *xCoordStart;
+            shape::index2coords(i, output.getShapeInfo(), zCoordStart);
 
-        if(yLastDim == xRank) {
-            zCoordStart = coords.data();
-            xCoordStart = coords.data();
-        }
-        else if(zRank >= xRank) {
-            zCoordStart = coords.data();
-            xCoordStart = coords.data() + zRank - xRank;
-        }
-        else {
-            zCoordStart = coords.data() + xRank - zRank;
-            xCoordStart = coords.data();
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
+
+            // last y coordinate
+            uint coordToRestore;
+            if (yLastDim != xRank)
+                coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
+
+            zCoordStart[yRank - 1] = 0;
+            const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
+
+            //restore z coordinate
+            if (yLastDim != xRank)
+                zCoordStart[yRank - 1] = coordToRestore;
+
+            // construct coordinates for x
+            for (uint j = 0; j < yLastDim; ++j)
+                xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
+
+            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
+
+            z[zOffset] = x[xOffset];
         }
+    };
 
-        shape::index2coords(i, output.getShapeInfo(), zCoordStart);
-
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
-
-        // last y coordinate
-        uint coordToRestore;
-        if(yLastDim != xRank)
-            coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
-
-        zCoordStart[yRank - 1] = 0;
-        const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
-
-        //restore z coordinate
-        if(yLastDim != xRank)
-            zCoordStart[yRank - 1] = coordToRestore;
-
-        // construct coordinates for x
-        for(uint j = 0; j < yLastDim; ++j)
-            xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
-
-        const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
-
-        z[zOffset] = x[xOffset];
-    }
+    samediff::Threads::parallel_tad(func, 0, zLen);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -644,21 +653,28 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
         }
         else if (input->rankOf() == 1 && indices->isVector()) {
             // special case
-            PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->tadThreshold())
-            for (int e = 0; e < indices->lengthOf(); e++)
-                output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment)
+                    output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
+            };
+
+            samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         else {
 
             std::vector<int> dimsOut(indices->rankOf());
             std::iota(dimsOut.begin(), dimsOut.end(), axis);   // fill with axis, axis+1, ... indices->rankOf()-1
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, dimsOut);
-                NDArray subArrIn  = (*input)(indices->e<Nd4jLong>(i), {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, dimsOut);
+                    NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     }
     else {
@@ -673,12 +689,16 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
         }
         else { // vector case
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, {axis});
-                NDArray subArrIn  = (*input)(intArgs[i+1], {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, {axis});
+                    NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     }
 }
@@ -693,9 +713,12 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
     const int rank = output.rankOf();
     auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(arrs->size() > Environment::getInstance()->tadThreshold())
-    for(int i = 0; i < arrs->size(); ++i)
-        arrs->at(i)->setIdentity();
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            arrs->at(i)->setIdentity();
+    };
+
+    samediff::Threads::parallel_tad(func, 0, arrs->size());
 
     delete arrs;
 }
@@ -719,41 +742,43 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
     for (; e < intArgs->size(); e++)
         indices.push_back((*intArgs)[e]);
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong i = 0; i < indices.size(); ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            auto inSubArr = input(indices[i], dimsToExclude, true);
+            auto updSubArr = updates(i, dimsToExclude, true);
 
-        auto inSubArr  = input(indices[i], dimsToExclude, true);
-        auto updSubArr = updates(i,        dimsToExclude, true);
-
-        if (inSubArr.lengthOf() != updSubArr.lengthOf())
-            continue;
-
-        switch (opCode) {
-            case 0:
-                inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 1:
-                inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 2:
-                inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 3:
-                inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 4:
-                inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 5:
-                inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 6:
-                inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
-                break;
-            default:
+            if (inSubArr.lengthOf() != updSubArr.lengthOf())
                 continue;
+
+            switch (opCode) {
+                case 0:
+                    inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 1:
+                    inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 2:
+                    inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 3:
+                    inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 4:
+                    inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 5:
+                    inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 6:
+                    inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
+                    break;
+                default:
+                    continue;
+            }
         }
-    }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, indices.size());
 }
 
 
@@ -766,11 +791,14 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input
     switch (opId) {
 
         case 6: {   // copy
-            PRAGMA_OMP_PARALLEL_FOR_IF(len > Environment::getInstance()->elementwiseThreshold())
-            for(uint i = 0; i < len; ++i) {
-                auto inSubArr = input(i, dimensions);
-                inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto inSubArr = input(i, dimensions);
+                    inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, len);
         }
             break;
 
@@ -786,70 +814,79 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
     const Nd4jLong numArgs = inArrs.size();
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-    for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
-        T max = -DataTypeUtils::max<T>();
-        Nd4jLong idx = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T max = -DataTypeUtils::max<T>();
+            Nd4jLong idx = 0;
 
-        for (int i = 0; i < numArgs; i++){
-
-            T v = inArrs[i]->e<T>(e);
-            if (v > max) {
-                max = v;
-                idx = i;
+            for (int i = 0; i < numArgs; i++) {
+                T v = inArrs[i]->e<T>(e);
+                if (v > max) {
+                    max = v;
+                    idx = i;
+                }
             }
+            output.p(e, idx);
         }
-        output.p(e, idx);
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
+}
+
+void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+    BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES);
 }
-    void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
-        BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES);
-    }
 
 
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
-
     const Nd4jLong numArgs = inArrs.size();
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-     for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
-        T max = -DataTypeUtils::max<T>();
-        for (int i = 0; i < numArgs; i++) {
-            T v = inArrs[i]->e<T>(e);
-            if (v > max)
-                max = v;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T max = -DataTypeUtils::max<T>();
+            for (int i = 0; i < numArgs; i++) {
+                T v = inArrs[i]->e<T>(e);
+                if (v > max)
+                    max = v;
+            }
+            output.p(e, max);
         }
-        output.p(e, max);
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
+}
+
+void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+    BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (inArrs, output), LIBND4J_TYPES);
 }
-    void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
-        BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (inArrs, output), LIBND4J_TYPES);
-    }
 
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
-
     const Nd4jLong numArgs = inArrs.size();
     const T factor = 1.f / numArgs;
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-    for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
-        T sum = 0.;
-        for (int i = 0; i < numArgs; i++) {
-            T v = inArrs[i]->e<T>(e);
-            sum += v;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T sum = 0.;
+            for (int i = 0; i < numArgs; i++) {
+                T v = inArrs[i]->e<T>(e);
+                sum += v;
+            }
+            output.p<T>(e, sum * factor);
         }
-        output.p<T>(e, sum * factor);
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
+}
+
+void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+    BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (inArrs, output), LIBND4J_TYPES);
 }
-    void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
-        BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (inArrs, output), LIBND4J_TYPES);
-    }
 
 
 //////////////////////////////////////////////////////////////////////////
@@ -859,16 +896,17 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     const Nd4jLong numArgs = inArrs.size();
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-    for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T sum = (T) 0.f;
+            for (int i = 0; i < numArgs; i++)
+                sum += inArrs[i]->e<T>(e);
 
-        T sum = (T) 0.f;
+            output.p(e, sum);
+        }
+    };
 
-        for (int i = 0; i < numArgs; i++)
-            sum += inArrs[i]->e<T>(e);
-
-        output.p(e, sum);
-    }
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
 }
     void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
         BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (inArrs, output), LIBND4J_TYPES);
@@ -895,14 +933,15 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
 
             auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for(Nd4jLong i = 0; i < listOfInSubArrs->size(); ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    const T iNormActual = norm2.e<T>(i);
+                    if (iNormActual > normClip)
+                        *listOfInSubArrs->at(i) *= normClip / iNormActual;
+                }
+            };
+            samediff::Threads::parallel_tad(func, 0, listOfInSubArrs->size());
 
-                const T iNormActual = norm2.e<T>(i);
-
-                if (iNormActual > normClip)
-                    *listOfInSubArrs->at(i) *= normClip / iNormActual;
-            }
             delete listOfInSubArrs;
         }
     }
@@ -920,18 +959,19 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
             auto listOfInSubArrs  = input.allTensorsAlongDimension(dimensions);
             auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for(Nd4jLong i = 0; i < listOfInSubArrs->size(); ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto inputSubArr = listOfInSubArrs->at(i);
+                    auto outputSubArr = listOfOutSubArrs->at(i);
+                    outputSubArr->assign(inputSubArr);
 
-                auto inputSubArr  = listOfInSubArrs->at(i);
-                auto outputSubArr = listOfOutSubArrs->at(i);
-                outputSubArr->assign(inputSubArr);
+                    const T iNormActual = norm2.e<T>(i);
 
-                const T iNormActual = norm2.e<T>(i);
-
-                if (iNormActual > clipNorm.e<T>(0))
-                    *outputSubArr *= clipNorm / iNormActual;
-            }
+                    if (iNormActual > clipNorm.e<T>(0))
+                        *outputSubArr *= clipNorm / iNormActual;
+                }
+            };
+            samediff::Threads::parallel_tad(func, 0, listOfInSubArrs->size());
 
             delete listOfInSubArrs;
             delete listOfOutSubArrs;
@@ -1028,31 +1068,29 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
 
         auto cn = clipNorm.e<T>(0);
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for(Nd4jLong i = 0; i < gradISubArrs->size(); ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                T N = norm2.e<T>(i);
 
-            T N = norm2.e<T>(i);
+                auto gradOSubArr = gradOSubArrs->at(i);
+                auto gradISubArr = gradISubArrs->at(i);
 
-            auto gradOSubArr = gradOSubArrs->at(i);
-            auto gradISubArr = gradISubArrs->at(i);
+                if (N > cn) {
+                    auto inputSubArr = inputSubArrs->at(i);
+                    const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e<T>(0);    // reduce to scalar
+                    const T factor1 = static_cast<T>(1.f) / N;
+                    const T factor3 = factor1 / (N * N);                                            // 1 / (N*N*N)
 
-            if (N > cn) {
+                    auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
+                        return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
+                    };
 
-                auto inputSubArr = inputSubArrs->at(i);
-
-                const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e<T>(0);    // reduce to scalar
-                const T factor1 = static_cast<T>(1.f) / N;
-                const T factor3 = factor1 / (N * N) ;                                            // 1 / (N*N*N)
-
-                auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
-                    return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
-                };
-
-                inputSubArr->applyPairwiseLambda<T>(gradOSubArr, lambda, gradISubArr);
+                    inputSubArr->applyPairwiseLambda<T>(gradOSubArr, lambda, gradISubArr);
+                } else
+                    gradISubArr->assign(gradOSubArr);
             }
-            else
-                gradISubArr->assign(gradOSubArr);
-        }
+        };
+        samediff::Threads::parallel_tad(func, 0, gradISubArrs->size());
 
         delete gradISubArrs;
         delete gradOSubArrs;
@@ -1165,34 +1203,35 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
     }
     else {
 
-        std::vector<Nd4jLong> inIdx(rank), outIdx(rank);
+        auto func = PRAGMA_THREADS_FOR {
+            Nd4jLong inIdx[MAX_RANK];
+            Nd4jLong outIdx[MAX_RANK];
+            for (auto i = start; i < stop; i += increment) {
+                shape::index2coords(i, output.getShapeInfo(), outIdx);
 
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(inIdx, outIdx))
-        for(int i = 0; i < outLen; ++i) {
+                for (int j = 0; j < rank; ++j) {
+                    const Nd4jLong inLen = input.sizeAt(j);
+                    const auto leftSide = paddings.e<T>(j, 0);
+                    const auto leftSideCorrected = leftSide - reflBorder;
+                    const Nd4jLong len = 2 * (inLen - 1) + leftSide + reflBorder;
 
-            shape::index2coords(i, output.getShapeInfo(), outIdx.data());
+                    if (outIdx[j] < leftSide)                                        // left side
+                        inIdx[j] = leftSideCorrected - outIdx[j];
 
-            for(int j = 0; j < rank; ++j) {
+                    else if (outIdx[j] >= leftSide && outIdx[j] < leftSide + inLen)  // middle
+                        inIdx[j] = outIdx[j] - leftSide;
 
-                const Nd4jLong inLen         = input.sizeAt(j);
-                const auto leftSide          = paddings.e<T>(j, 0);
-                const auto leftSideCorrected = leftSide - reflBorder;
-                const Nd4jLong len           = 2*(inLen-1) + leftSide + reflBorder;
+                    else                                                            // right side
+                        inIdx[j] = len - outIdx[j];
+                }
 
-                if(outIdx[j] < leftSide)                                        // left side
-                    inIdx[j] = leftSideCorrected - outIdx[j];
-
-                else if(outIdx[j] >= leftSide && outIdx[j] < leftSide + inLen)  // middle
-                    inIdx[j] = outIdx[j] - leftSide;
-
-                else                                                            // right side
-                    inIdx[j] = len - outIdx[j];
+                auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx);
+                auto inOffset = shape::getOffset(input.getShapeInfo(), inIdx);
+                reinterpret_cast<T *>(output.buffer())[outOffset] = reinterpret_cast<T *>(input.getBuffer())[inOffset];
             }
+        };
 
-            auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx.data());
-            auto inOffset  = shape::getOffset(input.getShapeInfo(), inIdx.data());
-            reinterpret_cast<T*>(output.buffer())[outOffset] = reinterpret_cast<T*>(input.getBuffer())[inOffset];
-        }
+        samediff::Threads::parallel_for(func, 0, outLen);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
index a365d8135..5d4ed9f2e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
@@ -19,6 +19,7 @@
 //
 
 #include<ops/declarable/helpers/zeta.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -62,9 +63,12 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
 	//auto result = NDArray(&x, false, context);
 	int xLen = x.lengthOf();
 
-	PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
-	for(int i = 0; i < xLen; ++i)
-		  z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
+	auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
+    };
+
+	samediff::Threads::parallel_for(func, 0, xLen);
 }
 
 void zeta(nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z) {
diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h
index 27caedd0c..d087a4849 100644
--- a/libnd4j/include/ops/declarable/helpers/cross.h
+++ b/libnd4j/include/ops/declarable/helpers/cross.h
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/helpers.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -66,14 +67,17 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
 
         int tads = tadsA->size();
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int e = 0; e < tads; e++) {
-            auto a_ = tadsA->at(e);
-            auto b_ = tadsB->at(e);
-            auto o_ = tadsO->at(e);
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                auto a_ = tadsA->at(e);
+                auto b_ = tadsB->at(e);
+                auto o_ = tadsO->at(e);
 
-            helpers::cross(context, a_, b_, o_);
-        }
+                helpers::cross(context, a_, b_, o_);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func,  0, tads);
 
         delete tadsA;
         delete tadsB;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
deleted file mode 100644
index 63e406cc6..000000000
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
+++ /dev/null
@@ -1,138 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by raver119 on 30.11.17.
-//
-
-#include <ops/declarable/helpers/col2im.h>
-
-namespace nd4j {
-namespace ops {
-namespace helpers {
-
-// [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-template <typename T>
-void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
-
-    auto imBuff         = output.bufferAsT<T>();
-	auto colBuff        = input.bufferAsT<T>();
-	auto imShapeBuffer  = output.getShapeInfo();
-	auto colShapeBuffer = input.getShapeInfo();
-    auto colShape  		= shape::shapeOf(colShapeBuffer);
-    auto colStride 		= shape::stride(colShapeBuffer);
-    auto imShape  	    = shape::shapeOf(imShapeBuffer);
-    auto imStride 	    = shape::stride(imShapeBuffer);
-
-    const int bS = imShape[0];
-    const int iC = imShape[1];
-    const int kH = colShape[2];
-    const int kW = colShape[3];
-    const int oH = colShape[4];
-    const int oW = colShape[5];
-    const Nd4jLong colStride0 = colStride[0];
-    const Nd4jLong colStride1 = colStride[1];
-    const Nd4jLong colStride2 = colStride[2];
-    const Nd4jLong colStride3 = colStride[3];
-    const Nd4jLong colStride4 = colStride[4];
-    const Nd4jLong colStride5 = colStride[5];
-    const Nd4jLong imStride0  = imStride[0];
-    const Nd4jLong imStride1  = imStride[1];
-    const Nd4jLong imStride2  = imStride[2];
-    const Nd4jLong imStride3  = imStride[3];
-
-    // initial zeroing of image content
-    const auto imEWS = shape::elementWiseStride(imShapeBuffer);
-    if(imEWS == 1) {
-        memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
-    }
-    else if (imEWS > 1) {
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
-        for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS)
-            imBuff[i] = static_cast<T>(0.f);
-    }
-    else {
-        const auto len = shape::length(imShapeBuffer);
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
-        for (int i = 0; i < len; i++)
-            imBuff[shape::getIndexOffset(i, imShapeBuffer)] = static_cast<T>(0.f);
-    }
-
-	T *col, *im;
-    int imRow, imCol;
-
-    if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
-
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-      		for (int c = 0; c < iC; ++c) {
-            	for (int kRow = 0; kRow < kH; ++kRow) {
-                	for (int kCol = 0; kCol < kW; ++kCol) {
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {
-
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im += *col;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    else {
-
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-        	for (int colH = 0; colH < oH; ++colH) {
-            	for (int colW = 0; colW < oW; ++colW) {
-                	for (int c = 0; c < iC; ++c) {
-                    	for (int kRow = 0; kRow < kH; ++kRow) {
-                        	for (int kCol = 0; kCol < kW; ++kCol) {
-
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im += *col;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-void col2im(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
-	BUILD_SINGLE_SELECTOR(input.dataType(), col2im_, (context, input, output, sH, sW, pH, pW, iH, iW, dH, dW), LIBND4J_TYPES);
-}
-
-BUILD_SINGLE_TEMPLATE(template void col2im_, (nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW), LIBND4J_TYPES);
-
-}
-}
-}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
deleted file mode 100644
index 67f5650bd..000000000
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
+++ /dev/null
@@ -1,129 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author Yurii Shyrma (iuriish@yahoo.com), created on 19.09.2018
-//
-
-#include <ops/declarable/helpers/im2col.h>
-
-
-namespace nd4j    {
-namespace ops     {
-namespace helpers {
-
-// input [bS, iC, iH, iW] is convoluted to output [bS, iC, kH, kW, oH, oW]
-template <typename T>
-static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
-
-	auto imBuff         = static_cast<T*>(input.getBuffer());
-	auto colBuff        = static_cast<T*>(output.getBuffer());
-	auto imShapeBuffer  = input.getShapeInfo();
-	auto colShapeBuffer = output.getShapeInfo();
-    auto colShape       = shape::shapeOf(colShapeBuffer);
-    auto colStride      = shape::stride(colShapeBuffer);
-    auto imShape        = shape::shapeOf(imShapeBuffer);
-    auto imStride       = shape::stride(imShapeBuffer);
-
-    const T zeroPadVal =  arrZeroPadVal.e<T>(0);
-
-    const int bS = imShape[0];
-    const int iC = imShape[1];
-    const int iH = imShape[2];
-    const int iW = imShape[3];
-    const int oH = colShape[4];
-    const int oW = colShape[5];
-    const Nd4jLong colStride0 = colStride[0];
-    const Nd4jLong colStride1 = colStride[1];
-    const Nd4jLong colStride2 = colStride[2];
-    const Nd4jLong colStride3 = colStride[3];
-    const Nd4jLong colStride4 = colStride[4];
-    const Nd4jLong colStride5 = colStride[5];
-    const Nd4jLong imStride0  = imStride[0];
-    const Nd4jLong imStride1  = imStride[1];
-    const Nd4jLong imStride2  = imStride[2];
-    const Nd4jLong imStride3  = imStride[3];
-
-    T *col, *im;
-    int imRow, imCol;
-            
-    if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
-
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-        	for (int c = 0; c < iC; ++c) {        
-            	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                	for (int kCol = 0; kCol < kW; ++kCol) {                            
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {                    
-                                
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-                                        
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3; 
-                                                    
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else 
-                                	*col = *im;
-                            }
-                        }
-                    }
-                }
-            }
-        }  
-    }
-    else {
- 
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-        	for (int colH = 0; colH < oH; ++colH) {
-            	for (int colW = 0; colW < oW; ++colW) {
-                	for (int c = 0; c < iC; ++c) {
-                    	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                        	for (int kCol = 0; kCol < kW; ++kCol) {                            
-                        
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-                                        
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-                                                    
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else 
-                                	*col = *im;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-void im2col(nd4j::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
-	BUILD_SINGLE_SELECTOR(im.dataType(), im2col_, (context, im, col, kH, kW, sH, sW, pH, pW, dH, dW, arrZeroPadVal), LIBND4J_TYPES);
-}
-
-BUILD_SINGLE_TEMPLATE(template void im2col_, (nd4j::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal), LIBND4J_TYPES);
-
-
-}
-}
-}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
index c2dd4919d..753c8ae64 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/legacy_helpers.h>
+#include <ops/ops.h>
 #include <NDArrayFactory.h>
 #include <op_boilerplate.h>
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
index 017180b38..3a09f9a80 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <NDArrayFactory.h>
+#include <ops/ops.h>
 #include <op_boilerplate.h>
 
 namespace nd4j {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
index 8db1f66d4..fa97a3de2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <NDArrayFactory.h>
 #include <op_boilerplate.h>
+#include <ops/ops.h>
 
 namespace nd4j {
 namespace ops {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index 0a707ffb3..8a9986e23 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -644,7 +644,6 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 
             // apply Fisher-Yates shuffle
             if(isInplace) {
-                PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->elementwiseThreshold())
                 for(int i = firstDim - 1; i > 0; --i) {
                     int r = rng.relativeInt(i) % i;
 
@@ -658,7 +657,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
                 std::vector<int> indices(firstDim);
                 std::iota(indices.begin(), indices.end(), 0);
                 bool isZeroShuffled = false;
-                PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+
                 for(int i = firstDim - 1; i > 0; --i) {
                     int r = rng.relativeInt(i) % i;
                     subArrsListOut->at(i)->assign(subArrsListIn->at(indices[r]));
diff --git a/libnd4j/include/ops/declarable/helpers/helpers.h b/libnd4j/include/ops/declarable/helpers/helpers.h
index 0914d2d49..f2e19063e 100644
--- a/libnd4j/include/ops/declarable/helpers/helpers.h
+++ b/libnd4j/include/ops/declarable/helpers/helpers.h
@@ -38,6 +38,7 @@
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <cuda_device_runtime_api.h>
+#include <helpers/DebugHelper.h>
 #include <stdio.h>
 #include <stdlib.h>
 #endif // CUDACC
diff --git a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
index 47ca64d3b..4fb32e2f8 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/choose.h>
 #include <NDArrayFactory.h>
+#include <ops/ops.h>
 
 namespace nd4j {
 namespace ops {
diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
index 5a73e0a00..8ef63101e 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/unique.h>
 #include <Status.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -67,12 +68,14 @@ namespace helpers {
             }
         }
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(values->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-        for (int e = 0; e < values->lengthOf(); e++) {
-            values->p(e, static_cast<T>(valuesVector[e]));
-            if (counts != nullptr) 
-                counts->p(e, countsMap[valuesVector[e]]);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                values->p(e, static_cast<T>(valuesVector[e]));
+                if (counts != nullptr)
+                    counts->p(e, countsMap[valuesVector[e]]);
+            }
+        };
+        samediff::Threads::parallel_for(func, 0, values->lengthOf());
 
         for (int e = 0; e < indices->lengthOf(); e++) {
             auto posI = std::find(valuesVector.begin(), valuesVector.end(), input->e<T>(e));
diff --git a/libnd4j/include/ops/declarable/helpers/matmul.h b/libnd4j/include/ops/declarable/helpers/matmul.h
index 8d253cabf..2e7cce13f 100644
--- a/libnd4j/include/ops/declarable/helpers/matmul.h
+++ b/libnd4j/include/ops/declarable/helpers/matmul.h
@@ -22,7 +22,6 @@
 #define LIBND4J_HELPERS_MATMUL_H
 
 #include <NDArray.h>
-#include <helpers/BlasHelper.h>
 
 namespace nd4j {
     namespace ops {
diff --git a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
index 579fdf394..436cddda3 100644
--- a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
@@ -29,10 +29,6 @@ namespace nd4j {
             //
         }
 
-        BooleanOp::~BooleanOp() {
-            //
-        }
-
         /**
         * Output shape of any BooleanOp is ALWAYS scalar
         */
diff --git a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
index 71c722bca..7d696c8ef 100644
--- a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
@@ -29,10 +29,6 @@ namespace nd4j {
             //
         }
 
-        BroadcastableOp::~BroadcastableOp() {
-            // no-op
-        }
-
         ShapeList *BroadcastableOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
             auto shapeList = SHAPELIST();
             auto x = inputShape->at(0);
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
index 691a3154d..1fd57c867 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
@@ -26,9 +26,5 @@ namespace nd4j {
         DeclarableCustomOp::DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs) : nd4j::ops::DeclarableOp(numInputs, numOutputs, opName, allowsInplace, tArgs, iArgs) {
             //
         }
-
-        DeclarableCustomOp::~DeclarableCustomOp()  {
-            //
-        }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
index 7cb28e76d..624d6dbef 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
@@ -26,10 +26,6 @@
 
 namespace nd4j {
     namespace ops {
-        DeclarableListOp::~DeclarableListOp() {
-            //
-        }
-
         DeclarableListOp::DeclarableListOp(int numInputs, int numOutputs, const char* opName, int tArgs, int iArgs) : DeclarableOp::DeclarableOp(numInputs, numOutputs, opName, false, tArgs, iArgs) {
             // This kind of operations work with sets: NDArrayList
             this->getOpDescriptor()->setInputType(InputType_NUMERIC_SET);
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
index ef3b04d30..98a60b28b 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/TAD.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -30,11 +31,6 @@ namespace nd4j {
             //
         }
 
-        DeclarableReductionOp::~DeclarableReductionOp()  {
-            //
-        }
-
-
         nd4j::ShapeList* DeclarableReductionOp::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block)  {
            // int numDims = INT_ARG(0);
             std::vector<int> dims;
@@ -55,7 +51,7 @@ namespace nd4j {
                 std::sort(dims.begin(), dims.end());
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
                 auto newShape = ConstantShapeHelper::getInstance()->scalarShapeInfo(block.dataType());
                 return SHAPELIST(newShape);
             }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
index 2b83b200a..684f09262 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
@@ -22,6 +22,7 @@
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -39,7 +40,7 @@ namespace nd4j {
             ExtraArguments extras(*block.getTArguments());
             PointersManager manager(block.launchContext(), "LegacyReduce3Op");
 
-            if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT))) {
+            if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()))) {
                 // reduce3 to scalar
                 NativeOpExecutioner::execReduce3Scalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                         extras.argumentsAsT(z->dataType()),
@@ -97,7 +98,7 @@ namespace nd4j {
 
             Nd4jLong *zShape = nullptr;
 
-            if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT))) {
+            if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()))) {
                 // reduce3 to scalar case
                 ALLOCATE(zShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong);
                 zShape[0] = 2;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
index ac4bb33b7..12a25537d 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -60,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 if ((axis.empty()) ||
-                    (axis.size() == 1 && axis[0] == MAX_INT) || allAxes) {
+                    (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                             extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
@@ -100,7 +101,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
index e1da0621e..2765e1b3f 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -60,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 // _axis.(block.getIArguments()->size() == 0) ||
-                //                    (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT)
+                //                    (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>())
                 if (block.getAxis()->empty() || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
@@ -101,7 +102,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
index 3c83df702..836564c79 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -60,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 if ((axis.empty()) ||
-                    (axis.size() == 1 && axis[0] == MAX_INT) || allAxes) {
+                    (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                             extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
@@ -103,7 +104,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
index 09a225b19..2340f39b0 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -98,7 +99,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceSameScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
index bb4dda4d4..08ebb80de 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
@@ -22,6 +22,7 @@
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 
 namespace nd4j {
@@ -43,7 +44,7 @@ namespace nd4j {
             ExtraArguments extras(*block.getTArguments());
             PointersManager manager(block.launchContext(),"LegacyStatsOp");
 
-            if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == MAX_INT)) {
+            if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == nd4j::DataTypeUtils::max<int>())) {
                 // scalar
                 NativeOpExecutioner::execSummaryStatsScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                         extras.argumentsAsT(z->dataType()), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), biasCorrected);
@@ -92,7 +93,7 @@ namespace nd4j {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
-            if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT)) {
+            if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>())) {
                 // in this case we just return scalar
                 ALLOCATE(newShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong);
                 newShape[0] = 2;
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
index 13e1cfe11..27f836a0e 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
  *
  * This program and the accompanying materials are made available under the
  * terms of the Apache License, Version 2.0 which is available at
@@ -55,7 +56,7 @@ static void batchnormMKLDNN(const NDArray* x, const NDArray* mean, const NDArray
     mkldnn::memory::data_type type = mkldnn::memory::data_type::f32;
 
     // indicate whether gamma or/and beta are given
-    auto flags = mkldnn::normalization_flags::use_global_stats;
+    auto flags = mkldnn::normalization_flags::use_global_stats;         // don't calculate the mean and variance for each mini-batch
     if (weights != nullptr)
         flags |= mkldnn::normalization_flags::use_scale_shift;
 
@@ -182,7 +183,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
     mkldnn::memory::data_type type = mkldnn::memory::data_type::f32;
 
     // indicate whether gamma or/and beta are given
-    auto flags = mkldnn::normalization_flags::use_global_stats;
+    auto flags = mkldnn::normalization_flags::use_global_stats;     // don't calculate the mean and variance for each mini-batch
     if (weights != nullptr)
         flags |= mkldnn::normalization_flags::use_scale_shift;
 
@@ -308,6 +309,70 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
     stream.wait();
 
     // shape::printArray(dLdI_mkl_mem.map_data<float>(),8);
+
+    // notations:
+    // f = g * (gamma * ((x - m) / (v + eps)^0.5) + beta) -> means dLdO * ff_output
+    // g = dLdO
+    // stdInv = 1 / (v + eps)^0.5
+    // N - batch size (product of spatial dimensions)
+
+    // formula for full derivative with respect to input (x)
+    // dLdI = dfdx + dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx)
+
+    // !!! MKL CALCULATES ONLY FIRST TERM dfdx, SO WE SHOULD CALCULATE TERM (dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx)) BY OURSELF !!!
+
+    // dfdm = -gamma*stdInv*g_sum;
+    // dmdx  = 1/N;
+    // dvdx  = 2 *  (x - m) / N
+    // dvdm  = -2 * [(x - m)]_sum / N
+    // dfdv  = -0.5 * [g*(x - m)]_sum * stdInv^3, drop gamma here for calc convenience
+
+    // finally:
+    // dLdI = dfdm / N + (2/N) * dfdv * (dvdm/2  + (x - m))
+    // dLdI = gamma * (  stdInv * -g_sum/N + (2/N) * dfdv * (dvdm/2  + (x - m))  )
+
+    std::vector<int> axes = {1};
+    const auto excludedAxes = ShapeUtils::evalDimsToExclude(x->rankOf(), axes);
+
+    // inversed batch size 1 / N
+    const auto Ninv = 1.f * mean->lengthOf() / x->lengthOf();
+
+    // x - mean
+    NDArray xMinusMean(x); // empty array with same shape as x
+    const_cast<NDArray*>(x)->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMinusMean);
+
+    // stdInv
+    NDArray stdInv = *variance + epsilon;
+    stdInv.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
+    stdInv.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
+
+    // dfdm / N
+    auto dfdm = dLdO->reduceAlongDims(nd4j::reduce::Sum, excludedAxes);
+    dfdm *= stdInv;
+    dfdm *= -Ninv;
+
+    // dvdm / 2
+    NDArray dvdm(mean);                 // empty array with same shape as mean
+    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, &dvdm, excludedAxes);
+    dvdm *= -Ninv;
+
+    // (2/N)*dfdv
+    NDArray dfdv(variance);                 // empty array with same shape as variance
+    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, &dfdv, excludedAxes);
+    dfdv *= stdInv*stdInv*stdInv;
+    dfdv *= -Ninv;
+
+    // dvdm/2  + (x - m)
+    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, &dvdm);
+    // dfdv * (dvdm/2  + (x - m))
+    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, &dfdv);
+    // add dfdm / N
+    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, &dfdm);
+    // * gamma
+    auto gamma = (*weights)({0,1, 0,0});
+    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, &gamma);
+
+    *dLdI += xMinusMean;
 }
 
 PLATFORM_IMPL(batchnorm) {
@@ -371,10 +436,21 @@ PLATFORM_IMPL(batchnorm) {
             (*weights)({1,2, 0,0}).assign(0);
     }
 
+    if(axes[0] == inRank - 1 && inRank > 2) {   // if nhwc or ndhwc
+        std::vector<int> permut = inRank == 4 ? std::vector<int>({0,3,1,2}) : std::vector<int>({0,4,1,2,3});
+        input = new NDArray(input->permute(permut));
+        output = new NDArray(output->permute(permut));
+    }
+
     batchnormMKLDNN(input, mean, variance, weights, epsilon, output);
 
     delete weights;
 
+    if(axes[0] == inRank - 1 && inRank > 2) {
+        delete input;
+        delete output;
+    }
+
     return Status::OK();
 }
 
@@ -418,7 +494,7 @@ PLATFORM_CHECK(batchnorm) {
 
     const int inRank = input->rankOf();
 
-    return block.isUseMKLDNN() && axes.size() == 1 && axes[0] == 1 && (inRank == 2 || inRank == 4 || inRank == 5) &&
+    return block.isUseMKLDNN() && axes.size() == 1 && (axes[0] == 1 || axes[0] == inRank - 1)  && (inRank == 2 || inRank == 4 || inRank == 5) &&
             (inputType == DataType::FLOAT32 && meanType == DataType::FLOAT32 && varType == DataType::FLOAT32 &&
              gammaType == DataType::FLOAT32 && betaType == DataType::FLOAT32 && outType == DataType::FLOAT32);
 }
@@ -558,29 +634,29 @@ PLATFORM_CHECK(batchnorm) {
 //////////////////////////////////////////////////////////////////////////
 PLATFORM_IMPL(batchnorm_bp) {
 
-    NDArray* input    = INPUT_VARIABLE(0);      // 2D:nc, 4D:nchw, 5D:ncdhw
-    NDArray* mean     = INPUT_VARIABLE(1);      // [c]
-    NDArray* variance = INPUT_VARIABLE(2);      // [c]
-    NDArray* dLdO     = INPUT_VARIABLE(3);      // same as input
-    NDArray* gamma    = nullptr;                // [c]
-    NDArray* beta     = nullptr;                // [c]
+    NDArray* input    = INPUT_VARIABLE(0);                  // 2D:nc, 4D:nchw, 5D:ncdhw
+    NDArray* mean     = INPUT_VARIABLE(1);                  // [c]
+    NDArray* variance = INPUT_VARIABLE(2);                  // [c]
+    NDArray* gamma    = nullptr;                            // [c]
+    NDArray* beta     = nullptr;                            // [c]
+    NDArray* dLdO     = INPUT_VARIABLE(block.width() - 1);  // same as input
 
-    NDArray* dLdI = OUTPUT_VARIABLE(0);         // same as input
-    NDArray* dLdM = OUTPUT_VARIABLE(1);         // [c]
-    NDArray* dLdV = OUTPUT_VARIABLE(2);         // [c]
-    NDArray* dLdG = nullptr;                    // [c]
-    NDArray* dLdB = nullptr;                    // [c]
+    NDArray* dLdI = OUTPUT_VARIABLE(0);                     // same as input
+    NDArray* dLdM = OUTPUT_VARIABLE(1);                     // [c]
+    NDArray* dLdV = OUTPUT_VARIABLE(2);                     // [c]
+    NDArray* dLdG = nullptr;                                // [c]
+    NDArray* dLdB = nullptr;                                // [c]
 
     const bool  applyScale  = (bool)INT_ARG(0);
     const bool  applyOffset = (bool)INT_ARG(1);
     const float epsilon     = T_ARG(0);
 
     if(applyScale) {
-        gamma = INPUT_VARIABLE(4);
+        gamma = INPUT_VARIABLE(3);
         dLdG  = OUTPUT_VARIABLE(3);
     }
     if(applyOffset) {
-        beta = INPUT_VARIABLE(4 + (int)applyScale);
+        beta = INPUT_VARIABLE(3 + (int)applyScale);
         dLdB = OUTPUT_VARIABLE(3 + (int)applyScale);
     }
 
@@ -606,7 +682,7 @@ PLATFORM_IMPL(batchnorm_bp) {
     if(beta != nullptr)
         REQUIRE_TRUE(beta->rankOf() == 1 && beta->sizeAt(0) == input->sizeAt(axes[0]), 0, "BATCHNORM_BP_MKLDNN op: wrong shape of beta array, expected is [%lld], but got %s instead !", input->sizeAt(axes[0]), ShapeUtils::shapeAsString(beta).c_str());
 
-    // types of all input arrays should be the same (except dLdO)
+    // types of all input arrays should be the same
     for(int i = 1; i < block.width() - 1; ++i)
         REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP_MKLDNN op: types of all input arrays should be the same !");
 
@@ -626,11 +702,19 @@ PLATFORM_IMPL(batchnorm_bp) {
             (*weights)({1,2, 0,0}).assign(0);
     }
 
-    *dLdM = 0;
-    *dLdV = 0;
+
+    if(axes[0] == inRank - 1 && inRank > 2) {   // if nhwc or ndhwc
+        std::vector<int> permut = inRank == 4 ? std::vector<int>({0,3,1,2}) : std::vector<int>({0,4,1,2,3});
+        input = new NDArray(input->permute(permut));
+        dLdO = new NDArray(dLdO->permute(permut));
+        dLdI = new NDArray(dLdI->permute(permut));
+    }
 
     batchnormBackPropMKLDNN(input, mean, variance, dLdO, weights, epsilon, dLdI, dLdW);
 
+    *dLdM = 0;
+    *dLdV = 0;
+
     if(applyScale || applyOffset) {
         if(applyScale)
             dLdG->assign((*dLdW)({0,1, 0,0}));
@@ -641,6 +725,12 @@ PLATFORM_IMPL(batchnorm_bp) {
         delete dLdW;
     }
 
+    if(axes[0] == inRank - 1 && inRank > 2) {
+        delete input;
+        delete dLdO;
+        delete dLdI;
+    }
+
     return Status::OK();
 }
 
@@ -696,7 +786,7 @@ PLATFORM_CHECK(batchnorm_bp) {
 
     const int inRank = input->rankOf();
 
-    return block.isUseMKLDNN() && axes.size() == 1 && axes[0] == 1 && (inRank == 2 || inRank == 4 || inRank == 5) &&
+    return block.isUseMKLDNN() && axes.size() == 1 && (axes[0] == 1 || axes[0] == inRank - 1)  && (inRank == 2 || inRank == 4 || inRank == 5) &&
             (inputType == DataType::FLOAT32 && meanType  == DataType::FLOAT32 && varType  == DataType::FLOAT32 &&
              dLdOType  == DataType::FLOAT32 && gammaType == DataType::FLOAT32 && betaType == DataType::FLOAT32 &&
              dLdIType  == DataType::FLOAT32 && dLdGType  == DataType::FLOAT32 && dLdBType == DataType::FLOAT32);
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
index 3c334e726..3d9a79535 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
@@ -213,6 +213,9 @@ PLATFORM_IMPL(conv3dnew_bp) {
     ConvolutionUtils::getSizesAndIndexesConv3d(isNDHWC, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW,
                                                indIOioC, indIOioD, indWiC, indWoC, indWkD);
 
+    if(isSameMode)                       // SAME
+        ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW);
+
     int trueoD, trueoH, trueoW;          // true output depth/height/width
     ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH,
                                         dW, iD, iH, iW, isSameMode);
diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp
index e004dc379..74b832b4a 100644
--- a/libnd4j/include/ops/impl/gemm.cpp
+++ b/libnd4j/include/ops/impl/gemm.cpp
@@ -22,6 +22,7 @@
 #include <gemm.h>
 #include <types/types.h>
 #include <Environment.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace blas {
@@ -32,15 +33,18 @@ namespace nd4j {
             auto source = reinterpret_cast<T *>(vsource);
 
             // handle transpose in parallel
-            PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
-            for (int r = 0; r < rows; r++) {
-                for (int c = 0; c < cols; c++) {
-                    int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
-                    int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto r = start; r < stop; r += increment) {
+                    for (int c = 0; c < cols; c++) {
+                        int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
+                        int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
 
-                    ret[zIdx] = source[xIdx];
+                        ret[zIdx] = source[xIdx];
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, rows);
 
             return ret;
         }
@@ -62,44 +66,49 @@ namespace nd4j {
             bool transBFlag = TransB == CblasTrans;
 
             if (beta == 0.0) {
+                Z z = 0.f;
                 int length = M*N;
                 if (length <= Environment::getInstance()->elementwiseThreshold()) {
-                    PRAGMA_OMP_SIMD
                     for (int r = 0; r < length; r++)
-                        C[r] = static_cast<Z>(0.0f);
+                        C[r] = z;
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (int r = 0; r < length; r++)
-                        C[r] = static_cast<Z>(0.0f);
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto r = start; r < stop; r += increment)
+                            C[r] = z;
+                    };
+                    samediff::Threads::parallel_for(func, 0, length);
                 }
             }
 
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-            for (int r = 0; r < M; r++) {
-                for (int c = 0; c < N; c++) {
-                    int zIdx = linearIndexF(M, N, r, c);
+            auto func = PRAGMA_THREADS_FOR_2D {
+                for (auto r = start_x; r < stop_x; r += inc_x) {
+                    for (auto c = start_y; c < stop_y; c += inc_y) {
+                        int zIdx = linearIndexF(M, N, r, c);
 
-                    Z dot = static_cast<Z>(0.0f);
+                        Z dot = static_cast<Z>(0.0f);
 
-                    if (alpha != 0.0) {
-                        int bIdx; // = linearIndexF(K, N, 0, c);
-                        int aIdx;
+                        if (alpha != 0.0) {
+                            int bIdx; // = linearIndexF(K, N, 0, c);
+                            int aIdx;
 
-                        for (int k = 0; k < K; k++) {
-                            aIdx = (transAFlag ? linearIndexC(M, K, r, k) : linearIndexF(M, K, r, k));
-                            bIdx = (transBFlag ? linearIndexC(K, N, k, c) : linearIndexF(K,N, k, c));
-                            dot += static_cast<Z>(alpha) * static_cast<Z>(A[aIdx]) * static_cast<Z>(B[bIdx]);//A[aIdx]nd4j::math::nd4j_dot<T>(aX, bX, K) * alpha;
+                            for (int k = 0; k < K; k++) {
+                                aIdx = (transAFlag ? linearIndexC(M, K, r, k) : linearIndexF(M, K, r, k));
+                                bIdx = (transBFlag ? linearIndexC(K, N, k, c) : linearIndexF(K, N, k, c));
+                                dot += static_cast<Z>(alpha) * static_cast<Z>(A[aIdx]) * static_cast<Z>(B[bIdx]);//A[aIdx]nd4j::math::nd4j_dot<T>(aX, bX, K) * alpha;
+                            }
+                        }
+
+                        if (beta != 0.0) {
+                            C[zIdx] = static_cast<Z>(dot + beta * C[zIdx]);
+                        } else {
+                            C[zIdx] = static_cast<Z>(dot);
                         }
                     }
-
-                    if (beta != 0.0) {
-                        C[zIdx] = static_cast<Z>(dot + beta * C[zIdx]);
-                    } else {
-                        C[zIdx] = static_cast<Z>(dot);
-                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, M, 1, 0, N, 1);
         }
 
 
@@ -120,14 +129,16 @@ namespace nd4j {
 
             auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int r = 0; r < M; r++) {
-                int aIdx = linearIndexC(M, N, r, 0);
-                auto aX = aT + aIdx;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto r = start; r < stop; r += increment) {
+                    int aIdx = linearIndexC(M, N, r, 0);
+                    auto aX = aT + aIdx;
 
-                auto dot = nd4j::math::nd4j_dot<X, Y, Z>(aX, y, lda) * alpha;
-                z[r] =  beta == 0.0f ? dot : dot + beta * z[r];
-            }
+                    auto dot = nd4j::math::nd4j_dot<X, Y, Z>(aX, y, lda) * alpha;
+                    z[r] = beta == 0.0f ? dot : dot + beta * z[r];
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, M);
 
             if (TRANS == CblasTrans)
                 delete[] aT;
diff --git a/libnd4j/include/ops/impl/specials.cpp b/libnd4j/include/ops/impl/specials.cpp
index 85642d6c8..11cca1b15 100644
--- a/libnd4j/include/ops/impl/specials.cpp
+++ b/libnd4j/include/ops/impl/specials.cpp
@@ -63,22 +63,24 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
 
                 T* outBuff = output.bufferAsT<T>();
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (uint r = 0; r < numOfArrs; r++) {
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto r = start; r < stop; r += increment) {
+                        const Nd4jLong arrLen = inArrs[r]->lengthOf();
+                        const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
 
-                    const Nd4jLong arrLen = inArrs[r]->lengthOf();
-                    const uint xEws    = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
+                        T *z = outBuff + zOffset[r];
+                        T *x = inArrs[r]->bufferAsT<T>();
 
-                    T *z = outBuff + zOffset[r];
-                    T *x = inArrs[r]->bufferAsT<T>();
+                        if (outEws == 1 && xEws == 1)
+                            for (Nd4jLong e = 0; e < arrLen; e++)
+                                z[e] = x[e];
+                        else
+                            for (Nd4jLong e = 0; e < arrLen; e++)
+                                z[e * outEws] = x[e * xEws];
+                    }
+                };
 
-                    if(outEws == 1 && xEws == 1)
-                        for (Nd4jLong e = 0; e < arrLen; e++)
-                            z[e] = x[e];
-                    else
-                        for (Nd4jLong e = 0; e < arrLen; e++)
-                            z[e * outEws] = x[e * xEws];
-                }
+                samediff::Threads::parallel_tad(func, 0, numOfArrs);
                 return;
             }
         }
@@ -96,11 +98,14 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
             indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
         }
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(int i = 0; i < numOfArrs; ++i) {
-            auto temp = output(indices[i], true);
-            nd4j::TransformLoops<T,T,T>::template loopTransform<simdOps::Assign<T,T>, false>(inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto temp = output(indices[i], true);
+                nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numOfArrs);
 }
 
 /**
@@ -137,21 +142,15 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
         auto z = reinterpret_cast<T *>(vz);
         auto x = reinterpret_cast<T **>(vx);
 
-        // aggregation step
-#ifdef _OPENMP
-        int _threads = omp_get_max_threads();
-#else
-        // we can use whatever we want here, this value won't be used if there's no omp
-    int _threads = 4;
-#endif
-
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong i = 0; i < length; i++) {
-
-            for (Nd4jLong ar = 0; ar < n; ar++) {
-                z[i] += x[ar][i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                for (auto ar = 0L; ar < n; ar++) {
+                    z[i] += x[ar][i];
+                }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, length);
     }
 
 
@@ -175,24 +174,18 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             z = x[0];
 
             PRAGMA_OMP_SIMD
-            for (Nd4jLong i = 0; i < length; i++) {
+            for (uint64_t i = 0; i < length; i++) {
                 z[i] /= n;
             }
 
-#ifdef _OPENNMP
-            int _threads = omp_get_max_threads(); //nd4j::math::nd4j_min<int>(omp_get_max_threads() / 2, 4);
-#else
-            // we can use whatever we want here, this value won't be used if there's no omp
-            int _threads = 4;
-#endif
-
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong i = 0; i < length; i++) {
-
-                for (Nd4jLong ar = 1; ar < n; ar++) {
-                    z[i] += x[ar][i] / n;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    for (Nd4jLong ar = 1; ar < n; ar++) {
+                        z[i] += x[ar][i] / n;
+                    }
                 }
-            }
+            };
+            samediff::Threads::parallel_for(func, 0, length);
 
             // instead of doing element-wise propagation, we just issue memcpy to propagate data
             for (Nd4jLong ar = 1; ar < n; ar++) {
@@ -205,20 +198,14 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             memset(z, 0, length * sizeof(T));
 
             // aggregation step
-#ifdef _OPENNMP
-            int _threads = omp_get_max_threads(); //nd4j::math::nd4j_min<int>(omp_get_max_threads() / 2, 4);
-#else
-            // we can use whatever we want here, this value won't be used if there's no omp
-            int _threads = 4;
-#endif
-
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong i = 0; i < length; i++) {
-
-                for (Nd4jLong ar = 0; ar < n; ar++) {
-                    z[i] += x[ar][i] / n;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    for (Nd4jLong ar = 0; ar < n; ar++) {
+                        z[i] += x[ar][i] / n;
+                    }
                 }
-            }
+            };
+            samediff::Threads::parallel_for(func, 0, length);
 
             // instead of doing element-wise propagation, we just issue memcpy to propagate data
             for (Nd4jLong ar = 0; ar < n; ar++) {
@@ -348,12 +335,14 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         Nd4jLong xTadLength = shape::tadLength(xShapeInfo, dimension, dimensionLength);
         int numTads = xLength / xTadLength;
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (int r = 0; r < numTads; r++) {
-            T *dx = x + tadOffsets[r];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
+                T *dx = x + tadOffsets[r];
 
-            quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
-        }
+                quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
+            }
+        };
+        samediff::Threads::parallel_tad(func, 0, numTads);
     }
 
 
@@ -368,23 +357,25 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         float threshold = fb.f_;
 
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong e = 4; e < lim; e++) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                for (int bitId = 0; bitId < 16; bitId++) {
+                    bool hasBit = (x[e] & 1 << (bitId)) != 0;
+                    bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
 
-            for (int bitId = 0; bitId < 16; bitId++) {
-                bool hasBit = (x[e] & 1 << (bitId) ) != 0;
-                bool hasSign = (x[e] & 1 << (bitId + 16) ) != 0;
-
-                if (hasBit) {
-                    if (hasSign)
-                        dz[(e - 4) * 16 + bitId] -= threshold;
-                    else
-                        dz[(e - 4) * 16 + bitId] += threshold;
-                } else if (hasSign) {
-                    dz[(e - 4) * 16 + bitId] -= threshold / 2;
+                    if (hasBit) {
+                        if (hasSign)
+                            dz[(e - 4) * 16 + bitId] -= threshold;
+                        else
+                            dz[(e - 4) * 16 + bitId] += threshold;
+                    } else if (hasSign) {
+                        dz[(e - 4) * 16 + bitId] -= threshold / 2;
+                    }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 4, lim);
     }
 
     template<typename S, typename T>
@@ -392,17 +383,14 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         auto x = reinterpret_cast<S *>(dx);
         auto z = reinterpret_cast<T *>(dz);
 
-        if (N < nd4j::Environment::getInstance()->elementwiseThreshold()) {
-            for (int i = 0; i < N; i++) {
-                z[i] = static_cast<T>(x[i]);
-            }
-        } else {
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < N; i++) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
                 z[i] = static_cast<T>(x[i]);
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, N);
     };
     BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
 
@@ -410,49 +398,49 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
     Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
         auto dx = reinterpret_cast<T *>(vx);
 
-        Nd4jLong retVal = 0L;
+//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
+        auto func = PRAGMA_REDUCE_LONG {
+            Nd4jLong retVal = 0L;
 
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
-        for (Nd4jLong x = 0; x < N; x += 16) {
+            for (auto x = start; x < stop; x += increment) {
+                int byte = 0;
+                int byteId = x / 16 + 4;
 
-            int byte = 0;
-            int byteId = x / 16 + 4;
+                for (int f = 0; f < 16; f++) {
+                    Nd4jLong e = x + f;
 
-            for (int f = 0; f < 16; f++) {
-                Nd4jLong e = x + f;
+                    if (e >= N)
+                        continue;
 
-                if (e >= N)
-                    continue;
+                    T val = dx[e];
+                    T abs = nd4j::math::nd4j_abs<T>(val);
 
-                T val = dx[e];
-                T abs = nd4j::math::nd4j_abs<T>(val);
+                    int bitId = e % 16;
 
-                int bitId = e % 16;
+                    if (abs >= (T) threshold) {
+                        byte |= 1 << (bitId);
+                        retVal++;
 
-                if (abs >= (T) threshold) {
-                    byte |= 1 << (bitId);
-
-                    retVal++;
-
-
-                    if (val < (T) 0.0f) {
+                        if (val < (T) 0.0f) {
+                            byte |= 1 << (bitId + 16);
+                            dx[e] += threshold;
+                        } else {
+                            dx[e] -= threshold;
+                        }
+                    } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
                         byte |= 1 << (bitId + 16);
-                        dx[e] += threshold;
-                    } else {
-                        dx[e] -= threshold;
-                    }
-                } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
-                    byte |= 1 << (bitId + 16);
-                    dx[e] += threshold / 2;
+                        dx[e] += threshold / 2;
 
-                    retVal++;
+                        retVal++;
+                    }
                 }
+
+                dz[byteId] = byte;
             }
 
-            dz[byteId] = byte;
-        }
-
-        return retVal;
+            return retVal;
+        };
+        return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
     }
 
     template <typename X, typename Y>
@@ -637,13 +625,16 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         auto xTadLength = shape::length(packX.primaryShapeInfo());
         auto numTads = packX.numberOfTads();
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong r = 0; r < numTads; r++) {
-            auto dx = x + packX.primaryOffsets()[r];
-            auto dy = y + packY.primaryOffsets()[r];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
 
-            quickSort_parallel_key<X,Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-        }
+                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
     }
 
     template <typename X, typename Y>
@@ -658,13 +649,16 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         auto xTadLength = shape::length(packX.primaryShapeInfo());
         auto numTads = packX.numberOfTads();
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong r = 0; r < numTads; r++) {
-            auto dx = x + packX.primaryOffsets()[r];
-            auto dy = y + packY.primaryOffsets()[r];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
 
-            quickSort_parallel_value<X,Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-        }
+                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
     }
 
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/ops.h b/libnd4j/include/ops/ops.h
index 601481b21..ab4bfca90 100644
--- a/libnd4j/include/ops/ops.h
+++ b/libnd4j/include/ops/ops.h
@@ -77,42 +77,6 @@
 #define SELU_ALPHA 1.6732632423543772848170429916717
 #define SELU_LAMBDA 1.0507009873554804934193349852946
 
-#ifdef _OPENMP
-#pragma omp declare reduction(maxTF : float,double,float16,bfloat16 :              \
-                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
-                initializer (omp_priv=-MAX_FLOAT)
-
-#pragma omp declare reduction(minTF : float,double,float16,bfloat16 :              \
-                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
-                initializer (omp_priv=MAX_FLOAT)
-
-#pragma omp declare reduction(maxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(minT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(amaxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_max(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
-
-#pragma omp declare reduction(aminT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_min(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
-
-#pragma omp declare reduction(asumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_abs(omp_in) + nd4j::math::nd4j_abs(omp_out))\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(sumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = omp_in + omp_out)\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(prodT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = omp_in * omp_out)\
-                initializer (omp_priv=1)
-#endif
-
 
 namespace functions {
 	namespace indexreduce {
diff --git a/libnd4j/include/ops/special_accumulation_ops.h b/libnd4j/include/ops/special_accumulation_ops.h
deleted file mode 100644
index 3f2b2ed1d..000000000
--- a/libnd4j/include/ops/special_accumulation_ops.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-//
-
-#ifndef LIBND4J_SPECIAL_ACCUMULATION_OPS_H
-#define LIBND4J_SPECIAL_ACCUMULATION_OPS_H
-
-#include <templatemath.h>
-#include <helpers/TAD.h>
-#include <helpers/ConstantTadHelper.h>
-//#include <ops/ops.h>
-//#include <loops/reduce.h>
-
-namespace simdOps {
-
-    template<typename T, typename Z>
-    class LogSumExp {
-    public:
-        static const bool requiresSpecialAccumulation = true;
-
-        constexpr static functions::ReduceType reduceType = functions::ReduceType::SUM;
-
-        op_def static T startingValue(const T *input) {
-            return (T) 0.0f;
-        }
-
-        op_def static Z merge(T old, T opOutput, Z *extraParams) {
-            return opOutput + old;
-        }
-
-        op_def static T update(T old, T opOutput, Z *extraParams) {
-            return opOutput + old;
-        }
-
-        op_def static Z op(T d1, T d2) {
-            return nd4j::math::nd4j_exp<T, Z>(d1 - d2);
-        }
-
-        op_def static Z op(T d1, Z* extraParams) {
-            return nd4j::math::nd4j_exp<Z, Z>(static_cast<Z>(d1) - extraParams[0]);
-        }
-
-        op_def static Z postProcess(T reduction, Nd4jLong n, Z *extraParams) {
-            return extraParams[0] + nd4j::math::nd4j_log<T, Z>(reduction);
-        }
-
-#ifdef __CUDACC__
-        __device__ static inline void aggregatePartials(Z *sPartials, int tid, int numItems, Z *extraParams) {
-            // start the shared memory loop on the next power of 2 less
-            // than the block size.  If block size is not a power of 2,
-            // accumulate the intermediate sums in the remainder range.
-            int floorPow2 = numItems;
-
-            if (floorPow2 & (floorPow2 - 1)) {
-                while (floorPow2 & (floorPow2 - 1)) {
-                    floorPow2 &= floorPow2 - 1;
-                }
-                if (tid >= floorPow2) {
-                    sPartials[tid - floorPow2] = update(sPartials[tid - floorPow2], sPartials[tid], extraParams);
-                }
-
-                __syncthreads();
-            }
-
-
-            for (int activeThreads = floorPow2 >> 1; activeThreads; activeThreads >>= 1) {
-                if (tid < activeThreads && tid + activeThreads < numItems) {
-                    sPartials[tid] = update(sPartials[tid], sPartials[tid + activeThreads], extraParams);
-                }
-                __syncthreads();
-            }
-        }
-
-        static inline __device__ void execSpecialCuda(
-				T *dx,
-				Nd4jLong *xShapeInfo,
-				Z *extraParams,
-				Z *result,
-				Nd4jLong *resultShapeInfo,
-				int *dimension,
-				int dimensionLength,
-				Z *reductionBuffer,
-				Nd4jLong *tadOnlyShapeInfo,
-				Nd4jLong *tadOffsets) {
-
-				// we assume that RESULT already holds max values
-
-				//shared memory space for storing intermediate results
-				__shared__ Z *sPartials;
-
-				//                __shared__ shape::TAD *tad;
-				__shared__ Nd4jLong tadLength;
-				__shared__ Nd4jLong numTads;
-
-				if (threadIdx.x == 0) {
-				    extern __shared__ unsigned char shmem[];
-				    sPartials = (Z *) shmem;
-					tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
-					numTads = shape::length(xShapeInfo) / tadLength;
-				}
-				__syncthreads();
-
-				for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
-					auto tadOffsetForBlock = tadOffsets[r];
-
-					sPartials[threadIdx.x] = startingValue(dx + tadOffsetForBlock);
-
-					for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-						auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
-						sPartials[threadIdx.x] = update(sPartials[threadIdx.x], op(dx[xOffset], result[r]), extraParams);
-					}
-					__syncthreads();
-
-					// aggregate. do NOT reduce for elements > tadLength
-					aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), &result[r]);
-
-					__syncthreads();
-					if (threadIdx.x == 0)
-						result[r] = postProcess(sPartials[threadIdx.x], tadLength, &result[r]);
-				}
-			}
-#endif
-
-        static void execSpecial(T *x,
-                         Nd4jLong *xShapeInfo,
-                         Z *extraParams,
-                         Z *result,
-                         Nd4jLong *resultShapeInfoBuffer,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset) {
-            Nd4jLong resultLength = shape::length(resultShapeInfoBuffer);
-
-            auto tadOnlyShapeInfo = tadShapeInfo;
-            auto tadOffsets = tadOffset;
-
-            if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
-                if (dimensionLength < 1)
-                    return;
-
-                auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-                tadOnlyShapeInfo = tadPack.primaryShapeInfo();
-                tadOffsets = tadPack.primaryOffsets();
-            }
-
-
-            const Nd4jLong tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
-            auto numTads = shape::length(xShapeInfo) / tadLength;
-            auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
-
-            int tadsPerThread = resultLength / TAD_THRESHOLD;
-            int num_threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-            num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-            if (tadEWS > 0 && (numTads == 1 || shape::isVector(tadOnlyShapeInfo) || shape::isScalar(tadOnlyShapeInfo))) {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (int i = 0; i < resultLength; i++) {
-
-                    T *iter = x + tadOffsets[i];
-                    T start = startingValue(iter);
-                    if (tadEWS == 1) {
-                        for (int j = 0; j < tadLength; j++) {
-                            start = update(start, op(iter[j], result[i]), extraParams);
-
-                        }
-                    }
-                    else {
-                        for (int j = 0; j < tadLength; j++) {
-                            start = update(start, op(iter[j * tadEWS], result[i]), extraParams);
-                        }
-                    }
-                    result[i] = postProcess(start, tadLength, &result[i]);
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (int i = 0; i < resultLength; i++) {
-
-                    auto offset = tadOffsets[i];
-                    T start = startingValue(x + offset);
-
-                    for (int j = 0; j < tadLength; j++) {
-                        auto xOffset = offset + shape::getIndexOffset(j, tadOnlyShapeInfo);
-                        start = update(start, op(x[xOffset], result[i]), extraParams);
-                    }
-
-                    result[i] = postProcess(start, tadLength, &result[i]);;
-                }
-            }
-        }
-    };
-}
-
-#endif //LIBND4J_SPECIAL_ACCUMULATION_OPS_H
diff --git a/libnd4j/include/ops/special_ops.h b/libnd4j/include/ops/special_ops.h
deleted file mode 100644
index 8f6ef6b5b..000000000
--- a/libnd4j/include/ops/special_ops.h
+++ /dev/null
@@ -1,2293 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-#pragma once
-#include <ops/ops.h>
-#include <loops/reduce_float.h>
-#include <loops/reduce_same.h>
-#include <loops/scalar.h>
-#include <loops/indexreduce.h>
-#include <loops/broadcasting.h>
-#include <loops/transform_float.h>
-#include <op_enums.h>
-#include <loops/transform_strict.h>
-#include <helpers/ConstantTadHelper.h>
-
-#ifdef __CUDACC__
-#include <loops/cuda/inplace_loops/reduce_same_inplace.h>
-#include <loops/cuda/inplace_loops/transform_strict_inplace.h>
-#include <loops/cuda/inplace_loops/scalar_inplace.h>
-#endif
-
-namespace functions {
-	namespace broadcast {
-		template <typename X, typename Y, typename Z>
-		class Broadcast;
-	}
-
-	namespace transform {
-		template <typename X>
-		class TransformStrict;
-	}
-
-	namespace scalar {
-	}
-
-	namespace reduce {
-		template <typename X, typename Z>
-		class ReduceFloatFunction;
-
-        template <typename X>
-        class ReduceSameFunction;
-	}
-}
-
-namespace simdOps {
-
-	template<typename T, typename Z>
-	class Pooling2D {
-	public:
-		static const bool requiresSpecial = true;
-#ifdef __CUDACC__
-		inline __host__ __device__
-#elif defined(__GNUC__)
-
-#endif
-		static int outSize(int size, int k, int s, int p, bool coverAll) {
-			if (coverAll)
-				return (size + p * 2 - k + s - 1) / s + 1;
-			else
-				return (size + p * 2 - k) / s + 1;
-		}
-
-#ifdef __CUDACC__
-		/**
-		* Based on:  https://github.com/pjreddie/darknet/blob/master/src/im2col_kernels.cu
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			             T *dx, Nd4jLong *xShapeBuffer,
-			             Z *result, Nd4jLong *zShapeBuffer,
-			             Z *extraParams,
-                         int *allocationPointer, Z *reductionPointer,
-                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-			__shared__ int kH;
-			__shared__ int kW;
-			__shared__ int sH;
-			__shared__ int sW;
-			__shared__ int pH;
-			__shared__ int pW;
-			__shared__ int dH;
-			__shared__ int dW;
-			__shared__ int poolingMode;
-			__shared__ Z extraParam0;
-
-			__shared__ int batchSize;
-			__shared__ int inChannels;
-			__shared__ int outH;
-			__shared__ int outW;
-			__shared__ int inH;
-			__shared__ int inW;
-
-            //__shared__ int *strideIn;
-            //__shared__ int *strideOut;
-            __shared__ int strideB;
-            __shared__ int strideC;
-            __shared__ int strideY;
-            __shared__ int strideX;
-
-			__shared__ int strideOB;
-            __shared__ int strideOC;
-            __shared__ int strideOY;
-            __shared__ int strideOX;
-
-            __shared__ int length;
-            __shared__ int kHEff;
-            __shared__ int kWEff;
-			__shared__ bool fOrder;
-
-
-			if (threadIdx.x == 0) {
-				kH = (int)extraParams[0];
-				kW = (int)extraParams[1];
-				sH = (int)extraParams[2];
-				sW = (int)extraParams[3];
-				pH = (int)extraParams[4];
-				pW = (int)extraParams[5];
-				dH = (int)extraParams[6];			//Dilation, height dimension
-				dW = (int)extraParams[7];			//Dilation, width dimension
-				poolingMode = (int)extraParams[9];
-				extraParam0 = extraParams[10];
-
-				batchSize = shape::sizeAt(xShapeBuffer, 0);
-				inChannels = shape::sizeAt(xShapeBuffer, 1);
-				outH = shape::sizeAt(zShapeBuffer, 2);
-				outW = shape::sizeAt(zShapeBuffer, 3);
-				inH = shape::sizeAt(xShapeBuffer, 2);
-				inW = shape::sizeAt(xShapeBuffer, 3);
-
-            	strideB = shape::stride(xShapeBuffer)[0];
-            	strideC = shape::stride(xShapeBuffer)[1];
-            	strideY = shape::stride(xShapeBuffer)[2];
-            	strideX = shape::stride(xShapeBuffer)[3];
-
-				strideOB = shape::stride(zShapeBuffer)[0];
-            	strideOC = shape::stride(zShapeBuffer)[1];
-            	strideOY = shape::stride(zShapeBuffer)[2];
-            	strideOX = shape::stride(zShapeBuffer)[3];
-
-            	length = shape::length(zShapeBuffer);
-
-				//Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
-				kHEff = kH + (kH-1)*(dH-1);
-				kWEff = kW + (kW-1)*(dW-1);
-
-				fOrder = shape::order(zShapeBuffer) == 'f';
-/*
-				if (blockIdx.x == 0) {
-					printf("kH: %i; kW: %i; sH: %i; sW: %i; pH: %i; pW: %i; dH: %i; dW: %i; poolingMode: %i; extraParam0: %f;\n", kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, (float) extraParam0);
-					printf("batchSize: %i; inChannels: %i; outH: %i; outW: %i; inH: %i; inW: %i; strideB: %i; strideC: %i; strideY: %i; strideX: %i;\n", batchSize, inChannels, outH, outW, inH, inW, strideB, strideC, strideY, strideX);
-				}
-*/
-            }
-            __syncthreads();
-
-			int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-            for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
-				const int pw = index % outW;
-    			const int ph = (index / outW) % outH;
-    			const int c = (index / outW / outH) % inChannels;
-    			const int n = index / outW / outH / inChannels;
-    			int hstart = sH * ph - pH;
-    			int wstart = sW * pw - pW;
-    			int hend = hstart + kHEff;
-    			int wend = wstart + kWEff;
-
-//    			const int hSO = hstart;
-//    			const int hEO = hend;
-
-    			if(hstart < 0){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
-                    hstart += f * dH;
-                }
-                if(wstart < 0){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
-                    wstart += f * dW;
-                }
-                if(hend > inH){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) (hend-inH) / (Z) dH);
-                    hend -= f * dH;
-                }
-                if(wend > inW){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) (wend-inW) / (Z) dW);
-                    wend -= f * dW;
-                }
-                //Accounts for dilation
-    			int pool_size = nd4j::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * nd4j::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
-
-    			Z sum = poolingMode == 0 ? -nd4j::DataTypeUtils::max<Z>() : static_cast<Z>(0.f);
-
-    			T *input_slice = dx + (n * strideB + c * strideC);
-    			if (poolingMode == 0) {
-    			    for (int h = hstart; h < hend; h += dH) {
-      				    for (int w = wstart; w < wend; w += dW) {
-        				    Z v = static_cast<Z>(input_slice[h * strideY + w * strideX]);
-        				    if (v > sum)
-        				        sum = v;
-      				    }
-    			    }
-    			} else if (poolingMode == 1) {
-    			    for (int h = hstart; h < hend; h += dH) {
-      				    for (int w = wstart; w < wend; w += dW) {
-        				    sum += static_cast<Z>(input_slice[h * strideY + w * strideX]);
-      				    }
-    			    }
-    			} else if (poolingMode == 2) {
-    			    for (int h = hstart; h < hend; h += dH) {
-      				    for (int w = wstart; w < wend; w += dW) {
-        				    sum += nd4j::math::nd4j_pow<Z,Z,Z>(static_cast<Z>(nd4j::math::nd4j_abs<T>(input_slice[h * strideY + w * strideX])), extraParam0);
-      				    }
-    			    }
-    			}
-
-				Z res;
-
-    			if (poolingMode == 0) {
-                    res = sum;
-    			} else if (poolingMode == 1) {
-    			    int divide_factor = pool_size;  //Case 0: exclude padding
-    			    if ((int) extraParam0 == 1)     //Case 1: include padding
-					    divide_factor = kH * kW;
-
-    			    res = sum / static_cast<Z>(divide_factor);
-    			} else if (poolingMode == 2) {
-                    res = nd4j::math::nd4j_pow<Z,Z,Z>(sum, (Z) 1.0f / extraParam0);
-    			}
-
-
-				if (!fOrder) {
-					result[index] = res;
-                } else {
-					result[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = res;
-                }
-/*
-                if (index >= 0 && index < 400000) {
-    			    printf("index: %i; hstart: %i; hend: %i; wstart: %i; wend: %i; ph: %i; pw: %i; hstart_orig: %i; hend_orig: %i;\n", index, hstart, hend, wstart, wend, ph, pw, hSO, hEO);
-    			}
-*/
-            }
-
-            __syncthreads();
-		}
-#endif
-
-
-static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outShapeBuffer, Z *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-	// input is  [bS, iC, iH, iW]
-	// output is [bS, iC, oH, oW]
-
-	const Nd4jLong kH = (int)extraParams[0];
-	const Nd4jLong kW = (int)extraParams[1];
-    const Nd4jLong sH = (int)extraParams[2];
-    const Nd4jLong sW = (int)extraParams[3];
-    const Nd4jLong pH = (int)extraParams[4];
-    const Nd4jLong pW = (int)extraParams[5];
-    const Nd4jLong dH = (int)extraParams[6];
-    const Nd4jLong dW = (int)extraParams[7];
-    Nd4jLong poolingMode = (int)extraParams[9];
-    T extraParam0 = extraParams[10];
-
-    if(dH == 0 || dW == 0) {
-       printf("Special_ops pooling2d:: dilation must not be zero, but got instead {%lld, %lld} \n", dH, dW);
-       throw "";
-    }
-
-    const Nd4jLong kHEff = kH + (kH-1)*(dH-1);
-    const Nd4jLong kWEff = kW + (kW-1)*(dW-1);
-
-	const int bS = shape::sizeAt(inShapeBuffer, 0);
-    const int iC = shape::sizeAt(inShapeBuffer, 1);
-    const int iH = shape::sizeAt(inShapeBuffer, 2);
-    const int iW = shape::sizeAt(inShapeBuffer, 3);
-    const int oH = shape::sizeAt(outShapeBuffer, 2);
-    const int oW = shape::sizeAt(outShapeBuffer, 3);
-    const Nd4jLong iStride0 = shape::stride(inShapeBuffer)[0];
-    const Nd4jLong iStride1 = shape::stride(inShapeBuffer)[1];
-    const Nd4jLong iStride2 = shape::stride(inShapeBuffer)[2];
-    const Nd4jLong iStride3 = shape::stride(inShapeBuffer)[3];
-    const Nd4jLong oStride0 = shape::stride(outShapeBuffer)[0];
-    const Nd4jLong oStride1 = shape::stride(outShapeBuffer)[1];
-    const Nd4jLong oStride2 = shape::stride(outShapeBuffer)[2];
-    const Nd4jLong oStride3 = shape::stride(outShapeBuffer)[3];
-
-    const Nd4jLong iStep2 = dH*iStride2;
-    const Nd4jLong iStep3 = dW*iStride3;
-    const int kProd  = kH*kW;
-    const T iStep2Inv = 1./iStep2;
-    const T iStep3Inv = 1./iStep3;
-
-    Nd4jLong hstart, wstart, hend, wend;
-    T sum, *pIn;
-
-    if(poolingMode == 0) {        // max
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
-        for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {
-                for(int oh = 0; oh < oH; ++oh) {
-                    for(int ow = 0; ow < oW; ++ow) {
-
-                        pIn  = in  + b * iStride0 + c * iStride1;
-
-                        hstart = oh * sH - pH;
-                        wstart = ow * sW - pW;
-                        hend = hstart + kHEff;
-                        wend = wstart + kWEff;
-
-                        if(hstart < 0)
-                            hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                        if(wstart < 0)
-                            wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                        if(hend > iH)
-                            hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                        if(wend > iW)
-                            wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                        hstart *= iStride2;
-                        hend   *= iStride2;
-                        wstart *= iStride3;
-                        wend   *= iStride3;
-
-                        sum = -nd4j::DataTypeUtils::max<Z>();
-
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                T val = pIn[kh + kw];
-                                    if (val > sum)
-                                        sum = val;
-                                    }
-                        out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                    }
-                }
-            }
-        }
-    }
-/*************************************************************************/
-    else if(poolingMode == 1) {      // avg
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
-        for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {
-                for(int oh = 0; oh < oH; ++oh) {
-                    for(int ow = 0; ow < oW; ++ow) {
-
-                        pIn  = in  + b * iStride0 + c * iStride1;
-
-                        hstart = oh * sH - pH;
-                        wstart = ow * sW - pW;
-                        hend = hstart + kHEff;
-                        wend = wstart + kWEff;
-
-                        if(hstart < 0)
-                            hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                        if(wstart < 0)
-                            wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                        if(hend > iH)
-                            hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                        if(wend > iW)
-                            wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                        hstart *= iStride2;
-                        hend   *= iStride2;
-                        wstart *= iStride3;
-                        wend   *= iStride3;
-
-                        sum = static_cast<Z>(0.);
-
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                sum += pIn[kh + kw];
-
-                        if ((int) extraParam0 == 0)         //Exclude padding
-                            sum /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep3)));   //Accounts for dilation
-                        else if ((int) extraParam0 == 1)    //Include padding
-                            sum /= kProd;
-
-                        out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                    }
-                }
-            }
-        }
-    }
-/*************************************************************************/
-    else if(poolingMode == 2) {  // pnorm
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
-        for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {
-                for(int oh = 0; oh < oH; ++oh) {
-                    for(int ow = 0; ow < oW; ++ow) {
-
-                        pIn  = in  + b * iStride0 + c * iStride1;
-
-                        hstart = oh * sH - pH;
-                        wstart = ow * sW - pW;
-                        hend = hstart + kHEff;
-                        wend = wstart + kWEff;
-
-                        if(hstart < 0)
-                            hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                        if(wstart < 0)
-                            wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                        if(hend > iH)
-                            hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                        if(wend > iW)
-                            wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                        hstart *= iStride2;
-                        hend   *= iStride2;
-                        wstart *= iStride3;
-                        wend   *= iStride3;
-
-                        sum = static_cast<T>(0.);
-
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
-
-                        sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1. / extraParam0);
-
-                        out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                    }
-                }
-            }
-        }
-    }
-    else {
-        nd4j_printf("Special_ops::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-        throw "";
-	}
-}
-
-		op_def static T op(T d1, Z *params) {
-			return d1;
-		}
-
-
-		/** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc
-		*  normally negative indices are bad, OK here because of other checks on input indices
-		*  Uses unrolled loop specifically for length 4
-		*/
-		static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[2] != 1) offset += indices[2] * stride[2];
-			if (shape[3] != 1) offset += indices[3] * stride[3];
-			return offset;
-		}
-
-
-		/**
-		* A version of Shape.getOffset without checking on input for negative indices etc
-		* normally negative indices are bad, OK here because of other checks on input indices
-		* Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here)
-		*/
-		static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[4] != 1) offset += indices[4] * stride[4];
-			if (shape[5] != 1) offset += indices[5] * stride[5];
-			return offset;
-		}
-
-	};
-
-
-    FORCEINLINE bool is_a_ge_zero_and_a_lt_b(int a, int b) {
-        return static_cast<unsigned>(a) < static_cast<unsigned>(b);
-    }
-
-	template<typename T>
-	class
-	Im2col {
-	public:
-		static const bool requiresSpecial = true;
-
-		static _CUDA_HD int outSize(int size, int k, int s, int p, bool coverAll) {
-			if (coverAll)
-				return (size + p * 2 - k + s - 1) / s + 1;
-			else
-				return (size + p * 2 - k) / s + 1;
-		}
-
-#ifdef __CUDACC__
-		/**
-		* Based on:  https://github.com/pjreddie/darknet/blob/master/src/im2col_kernels.cu
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			                             T *dx, Nd4jLong *xShapeBuffer,
-			                             T *result, Nd4jLong *zShapeBuffer,
-			                             T *extraParams,
-                                         int *allocationPointer, T *reductionPointer,
-                                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-			/*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/
-			__shared__ int kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dY, dX, kSize, samples, depth, height, width, strideex, stridech, strideh, stridew, height_col, width_col, n;
-			__shared__ T zeroPadVal;
-			__shared__ Nd4jLong *outShape, *outStride, *inShape, *inStride;
-			__shared__ char resultOrder;
-
-			if (threadIdx.x == 0) {
-			    kernelHeight = (int) extraParams[0];
-			    kernelWidth = (int) extraParams[1];
-			    strideY = (int) extraParams[2];
-			    strideX = (int) extraParams[3];
-			    padHeight = (int) extraParams[4];
-			    padWidth = (int) extraParams[5];
-			    dY = (int) extraParams[6];			//Dilation, height/y dimension
-			    dX = (int) extraParams[7];			//Dilation, width/x dimension
-                kSize = kernelWidth * kernelHeight;
-                zeroPadVal = (T) extraParams[9];	//Value to use when value is padding. Usually 0 but not always
-
-                outShape = shape::shapeOf(zShapeBuffer);
-                resultOrder = shape::order(zShapeBuffer);
-			    outStride = shape::stride(zShapeBuffer);
-
-			    inShape = shape::shapeOf(xShapeBuffer);
-                inStride = shape::stride(xShapeBuffer);
-
-                samples = (int) inShape[0];
-                depth = (int) inShape[1];
-                height = (int) inShape[2];
-                width = (int) inShape[3];
-
-
-                strideex = (int) inStride[0];
-			    stridech = (int) inStride[1];
-			    strideh = (int) inStride[2];
-                stridew = (int) inStride[3];
-
-			    // (height + 2 * padHeight - kernelHeight) / strideX + 1; //
-			    // (width + 2 * padWidth - kernelWidth) / strideY + 1; //
-			    height_col = (int) outShape[4];
-			    width_col = (int) outShape[5];
-
-			    n = samples * depth * height_col * width_col;
-			}
-			__syncthreads();
-
-			int index = blockIdx.x * blockDim.x + threadIdx.x;
-			for (; index < n; index += blockDim.x*gridDim.x) {
-				int h_index = index / width_col;
-				int h_col = h_index % height_col;
-				int w_col = index % width_col;
-
-				int c_im = h_index / height_col;
-				int c_col = c_im * kSize;
-
-				int depth_im = c_im % depth;
-				int num_im = c_im / depth;
-				int h_offset = h_col * strideY - padHeight;
-				int w_offset = w_col * strideX - padWidth;
-
-				T* data_col_ptr = result;
-
-				int i_c = (c_col * height_col + h_col) * width_col + w_col;
-				data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-
-				T* data_im_ptr = dx;
-
-				data_im_ptr += num_im * strideex + depth_im * stridech + h_offset * strideh + w_offset*stridew;
-
-				for (int i = 0; i < kernelHeight; ++i) {
-					for (int j = 0; j < kernelWidth; ++j) {
-						int h_im = h_offset + i * dY;
-						int w_im = w_offset + j * dX;
-						int i_f = 0;
-						int i_c_temp = i_c;
-						for (int dim = 5; dim >= 0; dim--) {
-							i_f += (i_c_temp % outShape[dim])  * outStride[dim];
-							i_c_temp = i_c_temp / outShape[dim];
-						}
-						if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width){
-							result[i_f] = data_im_ptr[i * dY * strideh + j * dX * stridew];
-						} else result[i_f] = zeroPadVal;
-
-						//result[i_f] = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * strideh + j*stridew] : 0;
-						data_col_ptr += height_col * width_col;
-						i_c += height_col * width_col;
-					}
-				}
-			}
-		}
-#endif
-
-
-		static void execSpecial(
-			T *imBuff,
-			Nd4jLong *imShapeBuffer,
-			T *colBuff,
-			Nd4jLong *colShapeBuffer,
-			T *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-			/*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/
-
-			// [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-
-			int kH = (int)extraParams[0];
-			int kW = (int)extraParams[1];
-			int sH = (int)extraParams[2];
-			int sW = (int)extraParams[3];
-			int pH = (int)extraParams[4];
-			int pW = (int)extraParams[5];
-			int dH = (int)extraParams[6];			//Dilation, height/y dimension
-			int dW = (int)extraParams[7];			//Dilation, width/x dimension
-            T zeroPadVal = extraParams[9];
-
-            auto colShape  = shape::shapeOf(colShapeBuffer);
-            auto colStride = shape::stride(colShapeBuffer);
-            auto imShape = shape::shapeOf(imShapeBuffer);
-            auto imStride = shape::stride(imShapeBuffer);
-
-            const int bS = imShape[0];
-            const int iC = imShape[1];
-            const int iH = imShape[2];
-            const int iW = imShape[3];
-            const int oH = colShape[4];
-            const int oW = colShape[5];
-            const Nd4jLong colStride0 = colStride[0];
-            const Nd4jLong colStride1 = colStride[1];
-            const Nd4jLong colStride2 = colStride[2];
-            const Nd4jLong colStride3 = colStride[3];
-            const Nd4jLong colStride4 = colStride[4];
-            const Nd4jLong colStride5 = colStride[5];
-            const Nd4jLong imStride0  = imStride[0];
-            const Nd4jLong imStride1  = imStride[1];
-            const Nd4jLong imStride2  = imStride[2];
-            const Nd4jLong imStride3  = imStride[3];
-
-            T *col, *im;
-            int imRow, imCol;
-
-            if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kRow = 0; kRow < kH; ++kRow) {
-                            for (int kCol = 0; kCol < kW; ++kCol) {
-                                for (int colH = 0; colH < oH; ++colH) {
-                                    for (int colW = 0; colW < oW; ++colW) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                            *col = zeroPadVal;
-                                        else
-                                            *col = *im;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int colH = 0; colH < oH; ++colH) {
-                        for (int colW = 0; colW < oW; ++colW) {
-                            for (int c = 0; c < iC; ++c) {
-                                for (int kRow = 0; kRow < kH; ++kRow) {
-                                    for (int kCol = 0; kCol < kW; ++kCol) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                            *col = zeroPadVal;
-                                        else
-                                            *col = *im;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-		}
-
-		op_def static T op(T d1, T *params) {
-			return d1;
-		}
-
-
-		/** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc
-		*  normally negative indices are bad, OK here because of other checks on input indices
-		*  Uses unrolled loop specifically for length 4
-		*/
-		static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[2] != 1) offset += indices[2] * stride[2];
-			if (shape[3] != 1) offset += indices[3] * stride[3];
-			return offset;
-		}
-
-
-		/**
-		* A version of Shape.getOffset without checking on input for negative indices etc
-		* normally negative indices are bad, OK here because of other checks on input indices
-		* Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here)
-		*/
-		static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[4] != 1) offset += indices[4] * stride[4];
-			if (shape[5] != 1) offset += indices[5] * stride[5];
-			return offset;
-		}
-
-	};
-
-	template<typename T, typename Z>
-	class Histogram {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		static inline __device__ void execSpecialCuda(
-			                 T *dx, Nd4jLong *xShapeBuffer,
-			                 Z *result, Nd4jLong *zShapeBuffer,
-			                 Z *extraParams,
-                             int *allocationPointer, Z *reductionPointer,
-                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-
-
-		};
-#endif
-
-		static void execSpecial(
-				T *dx,
-				Nd4jLong *xShapeBuffer,
-				Z *result,
-				Nd4jLong *zShapeBuffer,
-				Z *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-
-
-		}
-
-
-        op_def static T op(T d1, Z *params) {
-            return d1;
-        }
-	};
-
-	template<typename X>
-	class Col2Im {
-
-	public:
-		static const bool requiresSpecial = true;
-#ifdef __CUDACC__
-		/**
-		* https://github.com/pjreddie/darknet/blob/master/src/col2im_kernels.cu
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			X *dx, Nd4jLong *xShapeBuffer,
-			X *result, Nd4jLong *zShapeBuffer,
-			X *extraParams, int *allocationPointer,
-            X *reductionPointer,
-            Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-		    __shared__ int strideex, stridech, stridekrow, stridekcol, striderow, stridecol, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, imgHeight, imgWidth, dY, dX, samples, depth, imgH, imgW, height_col, width_col, n, kEffectiveW, kEffectiveH;
-		    __shared__ Nd4jLong *inShape, *inStride, *outShape, *outStride;
-		    __shared__ char resultOrder;
-
-		    if (threadIdx.x == 0) {
-			    inShape = shape::shapeOf(xShapeBuffer);
-                inStride = shape::stride(xShapeBuffer);
-
-			    strideex = (int) inStride[0];
-                stridech = (int) inStride[1];
-                stridekrow = (int) inStride[2];
-                stridekcol = (int) inStride[3];
-                striderow = (int) inStride[4];
-                stridecol = (int) inStride[5];
-
-			    kernelHeight = (int) inShape[2];
-                kernelWidth = (int) inShape[3];
-
-                strideY = (int) extraParams[0];
-                strideX = (int) extraParams[1];
-                padHeight = (int) extraParams[2];
-			    padWidth = (int) extraParams[3];
-                imgHeight = (int) extraParams[4];
-                imgWidth = (int) extraParams[5];
-                dY = (int) extraParams[6];			//Dilation in height/y dimension
-                dX = (int) extraParams[7];			//Dilation in width/x dimension
-
-			    outShape = shape::shapeOf(zShapeBuffer);
-			    resultOrder = shape::order(zShapeBuffer);
-			    outStride = shape::stride(zShapeBuffer);
-
-                samples = (int) outShape[0];
-                depth = (int) outShape[1];
-                imgH = (int) outShape[2];
-                imgW = (int) outShape[3];
-
-                height_col = inShape[4];//(imgHeight + 2 * padHeight - kernelHeight) / strideX + 1;
-			    width_col = inShape[5];//(imgWidth + 2 * padWidth - kernelWidth) / strideY + 1;
-
-			    n = samples * depth * imgHeight * imgWidth;
-
-			    //Effective kernel size, accounting for dilation
-                kEffectiveW = kernelWidth + (kernelWidth - 1) * (dX - 1);
-                kEffectiveH = kernelHeight + (kernelHeight - 1) * (dY - 1);
-			}
-		    __syncthreads();
-
-			for (int i = (blockDim.x * blockIdx.x) + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
-				X val = 0;
-				int w_im = i % imgWidth + padWidth;
-				int h_im = (i / imgWidth) % imgHeight + padHeight;
-				int c_im = i / (imgWidth * imgHeight);
-
-				int num_im = c_im / depth;
-				int depth_im = c_im % depth;
-
-				// compute the start and end of the output
-				// These are the indexes for dimensions ??? in the 6d col matrix
-				int w_col_start = (w_im < kEffectiveW) ? 0 : (w_im - kEffectiveW) / strideX + 1;
-				int w_col_end = nd4j::math::nd4j_min<int>(w_im / strideX + 1, width_col);
-
-				int h_col_start = (h_im < kEffectiveH) ? 0 : (h_im - kEffectiveH) / strideY + 1;
-				int h_col_end = nd4j::math::nd4j_min<int>(h_im / strideY + 1, height_col);
-
-
-				//Iterate over col entries in the 6d array... these are added up
-				for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
-					for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-						int h_k = (h_im - h_col * strideY);
-						int w_k = (w_im - w_col * strideX);
-
-						if(h_k % dY == 0 && w_k % dX == 0){
-							h_k /= dY;
-							w_k /= dX;
-
-							int data_col_index = num_im * strideex + depth_im * stridech + h_k * stridekrow + w_k * stridekcol + h_col * striderow + w_col * stridecol;
-							val += dx[data_col_index];
-						}
-					}
-				}
-				int i_f = 0;
-				int i_c = i;
-				for (int dim = 3; dim >= 0; dim--)
-				{
-					i_f += (i_c % outShape[dim])  * outStride[dim];
-					i_c = i_c / outShape[dim];
-				}
-				result[i_f] = val;
-			}
-		}
-#endif
-
-		static void execSpecial(
-			X *colBuff,
-			Nd4jLong *colShapeBuffer,
-			X *imBuff,
-			Nd4jLong *imShapeBuffer,
-			X *extraParams,
-			Nd4jLong *tadShapeInfo,
-			Nd4jLong *tadOffsets) {
-
-            // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-
-            auto colShape  = shape::shapeOf(colShapeBuffer);
-            auto colStride = shape::stride(colShapeBuffer);
-            auto imShape = shape::shapeOf(imShapeBuffer);
-            auto imStride = shape::stride(imShapeBuffer);
-
-            const int sH = (int)extraParams[0];
-            const int sW = (int)extraParams[1];
-            const int pH = (int)extraParams[2];
-            const int pW = (int)extraParams[3];
-            const int iH = (int)extraParams[4];
-            const int iW = (int)extraParams[5];
-            const int dH = (int)extraParams[6];
-            const int dW = (int)extraParams[7];
-
-            const int bS = imShape[0];
-            const int iC = imShape[1];
-            const int kH = colShape[2];
-            const int kW = colShape[3];
-            const int oH = colShape[4];
-            const int oW = colShape[5];
-            const Nd4jLong colStride0 = colStride[0];
-            const Nd4jLong colStride1 = colStride[1];
-            const Nd4jLong colStride2 = colStride[2];
-            const Nd4jLong colStride3 = colStride[3];
-            const Nd4jLong colStride4 = colStride[4];
-            const Nd4jLong colStride5 = colStride[5];
-            const Nd4jLong imStride0  = imStride[0];
-            const Nd4jLong imStride1  = imStride[1];
-            const Nd4jLong imStride2  = imStride[2];
-            const Nd4jLong imStride3  = imStride[3];
-
-            auto zLength = shape::length(imShapeBuffer);
-
-            // initial zeroing of image content
-            memset(imBuff, 0, zLength * sizeof(X));
-
-
-            X *col, *im;
-            int imRow, imCol;
-
-            if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kRow = 0; kRow < kH; ++kRow) {
-                            for (int kCol = 0; kCol < kW; ++kCol) {
-                                for (int colH = 0; colH < oH; ++colH) {
-                                    for (int colW = 0; colW < oW; ++colW) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                            *im += *col;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol))
-                for (int b = 0; b < bS; b++) {
-                    for (int colH = 0; colH < oH; ++colH) {
-                        for (int colW = 0; colW < oW; ++colW) {
-                            for (int c = 0; c < iC; ++c) {
-                                for (int kRow = 0; kRow < kH; ++kRow) {
-                                    for (int kCol = 0; kCol < kW; ++kCol) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                            *im += *col;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-
-
-		/** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc
-		*  normally negative indices are bad, OK here because of other checks on input indices
-		*  Uses unrolled loop specifically for length 4
-		*/
-		static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[2] != 1) offset += indices[2] * stride[2];
-			if (shape[3] != 1) offset += indices[3] * stride[3];
-			return offset;
-		}
-
-		/** A version of Shape.getOffset without checking on input for negative indices etc
-		* normally negative indices are bad, OK here because of other checks on input indices
-		* Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here)
-		*/
-		static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[4] != 1) offset += indices[4] * stride[4];
-			if (shape[5] != 1) offset += indices[5] * stride[5];
-			return offset;
-		}
-
-	};
-
-
-	template<typename X>
-	class Reverse {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		static inline __device__ void execSpecialCuda(X *dx, Nd4jLong *xShapeBuffer,
-                                                    X *result, Nd4jLong *zShapeBuffer,
-                                                    X *extraParams, int *allocationPointer,
-                                                    X *reductionPointer,
-                                                    Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            __shared__ Nd4jLong xLength;
-			__shared__ int xEWS;
-            __shared__ char xOrder;
-            __shared__ Nd4jLong sLength;
-            __shared__ X *shmem;
-            int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-            if (threadIdx.x == 0) {
-                xLength = shape::length(xShapeBuffer);
-			    xEWS = shape::elementWiseStride(xShapeBuffer);
-                xOrder = shape::order(xShapeBuffer);
-                sLength = xLength - 1;
-
-                extern __shared__ unsigned char shrd[];
-                shmem = (X *) shrd;
-            }
-            __syncthreads();
-
-
-
-            if (dx == result) {
-
-                if (xEWS == 1) {
-                    for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) {
-                        Nd4jLong idx = sLength - e;
-                        X tmp = dx[e];
-                        dx[e] = dx[idx];
-                        dx[idx] = tmp;
-                    }
-                } else if (xEWS >= 1) {
-                    for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) {
-                        Nd4jLong idx1 = (sLength - e) * xEWS;
-                        Nd4jLong idx2 =  e * xEWS;
-                        X tmp = dx[idx2];
-                        dx[idx2] = dx[idx1];
-                        dx[idx1] = tmp;
-                    }
-                }
-                else {
-
-					for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
-                        result[zOffset] = dx[xOffset];
-					}
-                }
-
-            } else {
-                __shared__ int zEWS;
-				__shared__ char zOrder;
-
-				if (threadIdx.x == 0) {
-				    zEWS = shape::elementWiseStride(zShapeBuffer);
-				    zOrder = shape::order(zShapeBuffer);
-				}
-				__syncthreads();
-
-                if (xEWS == 1 && zEWS == 1 && xOrder == zOrder) {
-                    // loop for whole array
-                    for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
-                        result[sLength - e] = dx[e];
-                    }
-                } else if (xEWS >= 1 && zEWS >= 1 && xOrder == zOrder) {
-
-                    for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
-                        result[(sLength - e) * zEWS] = dx[e * xEWS];
-                    }
-                }
-                else {
-
-                    for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
-                        result[zOffset] = dx[xOffset];
-                    }
-                }
-            }
-		}
-
-#endif
-
-
-		static void execSpecial(X *dx, Nd4jLong *xShapeBuffer, X *result, Nd4jLong *zShapeBuffer, X *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-			Nd4jLong xLength = shape::length(xShapeBuffer);
-			int xEWS = shape::elementWiseStride(xShapeBuffer);
-            char xOrder = shape::order(xShapeBuffer);
-            Nd4jLong sLength = xLength - 1;
-
-			// two step phase here
-			if (dx == result) {
-				if (xEWS == 1) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (Nd4jLong e = 0; e < xLength / 2; e++) {
-                        Nd4jLong idx = sLength - e;
-                        auto tmp = dx[e];
-                        dx[e] = dx[idx];
-                        dx[idx] = tmp;
-                    }
-				} else if (xEWS > 1) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (Nd4jLong e = 0; e < xLength / 2; e++) {
-                        Nd4jLong idx1 = (sLength - e) * xEWS;
-                        Nd4jLong idx2 =  e * xEWS;
-                        auto tmp = dx[idx2];
-                        dx[idx2] = dx[idx1];
-                        dx[idx1] = tmp;
-                    }
-				}
-                else {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (Nd4jLong e = 0; e < xLength / 2; e++) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
-
-                        result[zOffset] = dx[xOffset];
-                    }
-				}
-			} else {
-				// single step phase here
-				auto zEWS = shape::elementWiseStride(zShapeBuffer);
-				auto zOrder = shape::order(zShapeBuffer);
-
-				if (xEWS == 1 && zEWS == 1 && xOrder == zOrder) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-					for (Nd4jLong e = 0; e < xLength; e++) {
-						result[sLength - e] = dx[e];
-					}
-				} else if (xEWS >= 1 && zEWS >= 1 && xOrder == zOrder) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-					for (Nd4jLong e = 0; e < xLength; e++) {
-						result[(sLength - e) * zEWS] = dx[e * xEWS];
-					}
-				}
-                else {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-					for (Nd4jLong e = 0; e < xLength; e++) {
-						auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, zShapeBuffer);
-						result[zOffset] = dx[xOffset];
-					}
-				}
-			}
-		}
-
-        op_def static X op(X d1, X *params) {
-            return d1;
-        }
-	};
-
-	template<typename X>
-	class SoftMax {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			void *vx, Nd4jLong *xShapeBuffer,
-			void *vresult, Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			int *allocationPointer, void *reductionPointer,
-            Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			auto shape = shape::shapeOf(xShapeBuffer);
-			__shared__ X maxResult;
-			__shared__ Nd4jLong *maxResultShapeBuffer;
-
-			auto length = shape::length(xShapeBuffer);
-
-			auto stride = shape::stride(xShapeBuffer);
-			//compute the row wise maxes
-
-			__shared__ Nd4jLong maxShape[2];
-
-			// it's always 2d here
-			__shared__ Nd4jLong tempBuffer[8];
-
-			if (threadIdx.x == 0) {
-			    maxResult = (X) 0.0;
-			    maxShape[0] = shape[0];
-			    maxShape[1] = 1;
-                maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape, tempBuffer);
-			}
-			__syncthreads();
-
-
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//subtract max of each row
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			//after subtracting the row wise maxes take the exp
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-			__syncthreads();
-
-			//take the sum for the exponential
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//divide by the sum
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-		}
-#endif
-
-		      static void execSpecial(
-            void *vx,
-            Nd4jLong *xShapeInfo,
-            void *vz,
-            Nd4jLong *zShapeInfo,
-            void *vextraParams,
-            Nd4jLong *tadShapeInfo,
-            Nd4jLong *tadOffsets) {
-
-            auto x = reinterpret_cast<X *>(vx);
-            auto z = reinterpret_cast<X *>(vz);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-            if (shape::isMatrix(xShapeInfo)) {
-
-                if(shape::equalsStrict(xShapeInfo, zShapeInfo)) {
-                    if (tadShapeInfo == nullptr) {
-                        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, 1);
-                        tadShapeInfo = tadPack.primaryShapeInfo();
-                        tadOffsets = tadPack.primaryOffsets();
-                    }
-
-                    const uint tadLen    = shape::length(tadShapeInfo);
-                    const uint numOfTads = shape::length(xShapeInfo) / tadLen;
-
-                    if(shape::elementWiseStride(tadShapeInfo) == 1) {
-
-                        PRAGMA_OMP_PARALLEL_FOR_SIMD
-                        for (uint i = 0; i < numOfTads; ++i) {
-
-                            X* inBuff  = x + tadOffsets[i];
-                            X* outBuff = z + tadOffsets[i];
-
-                            X max = -nd4j::DataTypeUtils::max<X>();
-                            X sum = 0;
-
-                            for(uint j = 0; j < tadLen; ++j)
-                                max = nd4j::math::nd4j_max<X>(max, inBuff[j]);
-
-                            for (uint j = 0; j < tadLen; ++j) {
-                                X temp = nd4j::math::nd4j_exp<X,X>(inBuff[j] - max);
-                                outBuff[j] = temp;
-                                sum += temp;
-                            }
-
-                            for (uint j = 0; j < tadLen; ++j)
-                            outBuff[j] /= sum;
-                        }
-                    }
-                    else {
-
-                        uint xShapeInfoCast[MAX_RANK];
-                        bool canCast = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, xShapeInfoCast);
-
-                        auto offsets = new Nd4jLong[tadLen];
-                        shape::calcOffsets(tadShapeInfo, offsets);
-
-                        PRAGMA_OMP_PARALLEL_FOR_SIMD
-                        for (uint i = 0; i < numOfTads; ++i) {
-
-                            X* inBuff  = x  + tadOffsets[i];
-                            X* outBuff = z + tadOffsets[i];
-
-                            X max = -nd4j::DataTypeUtils::max<X>();
-                            X sum = 0.f;
-
-                            for(uint j = 0; j < tadLen; ++j)
-                                max = nd4j::math::nd4j_max<X>(max, inBuff[offsets[j]]);
-
-                            for (uint j = 0; j < tadLen; ++j) {
-                                X temp = nd4j::math::nd4j_exp<X,X>(inBuff[offsets[j]] - max);
-                                outBuff[offsets[j]] = temp;
-                                sum += temp;
-                            }
-
-                            for (uint j = 0; j < tadLen; ++j)
-                                outBuff[offsets[j]] /= sum;
-                        }
-                        delete []offsets;
-                    }
-                }
-                else {
-
-                    auto shape = shape::shapeOf(xShapeInfo);
-                    //iterate along rows
-                    int dimension[1] = { 0 };
-                    int maxDimension[1] = { 1 };
-                    //compute the row wise maxes
-                    auto maxResult = new X[shape[0]];
-                    for (int i = 0; i < shape[0]; i++)
-                        maxResult[i] = 0.0;
-                    Nd4jLong maxShape[2] = { shape[0], 1 };
-                    auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape);
-                    functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Max, x, xShapeInfo, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1,  nullptr, nullptr);
-
-                    //subtract max of each row
-                    functions::broadcast::Broadcast<X, X, X>::exec(nd4j::broadcast::Subtract, x, xShapeInfo, maxResult, maxResultShapeBuffer, z, zShapeInfo, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-                    //after subtracting the row wise maxes take the exp
-                    functions::transform::TransformStrict<X>::exec(nd4j::transform::Exp, z, zShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-
-                    //take the sum for the exponential
-                    functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Sum, z, zShapeInfo, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-                    //divide by the sum
-                    functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Divide, z, zShapeInfo, maxResult, maxResultShapeBuffer, z, zShapeInfo, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-                    delete[] maxResultShapeBuffer;
-                    delete[] maxResult;
-                }
-            }
-            else if (shape::isVector(xShapeInfo)) {
-                auto max = -nd4j::DataTypeUtils::max<X>();
-                X sum = 0;
-                int elementWiseStride = shape::elementWiseStride(xShapeInfo);
-                int resultElementWiseStride = shape::elementWiseStride(zShapeInfo);
-                int length = shape::length(xShapeInfo);
-                if (elementWiseStride >= 1 && resultElementWiseStride >= 1) {
-                    if (elementWiseStride == 1 && resultElementWiseStride == 1) {
-
-                        for (int i = 0; i < length; i++) {
-                            max = nd4j::math::nd4j_max<X>(max, x[i]);
-                        }
-
-                        for (int i = 0; i < length; i++) {
-                            z[i] = nd4j::math::nd4j_exp<X,X>(x[i] - max);
-                            sum += z[i];
-                        }
-
-                        PRAGMA_OMP_SIMD
-                        for (int i = 0; i < length; i++) {
-                            z[i] /= sum;
-                        }
-                    }
-                    else {
-
-                        for (int i = 0; i < length; i++) {
-                            max = nd4j::math::nd4j_max<X>(max, x[i * elementWiseStride]);
-                        }
-
-                        for (int i = 0; i < length; i++) {
-                            auto r = nd4j::math::nd4j_exp<X, X>(x[i * elementWiseStride] - max);
-                            z[i * resultElementWiseStride] = r;
-                            sum += r;
-                        }
-
-                        for (int i = 0; i < length; i++) {
-                            z[i * resultElementWiseStride] /= sum;
-                        }
-                    }
-                }
-            }
-        }
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-	};
-
-
-
-	template<typename X>
-	class LogSoftMax {
-	public:
-		static const bool requiresSpecial = true;
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-            			void *vx, Nd4jLong *xShapeBuffer,
-            			void *vresult, Nd4jLong *zShapeBuffer,
-            			void *vextraParams,
-            			int *allocationPointer, void *reductionPointer,
-                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-			auto shape = shape::shapeOf(xShapeBuffer);
-			auto stride = shape::stride(xShapeBuffer);
-			//iterate along rows
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			__shared__ X maxResult;
-			__shared__ Nd4jLong *maxResultShapeBuffer;
-			if (threadIdx.x == 0) {
-				maxResult = (X) 0.0;
-			}
-			__syncthreads();
-			//compute the row wise maxes
-
-			Nd4jLong maxShape[2] = { shape[0], 1 };
-			__shared__ Nd4jLong tempBuffer[8];
-
-			if (threadIdx.x == 0)
-                maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape, tempBuffer);
-			__syncthreads();
-
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//subtract max of each row
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			//after subtracting the row wise maxes take the exp
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-			__syncthreads();
-
-			//take the sum for the exponential
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//divide by the sum
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Log, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-
-		}
-#endif
-
-
-		static void execSpecial(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			Nd4jLong *tadShapeInfo,
-			Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			if (shape::isMatrix(xShapeBuffer, 2)) {
-				auto shape = shape::shapeOf(xShapeBuffer);
-				//iterate along rows
-				int dimension[1] = { 0 };
-				int maxDimension[1] = { 1 };
-				//compute the row wise maxes
-				auto maxResult = new X[shape[0]];
-
-                PRAGMA_OMP_SIMD
-				for (int i = 0; i < shape[0]; i++)
-					maxResult[i] = 0.0;
-
-				Nd4jLong maxShape[2] = { shape[0], 1 };
-                auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape);
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//subtract max of each row
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Subtract, dx, xShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				//after subtracting the row wise maxes take the exp
-				functions::transform::TransformStrict<X>::exec(nd4j::transform::Exp, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets);
-
-				//take the sum for the exponential
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//divide by the sum
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Divide, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				functions::transform::TransformStrict<X>::exec(nd4j::transform::Log, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets);
-
-
-				delete[] maxResultShapeBuffer;
-			}
-			else if (shape::isVector(xShapeBuffer, 2)) {
-				auto max = -FLOAT_MAX_VALUE;
-				X sum = 0;
-
-				auto elementWiseStride = shape::elementWiseStride(xShapeBuffer);
-                auto length = shape::length(xShapeBuffer);
-				if (elementWiseStride == 1) {
-
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i]);
-					}
-
-
-					for (int i = 0; i < length; i++) {
-						result[i] = nd4j::math::nd4j_exp<X, X>(dx[i] - max);
-						sum += result[i];
-					}
-
-                    PRAGMA_OMP_SIMD
-					for (int i = 0; i < length; i++) {
-						result[i] /= sum;
-						result[i] = nd4j::math::nd4j_log<X, X>(result[i]);
-					}
-				}
-				else if (elementWiseStride > 1) {
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i * elementWiseStride]);
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] = nd4j::math::nd4j_exp<X, X>(dx[i * elementWiseStride] - max);
-						sum += result[i * elementWiseStride];
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] /= sum;
-						result[i * elementWiseStride] = nd4j::math::nd4j_log<X, X>(result[i * elementWiseStride]);
-					}
-				}
-			}
-		}
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-	};
-
-
-	/**
-	* softmax(x)
-	*/
-	template<typename X>
-	class SoftMaxDerivative {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			                 void *vx, Nd4jLong *xShapeBuffer,
-			                 void *vresult, Nd4jLong *zShapeBuffer,
-			                 void *vextraParams,
-			                 int *allocationPointer, void *reductionPointer,
-                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			auto shape = shape::shapeOf(xShapeBuffer);
-			__shared__ X maxResult;
-			__shared__ Nd4jLong *maxResultShapeBuffer;
-			__shared__ Nd4jLong resultEWS;
-
-			auto length = shape::length(xShapeBuffer);
-
-			if (threadIdx.x == 0) {
-				resultEWS = shape::elementWiseStride(zShapeBuffer);
-
-				maxResult = (X) 0.0;
-			}
-			__syncthreads();
-
-			auto tride = shape::stride(xShapeBuffer);
-			Nd4jLong maxShape[2] = { shape[0], 1 };
-
-			__shared__ Nd4jLong tempBuffer[8];
-
-			if (threadIdx.x == 0)
-                maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape, tempBuffer);
-			__syncthreads();
-
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//subtract max of each row
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			//after subtracting the row wise maxes take the exp
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-			__syncthreads();
-
-			//take the sum for the exponential
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//divide by the sum
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			if (resultEWS >= 1) {
-				for (int i = threadIdx.x; i < length; i += blockDim.x) {
-					result[i * resultEWS] = result[i * resultEWS] * ((X) 1.0 - result[i * resultEWS]);
-				}
-			}
-			else {
-				printf("Non element wise stride not supported right now\n");
-			}
-
-		}
-#endif
-
-
-		static void execSpecial(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			if (shape::isMatrix(xShapeBuffer, 2)) {
-				auto shape = shape::shapeOf(xShapeBuffer);
-
-				auto resultEleStide = shape::elementWiseStride(zShapeBuffer);
-
-				//iterate along rows
-				int dimension[1] = { 0 };
-				int maxDimension[1] = { 1 };
-				auto len = shape::length(xShapeBuffer);
-				//compute the row wise maxes
-				auto maxResult = new X[shape[0]];
-
-                PRAGMA_OMP_SIMD
-				for (int i = 0; i < shape[0]; i++)
-					maxResult[i] = 0.0f;
-
-				Nd4jLong maxShape[2] = { shape[0], 1 };
-                auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape);
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//subtract max of each row
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Subtract, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				//after subtracting the row wise maxes take the exp
-				functions::transform::TransformStrict<X>::exec(nd4j::transform::Exp, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets);
-
-				//take the sum for the exponential
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//divide by the sum
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Divide, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				if (resultEleStide >= 1) {
-					if (resultEleStide == 1) {
-                        PRAGMA_OMP_SIMD
-						for (int i = 0; i < len; i++) {
-							result[i] = result[i] * (static_cast<X>(1.0f) - result[i]);
-						}
-
-					}
-					else {
-                        PRAGMA_OMP_SIMD
-						for (int i = 0; i < len; i++) {
-							result[i * resultEleStide] = result[i * resultEleStide] * (static_cast<X>(1.0f) - result[i * resultEleStide]);
-						}
-
-					}
-				}
-				else {
-
-                    for (int i = 0; i < len; i++) {
-                        Nd4jLong zOffset = shape::getIndexOffset(i, zShapeBuffer);
-                        result[zOffset] = result[zOffset] * ((X) 1.0f - result[zOffset]);
-                    }
-                }
-
-
-				delete[] maxResultShapeBuffer;
-				delete[] maxResult;
-			}
-			else if (shape::isVector(xShapeBuffer, 2)) {
-				auto max = -nd4j::DataTypeUtils::max<X>();
-				X sum = 0;
-
-				auto elementWiseStride = shape::elementWiseStride(xShapeBuffer);
-				auto length = shape::length(xShapeBuffer);
-				if (elementWiseStride == 1) {
-
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i]);
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i] -= max;
-						result[i] = nd4j::math::nd4j_exp<X, X>(result[i]);
-						sum += result[i];
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i] /= sum;
-					}
-
-                    for (int i = 0; i < length; i++) {
-                        result[i] = result[i] * ((X) 1.0f - result[i]);
-                    }
-                } else if (elementWiseStride >= 1) {
-
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i * elementWiseStride]);
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] -= max;
-						result[i * elementWiseStride] = nd4j::math::nd4j_exp<X, X>(result[i * elementWiseStride]);
-						sum += result[i * elementWiseStride];
-					}
-
-                    PRAGMA_OMP_SIMD
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] /= sum;
-					}
-
-                    PRAGMA_OMP_SIMD
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] = result[i * elementWiseStride] * ((X) 1.0f - result[i * elementWiseStride]);
-					}
-				} else {
-                    printf("non-ews access on row not implemented yet");
-                }
-			}
-		}
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-	};
-
-
-	template<typename X, typename Z>
-	class IsMax {
-	public:
-		static const bool requiresSpecial = true;
-
-
-#ifdef __CUDACC__
-
-		static inline  __device__ void doAllCuda(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			int *allocationPointer, void *reductionPointer) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-// this code is safe to delete, it's never used
-/*
-			__shared__ int maxIdx;
-			__shared__ int length;
-			if (threadIdx.x == 0) {
-				length = shape::length(zShapeBuffer);
-			}
-			__syncthreads();
-
-			functions::indexreduce::IndexReduce<T>::template transform<simdOps::IndexMax<T>>(
-				dx,
-				xShapeBuffer,
-				extraParams,
-				result,
-				zShapeBuffer,
-				nullptr,
-				1,
-				1, allocationPointer, reductionPointer,  nullptr, nullptr);
-
-			__syncthreads();
-			if (threadIdx.x == 0)
-				maxIdx = (int)result[0];
-			__syncthreads();
-
-			for (int i = threadIdx.x; i < length; i += blockDim.x)
-				result[i] = 0;
-			__syncthreads();
-
-			if (threadIdx.x == 0) {
-				result[maxIdx] = 1.0;
-			}
-			*/
-		}
-#endif
-
-#ifdef __CUDACC__
-		inline __host__
-
-#elif defined(__GNUC__)
-
-
-#endif
-		static void doAll(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-            void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			auto length = shape::length(xShapeBuffer);
-			auto eleStride = shape::elementWiseStride(xShapeBuffer);
-			auto resultEleStride = shape::elementWiseStride(zShapeBuffer);
-			auto xOrder = shape::order(xShapeBuffer);
-			auto resultOrder = shape::order(zShapeBuffer);
-
-			if (xOrder == resultOrder && xOrder == 'c') {
-				if (eleStride == 1 && resultEleStride == 1) {
-					if (length < ELEMENT_THRESHOLD) {
-						int maxIdx = 0;
-                        auto currMax = dx[0];
-
-						for (int i = 0; i < length; i++) {
-							if (currMax < dx[i]) {
-								currMax = dx[i];
-								maxIdx = i;
-							}
-
-							result[i] = static_cast<Z>(0);
-
-						}
-
-						result[maxIdx] = static_cast<Z>(1);
-
-					}
-					else {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-
-
-{
-						int maxIdxLocal = maxIdx;
-						auto currMaxLocal = currMax;
-
-						for (int i = 0; i < length; i++) {
-							if (currMaxLocal < dx[i]) {
-								currMaxLocal = dx[i];
-								maxIdxLocal = i;
-							}
-							result[i] = static_cast<Z>(0);
-						}
-
-PRAGMA_OMP_CRITICAL
-{
-						if (currMax < currMaxLocal) {
-							currMax = currMaxLocal;
-							maxIdx = maxIdxLocal;
-						}
-}
-}
-						result[maxIdx] = static_cast<Z>(1);
-					}
-
-				}
-				else {
-					if (length < ELEMENT_THRESHOLD) {
-						int maxIdx = 0;
-                        auto currMax = dx[0];
-
-						for (int i = 0; i < length; i++) {
-							result[i * resultEleStride] = static_cast<Z>(0);
-							if (currMax < dx[i * eleStride]) {
-								currMax = dx[i * eleStride];
-								maxIdx = i;
-							}
-						}
-
-						result[maxIdx * resultEleStride] = static_cast<Z>(1);
-
-					}
-					else {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-
-
-{
-						int maxIdxLocal = maxIdx;
-						auto currMaxLocal = currMax;
-
-						for (int i = 0; i < length; i++) {
-							result[i * resultEleStride] = static_cast<Z>(0);
-							if (currMaxLocal < dx[i * eleStride]) {
-								currMaxLocal = dx[i * eleStride];
-								maxIdxLocal = i;
-							}
-						}
-
-PRAGMA_OMP_CRITICAL
-{
-						if (currMax < currMaxLocal) {
-							currMax = currMaxLocal;
-							maxIdx = maxIdxLocal;
-						}
-}
-}
-						result[maxIdx * resultEleStride] = static_cast<Z>(1);
-					}
-
-				}
-			}
-
-
-			else {
-				Nd4jLong shapeIter[MAX_RANK];
-				Nd4jLong coord[MAX_RANK];
-				int dim;
-				Nd4jLong xStridesIter[MAX_RANK];
-				Nd4jLong resultStridesIter[MAX_RANK];
-				auto xShape = shape::shapeOf(xShapeBuffer);
-				auto xStride = shape::stride(xShapeBuffer);
-				auto resultStride = shape::stride(zShapeBuffer);
-				auto rank = shape::rank(xShapeBuffer);
-				auto originalResult = result;
-				if (PrepareTwoRawArrayIter<X, Z>(rank,
-					xShape,
-					dx,
-					xStride,
-					result,
-					resultStride,
-					&rank,
-					shapeIter,
-					&dx,
-					xStridesIter,
-					&result,
-					resultStridesIter) >= 0) {
-					auto value = dx[0];
-					int idx = 0;
-					int maxIdx = 0;
-					ND4J_RAW_ITER_START(dim, rank, coord, shapeIter); {
-						if (dx[0] > value) {
-							value = dx[0];
-							maxIdx = idx;
-						}
-
-						idx++;
-						result[0] = static_cast<Z>(0);
-
-					}
-					ND4J_RAW_ITER_TWO_NEXT(
-						dim,
-						rank,
-						coord,
-						shapeIter,
-						dx,
-						xStridesIter,
-						result,
-						resultStridesIter);
-
-					//pointer to where max value would be
-					if (shape::order(zShapeBuffer) == 'c' || (shape::order(zShapeBuffer) == 'f' &&
-						maxIdx * shape::stride(zShapeBuffer)[shape::rank(zShapeBuffer) - 1] >=
-						shape::length(zShapeBuffer)))
-						originalResult[maxIdx] = static_cast<Z>(1);
-					else
-						originalResult[maxIdx * shape::stride(zShapeBuffer)[shape::rank(zShapeBuffer) - 1]] = static_cast<Z>(1);
-				}
-			}
-
-
-		}
-	public:
-
-
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			             void *vx, Nd4jLong *xShapeBuffer,
-			             void *vresult, Nd4jLong *zShapeBuffer,
-			             void *vextraParams, int *allocationPointer,
-                         void *reductionPointer,
-                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			// FIXME: MAX_DIMENSION is lower then FP16 frame
-			if (extraParams == nullptr || (int) extraParams[0] == MAX_DIMENSION) {
-				doAllCuda(dx, xShapeBuffer, result, zShapeBuffer, extraParams, allocationPointer, reductionPointer);
-			}
-		}
-#endif
-
-		static void execSpecial(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			Nd4jLong *tadShapeInfo,
-			Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			//FIXME: this op should be moved to CustomOps
-			if (extraParams == nullptr || (int)extraParams[0] == 0 ||
-				((int)extraParams[0] == 1 && (int)extraParams[1] == MAX_DIMENSION)) {
-				doAll(dx, xShapeBuffer, result, zShapeBuffer, extraParams);
-			}
-			else if (shape::isVector(xShapeBuffer)) {
-				auto dimensionLength = (int)extraParams[0];
-				auto dimension = new int[dimensionLength];
-				auto length = shape::length(xShapeBuffer);
-				for (int i = 0; i < dimensionLength; i++) {
-					dimension[i] = (int)extraParams[i + 1];
-				}
-				if (shape::shapeOf(xShapeBuffer)[dimension[0]] == 1) {
-					for (int i = 0; i < length; i++) {
-						result[i] = static_cast<Z>(1);
-					}
-				}
-				else {
-					auto eleStride = shape::elementWiseStride(xShapeBuffer);
-					if (eleStride == 1) {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-						if (length < ELEMENT_THRESHOLD) {
-
-							for (int i = 0; i < length; i++) {
-								if (currMax < dx[i]) {
-									currMax = dx[i];
-									maxIdx = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-
-							}
-						}
-						else {
-PRAGMA_OMP_PARALLEL
-{
-							int maxIdxLocal = maxIdx;
-							auto currMaxLocal = currMax;
-
-							for (int i = 0; i < length; i++) {
-								if (currMaxLocal < dx[i]) {
-									currMaxLocal = dx[i];
-									maxIdxLocal = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-
-							}
-
-							PRAGMA_OMP_CRITICAL
-                            {
-							    if (currMax < currMaxLocal) {
-								    currMax = currMaxLocal;
-								    maxIdx = maxIdxLocal;
-							    }
-                            }
-}
-						}
-
-						result[maxIdx] = static_cast<Z>(1);
-
-					}
-
-
-					else {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-						if (length < ELEMENT_THRESHOLD) {
-
-							for (int i = 0; i < length; i++) {
-								if (currMax < dx[i * eleStride]) {
-									currMax = dx[i * eleStride];
-									maxIdx = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-							}
-						}
-						else {
-
-{
-							int maxIdxLocal = maxIdx;
-							auto currMaxLocal = currMax;
-
-							for (int i = 0; i < length; i++) {
-								if (currMaxLocal < dx[i * eleStride]) {
-									currMaxLocal = dx[i * eleStride];
-									maxIdxLocal = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-							}
-
-PRAGMA_OMP_CRITICAL
-{
-							if (currMax < currMaxLocal) {
-								currMax = currMaxLocal;
-								maxIdx = maxIdxLocal;
-							}
-}
-}
-						}
-
-						result[maxIdx] = static_cast<Z>(1);
-					}
-				}
-
-
-			}
-			else {
-                auto dimensionLength = (int) extraParams[0];
-                auto dimension = new int[dimensionLength];
-
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < dimensionLength; i++) {
-                    dimension[i] = (int) extraParams[i + 1];
-                }
-                //decompose in to several sub tads after
-                //moving all dimensions (in sorted order)
-                //to the back.
-                //permuted version of the x shape info for setting up the tad problem
-				auto tadShapeShapeInfo = tadShapeInfo;
-				if(tadShapeInfo==nullptr) {
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeBuffer, dimension, dimensionLength);
-
-					tadShapeShapeInfo = tadPack.primaryShapeInfo();
-					tadOffsets = tadPack.primaryOffsets();
-                    tadShapeInfo = tadShapeShapeInfo;
-				}
-
-                auto tadLength = shape::length(tadShapeInfo);//shape::tadLength(xShapeBuffer, dimension, dimensionLength);
-                auto tads = shape::length(xShapeBuffer) / tadLength;
-
-                int tadsPerThread = tads / TAD_THRESHOLD;
-                int num_threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-                auto tadEWS = shape::elementWiseStride(tadShapeShapeInfo);
-                auto zEWS = tadEWS;
-
-                int span = (tads / num_threads) + 8;
-
-                PRAGMA_OMP_PARALLEL_THREADS(num_threads)
-                {
-                    int tid = omp_get_thread_num();
-                    int start = span * tid;
-                    int end = span * (tid + 1);
-                    if (end > tads) end = tads;
-
-                    for (int r = start; r < end; r++) {
-                        if (tadEWS > 0 && zEWS > 0 && dimensionLength == 1) {
-                            auto rX = dx + tadOffsets[r];
-                            auto rZ = result + tadOffsets[r];
-
-                            auto maxValue = rX[0];
-                            int maxIdx = 0;
-                            if (tadEWS == 1 && zEWS == 1) {
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    if (rX[i] > maxValue) {
-                                        maxIdx = i;
-                                        maxValue = rX[i];
-                                    }
-                                }
-
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    rZ[i] = static_cast<Z>(maxIdx == i);
-                                }
-
-                            } else {
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    if (rX[i * tadEWS] > maxValue) {
-                                        maxIdx = i;
-                                        maxValue = rX[i * tadEWS];
-                                    }
-                                }
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    rZ[i * zEWS] = static_cast<Z>(maxIdx == i);
-                                }
-                            }
-                        } else {
-                            int tadsPerThread = tads / TAD_THRESHOLD;
-                            int num_threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                            num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-                            auto offset = tadOffsets[r];
-                            Nd4jLong shapeIter[MAX_RANK];
-                            Nd4jLong coord[MAX_RANK];
-                            int dim;
-                            Nd4jLong xStridesIter[MAX_RANK];
-                            Nd4jLong resultStridesIter[MAX_RANK];
-                            auto xShape = shape::shapeOf(tadShapeShapeInfo);
-                            auto xStride = shape::stride(tadShapeShapeInfo);
-                            auto resultStride = shape::stride(tadShapeShapeInfo);
-                            int rank = shape::rank(tadShapeShapeInfo);
-                            auto xPointer = dx + offset;
-                            auto resultPointer = result + offset;
-                            auto maxValue = xPointer[0];
-
-                            auto maxCursor = resultPointer;
-                            Nd4jPointer maxCursorLong = reinterpret_cast<Nd4jPointer>(maxCursor);
-                            if (PrepareTwoRawArrayIter<X,Z>(rank,
-                                                             xShape,
-                                                             xPointer,
-                                                             xStride,
-                                                             resultPointer,
-                                                             resultStride,
-                                                             &rank,
-                                                             shapeIter,
-                                                             &xPointer,
-                                                             xStridesIter,
-                                                             &resultPointer,
-                                                             resultStridesIter) >= 0) {
-                                   ND4J_RAW_ITER_START(dim, rank, coord, shapeIter); {
-                                       if (maxValue < xPointer[0]) {
-                                           maxCursor = resultPointer;
-                                           maxCursorLong = reinterpret_cast<Nd4jPointer>(resultPointer);
-                                           maxValue = xPointer[0];
-                                       }
-
-                                       resultPointer[0] = static_cast<Z>(0);
-                                   }
-                                   ND4J_RAW_ITER_TWO_NEXT(dim,
-                                                          rank,
-                                                          coord,
-                                                          shapeIter,
-                                                          xPointer,
-                                                          xStridesIter,
-                                                          resultPointer,
-                                                          resultStridesIter);
-                                   maxCursor = reinterpret_cast<Z *>(maxCursorLong);
-                                   maxCursor[0] = static_cast<Z>(1);;
-                            }
-                        }
-                    }
-                }
-
-                delete[] dimension;
-            }
-		}
-
-		op_def static Z op(X d1, X *params) {
-			return nd4j::math::softplus<X,Z>(d1);
-		}
-	};
-}
diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h
index 1ae310ad4..a25aa36ec 100644
--- a/libnd4j/include/ops/special_random_ops.h
+++ b/libnd4j/include/ops/special_random_ops.h
@@ -25,6 +25,7 @@
 #include <helpers/shape.h>
 #include <graph/RandomGenerator.h>
 #include <specials_cuda.h>
+#include <execution/Threads.h>
 
 namespace randomOps {
 
@@ -152,9 +153,9 @@ namespace randomOps {
             // TODO: we probably might want to skip this sum, and state that probabilities array should be real probabilities, i.e. should sum to 1.0
             //T probSum = extraArguments[0];
 
-            Nd4jLong xLength = shape::length(xShapeBuffer);
-            Nd4jLong yLength = shape::length(yShapeBuffer);
-            Nd4jLong zLength = shape::length(zShapeBuffer);
+            auto xLength = shape::length(xShapeBuffer);
+            auto yLength = shape::length(yShapeBuffer);
+            auto zLength = shape::length(zShapeBuffer);
 
             auto xEWS = shape::elementWiseStride(xShapeBuffer);
             auto yEWS = shape::elementWiseStride(yShapeBuffer);
@@ -162,47 +163,53 @@ namespace randomOps {
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-                for (Nd4jLong e = 0; e < zLength; e++) {
-                    T prob = rng->relativeT<T>(e);
-                    T cumProb = (T) 0.0f;
-                    for (Nd4jLong f = 0; f < yLength; f++) {
-                        T relProb = y[f * yEWS];
-                        cumProb += relProb;
+                auto func = PRAGMA_THREADS_FOR {
+                    for (uint64_t e = start; e < stop; e += increment) {
+                        T prob = rng->relativeT<T>(e);
+                        T cumProb = (T) 0.0f;
+                        for (Nd4jLong f = 0; f < yLength; f++) {
+                            T relProb = y[f * yEWS];
+                            cumProb += relProb;
 
-                        if (prob <= cumProb || f == yLength - 1) {
-                            z[e * zEWS] = x[f * xEWS];
-                            break;
+                            if (prob <= cumProb || f == yLength - 1) {
+                                z[e * zEWS] = x[f * xEWS];
+                                break;
+                            }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
             }
             else {
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-                for (Nd4jLong i = 0; i < zLength; i++) {
+                auto func = PRAGMA_THREADS_FOR {
+                    for (Nd4jLong i = 0; i < zLength; i++) {
 
-                    auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
-                    T prob = rng->relativeT<T>(i);
-                    T cumProb = (T) 0.0f;
+                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
+                        T prob = rng->relativeT<T>(i);
+                        T cumProb = (T) 0.0f;
 
-                    for (Nd4jLong f = 0; f < yLength; f++) {
+                        for (Nd4jLong f = 0; f < yLength; f++) {
 
-                        auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer);
-                        T relProb = y[yOffset2];
-                        cumProb += relProb;
+                            auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer);
+                            T relProb = y[yOffset2];
+                            cumProb += relProb;
 
-                        if (prob <= cumProb || f == yLength - 1) {
+                            if (prob <= cumProb || f == yLength - 1) {
 
-                            auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer);
-                            z[zOffset2] = x[xOffset2];
-                            break;
+                                auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer);
+                                z[zOffset2] = x[xOffset2];
+                                break;
+                            }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
             }
         }
     };
@@ -308,7 +315,7 @@ namespace randomOps {
 
             int elementsPerThread = middle / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             int span = (middle / _threads) + 8;
 
@@ -322,25 +329,30 @@ namespace randomOps {
 
             const T epsilon = static_cast<T>(1e-5);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-            for (Nd4jLong e = 0; e < middle; e++) {
-                auto epm = e + middle;
+            auto func = PRAGMA_THREADS_FOR {
+                for (uint64_t e = start; e < stop; e += increment) {
+                    auto epm = e + middle;
 
-                // we need to get random values
-                T r0 = rng->relativeT<T>(e, epsilon, static_cast<T>(1.0f));
-                T r1 = rng->relativeT<T>(epm, epsilon, static_cast<T>(1.0f));
+                    // we need to get random values
+                    T r0 = rng->relativeT<T>(e, epsilon, static_cast<T>(1.0f));
+                    T r1 = rng->relativeT<T>(epm, epsilon, static_cast<T>(1.0f));
 
-                T realMean0 = y == z ? mean : y[e * yEWS];
+                    T realMean0 = y == z ? mean : y[e * yEWS];
 
-                auto z0 =  (nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean0;
-                z[e * zEWS] = z0;
+                    auto z0 = (nd4j::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T, T>(r0)) *
+                               nd4j::math::nd4j_cos<T, T>(two_pi * r1)) * stddev + realMean0;
+                    z[e * zEWS] = z0;
 
-                if (epm < zLength) {
-                    T realMean1 = y == z ? mean : y[epm * yEWS];
-                    auto z1 = (nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean1;
-                    z[epm * zEWS] = z1;
+                    if (epm < zLength) {
+                        T realMean1 = y == z ? mean : y[epm * yEWS];
+                        auto z1 = (nd4j::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T, T>(r0)) *
+                                   nd4j::math::nd4j_sin<T, T>(two_pi * r1)) * stddev + realMean1;
+                        z[epm * zEWS] = z1;
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, middle, 1, _threads);
         }
     };
 
@@ -422,21 +434,13 @@ namespace randomOps {
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
-            auto span = (zLength / _threads) + 8;
+            T prob = extraArguments[1];
 
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            PRAGMA_OMP_PARALLEL_THREADS(_threads)
-            {
-                int tid = omp_get_thread_num();
-                auto start = span * tid;
-                auto end = span * (tid + 1);
-                if (end > zLength) end = zLength;
-
-                T prob = extraArguments[1];
-
-                for (Nd4jLong e = start; e < end; e++) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (Nd4jLong e = start; e < stop; e += increment) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -453,7 +457,9 @@ namespace randomOps {
                     // if trials is set to 0, effectively we just have successful memset
                     z[e * zEWS] = static_cast<T>(success);
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
         }
     };
 
@@ -536,22 +542,14 @@ namespace randomOps {
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
-            auto span = (zLength / _threads) + 8;
+            T prob = extraArguments[1];
 
             //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            PRAGMA_OMP_PARALLEL_THREADS(_threads)
-            {
-                int tid = omp_get_thread_num();
-                Nd4jLong start = span * tid;
-                Nd4jLong end = span * (tid + 1);
-                if (end > zLength) end = zLength;
-
-                T prob = extraArguments[1];
-
-                for (Nd4jLong e = start; e < end; e++) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (uint64_t e = start; e < stop; e += increment) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -568,7 +566,9 @@ namespace randomOps {
                     // if trials is set to 0, effectively we just have successful memset
                     z[e * zEWS] = static_cast<T>(success);
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
         }
     };
 
@@ -685,19 +685,22 @@ namespace randomOps {
             Nd4jLong middle = zLength / 2 + (zLength % 2);
             int elementsPerThread = middle / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             const T epsilon = static_cast<T>(1e-5);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-            for (Nd4jLong e = 0; e < zLength; ++e) {
-                if (z[e] > mean + ds || z[e] < mean - ds) {
-                    z[e] = step(rng, mean, stddev, e, middle, z[e]);
+            auto func = PRAGMA_THREADS_FOR {
+                for (uint64_t e = start; e < stop; e += increment) {
+                    if (z[e] > mean + ds || z[e] < mean - ds) {
+                        z[e] = step(rng, mean, stddev, e, middle, z[e]);
 
-                    if (z[e] > mean + ds || z[e] < mean - ds)
-                        z[e] = mean + nd4j::DataTypeUtils::min<T>();
+                        if (z[e] > mean + ds || z[e] < mean - ds)
+                            z[e] = mean + nd4j::DataTypeUtils::min<T>();
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
         }
     };
 
@@ -799,7 +802,7 @@ namespace randomOps {
 
             int elementsPerThread = middle / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             int span = (zLength / _threads) + 8;
 
@@ -813,16 +816,9 @@ namespace randomOps {
             const T stddev = extraArguments[1];
             const T epsilon = static_cast<T>(1e-5);
 
-            PRAGMA_OMP_PARALLEL_THREADS(_threads)
-            {
-                int tid = omp_get_thread_num();
-                Nd4jLong start = span * tid;
-                Nd4jLong end = span * (tid + 1);
-                if (end > middle)
-                    end = middle;
-
+            auto func = PRAGMA_THREADS_FOR {
                 PRAGMA_OMP_SIMD
-                for (Nd4jLong e = start; e < end; e++) {
+                for (uint64_t e = start; e < stop; e += increment) {
                     auto epm = e + middle;
 
                     // we need to get random values
@@ -838,7 +834,9 @@ namespace randomOps {
                         z[epm * zEWS] =  nd4j::math::nd4j_exp<T,T>((nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean);
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, middle, 1, _threads);
         }
     };
 
diff --git a/libnd4j/include/ops/specials.h b/libnd4j/include/ops/specials.h
index 6919aa38d..d8030db0b 100644
--- a/libnd4j/include/ops/specials.h
+++ b/libnd4j/include/ops/specials.h
@@ -18,8 +18,8 @@
 // Created by raver119 on 24.04.17.
 //
 
-#ifndef LIBND4J_CONCAT_H
-#define LIBND4J_CONCAT_H
+#ifndef LIBND4J_SPECIALS_H
+#define LIBND4J_SPECIALS_H
 
 
 #ifdef __CUDACC__
@@ -28,6 +28,7 @@
 #endif
 
 #include <pointercast.h>
+#include <vector>
 
 namespace nd4j {
     class NDArray;
@@ -81,4 +82,4 @@ namespace nd4j {
 }
 
 
-#endif //LIBND4J_CONCAT_H
+#endif //LIBND4J_SPECIALS_H
diff --git a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
index d35346e2b..22bb87103 100644
--- a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
+++ b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
@@ -21,8 +21,9 @@
 #include <ops/declarable/CustomOperations.h>
 #include <performance/benchmarking/FullBenchmarkSuit.h>
 #include <ops/declarable/LegacyRandomOp.h>
+#include <algorithm>
 
-#ifdef _RELEASE
+#ifdef RELEASE_BUILD
     int wIterations = 4;
     int rIterations = 20;
     int gemmRegularUpperPow = 11;
diff --git a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
index caad37867..9e179db7f 100644
--- a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
+++ b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
@@ -21,14 +21,14 @@
 #include <ops/declarable/CustomOperations.h>
 #include "performance/benchmarking/LightBenchmarkSuit.h"
 
-#ifdef _RELEASE
-#define WARMUP 3
-#define NUM_ITER 10
+#ifdef RELEASE_BUILD
+#define WARMUP 5
+#define NUM_ITER 100
 
 #else
 
-#define WARMUP 0
-#define NUM_ITER 1
+#define WARMUP 5
+#define NUM_ITER 100
 
 #endif
 
@@ -592,7 +592,7 @@ namespace nd4j {
     }
 
     std::string LightBenchmarkSuit::runSuit() {
-#ifdef _RELEASE
+#ifdef RELEASE_BUILD
         std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32, nd4j::DataType::HALF});
 #else
         std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32});
@@ -609,7 +609,7 @@ namespace nd4j {
 
             nd4j_printf("Running LightBenchmarkSuite.pairwiseBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
             BUILD_SINGLE_SELECTOR(t, result += pairwiseBenchmark, (), LIBND4J_TYPES);
-
+/*
             nd4j_printf("Running LightBenchmarkSuite.reduceFullBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
             BUILD_SINGLE_SELECTOR(t, result += reduceFullBenchmark, (), LIBND4J_TYPES);
 
@@ -627,12 +627,13 @@ namespace nd4j {
 
             nd4j_printf("Running LightBenchmarkSuite.lstmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
             BUILD_SINGLE_SELECTOR(t, result += lstmBenchmark, (), LIBND4J_TYPES);
+            */
         }
 
         nd4j_printf("Running LightBenchmarkSuite.broadcast2d\n", "");
-        result += broadcast2d();
+        //result += broadcast2d();
         nd4j_printf("Running LightBenchmarkSuite.mismatchedOrderAssign\n", "");
-        result += mismatchedOrderAssign();
+        //result += mismatchedOrderAssign();
 
         return result;
     }
diff --git a/libnd4j/include/pointercast.h b/libnd4j/include/pointercast.h
index c6161782a..e080b33b6 100644
--- a/libnd4j/include/pointercast.h
+++ b/libnd4j/include/pointercast.h
@@ -21,6 +21,7 @@
 #ifndef NATIVEOPERATIONS_POINTERCAST_H
 #define NATIVEOPERATIONS_POINTERCAST_H
 
+#include <msvc.h>
 #include <stdint.h>
 
 typedef void* Nd4jPointer;
diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h
index 96f97f762..23f6b342d 100644
--- a/libnd4j/include/templatemath.h
+++ b/libnd4j/include/templatemath.h
@@ -44,7 +44,6 @@
 #define M_PI 3.14159265358979323846
 #endif
 
-
 namespace nd4j {
 #ifdef __CUDACC__
 
@@ -1651,4 +1650,46 @@ inline __device__ bfloat16 nd4j_atomicDiv<bfloat16>(bfloat16* address, bfloat16
 
 }
 
+#ifdef _OPENMP
+
+#ifndef MAX_FLOAT
+#define MAX_FLOAT 1e37
+#endif
+
+#pragma omp declare reduction(maxTF : float,double,float16,bfloat16 :              \
+                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
+                initializer (omp_priv=-MAX_FLOAT)
+
+#pragma omp declare reduction(minTF : float,double,float16,bfloat16 :              \
+                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
+                initializer (omp_priv=MAX_FLOAT)
+
+#pragma omp declare reduction(maxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(minT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(amaxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_max(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
+
+#pragma omp declare reduction(aminT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_min(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
+
+#pragma omp declare reduction(asumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_abs(omp_in) + nd4j::math::nd4j_abs(omp_out))\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(sumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = omp_in + omp_out)\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(prodT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = omp_in * omp_out)\
+                initializer (omp_priv=1)
+
+#endif
+
 #endif /* TEMPLATEMATH_H_ */
diff --git a/libnd4j/pom.xml b/libnd4j/pom.xml
index f33f8577f..3e766b944 100644
--- a/libnd4j/pom.xml
+++ b/libnd4j/pom.xml
@@ -185,6 +185,8 @@
                             <buildCommand>
                                 <program>bash</program>
                                 <argument>run_tests.sh</argument>
+                                <argument>--chip</argument>
+                                <argument>${libnd4j.chip}</argument>
                             </buildCommand>
                         </configuration>
                     </execution>
diff --git a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
index 2cbc8513e..20469ed2d 100644
--- a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
@@ -141,7 +141,7 @@ TEST_F(BooleanOpsTests, test_where_1) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
+    //z->printIndexedBuffer("z");
 
     ASSERT_EQ(e, *z);
 
diff --git a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
index c6b834a33..33a8fa10a 100644
--- a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
@@ -41,6 +41,8 @@ TEST_F(BroadcastableOpsTests, Test_Add_1) {
     y.linspace(1);
     exp.linspace(1);
 
+    //exp.printIndexedBuffer("E B");
+
     exp.applyBroadcast(broadcast::Add, {1}, &y);
 
     nd4j::ops::add op;
@@ -50,8 +52,8 @@ TEST_F(BroadcastableOpsTests, Test_Add_1) {
 
     auto z = result->at(0);
 
-    // exp.printIndexedBuffer("E");
-    // z->printIndexedBuffer("Z");
+    //exp.printIndexedBuffer("E A");
+    //z->printIndexedBuffer("Z");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -717,7 +719,7 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_empty_2) {
 
     auto z = result->at(0);
 
-    z->printShapeInfo("z");
+    // z->printShapeInfo("z");
 
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_TRUE(e.isSameShape(z));
diff --git a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
index 0fa4d687d..9a8f09b87 100644
--- a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
@@ -54,7 +54,7 @@ TEST_F(BroadcastMultiDimTest,MultimDimTest) {
             tad->tadOnlyShapeInfo, //tadShapeInfo
             tad->tadOffsets, //tadOffset
             tad->tadOnlyShapeInfo, //tadShapeInfoZ
-            tad->tadOffsets); //tadOffsetZ
+            tad->tadOffsets, 0, tad->numTads); //tadOffsetZ
     for(int i = 0; i < 30; i++) {
         ASSERT_EQ(dataAssertion[i],result[i]);
     }
diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
index 6f964d0ac..8a58fe3a5 100644
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@@ -34,7 +34,7 @@ if (CUDA_BLAS)
     endif()
 
     if ("${COMPUTE}" STREQUAL "all")
-        list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_35,code=sm_35 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70)
+        list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70)
     else()
         list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
     endif()
@@ -43,18 +43,19 @@ endif()
 # -fsanitize=address
 # -fsanitize=leak
 if (APPLE)
-    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2 -D__APPLE_OS__=true")
-elseif(WIN32)    
-	if (CPU_BLAS)
-		set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fmax-errors=2 -D__APPLE_OS__=true")
+elseif(WIN32)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native -mtune=native -O3")
+	if (CPU_BLAS AND LINUX)
+		set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fmax-errors=2")
 	endif()
 else()
-
-    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}  -O3")
+    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fmax-errors=2")
     if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
         set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
     else()
-        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
     endif()
 
     if (CPU_BLAS)
@@ -130,6 +131,10 @@ foreach (TMP_PATH ${TEST_SOURCES})
 endforeach(TMP_PATH)
 
 if (CPU_BLAS)
+    if (NOT BLAS_LIBRARIES)
+        set(BLAS_LIBRARIES "")
+    endif()
+
 	add_executable(runtests ${TEST_SOURCES})
 	target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main)
 elseif(CUDA_BLAS)
diff --git a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
index 2d4f9205f..60ba4733c 100644
--- a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
@@ -160,7 +160,6 @@ TEST_F(ConditionalTests, Flat_Test_2) {
 
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {1, 1, 1, 1});
 
-    z->printIndexedBuffer("z");
     ASSERT_TRUE(exp.equalsTo(z));
     delete graph;
 }
diff --git a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
index 383815417..9134ef0a4 100644
--- a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
@@ -140,8 +140,8 @@ TEST_F(ConstantShapeHelperTests, basic_test_5) {
     auto arrayA = NDArrayFactory::create<int>(1);
     auto arrayB = NDArrayFactory::create_<float>('c', {128, 256});
 
-    arrayA.printShapeInfo("A");
-    arrayB->printShapeInfo("B");
+    //arrayA.printShapeInfo("A");
+    //arrayB->printShapeInfo("B");
     ASSERT_EQ(0, arrayA.rankOf());
     ASSERT_EQ(2, arrayB->rankOf());
     ASSERT_NE(arrayA.dataType(), arrayB->dataType());
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
index 853f82cda..353e51ad3 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
@@ -614,182 +614,6 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_conv2d_1) {
     delete result2D;
 }
 
-
-
-TEST_F(ConvolutionTests1, Test_im2col_col2im_1) {
-    int kY = 5;
-    int kX = 5;
-    int sY = 1;
-    int sX = 1;
-    int pY = 0;
-    int pX = 0;
-    int dY = 1;
-    int dX = 1;
-    int inY = 28;
-    int inX = 28;
-    int channels = 3;
-
-    bool isSameMode = true;
-
-    auto x = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    x.linspace(1);
-
-    int oY, oX;
-    x.syncToDevice();
-    //ASSERT_TRUE(x.isActualOnDeviceSide());
-    ASSERT_TRUE(x.isActualOnHostSide());
-    //x.printBuffer("x", 64);
-
-    nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode);
-
-    if (isSameMode)
-        nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX);
-
-    auto im2col0 = NDArrayFactory::create<double>('c', {2, channels, kY, kX, oY, oX});
-
-    ExtraArguments args({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.});
-    x.applyTransform(transform::Im2col, &im2col0, &args);
-
-    nd4j::ops::im2col op;
-    auto result2col = op.execute({&x}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0});
-
-    auto im2col1 = result2col->at(0);
-
-    //im2col0.printBuffer("transformed");
-    //im2col1->printBuffer("customized", 64);
-
-    ASSERT_TRUE(im2col1->isSameShape(&im2col0));
-    ASSERT_TRUE(im2col1->equalsTo(&im2col0));
-
-
-    ExtraArguments args2({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0});
-    auto col2im0 = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    im2col0.applyTransform(transform::Col2Im, &col2im0, &args2);
-
-    nd4j::ops::col2im op2im;
-    auto result2im = op2im.execute({im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0});
-    auto col2im1 = result2im->at(0);
-
-    ASSERT_TRUE(col2im1->isSameShape(&col2im0));
-    ASSERT_TRUE(col2im1->equalsTo(&col2im0));
-
-    delete result2col;
-    delete result2im;
-}
-
-
-TEST_F(ConvolutionTests1, Test_im2col_col2im_2) {
-    int kY = 5;
-    int kX = 5;
-    int sY = 1;
-    int sX = 1;
-    int pY = 0;
-    int pX = 0;
-    int dY = 1;
-    int dX = 1;
-    int inY = 28;
-    int inX = 28;
-    int channels = 3;
-
-    bool isSameMode = true;
-
-    auto x = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    x.linspace(1);
-
-    int oY, oX;
-
-    nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode);
-
-    if (isSameMode)
-        nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX);
-
-    auto im2col0 = NDArrayFactory::create<double>('c', {2, channels, oY, oX, kY, kX});
-    im2col0.permutei({0, 1, 4, 5, 2, 3});
-
-    ExtraArguments args2col({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.});
-    x.applyTransform(transform::Im2col, &im2col0, &args2col);
-
-    nd4j::ops::im2col op;
-    auto result2col = op.execute({&x}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0});
-
-    auto im2col1 = result2col->at(0);
-
-    ASSERT_TRUE(im2col1->isSameShape(&im2col0));
-    ASSERT_TRUE(im2col1->equalsTo(&im2col0));
-
-
-    ExtraArguments args2im({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0});
-    auto col2im0 = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    im2col0.applyTransform(transform::Col2Im, &col2im0, &args2im);
-
-    nd4j::ops::col2im op2im;
-    auto result2im = op2im.execute({im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0});
-    auto col2im1 = result2im->at(0);
-
-    ASSERT_TRUE(col2im1->isSameShape(&col2im0));
-    ASSERT_TRUE(col2im1->equalsTo(&col2im0));
-
-    delete result2col;
-    delete result2im;
-}
-
-TEST_F(ConvolutionTests1, Test_im2col_col2im_3) {
-    int kY = 5;
-    int kX = 5;
-    int sY = 1;
-    int sX = 1;
-    int pY = 0;
-    int pX = 0;
-    int dY = 1;
-    int dX = 1;
-    int inY = 28;
-    int inX = 28;
-    int channels = 3;
-
-    bool isSameMode = true;
-
-    auto x = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    x.linspace(1);
-
-    int oY, oX;
-
-    nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode);
-
-    if (isSameMode)
-        nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX);
-
-    auto im2col0 = NDArrayFactory::create<double>('c', {2, channels, oY, oX, kY, kX});
-    im2col0.permutei({0, 1, 4, 5, 2, 3});
-
-    auto im2col1 = NDArrayFactory::create<double>('c', {2, channels, oY, oX, kY, kX});
-    im2col1.permutei({0, 1, 4, 5, 2, 3});
-
-    ExtraArguments args2col({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.});
-    x.applyTransform(transform::Im2col, &im2col0, &args2col);
-
-    nd4j::ops::im2col op;
-    auto status = op.execute({&x}, {&im2col1}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0}, {});
-    ASSERT_EQ(Status::OK(), status);
-
-    ASSERT_TRUE(im2col1.isSameShape(&im2col0));
-    ASSERT_TRUE(im2col1.equalsTo(&im2col0));
-
-
-    ExtraArguments args2im({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0});
-    auto col2im0 = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    im2col0.applyTransform(transform::Col2Im, &col2im0, &args2im);
-
-    nd4j::ops::col2im op2im;
-    auto result2im = op2im.execute({&im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0});
-    auto col2im1 = result2im->at(0);
-
-    ASSERT_TRUE(col2im1->isSameShape(&col2im0));
-    ASSERT_TRUE(col2im1->equalsTo(&col2im0));
-
-    delete result2im;
-}
-
-
 TEST_F(ConvolutionTests1, TestDeconv_bp_1) {
 
     int bS=3, iH=4,iW=4,  iC=3,oC=2,  kH=1,kW=1,  sH=1,sW=1,  pH=0,pW=0,  dH=1,dW=1;
@@ -1212,8 +1036,8 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_bp_test1) {
 
     nd4j::ops::conv3dnew_bp op;
     auto results = op.execute({&input, &weights, &bias, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
-    auto* gradI = results->at(0);
-    auto* gradW = results->at(1);
+    auto gradI = results->at(0);
+    auto gradW = results->at(1);
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expGradI.isSameShape(gradI));
diff --git a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
index c018e58d0..45b35eb4e 100644
--- a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
@@ -110,7 +110,7 @@ TEST_F(DataTypesValidationTests, test_bfloat16_rand_1) {
     RandomGenerator gen(119, 120);
     RandomLauncher::fillUniform(LaunchContext::defaultContext(), gen, &x, 1, 6);
 
-    ASSERT_TRUE(x.sumNumber().e<float>(0) > 0);
+    ASSERT_TRUE(x.sumNumber().e<float>(0) != 0.f);
 }
 
 TEST_F(DataTypesValidationTests, test_bfloat16_rand_2) {
@@ -118,7 +118,7 @@ TEST_F(DataTypesValidationTests, test_bfloat16_rand_2) {
     RandomGenerator gen(119, 120);
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), gen, &x, 0, 1);
 
-    ASSERT_TRUE(x.sumNumber().e<float>(0) > 0);
+    ASSERT_TRUE(x.sumNumber().e<float>(0) != 0.f);
 }
 
 TEST_F(DataTypesValidationTests, cast_1) {
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
index 458858c57..8dd2e7a40 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
@@ -164,9 +164,7 @@ TEST_F(DeclarableOpsTests1, ApplyGradientDescent_1) {
     auto result = op.execute({&x, &y}, {1.}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
-//    result->at(0)->printIndexedBuffer("OUTPUT");
-//    result->at(0)->printShapeInfo("OUTPUT Shape");
-//    exp.printIndexedBuffer("EXPECT");
+
     ASSERT_TRUE(z->equalsTo(exp));
     delete result;
 }
@@ -180,9 +178,7 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_1) {
     auto result = op.execute({&x, &y}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
-//    result->at(0)->printIndexedBuffer("OUTPUT");
-//    result->at(0)->printShapeInfo("OUTPUT Shape");
-//    exp.printIndexedBuffer("EXPECT");
+
     ASSERT_TRUE(z->equalsTo(exp));
     delete result;
 }
@@ -199,11 +195,6 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_2) {
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z1 = result->at(0);
     auto z2 = result->at(1);
-//    z1->printIndexedBuffer("OUTPUT");
-//    z2->printIndexedBuffer("OUTPUT");
-//
-//    exp1.printIndexedBuffer("EXPECT");
-//    exp2.printIndexedBuffer("EXPECT");
 
     ASSERT_TRUE(z1->equalsTo(exp1));
     ASSERT_TRUE(z2->equalsTo(exp2));
@@ -220,9 +211,7 @@ TEST_F(DeclarableOpsTests1, AXpY_Test_1) {
     auto result = op.execute({&x, &y}, {2.}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
-//    result->at(0)->printIndexedBuffer("OUTPUT");
-//    result->at(0)->printShapeInfo("OUTPUT Shape");
-//    exp.printIndexedBuffer("EXPECT");
+
     ASSERT_TRUE(z->equalsTo(exp));
     delete result;
 }
@@ -265,14 +254,6 @@ TEST_F(DeclarableOpsTests1, TestTensorMmul1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // exp.printShapeInfo();
-    // out->printShapeInfo();
-    // exp.printBuffer();
-    // out->printBuffer();
-
-    // PointersManager manager(x.getContext(), "scatter");
-    // manager.printDevContentOnHost<float>(out->getSpecialBuffer(), out->lengthOf());
-    // manager.printDevContentOnHost<float>(exp.getSpecialBuffer(), exp.lengthOf());
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -293,8 +274,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // out->printBuffer();
-    // out->printShapeInfo();
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -315,8 +294,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // out->printBuffer();
-    // out->printShapeInfo();
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -337,8 +314,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot4) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // out->printBuffer();
-    // out->printShapeInfo();
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -631,8 +606,6 @@ TEST_F(DeclarableOpsTests1, ClipByValue1) {
 
     clip.execute(block);
 
-    // x->printIndexedBuffer("Result");
-    // exp.printIndexedBuffer("Expect");
     ASSERT_TRUE(x->equalsTo(&exp));
 
 
@@ -775,7 +748,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractMatrices1) {
     nd4j::ops::reversesubtract subOp;
 
     subOp.execute(block);
-    // x->printIndexedBuffer("Output Subtract");
+
     ASSERT_TRUE(x->equalsTo(&exp));
 
     delete variableSpace;
@@ -814,7 +787,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_2) {
     y.assign(1.f);
     exp.assign(-2.f);
     x.applyTrueBroadcast(BROADCAST(ReverseSubtract), &y, &z, true);
-//    x.printIndexedBuffer("ReverseSubtract Legacy");
+
     ASSERT_TRUE(exp.equalsTo(&z));
 
     nd4j::ops::reversesubtract subOp;
@@ -822,7 +795,6 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_2) {
     auto res = subOp.execute({&x, &y}, {}, {});
 
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
-    //res->at(0)->printIndexedBuffer("OUtput REVERSED SUB");
     ASSERT_TRUE(res->at(0)->equalsTo(&exp));
 
     delete res;
@@ -862,8 +834,8 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_1) {
     y.assign(9.f);
     exp.assign(1.f);
     y.applyTrueBroadcast(BROADCAST(Mod), &x, &z, true);
-    // z.printIndexedBuffer("MOD1");
     ASSERT_TRUE(exp.equalsTo(&z));
+
     x.applyTrueBroadcast(BROADCAST(ReverseMod), &y, &exp, true);
     ASSERT_TRUE(exp.equalsTo(&z));
 
@@ -899,7 +871,6 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_2) {
     auto res = subOp.execute({&x, &y}, {}, {});
 
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
-//    res->at(0)->printIndexedBuffer("OUtput REVERSED MOD2");
     ASSERT_TRUE(res->at(0)->equalsTo(&exp));
 
     delete res;
@@ -1355,7 +1326,6 @@ TEST_F(DeclarableOpsTests1, DivideScalarScalar1) {
 
     div.execute(block);
 
-    //x->printBuffer("x");
     ASSERT_TRUE(x->equalsTo(&exp));
 
     delete variableSpace;
@@ -1503,10 +1473,6 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer("OUtput");
-    // yExp.printIndexedBuffer("Expect");
-    // z->printShapeInfo("OUt shape");
-    // yExp.printShapeInfo("Exp shape");
     ASSERT_TRUE(yExp.equalsTo(z));
 
     delete result;
@@ -1515,8 +1481,6 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, TestRegistrator1) {
     auto res = nd4j::ops::OpRegistrator::getInstance()->getAllCustomOperations();
-
-    // nd4j_printf("Ops: %s\n", res)
 }
 
 // //////////////////////////////////////////////////////////////////////
@@ -1555,7 +1519,6 @@ TEST_F(DeclarableOpsTests1, TestRegistrator1) {
 //     //auto status = execCustomOp(nullptr, hash, inputBuffers, inputShapes, 2, outputBuffers, outputShapes, 1, nullptr, 0, nullptr, 0, false);
 //     auto status = execCustomOp(nullptr, hash, inputBuffers, inputShapes, 2, outputBuffers, outputShapes, 1, nullptr, 0, nullptr, 0, nullptr, 0, false);
 //     ASSERT_EQ(ND4J_STATUS_OK, status);
-//     // z->printIndexedBuffer("Output add");
 //     ASSERT_NEAR(2.0f, y->meanNumber().e<float>(0), 1e-5);
 //     ASSERT_NEAR(1.0f, x->meanNumber().e<float>(0), 1e-5);
 //     ASSERT_NEAR(3.0f, z->meanNumber().e<float>(0), 1e-5);
@@ -1636,8 +1599,6 @@ TEST_F(DeclarableOpsTests1, TestGemv1) {
 
      nd4j::blas::GEMV<float, float, float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
 
-    //z->printBuffer();
-
     ASSERT_TRUE(z->equalsTo(exp));
 
     delete []xBuffer; delete []xShape; delete x; delete []yBuffer; delete []yShape; delete y; delete z; delete []expBuffer; delete exp;
@@ -2020,8 +1981,6 @@ TEST_F(DeclarableOpsTests1, TestCustomShape1) {
     auto inshapes = new ShapeList(input->getShapeInfo());
     auto shapes = test.calculateOutputShape(inshapes, *block);
 
-    //input.printShapeInfo("input");
-    //shape::printShapeInfoLinear(shape);
 
     ASSERT_EQ(input->getShapeInfo()[0]    , shapes->at(0)[0]);
     ASSERT_EQ(input->getShapeInfo()[1] * 2, shapes->at(0)[1]);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 3fd9d26c6..f0ae83168 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -130,7 +130,7 @@ TEST_F(DeclarableOpsTests10, Test_Not_1) {
     auto result = op.execute({&x, &y}, {}, {}, {}, false, nd4j::DataType::BOOL);
     ASSERT_EQ(Status::OK(), result->status());
     auto res = result->at(0);
-    res->printBuffer("OUtput NOT");
+
     ASSERT_TRUE(e.equalsTo(res));
 
     delete result;
@@ -163,7 +163,7 @@ TEST_F(DeclarableOpsTests10, MirrorPad_SGO_Test_1) {
 
     auto res = op.execute({&in, &pad}, {10.0}, {0}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
-    res->at(0)->printIndexedBuffer("Mirror pad:");
+
     ASSERT_TRUE(exp.equalsTo(res->at(0)));
     delete res;
 }
@@ -180,9 +180,6 @@ TEST_F(DeclarableOpsTests10, Unique_SGO_Test_1) {
     auto res1 = res->at(0);
     auto res2 = res->at(1);
 
-    res1->printIndexedBuffer("Unique values");
-    res2->printIndexedBuffer("Unique idxs");
-
     ASSERT_TRUE(exp.equalsTo(res1));
     ASSERT_TRUE(expIdx.equalsTo(res2));
     delete res;
@@ -215,8 +212,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_02) {
     auto res = op.execute({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto  resA = res->at(0);
-    resA->printIndexedBuffer("Where02");
-    resA->printBuffer("Where02lINEAR");
+
     ASSERT_TRUE(exp.equalsTo(resA));
     ASSERT_TRUE(exp.isSameShape(resA));
 //    ASSERT_TRUE(expIdx.equalsTo(res->at(1)));
@@ -329,8 +325,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_5) {
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
     //ASSERT_TRUE(resA->isEmpty());
-    resA->printIndexedBuffer("Result A");
-    //resA->printShapeInfo("ShapeA");
+
     ASSERT_TRUE(exp.equalsTo(resA));
     ASSERT_TRUE(exp.isSameShape(resA));
 //    ASSERT_TRUE(expIdx.equalsTo(res->at(1)));
@@ -658,8 +653,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test1) {
 
     auto z = result->at(0);
     auto zI = result->at(1);
-    z->printIndexedBuffer("TopK(5)");
-    zI->printIndexedBuffer("TopKI(5)");
+
     ASSERT_TRUE(expUnsorted.isSameShape(z));
     ASSERT_TRUE(expUnsorted.equalsTo(z));
 
@@ -669,8 +663,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test1) {
 
     z = result2->at(0);
     zI = result2->at(1);
-    z->printIndexedBuffer("sorted TopK(5)");
-    zI->printIndexedBuffer("sorted TopKI(5)");
+
     ASSERT_TRUE(expSorted.isSameShape(z));
     ASSERT_TRUE(expSorted.equalsTo(z));
 
@@ -693,8 +686,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test2) {
 
     auto z = result->at(0);
     auto zI = result->at(1);
-    z->printIndexedBuffer("TopK(5)");
-    zI->printIndexedBuffer("TopKI(5)");
+
     ASSERT_TRUE(expUnsorted.isSameShape(z));
     ASSERT_TRUE(expUnsorted.equalsTo(z));
 
@@ -704,8 +696,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test2) {
 
     z = result2->at(0);
     zI = result2->at(1);
-    z->printIndexedBuffer("sorted TopK(5)");
-    zI->printIndexedBuffer("sorted TopKI(5)");
+
     ASSERT_TRUE(expSorted.isSameShape(z));
     ASSERT_TRUE(expSorted.equalsTo(z));
 
@@ -1022,8 +1013,6 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("Output 2");
-    exp.printIndexedBuffer("Expect 2");
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1046,8 +1035,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("Output 3");
-    exp.printIndexedBuffer("Expect 3");
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1179,7 +1167,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_7) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("NTH rank3_n2");
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1206,7 +1194,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_8) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("NTH rank3_n2_reverse");
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1812,7 +1800,7 @@ TEST_F(DeclarableOpsTests10, LinSpace_Test1) {
     auto result = op.execute({&start, &finish, &num}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto res = result->at(0);
-    res->printIndexedBuffer("from 1 to 24");
+
     ASSERT_TRUE(expect.equalsTo(res));
     delete result;
 }
@@ -2084,7 +2072,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printIndexedBuffer("Cropped and Resized");
+
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     ASSERT_TRUE(expected.equalsTo(result));
 
@@ -2108,7 +2096,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printIndexedBuffer("Cropped and Resized");
+
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     ASSERT_TRUE(expected.equalsTo(result));
 
@@ -2156,7 +2144,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_5) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printShapeInfo("Cropped and Resized");
+
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     //ASSERT_TRUE(expected.equalsTo(result));
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
index 988e5d583..d077f886d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
@@ -916,7 +916,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test1) {
     auto result = op.execute({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
-    result->at(0)->printBuffer("Output");
 
     delete result;
 }
@@ -928,7 +927,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test2) {
     nd4j::ops::squaredsubtract op;
     auto result = op.execute({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
-    result->at(0)->printBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
     delete result;
 }
@@ -941,7 +939,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test3) {
     nd4j::ops::squaredsubtract_bp op;
     auto result = op.execute({&x, &y, &eps}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
-    result->at(0)->printBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
     delete result;
 }
@@ -1372,7 +1369,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1394,7 +1390,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1416,7 +1411,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1869,7 +1863,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_4) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1891,7 +1884,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_5) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 subtract:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1913,7 +1905,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_6) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 subtract2:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -2189,7 +2180,6 @@ TEST_F(DeclarableOpsTests11, SafeDivideMixed_Test1) {
     NDArray numOfNonZero(sumDiff.getShapeInfo(), nd4j::DataType::INT64, false);
     numOfNonZero.assign(1);
     sumDiff.applyPairwiseTransform(pairwise::SafeDivide, &numOfNonZero, &sumDiff, nullptr);
-    sumDiff.printIndexedBuffer("Output as Is");
 }
 
 /////////////////////////////////////////////////////////////////
@@ -2393,7 +2383,6 @@ TEST_F(DeclarableOpsTests11, Multiply_BP_Test1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdo = results->at(0);
-    dLdo->printBuffer("Output for multiply_bp op");
     ASSERT_TRUE(dLdpExp.isSameShape(dLdo));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdo));
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
index 3f868c45c..59da5edb4 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
@@ -402,8 +402,6 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_1) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("DivideBP X out");
-    output2.printIndexedBuffer("DivideBP Y out");
     //ASSERT_TRUE(output.e<double>(0) == 47.);
 }
 
@@ -427,8 +425,6 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_2) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("2DivideBP X out");
-    output2.printIndexedBuffer("2DivideBP Y out");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -450,8 +446,6 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_1) {
     Nd4jStatus status = op.execute({&y, &x, &eps}, {&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("RDivideBP X out");
-    output2.printIndexedBuffer("RDivideBP Y out");
     //ASSERT_TRUE(output.e<double>(0) == 47.);
 }
 
@@ -476,8 +470,6 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_2) {
     Nd4jStatus status = op.execute({&y, &x, &eps}, {&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("2RDivideBP X out");
-    output2.printIndexedBuffer("2RDivideBP Y out");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -501,7 +493,6 @@ TEST_F(DeclarableOpsTests12, TestSliceBP_1) {
     Nd4jStatus status = op.execute({&x, &eps}, {&output}, {}, {1,1,2,2}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output.printIndexedBuffer("SLICE_BP out");
     ASSERT_TRUE(output.equalsTo(exp));
     //ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -526,7 +517,6 @@ TEST_F(DeclarableOpsTests12, TestConfusionZero_1) {
     Nd4jStatus status = op.execute({&x, &i}, {&output}, {}, {4}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output.printIndexedBuffer("Confusion out");
     ASSERT_TRUE(output.equalsTo(exp));
     //ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -545,8 +535,6 @@ TEST_F(DeclarableOpsTests12, TestMaximumBP_1) {
     output1.assign(119);
     x.linspace(1.);
     y.linspace(12., -1.);
-    x.printBuffer("X");
-    y.printBuffer("Y");
     eps.linspace(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
@@ -554,8 +542,6 @@ TEST_F(DeclarableOpsTests12, TestMaximumBP_1) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("X max");
-    output2.printIndexedBuffer("Y max");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -574,8 +560,6 @@ TEST_F(DeclarableOpsTests12, TestMinimumBP_1) {
     output1.assign(119);
     x.linspace(1.);
     y.linspace(12., -1.);
-    x.printBuffer("X");
-    y.printBuffer("Y");
     eps.linspace(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
@@ -583,8 +567,6 @@ TEST_F(DeclarableOpsTests12, TestMinimumBP_1) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output2.printIndexedBuffer("X min");
-    output1.printIndexedBuffer("Y min");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
index 9d460f152..71ee8a04e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@@ -533,7 +533,6 @@ TEST_F(DeclarableOpsTests13, adjustSaturation_1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    // result->printIndexedBuffer();
 
     ASSERT_TRUE(exp.isSameShape(result));
     ASSERT_TRUE(exp.equalsTo(result));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
index 2d8311828..574da8993 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
@@ -58,12 +58,7 @@ TEST_F(DeclarableOpsTests14, Test_Reshape_CF_1) {
     auto x = NDArrayFactory::create<double>('f', {2, 3}, {1.0, 4.0, 2.0, 5.0, 3.0, 6.0});
     auto e = NDArrayFactory::create<double>('f', {3, 2}, {1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
 
-    x.printShapeInfo("x shape");
-    x.printBuffer("x buffr");
-    x.printIndexedBuffer("x indxd");
-
-    auto r = x.reshape('c', {3, 2});
-    r.printIndexedBuffer("r pre-s");
+    auto r = x.reshape('c', {3, 2});;
     r.streamline('f');
 
     nd4j::ops::reshape op;
@@ -92,7 +87,7 @@ TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_2) {
 TEST_F(DeclarableOpsTests14, Multiply_test) {
 
     for(int k=2;k<10;k++){
-        nd4j_printf("k=%d\n", k);
+        //nd4j_printf("k=%d\n", k);
         NDArray x = NDArrayFactory::create<double>('c', {k, 1});
         NDArray y = NDArrayFactory::create<double>('c', {k});
         NDArray e = NDArrayFactory::create<double>('c', {k, k});
@@ -122,7 +117,6 @@ TEST_F(DeclarableOpsTests14, Test_EvalReductionShape_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Reduced shape");
     ASSERT_EQ(e, *z);
 
     delete result;
@@ -416,8 +410,6 @@ TEST_F(DeclarableOpsTests14, test_empty_argmax_1) {
 
     auto z = result->at(0);
 
-    z->printShapeInfo("Z");
-
     ASSERT_EQ(e, *z);
 
     delete result;
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
index 6eabc964a..97e7d2d91 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
@@ -250,7 +250,6 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_2) {
     auto result = op.execute({&x}, {}, {nd4j::DataType::HALF}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
-    out->printIndexedBuffer("Casted result");
     ASSERT_TRUE(e.equalsTo(out));
     delete result;
 }
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
index d95e86b1c..1a459a012 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
@@ -149,5 +149,16 @@ TEST_F(DeclarableOpsTests16, test_knn_mindistance_1) {
     nd4j::ops::knn_mindistance op;
     auto result = op.execute({&input, &low, &high}, {&output}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
+}
 
+TEST_F(DeclarableOpsTests16, test_empty_cast_1) {
+    auto x = NDArrayFactory::create<bool>('c', {1, 0, 2});
+    auto e = NDArrayFactory::create<Nd4jLong>('c', {1, 0, 2});
+
+    nd4j::ops::cast op;
+    auto result = op.execute({&x}, {}, {10});
+    ASSERT_EQ(Status::OK(), result->status());
+    ASSERT_EQ(e, *result->at(0));
+
+    delete result;
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
index 62172dbf2..4941e7459 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
@@ -3589,8 +3589,6 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *result = results->at(0);
-    result->printIndexedBuffer("SCEL Output");
-    expected.printIndexedBuffer("SCEL Expect");
     ASSERT_TRUE(expected.isSameShape(result));
     ASSERT_TRUE(expected.equalsTo(result));
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
index 2f56eaf2a..478a31d4a 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
@@ -479,7 +479,6 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_4) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer();
 
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -1045,7 +1044,6 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_3) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Emply shape expected");
     ASSERT_TRUE(z->isEmpty());
 
     delete result;
@@ -1065,9 +1063,6 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_4) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printBuffer("Strided Slice");
-    z->printShapeInfo("Vector size 1 shape expected");
-    exp.printShapeInfo("Expected shape");
     ASSERT_TRUE(z->lengthOf() == 1);
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -1482,9 +1477,6 @@ TEST_F(DeclarableOpsTests4, WeightedCrossEntropyWithLogits_2) {
     auto results = op.execute({&targets, &input, &weights}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
     auto  output = results->at(0);
 
-    output->printIndexedBuffer("Result is ");
-    expected.printIndexedBuffer("Expected is ");
-
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
index 86acca29c..2e8d96f3c 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
@@ -304,7 +304,6 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Hadrdsigmoid 2x2");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -321,7 +320,6 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Hadrdsigmoid 2x2");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -384,7 +382,6 @@ TEST_F(DeclarableOpsTests5, histogram_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Histogram4");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -400,7 +397,6 @@ TEST_F(DeclarableOpsTests5, Identity_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Histogram3");
     ASSERT_TRUE(matrix.equalsTo(z));
 
     delete result;
@@ -416,7 +412,6 @@ TEST_F(DeclarableOpsTests5, Identity_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Identity_BP");
     ASSERT_TRUE(z->equalsTo(eps));
 
     delete result;
@@ -433,7 +428,6 @@ TEST_F(DeclarableOpsTests5, Log1p_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Log1p");
     ASSERT_TRUE(z->equalsTo(y));
 
     delete result;
@@ -450,7 +444,6 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer();
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -846,9 +839,6 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test1) {
 
     auto output = results->at(0);
 
-    exp.printIndexedBuffer("E");
-    output->printIndexedBuffer("O");
-
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1314,17 +1304,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_3_unsorted) {
     auto v = result->at(0);
     auto i = result->at(1);
 
-//    v->printShapeInfo("shape v");
-//    expV.printShapeInfo("shape expV");
-
-//    i->printShapeInfo("shape I");
-//    expI.printShapeInfo("shape expI");
-
-    v->printIndexedBuffer("v");
-//    expV.printIndexedBuffer("expV");
-    i->printIndexedBuffer("i");
-//    expI.printIndexedBuffer("expI");
-
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
 
@@ -1349,17 +1328,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_4) {
     auto v = result->at(0);
     auto i = result->at(1);
 
-//    v->printShapeInfo("shape v");
-//    expV.printShapeInfo("shape expV");
-
-//    i->printShapeInfo("shape I");
-//    expI.printShapeInfo("shape expI");
-
-//    v->printIndexedBuffer("v");
-//    expV.printIndexedBuffer("expV");
-//    i->printIndexedBuffer("i");
-//    expI.printIndexedBuffer("expI");
-
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
 
@@ -1377,11 +1345,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_5) {
 
     nd4j::ops::top_k op;
     auto result = op.execute({&x}, {}, {2, 1});
-    for (Nd4jLong r = 0; r < 2; r++) {
-        for (Nd4jLong c = 0; c < 3; c++)
-            nd4j_printf("%f, ", x.e<double>(r,c));
-        nd4j_printf("\n", "");
-    }
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     ASSERT_EQ(2, result->size());
@@ -1389,18 +1352,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_5) {
     auto v = result->at(0);
     auto i = result->at(1);
 
-//    x.printShapeInfo("shape of the source X");
-//    v->printShapeInfo("shape v");
-//    expV.printShapeInfo("shape expV");
-
-//    i->printShapeInfo("shape I");
-//    expI.printShapeInfo("shape expI");
-
-    v->printIndexedBuffer("v");
-    expV.printIndexedBuffer("expV");
-    i->printIndexedBuffer("i");
-    expI.printIndexedBuffer("expI");
-
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
 
@@ -2025,10 +1976,6 @@ TEST_F(DeclarableOpsTests5, DynamicPartition_2) {
 
     for (int e = 0; e < result->size(); e++) {
         auto output = result->at(e);
-         nd4j_printf("%i: ", e);
-         output->printShapeInfo("Output shape> ");
-         exp[e].printShapeInfo("Expected shape> ");
-         output->printIndexedBuffer("Output data> ");
 
         ASSERT_TRUE(exp[e].isSameShape(output));
         ASSERT_TRUE(exp[e].equalsTo(output));
@@ -2126,10 +2073,6 @@ TEST_F(DeclarableOpsTests5, DynamicStitch_1) {
 
     auto output = result->at(0);
 
-    // output->printShapeInfo("Output shape> ");
-    // exp.printShapeInfo("Expected shape> ");
-     output->printIndexedBuffer("O data");
-     exp.printIndexedBuffer("E data");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2334,8 +2277,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test1) {
     ASSERT_EQ(Status::OK(), results->status());
 
     auto output = results->at(0);
-    output->printIndexedBuffer("CM output");
-    expected.printIndexedBuffer("CM expected");
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2355,9 +2296,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test2) {
     ASSERT_EQ(Status::OK(), results->status());
 
     auto output = results->at(0);
-    output->printIndexedBuffer("CM2 output");
-    expected.printIndexedBuffer("CM2 expected");
-
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2376,8 +2314,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test3) {
     nd4j::ops::confusion_matrix op;
     auto results = op.execute({&labels, &predictions, &weights}, {}, {3});
     auto output = results->at(0);
-    output->printIndexedBuffer("CM3");
-
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expected.isSameShape(output));
@@ -2397,7 +2333,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test4) {
     nd4j::ops::confusion_matrix op;
     auto results = op.execute({&labels, &predictions, &weights}, {}, {3, nd4j::DataType::DOUBLE});
     auto output = results->at(0);
-    output->printIndexedBuffer("CM4");
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expected.isSameShape(output));
@@ -2470,11 +2405,6 @@ TEST_F(DeclarableOpsTests5, XWPlusB_1) {
 
     auto output = result->at(0);
 
-    output->printShapeInfo("Output shape> ");
-    exp.printShapeInfo("Expected shape> ");
-    output->printIndexedBuffer("Output data> ");
-    exp.printIndexedBuffer("Expected res>");
-
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2778,7 +2708,7 @@ TEST_F(DeclarableOpsTests5, L2_Loss_1) {
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(output->isScalar());
-    output->printIndexedBuffer("L2_Loss output");
+
     ASSERT_EQ(output->e<double>(0), exp);
 
     delete results;
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
index 34b66c61a..79a569e0f 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@@ -118,8 +118,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_4) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("SS OS shape");
-    z->printIndexedBuffer("SS OS out");
+
     ASSERT_TRUE(z->equalsTo(exp));
     //ASSERT_EQ(exp, *z);
 
@@ -127,9 +126,10 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_4) {
 }
 
 TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
+    int z = 0;
     auto matrix = NDArrayFactory::create<double>('c', {1}, {10});
     auto b = NDArrayFactory::create_<int>('c', {1}, {1});
-    auto e = NDArrayFactory::create_<int>('c', {1}, {(int)0});
+    auto e = NDArrayFactory::create_<int>('c', {1}, {z});
     auto s = NDArrayFactory::create_<int>('c', {1}, {1});
     nd4j::ops::ones_as opOnes;
     //auto exp = NDArrayFactory::create<double>('c', {2}, {1.0f, 2.0f});
@@ -138,7 +138,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
     ASSERT_EQ(onesRes->status(), Status::OK());
 
     auto ones = onesRes->at(0);
-    ones->printShapeInfo("Shape ones");
     *ones *= 10;
     auto onesD = ones->dup();
 
@@ -161,9 +160,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
     nd4j::ops::strided_slice op;
     auto result = op.calculateOutputShape(inputShapes, *block); //execute({ones, &b, &e, &s}, {}, {0, 1, 0, 0, 0});
     ASSERT_EQ(result->size(), 1);
-    shape::printShapeInfoLinear(result->at(0));
-    //auto z = result->at(0);
-//    z->printShapeInfo("SS OS shape");
     ASSERT_TRUE(shape::isEmpty(result->at(0)));
     //ASSERT_EQ(exp, *z);
     delete block;
@@ -189,8 +185,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_5) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -211,8 +205,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_6) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -234,8 +226,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_7) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -258,8 +248,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -282,8 +270,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_2) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -306,8 +292,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_3) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -362,8 +346,6 @@ TEST_F(DeclarableOpsTests6, Test_Order_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("O Output");
-    exp.printIndexedBuffer("O Expect");
     ASSERT_TRUE(exp.equalsTo(z));
     ASSERT_NE(x.ordering(), z->ordering());
 
@@ -379,7 +361,6 @@ TEST_F(DeclarableOpsTests6, cumSum_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer("CumSum1");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -910,9 +891,7 @@ TEST_F(DeclarableOpsTests6, TestRank_1) {
     auto ress = op.execute({&x}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
-    ress->at(0)->printIndexedBuffer("RANK Result is ");
 
-    //    x.printIndexedBuffer("Input is");
     ASSERT_TRUE(ress->at(0)->equalsTo(exp));
     delete ress;
 }
@@ -926,8 +905,6 @@ TEST_F(DeclarableOpsTests6, TestDropout_2) {
     auto ress = op.execute({&x}, {0.4f}, {113}, {}, false, nd4j::DataType::DOUBLE);
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
-    //x.printIndexedBuffer("Input is");
-    //ress->at(0)->printIndexedBuffer("Result is ");
 
     delete ress;
 }
@@ -943,8 +920,6 @@ TEST_F(DeclarableOpsTests6, TestDropout_3) {
     auto ress = op.execute({&x, &shape}, {0.4f}, {113}, {}, false, nd4j::DataType::DOUBLE);
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
-    //x.printIndexedBuffer("Input is");
-    //ress->at(0)->printIndexedBuffer("Result is ");
 
     delete ress;
 }
@@ -1556,8 +1531,6 @@ TEST_F(DeclarableOpsTests6, LogMatrixDeterminant_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Log ABS Output ");
-    exp.printIndexedBuffer("Log ABS Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1578,8 +1551,6 @@ TEST_F(DeclarableOpsTests6, LogDet_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("LogDet Output1 ");
-//    exp.printIndexedBuffer("LogDet Expected1 ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1593,16 +1564,12 @@ TEST_F(DeclarableOpsTests6, LogDet_2) {
     auto x = NDArrayFactory::create<double>('c', {1, 3, 3}, {4,12,-16,12,37,-43,-16,-43,98});
     auto exp = NDArrayFactory::create<double>('c', {1}, { 3.5835189});
 
-    //x.printIndexedBuffer("Input");
     nd4j::ops::logdet op;
     auto result = op.execute({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("LogDet Output2 ");
-//    z->printShapeInfo("Shape");
-//    exp.printIndexedBuffer("LogDet Expected2 ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1616,16 +1583,12 @@ TEST_F(DeclarableOpsTests6, LogDet_3) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {4,12,-16,12,37,-43,-16,-43,98});
     auto exp = NDArrayFactory::create<double>( 3.5835189);
 
-    //x.printIndexedBuffer("Input");
     nd4j::ops::logdet op;
     auto result = op.execute({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("LogDet Output3 ");
-//    z->printShapeInfo("Shape");
-//    exp.printIndexedBuffer("LogDet Expected3 ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1670,8 +1633,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output ");
-//    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1710,8 +1671,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_01) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output ");
-//    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1731,8 +1690,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_02) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output ");
-//    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
index c80d75372..e9fe7264e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
@@ -66,7 +66,6 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_SCALAR_LARGE) {
 
     auto z = result->at(1);
 
-    z->printIndexedBuffer("CHOOSE test");
     ASSERT_EQ(148,z->e<double>(0));
     //ASSERT_TRUE(exp.isSameShape(z));
 
@@ -572,8 +571,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_1) {
 
     ASSERT_EQ(Status::OK(), result->status());
     auto z = result->at(0);
-    z->printIndexedBuffer("Stitch");
-    z->printShapeInfo("Stitch Shape");
+
     ASSERT_TRUE(z->isSameShape(exp));
     ASSERT_TRUE(z->equalsTo(exp));
 
@@ -664,8 +662,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_2) {
 
     ASSERT_EQ(Status::OK(), result->status());
     auto z = result->at(0);
-    z->printIndexedBuffer("Stitch");
-    z->printShapeInfo("Stitch Shape");
+
     ASSERT_TRUE(z->isSameShape(exp));
     ASSERT_TRUE(z->equalsTo(exp));
 
@@ -683,11 +680,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Partition_119) {
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(4, result->size());
     auto z = result->at(0);
-//    z->printShapeInfo("Output shape info");
-//    z->printIndexedBuffer("Output1");
-//    result->at(1)->printIndexedBuffer("Output2");
-//    result->at(2)->printIndexedBuffer("Output3");
-//    result->at(3)->printIndexedBuffer("Output4");
+
     ASSERT_TRUE(e.isSameShape(z));
 
     delete result;
@@ -1080,7 +1073,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_1) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Segment mIN1");
+
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1097,7 +1090,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_01) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Segment mIN01");
+
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1113,7 +1106,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_02) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Segment mIN02");
+
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1130,8 +1123,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMinBP_1) {
 
     auto result = op.execute({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    //result->at(0)->printIndexedBuffer("Output1");
-    //exp.printIndexedBuffer("Expecte");
 
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
@@ -1433,9 +1424,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_02) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 1);
-    exp.printIndexedBuffer("Expect Mean");
-    result->at(0)->printIndexedBuffer("Output Mean");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1451,9 +1439,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_021) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 1);
-    exp.printIndexedBuffer("Expect Mean");
-    result->at(0)->printIndexedBuffer("Output Mean");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1470,9 +1455,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_022) {
     auto result = op.execute({&x, &idx}, {&z}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
     ASSERT_EQ(result, Status::OK());
 
-    exp.printIndexedBuffer("Expect Mean");
-    z.printIndexedBuffer("Output Mean");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(z));
 
 //    delete result;
@@ -1491,9 +1473,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) {
     auto result = op.execute({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 2);
-//    exp.printIndexedBuffer("Expect");
-//    result->at(0)->printIndexedBuffer("Output");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1842,8 +1821,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentSum_1) {
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-     result->at(0)->printIndexedBuffer("Output Sum");
-     exp.printIndexedBuffer("Expect Sum");
 
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
@@ -2001,8 +1978,6 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_1) {
 
     auto result = op.execute({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("UnsortedSum1");
-    exp.printIndexedBuffer("Unsorted Sum1 Exp");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2019,8 +1994,6 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_2) {
     auto result = op.execute({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 1);
-//    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2241,10 +2214,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_04) {
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2262,10 +2231,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_05) {
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2279,15 +2244,10 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_06) {
 
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<int8_t>({ 2,   3, 120,  56});
-    x.printIndexedBuffer("INPUT INT8");
     nd4j::ops::segment_prod op;
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2301,15 +2261,10 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_07) {
 
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<uint8_t>({ 2,   3, 120,  56});
-    x.printIndexedBuffer("INPUT INT8");
     nd4j::ops::segment_prod op;
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2577,12 +2532,6 @@ auto exp = NDArrayFactory::create<double>('c', {3, 1, 2, 6}, {
 
     auto result = op.execute({&x}, {}, {2,1,3,2,2,2,0});
     ASSERT_EQ(result->status(), Status::OK());
-//    x.printIndexedBuffer("images");
-//    nd4j_printf("input params: ksize = [1, 2, 1, 1], strides = [1, 3, 2, 1], rates = [1, 2, 2, 1]\n", "");
-    result->at(0)->printBuffer("Output");
-    //result->at(0)->printShapeInfo("Out Shape");
-    exp.printBuffer("Expect");
-    //exp.printShapeInfo("Exp Shape");
 
     ASSERT_TRUE(exp.isSameShape(result->at(0)));
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
@@ -3142,8 +3091,6 @@ auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 2}, {
     auto result = op.execute({&x}, {}, {6}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), Status::OK());
 
-    result->at(0)->printIndexedBuffer("z");
-
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -3358,9 +3305,6 @@ auto exp = NDArrayFactory::create<double>('c', {2, 3, 3}, {
     auto result = op.execute({&x}, {y}, {}, {1, 1}, {}, true, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result, Status::OK());
 
-    x.printIndexedBuffer("Output");
-    //exp.printIndexedBuffer("Expect");
-
     ASSERT_TRUE(exp.equalsTo(&x));
 
 //    delete result;
@@ -3431,8 +3375,6 @@ TEST_F(DeclarableOpsTests7, TestRoll_12) {
     auto result = op.execute({&x, &shift, &axis}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Output");
-    //exp.printIndexedBuffer("Expect");
 
     ASSERT_TRUE(exp.equalsTo(out));
 
@@ -3457,9 +3399,6 @@ TEST_F(DeclarableOpsTests7, TestRoll_13) {
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
 
-//    out->printIndexedBuffer("Output");
-    //exp.printIndexedBuffer("Expect");
-
     ASSERT_TRUE(exp.equalsTo(out));
 
     delete result;
@@ -4274,11 +4213,8 @@ TEST_F(DeclarableOpsTests7, TypesConversion_test4) {
     ASSERT_EQ(ND4J_STATUS_OK, result32->status());
     ASSERT_EQ(ND4J_STATUS_OK, result64->status());
     auto out1 = result32->at(0);
-    out1->printIndexedBuffer("OUT_F");
     auto out2 = result64->at(0);
-    out2->printIndexedBuffer("OUT_D");
 
-//    output->printIndexedBuffer("Toggled");
     ASSERT_TRUE(exp32.equalsTo(out1));
     ASSERT_TRUE(exp64.equalsTo(out2));
 
@@ -4369,8 +4305,6 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test5) {
     nd4j::ops::mirror_pad op;
     auto result = op.execute({&input, &paddings}, {}, {0});
     auto output = result->at(0);
-    output->printBuffer("Output");
-    exp.printBuffer("Expected");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -6204,8 +6138,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_1) {
     nd4j::ops::reduce_max_bp op;
     auto result = op.execute({&x, &eps}, {}, {0, 1});
     auto output = result->at(0);
-    exp.printIndexedBuffer("E");
-    output->printIndexedBuffer("O");
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
@@ -6379,8 +6311,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_02) {
     auto result = op.execute({&x, &eps, &axes}, {}, {}, {false});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
-    output->printIndexedBuffer("Result is");
-    exp.printIndexedBuffer("Expect is");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -6397,7 +6327,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_3) {
     nd4j::ops::reduce_norm1_bp op;
     auto result = op.execute({&x, &eps}, {1.f}, {0,1});
     auto output = result->at(0);
-//    output->printIndexedBuffer("Result is");
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
index 82b3d2db7..9f98ab3a1 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
@@ -55,12 +55,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test1) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f});
-        
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -73,12 +73,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test2) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f});
-    
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {1.}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -91,12 +91,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test3) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {3}, {900.9375f, 969.8594f, 424.1875f});
-        
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -108,13 +108,13 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test3) {
 TEST_F(DeclarableOpsTests8, reduceVariance_test4) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
-    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {900.9375f, 969.8594f, 424.1875f});  
-        
+    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {900.9375f, 969.8594f, 424.1875f});
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {1.}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -127,12 +127,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test5) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>(788.6927f);
-        
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test6) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(788.6927f);
-           
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -163,12 +163,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test7) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {788.6927f});
-           
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -199,12 +199,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test1) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f});
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -217,12 +217,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test2) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f});
-    
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {1.}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -235,12 +235,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test3) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {3}, {30.01562f, 31.14257f, 20.59581f});
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -252,13 +252,13 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test3) {
 TEST_F(DeclarableOpsTests8, reduceStDev_test4) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
-    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {30.01562f, 31.14257f, 20.59581f});  
-        
+    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {30.01562f, 31.14257f, 20.59581f});
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {1.}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -271,12 +271,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test5) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(28.08367f);
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -289,12 +289,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test6) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(28.08367f);
-           
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -307,12 +307,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test7) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {28.08367f});
-           
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {1.f}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -325,12 +325,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test8) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {4}, {26.88246f, 29.53924f, 34.52921f, 30.11755f});
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {0.f,1.f}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     // output->printBuffer("Reduced STDDEV");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -366,36 +366,36 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test1) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.45833334f, -0.375f, -0.29166666f, -0.20833333f, -0.125f, -0.041666668f, 0.041666668f, 0.125f, 0.20833333f, 0.29166666f, 0.375f, 0.45833334f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_variance_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
-    delete result;    
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
+    delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
 
 }
 
@@ -409,36 +409,36 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test2) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-4.000000f, -8.000000f, -12.000000f, -16.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 4.000000f, 8.000000f, 12.000000f, 16.000000f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_variance_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
-    ASSERT_TRUE(exp12.equalsTo(output)); 
-    delete result;    
+    ASSERT_TRUE(exp12.equalsTo(output));
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
-    ASSERT_TRUE(exp34.isSameShape(output));
-    ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
-
-    result = op.execute({&x, &gradO1}, {1,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
+
+    result = op.execute({&x, &gradO1}, {1,1}, {0});
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
+    ASSERT_TRUE(exp34.isSameShape(output));
+    ASSERT_TRUE(exp34.equalsTo(output));
+    delete result;
 
 }
 
@@ -537,15 +537,15 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test1) {
     auto x = NDArrayFactory::create<double>('c', {3,4});
     auto gradO1 = NDArrayFactory::create<double>('c', {1,1}, {0.5f});
     auto gradO2 = NDArrayFactory::create<double>(0.5f);
-    auto exp12 = NDArrayFactory::create<double>('c', {3,4}, {-0.069337524f, -0.056730703f, -0.04412388f, -0.031517055f, -0.018910235f, -0.0063034114f, 0.0063034114f, 0.018910235f, 0.031517055f, 0.04412388f, 0.056730703f, 0.069337524f});     
+    auto exp12 = NDArrayFactory::create<double>('c', {3,4}, {-0.069337524f, -0.056730703f, -0.04412388f, -0.031517055f, -0.018910235f, -0.0063034114f, 0.0063034114f, 0.018910235f, 0.031517055f, 0.04412388f, 0.056730703f, 0.069337524f});
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.06638563f, -0.05431551f, -0.0422454f, -0.030175284f, -0.01810517f, -0.006035057f, 0.006035057f, 0.01810517f, 0.030175284f, 0.0422454f, 0.05431551f, 0.06638563f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     // output->printIndexedBuffer();
     ASSERT_TRUE(exp12.isSameShape(output));
@@ -553,21 +553,21 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test1) {
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {0,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
@@ -584,36 +584,36 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test2) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.5f, -1.0f, -1.5f, -2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 2.0f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
-    ASSERT_TRUE(exp12.equalsTo(output)); 
-    delete result;    
+    ASSERT_TRUE(exp12.equalsTo(output));
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
-    ASSERT_TRUE(exp34.isSameShape(output));
-    ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
-
-    result = op.execute({&x, &gradO1}, {1,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
+
+    result = op.execute({&x, &gradO1}, {1,1}, {0});
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
+    ASSERT_TRUE(exp34.isSameShape(output));
+    ASSERT_TRUE(exp34.equalsTo(output));
+    delete result;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -669,44 +669,44 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test3) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.38729835f, -0.12909944f, 0.12909944f, 0.38729835f, -0.7745967f, -0.2581989f, 0.2581989f, 0.7745967f, -1.161895f, -0.38729835f, 0.38729835f, 1.161895f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
-    ASSERT_TRUE(exp12.equalsTo(output)); 
-    delete result;    
+    ASSERT_TRUE(exp12.equalsTo(output));
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
-    ASSERT_TRUE(exp34.isSameShape(output));
-    ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
-
-    result = op.execute({&x, &gradO1}, {1,1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
+
+    result = op.execute({&x, &gradO1}, {1,1}, {1});
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
+    ASSERT_TRUE(exp34.isSameShape(output));
+    ASSERT_TRUE(exp34.equalsTo(output));
+    delete result;
 
 }
 
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>(120.f);
     //************************************//
 
@@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) {
     auto result = op.execute({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
     //z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -722,8 +722,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_2) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>({15.f, 40.f, 65.f});
     //************************************//
 
@@ -731,7 +731,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_2) {
     auto result = op.execute({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -757,8 +757,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_03) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>(1307674368000.f);
     //************************************//
 
@@ -766,7 +766,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) {
     auto result = op.execute({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
     //z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -774,8 +774,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_2) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>({120.f, 30240.f, 360360.f});
     //************************************//
 
@@ -783,7 +783,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_2) {
     auto result = op.execute({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -798,9 +798,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_01) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -817,10 +817,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_02) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
    // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -837,10 +837,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_3) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -857,10 +857,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_4) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -877,10 +877,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_5) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -894,13 +894,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -914,13 +914,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {300.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -937,9 +937,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_01) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -956,10 +956,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_02) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -976,10 +976,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_3) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -996,10 +996,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_4) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1034,13 +1034,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_5) {
     auto x = NDArrayFactory::create<double>('c', {2,3,2});
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1054,13 +1054,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,2});
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1074,13 +1074,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,2});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {479001600.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1097,9 +1097,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_1) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1116,10 +1116,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_2) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1136,10 +1136,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_3) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1156,10 +1156,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_4) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1194,13 +1194,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1214,13 +1214,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1234,13 +1234,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {1.f});
     x.linspace(1);
-    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1257,10 +1257,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_1) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
     // output->printShapeInfo("Output shape");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1277,10 +1277,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_2) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1297,10 +1297,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_3) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1317,10 +1317,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_4) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1355,13 +1355,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1375,13 +1375,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1395,13 +1395,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_7) {
 	auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1419,7 +1419,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1436,10 +1436,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_2) {
 
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1456,10 +1456,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_3) {
 
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1476,10 +1476,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_4) {
 
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1514,13 +1514,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1534,13 +1534,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1554,13 +1554,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {300.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1578,7 +1578,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1595,10 +1595,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_2) {
 
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1615,10 +1615,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_3) {
 
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1635,10 +1635,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_4) {
 
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1673,13 +1673,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1693,13 +1693,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1713,13 +1713,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {70.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1738,7 +1738,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1757,7 +1757,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_2) {
     auto result = op.execute({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1776,7 +1776,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_3) {
     auto result = op.execute({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1795,7 +1795,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_4) {
     auto result = op.execute({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1829,13 +1829,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm_max op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1849,13 +1849,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm_max op;
     auto result = op.execute({&x}, {}, {0, 1, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1869,13 +1869,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm_max op;
     auto result = op.execute({&x}, {1.f}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1894,7 +1894,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1913,7 +1913,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_2) {
     auto result = op.execute({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1932,7 +1932,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_3) {
     auto result = op.execute({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1951,7 +1951,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_4) {
     auto result = op.execute({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1985,13 +1985,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_sqnorm op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2005,13 +2005,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_sqnorm op;
     auto result = op.execute({&x}, {}, {0, 1, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2025,13 +2025,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {4900.f});
     x.linspace(1);
-    
+
     nd4j::ops::reduce_sqnorm op;
     auto result = op.execute({&x}, {1.f}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2041,8 +2041,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_7) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>(0.5f);
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
@@ -2051,7 +2051,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) {
     auto result = op.execute({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2060,11 +2060,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>('c', {1, 1}, {0.5f});
-    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 
-                                     0.5f, 0.5f, 0.5f, 0.5f, 
+    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f,
+                                     0.5f, 0.5f, 0.5f, 0.5f,
                                      0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
 
@@ -2072,7 +2072,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) {
     auto result = op.execute({&input, &eps}, {1.f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //  z->printIndexedBuffer("Result is ");
 //  z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2081,11 +2081,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>('c', {4}, {1.f, 2.f, 3.f, 4.f});
-    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, 
-                                     1.f, 2.f, 3.f, 4.f, 
+    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f,
+                                     1.f, 2.f, 3.f, 4.f,
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
@@ -2093,7 +2093,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) {
     auto result = op.execute({&input, &eps}, {}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2102,11 +2102,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_4) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>('c', {1, 4}, {1.f, 2.f, 3.f, 4.f});
-    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, 
-                                     1.f, 2.f, 3.f, 4.f, 
+    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f,
+                                     1.f, 2.f, 3.f, 4.f,
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
@@ -2114,7 +2114,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_4) {
     auto result = op.execute({&input, &eps}, {1.f}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2146,23 +2146,23 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_04) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_BP_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f});
     auto eps = NDArrayFactory::create<double>(1307674368000.f);
     //************************************//
 //    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
-    auto exp = NDArrayFactory::create<double>('c', {3, 5},   {1710012166826558903812096.f, 855006083413279451906048.f, 570004067618451974258688.f, 
-                                       427503041706639725953024.f, 342002454982589992140800.f, 285002033809225987129344.f, 
-                                       244287457550765131825152.f, 213751520853319862976512.f, 190001355872817324752896.f, 
-                                       171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f, 
-                                       131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5},   {1710012166826558903812096.f, 855006083413279451906048.f, 570004067618451974258688.f,
+                                       427503041706639725953024.f, 342002454982589992140800.f, 285002033809225987129344.f,
+                                       244287457550765131825152.f, 213751520853319862976512.f, 190001355872817324752896.f,
+                                       171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f,
+                                       131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f});
 
     nd4j::ops::reduce_prod_bp op;
     auto result = op.execute({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2175,13 +2175,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test1) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {4}, {11.f, 12.f, 13.f, 14.f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2195,13 +2195,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test2) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {11.f, 12.f, 13.f, 14.f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {1.}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2215,13 +2215,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test3) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {3}, {8.5f, 12.5f, 16.5f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2235,13 +2235,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test4) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {8.5f, 12.5f, 16.5f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {1.f}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2255,13 +2255,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test5) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>(12.5f);
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2275,12 +2275,12 @@ TEST_F(DeclarableOpsTests8, reduceMean_test6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>(12.5f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2294,12 +2294,12 @@ TEST_F(DeclarableOpsTests8, reduceMean_test7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {12.5f});
     x.linspace(1);
-           
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2336,11 +2336,11 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test1) {
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
 
     // output->printShapeInfo("o");
@@ -2350,7 +2350,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test1) {
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2367,18 +2367,18 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test2) {
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f, 1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f, 1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2422,18 +2422,18 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test3) {
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.25f, 0.25f, 0.25f, 0.25f, 0.5f, 0.5f, 0.5f, 0.5f, 0.75f, 0.75f, 0.75f, 0.75f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
-    ASSERT_TRUE(exp.equalsTo(output)); 
+    ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2444,14 +2444,14 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test3) {
 TEST_F(DeclarableOpsTests8, reduceStDevBP_test4) {
 
     auto x = NDArrayFactory::create<double>('c', {3}, {2.f, 3.f, 4.f});
-    auto gradO = NDArrayFactory::create<double>(0.5f);    
-    auto exp = NDArrayFactory::create<double>('c', {3}, {-0.25f, 0.f, 0.25f});    
-            
+    auto gradO = NDArrayFactory::create<double>(0.5f);
+    auto exp = NDArrayFactory::create<double>('c', {3}, {-0.25f, 0.f, 0.25f});
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO}, {0,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());        
-    auto output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2481,7 +2481,7 @@ TEST_F(DeclarableOpsTests8, avgpool2d_test13) {
 
     nd4j::ops::avgpool2d op;
     auto results = op.execute({&input}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 0, dataFormat});
-    auto output = results->at(0);    
+    auto output = results->at(0);
 
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -2489,19 +2489,19 @@ TEST_F(DeclarableOpsTests8, avgpool2d_test13) {
     //expected.printIndexedBuffer("expected");
 
     ASSERT_TRUE(expected.isSameShape(output));
-    ASSERT_TRUE(expected.equalsTo(output));    
- 
+    ASSERT_TRUE(expected.equalsTo(output));
+
     delete results;
 }
 
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0});
     auto logits = NDArrayFactory::create<double>('c', {2,3,4});
     auto expected = NDArrayFactory::create<double>('c', {2,3}, {2.78507, 1.34254, 4.12761, 2.88507, 2.78507, 2.88507});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2509,7 +2509,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2519,11 +2519,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0});
     auto logits = NDArrayFactory::create<double>('c', {2,3,4});
     auto expected = NDArrayFactory::create<double>('c', {3,4}, {0.26328, 1.46328, 1.72656, 0.     , 0.26328, 0.     , 1.46328, 0.26328, 1.72656, 0.     , 1.72656, 1.46328});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2531,7 +2531,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2541,11 +2541,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0});
     auto logits = NDArrayFactory::create<double>('c', {2,3,4});
     auto expected = NDArrayFactory::create<double>('c', {2,4}, {0.75125, 1.55125, 3.45375, 0.75125, 3.45375, 0.     , 2.3025 , 1.15125});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2553,7 +2553,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2563,11 +2563,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test4) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3},{0,1,1,0,0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,3});
     auto expected = NDArrayFactory::create<double>('c', {2}, {2.10389, 1.00194});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2585,11 +2585,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test4) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test5) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3},{0,1,1,0,0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,3});
     auto expected = NDArrayFactory::create<double>('c', {3}, {0., 0.85436, 1.40871});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2607,11 +2607,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test5) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test6) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,1}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,1});
     auto expected = NDArrayFactory::create<double>('c', {1}, {0.6444});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2629,11 +2629,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test6) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test7) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,1}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,1});
     auto expected = NDArrayFactory::create<double>('c', {2}, {0., 0.});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2651,11 +2651,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test7) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {2});
     auto expected = NDArrayFactory::create<double>(0.6444);
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2663,7 +2663,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2673,11 +2673,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test9) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {1}, {0.});
     auto logits = NDArrayFactory::create<double>('c', {1}, {0.2});
     auto expected = NDArrayFactory::create<double>(0.);
-                                               
+
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.execute({&logits, &labels}, {}, {});
 
@@ -2693,11 +2693,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test9) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {1,2}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {1,2});
     auto expected = NDArrayFactory::create<double>('c', {2}, {0., 0.});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2715,14 +2715,14 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test4) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5}, {0.7044955, 0.55606544, 0.15833677, 0.001874401, 0.61595726, 0.3924779, 0.7414847, 0.4127324, 0.24026828, 0.26093036, 0.46741188, 0.01863421, 0.08528871, 0.529365, 0.5510694});
-    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105});
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {1.f}, {}, {}, false, nd4j::DataType::DOUBLE);
     auto output = result->at(0);
-        
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2731,16 +2731,18 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test4) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test5) {
-    
+
+    // auto x = NDArrayFactory::create<double>('c', {3, 5}, {1,2,3,4,5,  1,2,3,4,5,  1,2,3,4,5});
     auto x = NDArrayFactory::create<double>('c', {3, 5});
-    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1.,  2.,  2.89271,  3.50524,  4.00892, 6.,  7.,  7.71389,  7.88678,  8.01784, 11., 12., 12.53507, 12.26833, 12.02676});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1.,  2.,  2.89271,  3.50524,  4.00892, 6.,  7.,  7.71389,  7.88678,  8.01784, 11., 12., 12.53507, 12.26833, 12.02676});
+    // auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1});
 
     x.linspace(1);
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {15.f}, {0}, {}, false, nd4j::DataType::DOUBLE);
     auto output = result->at(0);
-        
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2749,25 +2751,25 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test5) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test6) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5});
-    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484});
 
     x.linspace(1);
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {15.f}, {1}, {}, false, nd4j::DataType::DOUBLE);
     auto output = result->at(0);
-        
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test7) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5});
     auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
 
@@ -2782,10 +2784,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test7) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test8) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5});
     auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
 
@@ -2800,12 +2802,12 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test8) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test9) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {2}, {3., 4.});
-    auto exp = NDArrayFactory::create<double>('c', {2}, {2.4, 3.2});    
+    auto exp = NDArrayFactory::create<double>('c', {2}, {2.4, 3.2});
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {4.}, {}, {}, false, nd4j::DataType::DOUBLE);
@@ -2816,10 +2818,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test9) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test10) {
-    
+
     auto x = NDArrayFactory::create<double>(6.);
     auto exp = NDArrayFactory::create<double>(5.);
 
@@ -2832,10 +2834,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test10) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test11) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {1.,  2.,  3.,  4.,  4.44787,  5.33745,  6.22702,  7.1166 , 6.33046,  7.03384,  7.73723,  8.44061,
                                         13., 14., 15., 16., 15.12277, 16.01235, 16.90192, 17.7915 ,14.77107, 15.47446, 16.17784, 16.88123});
@@ -2872,19 +2874,19 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test4) {
     auto gradO1 = NDArrayFactory::create<double>('c', {4}, {1., 2., 3., 4.});
     auto gradO2 = NDArrayFactory::create<double>('c', {1, 4}, {1., 2., 3., 4.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333});
-                                     
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
@@ -2898,19 +2900,19 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test5) {
     auto gradO1 = NDArrayFactory::create<double>('c', {3}, {1., 2., 3.});
     auto gradO2 = NDArrayFactory::create<double>('c', {3, 1}, {1., 2., 3.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.2500,0.2500,0.2500,0.2500, 0.5000,0.5000,0.5000,0.5000, 0.7500,0.7500,0.7500,0.7500});
-                                     
+
     nd4j::ops::reduce_mean_bp op;
-    
+
     auto result = op.execute({&x, &gradO1}, {0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
@@ -2924,19 +2926,19 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test5) {
     auto gradO1 = NDArrayFactory::create<double>('c', {4}, {1., 2., 3., 4.});
     auto gradO2 = NDArrayFactory::create<double>('c', {1, 4}, {1., 2., 3., 4.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {-0.408248, -0.816497, -1.224745, -1.632993, 0.000000, 0.000000, 0.000000, 0.000000, 0.408248, 0.816497, 1.224745, 1.632993});
-                                                                          
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
@@ -2948,12 +2950,12 @@ TEST_F(DeclarableOpsTests8, zeros_as_test1) {
     auto x = NDArrayFactory::create<double>(10.f);
     auto y = NDArrayFactory::create<double>(100.f);
     auto exp = NDArrayFactory::create<double>(0.f);
-                                                                          
+
     nd4j::ops::zeros_as op;
 
     Nd4jStatus status = op.execute({&x}, {&y}, {}, {}, {});
-    ASSERT_EQ(Status::OK(), status);    
-    
+    ASSERT_EQ(Status::OK(), status);
+
     ASSERT_TRUE(y.isSameShape(exp));
     ASSERT_TRUE(y.equalsTo(exp));
 
@@ -2987,11 +2989,11 @@ TEST_F(DeclarableOpsTests8, ones_as_test1) {
     nd4j::ops::ones_as op;
 
     Nd4jStatus status = op.execute({&x}, {&y}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
-    ASSERT_EQ(Status::OK(), status);    
-    
+    ASSERT_EQ(Status::OK(), status);
+
     ASSERT_TRUE(y.isSameShape(exp));
     ASSERT_TRUE(y.equalsTo(exp));
-        
+
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -3017,7 +3019,7 @@ TEST_F(DeclarableOpsTests8, NormalizeMoments_SGO_1) {
 
     auto data   = NDArrayFactory::create<double>('c', {10, 10});
     data.linspace(1);
-    
+
     auto means = data.reduceAlongDimension(reduce::Sum, {0});
     auto deviance = NDArrayFactory::create<double>('c', {10}, {825., 825. , 825., 825., 825., 825., 825., 825., 825., 825. }); // data.varianceAlongDimension(variance::SummaryStatsVariance, false, {0}); // = NDArrayFactory::create<double>('c', {10, 10});
 
@@ -3040,24 +3042,24 @@ TEST_F(DeclarableOpsTests8, NormalizeMoments_SGO_1) {
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_EQ(results->size(), 2);
 
-    auto outputMeans = results->at(0);    
-    auto outputDeviance = results->at(1);    
+    auto outputMeans = results->at(0);
+    auto outputDeviance = results->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputDeviance->printIndexedBuffer("Variance");
 //    deviance.printIndexedBuffer("Expected");
 //    means->printIndexedBuffer("Expected means");
     ASSERT_TRUE(means->isSameShape(outputMeans));
-    ASSERT_TRUE(means->equalsTo(outputMeans));    
+    ASSERT_TRUE(means->equalsTo(outputMeans));
     ASSERT_TRUE(deviance.isSameShape(outputDeviance));
     ASSERT_TRUE(deviance.equalsTo(outputDeviance));
     delete means;
     //delete deviance;
     delete ssSquared;
 //    ASSERT_TRUE(expMeans.isSameShape(outputMeans));
-//    ASSERT_TRUE(expMeans.equalsTo(outputMeans));    
+//    ASSERT_TRUE(expMeans.equalsTo(outputMeans));
 //    ASSERT_TRUE(expMeans.isSameShape(outputDeviance));
-//    ASSERT_TRUE(expDeviance.equalsTo(outputDeviance));    
+//    ASSERT_TRUE(expDeviance.equalsTo(outputDeviance));
 
     delete results;
 }
@@ -3073,10 +3075,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_1) {
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {}, {0, 1});
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3103,10 +3105,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_2) {
 
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3132,10 +3134,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_3) {
 
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {}, {0, 2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3161,10 +3163,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_4) {
 
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3187,13 +3189,13 @@ TEST_F(DeclarableOpsTests8, Test_Moments_6) {
 
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     x.linspace(1);
-           
+
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3216,13 +3218,13 @@ TEST_F(DeclarableOpsTests8, Test_Moments_7) {
     auto expVariance = NDArrayFactory::create<double>('c', {1,1,1}, {47.916668f});
 
     x.linspace(1);
-    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3319,13 +3321,13 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_1) {
     nd4j::ops::lrn op;
     auto  results = op.execute({&x}, {1.0, 1.0, 0.5}, {2}, {}, false, nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(exp.isSameShape(out));
 //    out->printIndexedBuffer("LRN out");
 //    exp.printIndexedBuffer("LRN exp");
-    ASSERT_TRUE(exp.equalsTo(out));    
-    
+    ASSERT_TRUE(exp.equalsTo(out));
+
     delete results;
 }
 
@@ -3334,75 +3336,75 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_2) {
 
     auto x = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5});
     x.linspace(1);
-    
+
     auto exp = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5}, {
-    0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f,   
-    0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f, 
-    0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f, 
-    0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f, 
-    0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f, 
+    0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f,
+    0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f,
+    0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f,
+    0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f,
+    0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f,
 
-    0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f, 
-    0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f, 
-    0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f, 
-    0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f, 
-    0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f, 
+    0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f,
+    0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f,
+    0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f,
+    0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f,
+    0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f,
 
-    0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f, 
+    0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f,
     0.567134f,  0.49553978f,  0.4470674f,  0.504163f,  0.5870515f,
-    0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f, 
-    0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f, 
-    0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f, 
+    0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f,
+    0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f,
+    0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f,
 
 
-    0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f, 
-    0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f, 
-    0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f, 
+    0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f,
+    0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f,
+    0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f,
     0.571041f,  0.4972537f,  0.44715673f,  0.50263065f,  0.58345926f,
-    0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f, 
+    0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f,
 
-    0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f, 
-    0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f, 
-    0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f, 
-    0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f, 
-    0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f, 
+    0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f,
+    0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f,
+    0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f,
+    0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f,
+    0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f,
 
-    0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f, 
+    0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f,
     0.57296f,  0.49809194f,  0.44718578f,  0.5018515f,   0.5816426f,
-    0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f, 
-    0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f, 
+    0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f,
+    0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f,
     0.57340944f,  0.49828786f,  0.44719115f,  0.5016664f,   0.581212f,
 
 
-    0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f, 
-    0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f, 
+    0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f,
+    0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f,
     0.5737754f,  0.4984474f,  0.44719502f,  0.501515f,  0.58085984f,
-    0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f, 
-    0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f, 
+    0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f,
+    0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f,
 
-    0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f, 
-    0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f, 
-    0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f, 
-    0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f, 
-    0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f, 
+    0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f,
+    0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f,
+    0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f,
+    0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f,
+    0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f,
 
-    0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f, 
-    0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f, 
-    0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f, 
-    0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f, 
+    0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f,
+    0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f,
+    0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f,
+    0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f,
     0.57474375f,  0.49886885f,  0.44720373f,  0.50111103f,  0.5799219f }
     );
 //
     nd4j::ops::lrn op;
     auto  results = op.execute({&x}, {1.0, 1.0, 0.5}, {2}, {}, false, nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
 //    ASSERT_TRUE(exp.isSameShape(out));
 //    out->printIndexedBuffer("LRN out");
 //    exp.printIndexedBuffer("LRN exp");
-    ASSERT_TRUE(exp.equalsTo(out));    
-    
+    ASSERT_TRUE(exp.equalsTo(out));
+
     delete results;
 }
 
@@ -3413,60 +3415,60 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_3) {
     x.linspace(1);
 
     auto exp = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5}, {
-            0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f, 
-            0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f, 
-            0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f, 
-            0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f, 
-            0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f, 
+            0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f,
+            0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f,
+            0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f,
+            0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f,
+            0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f,
 
-            0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f, 
-            0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f, 
-            0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f, 
-            0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f, 
-            0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f, 
+            0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f,
+            0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f,
+            0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f,
+            0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f,
+            0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f,
 
-            0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f, 
+            0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f,
             0.567134f,  0.49553978f,  0.4470674f,  0.504163f,  0.5870515f,
-            0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f, 
-            0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f, 
-            0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f, 
+            0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f,
+            0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f,
+            0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f,
 
 
-            0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f, 
-            0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f, 
-            0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f, 
+            0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f,
+            0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f,
+            0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f,
             0.571041f,  0.4972537f,  0.44715673f,  0.50263065f,  0.58345926f,
-            0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f, 
+            0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f,
 
-            0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f, 
-            0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f, 
-            0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f, 
-            0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f, 
-            0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f, 
+            0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f,
+            0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f,
+            0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f,
+            0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f,
+            0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f,
 
-            0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f, 
+            0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f,
             0.57296f,  0.49809194f,  0.44718578f,  0.5018515f,   0.5816426f,
-            0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f, 
-            0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f, 
+            0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f,
+            0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f,
             0.57340944f,  0.49828786f,  0.44719115f,  0.5016664f,   0.581212f,
 
 
-            0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f, 
-            0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f, 
+            0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f,
+            0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f,
             0.5737754f,  0.4984474f,  0.44719502f,  0.501515f,  0.58085984f,
-            0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f, 
-            0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f, 
+            0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f,
+            0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f,
 
-            0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f, 
-            0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f, 
-            0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f, 
-            0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f, 
-            0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f, 
+            0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f,
+            0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f,
+            0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f,
+            0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f,
+            0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f,
 
-            0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f, 
-            0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f, 
-            0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f, 
-            0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f, 
+            0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f,
+            0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f,
+            0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f,
+            0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f,
             0.57474375f,  0.49886885f,  0.44720373f,  0.50111103f,  0.5799219f }
     );
 //
@@ -3526,13 +3528,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_4_119) {
     auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
 
 
-    //ASSERT_EQ(Status::OK(), results);
-
-    nd4j_printf("avg time: %lld ms\n", spanTime);
-
 //    ASSERT_TRUE(exp.isSameShape(out));
-//    out->printIndexedBuffer("LRN out");
-//    exp.printIndexedBuffer("LRN exp");
 //    ASSERT_TRUE(exp.equalsTo(out));
 }
 
@@ -3548,8 +3544,6 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_5) {
 
     ASSERT_EQ(Status::OK(), results->status());
 //    ASSERT_TRUE(exp.isSameShape(out));
-//    out->printIndexedBuffer("LRN out");
-//    exp.printIndexedBuffer("LRN exp");
 //    ASSERT_TRUE(exp.equalsTo(out));
 
     delete results;
@@ -3626,13 +3620,13 @@ auto exp = NDArrayFactory::create<TypeParam>('c', {3,3,5,5}, {
     nd4j::ops::lrn_bp op;
     auto  results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, false, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
 //    ASSERT_TRUE(exp.isSameShape(out));
     // out->printBuffer("LRN BP out");
     // exp.printBuffer("LRN BP exp");
     //ASSERT_TRUE(exp.equalsTo(out));
-    
+
     delete results;
 }
 
@@ -3641,7 +3635,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_2) {
 
     auto x = NDArrayFactory::create<TypeParam>( 'c', {3, 3, 5, 5});
     x.linspace(1);
-    
+
     auto eps = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5}, {            0.2581989 ,0.3592106 , 0.40089184, 0.53935987, 0.70014,
                                                                                  0.4898979 ,0.46056613, 0.43971977, 0.5240002 , 0.6375767,
                                                                                  0.5274096 ,0.47771242, 0.4443308 , 0.5163977 , 0.61701745,
@@ -3706,13 +3700,13 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_2) {
     nd4j::ops::lrn_bp op;
     auto  results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, false, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(exp.isSameShape(out));
     //out->printBuffer("LRN BP out");
 //    exp.printIndexedBuffer("LRN exp");
    // ASSERT_TRUE(exp.equalsTo(out));
-    
+
     delete results;
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
index 4871c12e4..f88d6e930 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@@ -924,8 +924,6 @@ TEST_F(DeclarableOpsTests9, tile_test1) {
     auto reps   = NDArrayFactory::create<int>('c', {1, 2}, {2, 1});
     auto expOut = NDArrayFactory::create<double>('c', {2, 6,}, {1.,2.,3.,4.,5.,6., 1.,2.,3.,4.,5.,6.});
 
-    expOut.printIndexedBuffer("expOut");
-
     nd4j::ops::tile op;
     auto results = op.execute({&input, &reps}, {}, {});
     auto out = results->at(0);
@@ -1660,8 +1658,6 @@ TEST_F(DeclarableOpsTests9, test_range_int_1) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
-
     delete result;
 }
 
@@ -2901,31 +2897,29 @@ TEST_F(DeclarableOpsTests9, Floormod_BP_Test_4) {
     delete result;
 }
 
-////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, batchnorm_bp_test1) {
 
     NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.1, 1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32);
     NDArray variance('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gamma   ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.272779, -1.018224, -0.763668,-0.509112, -0.254556,  0.,  0.254556,0.509112,  0.763668,  1.018224,  1.272779,
-                                1.527335,  1.781891,  2.036447,  2.291003,2.545559,  2.800115,  3.054671,  3.309227,3.563783,  3.818338,  4.072894,  4.32745}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {6.448749, 7.212417, 8.230641, 9.50342 }, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,3,4}, {-0.000056, -0.000056, -0.000056, -0.000056, -0.000034, -0.000034, -0.000034, -0.000034, -0.000011, -0.000011, -0.000011, -0.000011, 0.000011, 0.000011, 0.000011, 0.000011, 0.000034, 0.000034, 0.000034, 0.000034, 0.000056, 0.000056, 0.000056, 0.000056}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {6.148104, 6.148104, 6.148105, 6.148105}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {3.6, 4.5, 5.4, 6.3}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
-    mean.assign(1.);
-    variance.assign(0.5);
+    variance.assign(0.46666667);
     gamma.assign(1.2);
-    // beta.assign(1.);     // has no effect on gradient calculations
+    beta.assign(1.);     // has no effect on gradient calculations
     gradO.linspace(-0.9, 0.15);
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -2945,20 +2939,22 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test1) {
     delete results;
 }
 
+
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) {
 
-    NDArray input   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray mean    ('c', {3}, {1.05, 1.1, 1.15});
-    NDArray variance('c', {3}, {0.5, 0.6, 0.7});
-    NDArray gamma   ('c', {3}, {1.2, 1.3, 1.4});
-    NDArray beta    ('c', {3}, nd4j::DataType::DOUBLE);
-    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {3}, {1.05, 1.1, 1.15}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {3}, {0.5, 0.6, 0.7}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {3}, {1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {3}, nd4j::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.272779, -1.018224, -0.763668,-0.503484, -0.251742,  0.,  0.251742,0.501992,  0.752989,  1.003985,  1.254981,
-                                    1.527335,  1.781891,  2.036447,  2.291003,2.517418,  2.76916 ,  3.020902,  3.272644,3.513947,  3.764943,  4.015939,  4.266936});
-    NDArray expdLdG('c', {3}, {5.81236 ,  7.048771, 12.155388});
-    NDArray expdLdB('c', {3}, {1.8,  6.6, 11.4});
+    NDArray expdLdI('c', {2,3,4}, {-0.601415, -0.521226, -0.441037, -0.360849, -0.456306, -0.395465, -0.334624, -0.273784, 0.396631, 0.343747,
+                                    0.290863, 0.237978, 0.360849, 0.441037, 0.521226, 0.601415, 0.273784, 0.334625, 0.395465, 0.456306, -0.237978,
+                                    -0.290863, -0.343746, -0.396631}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {3}, {5.81236 ,  7.048771, 12.155388}, nd4j::DataType::FLOAT32);
+    NDArray expdLdB('c', {3}, {1.8,  6.6, 11.4}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     // beta.assign(1.);     // has no effect on gradient calculations
@@ -2966,7 +2962,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -2989,17 +2985,18 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, batchnorm_bp_test3) {
 
-    NDArray input   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray mean    ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4});
-    NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2});
-    NDArray gamma   ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9});
-    NDArray beta    ('c', {2,1,4}, nd4j::DataType::DOUBLE);
-    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {2,1,4}, nd4j::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.258709, -1.003985, -0.754668,-0.509112, -0.251742,  0.,  0.251556,0.509112,  0.755225,  1.003985,  1.25778 ,
-                                   1.517885,  1.784991,  2.05947 ,  2.341504,2.529808,  2.804986,  3.089205,  3.382173,3.541731,  3.824981,  4.11894 ,  4.422841});
-    NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 });
-    NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45,  0.  ,  0.45,  4.5 ,  4.95,  5.4 ,  5.85});
+    NDArray expdLdI('c', {2,3,4}, {-0.577002, -0.744041, -0.850999, -0.922373, -0.000000, -0.000000, -0.000000, -0.000000, 0.577002,
+                                    0.744041, 0.850999, 0.922373, -0.386037, -0.350205, -0.312047, -0.271737, -0.000000, -0.000000,
+                                    -0.000000, -0.000000, 0.386037, 0.350205, 0.312047, 0.271736}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 }, nd4j::DataType::FLOAT32);
+    NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45,  0.  ,  0.45,  4.5 ,  4.95,  5.4 ,  5.85}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     // beta.assign(1.);     // has no effect on gradient calculations
@@ -3007,7 +3004,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test3) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,0,2});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3037,8 +3034,8 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test4) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,4}, {1.527335, -1.16534 ,  0.885433, -0.643584,  0.509112, -0.233068, -0.,  0.214528}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {1.442483, 0.9502  , 0.569207, 0.314641}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,4}, {0.162923, -0.289673, 0.354174, -0.386151, -0.162923, 0.289673, -0.354174, 0.386151}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {1.442483, 0.950200, 0.569207, 0.314641}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {-1.2, -0.9, -0.6, -0.3}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
@@ -3046,7 +3043,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test4) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3076,8 +3073,9 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test5) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,4,2,2}, {1.527335,  1.272779,1.018224,  0.763668,-0.466136, -0.233068,0.,  0.233068,-0.442716, -0.664075,-0.885433, -1.106791,1.287169,  1.501697,1.716225,  1.930753,
-                                    -2.545559, -2.800115,-3.054671, -3.309227,3.262951,  3.496019,3.729087,  3.962155,-3.984448, -4.205806,-4.427164, -4.648522,4.719618,  4.934146,5.148675,  5.363203}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,4,2,2}, {-0.737512, -0.659880, -0.582247, -0.504614, 0.561404, 0.502309, 0.443214, 0.384118, -1.168243,
+        -1.045270, -0.922297, -0.799324, 1.899026, 1.699128, 1.499231, 1.299333, 0.504614, 0.582247, 0.659880, 0.737512, -0.384118,
+        -0.443214, -0.502308, -0.561404, 0.799324, 0.922297, 1.045270, 1.168243, -1.299334, -1.499231, -1.699129, -1.899026}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {11.073181, 12.585667, 17.708657, 24.313186}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {4.2,  9. , 13.8, 18.6}, nd4j::DataType::FLOAT32);
 
@@ -3086,7 +3084,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test5) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3116,8 +3114,9 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test6) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,2,2,4}, {1.527335, -1.16534 ,  0.885433, -0.643584, 0.509112, -0.233068, -0.,  0.214528, -0.509112,  0.699204, -0.885433,  1.072641, -1.527335,  1.631475, -1.770866,  1.930753,
-                                    -2.545559,  2.563747, -2.656298,  2.788865, -3.563783,  3.496019, -3.541731,  3.646978, -4.582006,  4.42829 , -4.427164,  4.50509 , -5.60023 ,  5.360562, -5.312597,  5.363203}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,2,2,4}, {-4.989124, 2.540357, -1.515022, 0.791769, -3.563660, 1.814540, -1.082159, 0.565549, -2.138196, 1.088724, -0.649295,
+                                    0.339329, -0.712732, 0.362908, -0.216432, 0.113110, 0.712732, -0.362908, 0.216432, -0.113110, 2.138195, -1.088724, 0.649295,
+                                    -0.339330, 3.563660,-1.814540, 1.082159, -0.565549, 4.989125, -2.540356, 1.515022, -0.791770}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {20.364472, 17.856588, 16.949714, 15.903684}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {9.6, 10.8, 12. , 13.2}, nd4j::DataType::FLOAT32);
 
@@ -3126,7 +3125,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test6) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,3});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3156,20 +3155,21 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test7) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,2,2,2,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,2,2,2,4}, {1.527335,  -1.16534 ,   0.885433,  -0.643584,0.509112,  -0.233068,  -0.,   0.214528,-0.509112,   0.699204,  -0.885433,   1.072641,-1.527335,   1.631475,  -1.770866,
-                                      1.930753,-2.545559,   2.563747,  -2.656298,   2.788865,-3.563783,   3.496019,  -3.541731,   3.646978,-4.582006,   4.42829 ,  -4.427164,
-                                      4.50509 ,-5.60023 ,   5.360562,  -5.312597,   5.363203,  -6.618453,   6.292834,  -6.19803 ,   6.221315,-7.636677,   7.225105,  -7.083463,
-                                      7.079428,-8.6549  ,   8.157377,  -7.968895,   7.93754 ,-9.673124,   9.089649,  -8.854328,   8.795652, -10.691348,  10.02192 ,  -9.739761,
-                                      9.653765,-11.709571,  10.954192, -10.625194,  10.511877,-12.727795,  11.886464, -11.510627,  11.36999 ,-13.746018,  12.818735, -12.39606 ,  12.228102}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,2,2,2,4}, {-119.435059, 78.159744, -58.732986, 46.630123, -103.510391, 67.738441, -50.901920, 40.412773, -87.585716, 57.317142,
+        -43.070854, 34.195419, -71.661041, 46.895844, -35.239792, 27.978071, -55.736359, 36.474548, -27.408726, 21.760721, -39.811687, 26.053242, -19.577662,
+        15.543370, -23.887009, 15.631950, -11.746595, 9.326023, -7.962326, 5.210644, -3.915531, 3.108671, 7.962341, -5.210655, 3.915535, -3.108677, 23.887032,
+        -15.631958, 11.746601, -9.326031, 39.811691, -26.053246, 19.577671, -15.543377, 55.736382, -36.474548, 27.408726, -21.760731, 71.661064, -46.895851, 35.239788,
+        -27.978077, 87.585732, -57.317154, 43.070866, -34.195431, 103.510384, -67.738464, 50.901920, -40.412777, 119.435097, -78.159744, 58.732998, -46.630131}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {282.38734 , 244.542027, 224.140995, 207.548793}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {57.6, 60. , 62.4, 64.8}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     gradO.linspace(-0.9, 0.15);
 
+
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,4});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,4});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3201,10 +3201,11 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test8) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,4,2,2,2}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,4,2,2,2}, {1.527335,   1.272779, 1.018224,   0.763668, 0.509112,   0.254556, -0.      ,  -0.254556, 0.466136,   0.699204, 0.932272,   1.16534 , 1.398407,   1.631475, 1.864543,   2.097611,
-                                    -2.213582,  -2.43494 , -2.656298,  -2.877657, -3.099015,  -3.320373, -3.541731,  -3.76309 , 3.861506,   4.076034, 4.290562,   4.50509 , 4.719618,   4.934146, 5.148675,   5.363203,
-                                    -6.618453,  -6.873009, -7.127565,  -7.382121, -7.636677,  -7.891233, -8.145789,  -8.400345, 7.924309,   8.157377, 8.390445,   8.623513, 8.856581,   9.089649, 9.322717,   9.555784,
-                                    -9.297045,  -9.518403, -9.739761,  -9.961119, -10.182477, -10.403836, -10.625194, -10.846552, 10.726405,  10.940933, 11.155462,  11.36999 , 11.584518,  11.799046, 12.013574,  12.228102}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,4,2,2,2}, {-34.373802, -32.611046, -30.848286, -29.085529, -27.322769, -25.560009, -23.797251, -22.034491, 36.146996, 34.293301,
+        32.439610, 30.585917, 28.732227, 26.878534, 25.024841, 23.171150, -42.876553, -40.677757, -38.478958, -36.280159, -34.081367, -31.882565, -29.683767,
+        -27.484968, 50.674446, 48.075760, 45.477066, 42.878380, 40.279686, 37.681000, 35.082310, 32.483616, 22.034489, 23.797249, 25.560009, 27.322765, 29.085526,
+        30.848286, 32.611046, 34.373802, -23.171146, -25.024837, -26.878536, -28.732231, -30.585918, -32.439613, -34.293297, -36.146996, 27.484982, 29.683773,
+        31.882572, 34.081364, 36.280178, 38.478970, 40.677776, 42.876560, -32.483627, -35.082329, -37.681023, -40.279701, -42.878403, -45.477081, -48.075775, -50.674484}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {134.490365, 179.785003, 248.933114, 330.087248}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {32.4, 51.6, 70.8, 90.}, nd4j::DataType::FLOAT32);
 
@@ -3213,7 +3214,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test8) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3338,8 +3339,8 @@ TEST_F(DeclarableOpsTests9, Cholesky_Test_3) {
     auto result = op.execute({&x}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto res = result->at(0);
-//    res->printIndexedBuffer("Output for Cholesky 3");
-    ASSERT_TRUE(exp.equalsTo(res));
+    // res->printIndexedBuffer("Output for Cholesky 3");
+    ASSERT_TRUE(exp.equalsTo(res, 1e-4));
     delete result;
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
index baba901bf..8ae123260 100644
--- a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
@@ -121,7 +121,6 @@ TEST_F(EmptyTests, Test_Concat_3) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
     ASSERT_EQ(exp, *z);
 
     delete result;
@@ -141,7 +140,6 @@ TEST_F(EmptyTests, Test_Concat_4) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
     ASSERT_EQ(exp, *z);
 
     delete result;
@@ -282,7 +280,6 @@ TEST_F(EmptyTests, test_shaped_empty_3) {
 
 TEST_F(EmptyTests, test_shaped_empty_4) {
     auto shape = ConstantShapeHelper::getInstance()->vectorShapeInfo(0, nd4j::DataType::FLOAT32);
-    shape::printShapeInfoLinear("shape", shape);
     NDArray array(shape, true, nd4j::LaunchContext::defaultContext());
     std::vector<Nd4jLong> shapeOf({0});
 
diff --git a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
index 2ed43d08a..1dc2c8e48 100644
--- a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
@@ -20,6 +20,7 @@
 #include <hhSequence.h>
 #include <svd.h>
 #include <hhColPivQR.h>
+#include <array>
 #include <jacobiSVD.h>
 #include <ops/declarable/helpers/reverse.h>
 #include <ops/declarable/helpers/activations.h>
@@ -46,14 +47,14 @@ public:
 #ifndef __CUDABLAS__
 
 TEST_F(HelpersTests1, test_binary_search_1) {
-    std::array<int, 10> array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    std::array<int, 10> array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     auto idx = nd4j::ops::helpers::binarySearch(array.data(), 2, 10);
     ASSERT_EQ(2, idx);
 }
 
 TEST_F(HelpersTests1, test_binary_search_2) {
-    std::array<int, 10> array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    std::array<int, 10> array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     auto idx = nd4j::ops::helpers::binarySearch(array.data(), 18, 10);
     ASSERT_EQ(-1, idx);
diff --git a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
index 8097aab33..96c480fd9 100644
--- a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
@@ -58,7 +58,6 @@ TEST_F(IndexingTests, StridedSlice_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Output");
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -379,8 +378,6 @@ TEST_F(IndexingTests, Test_StridedSlice_1) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("Z");
-
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -424,8 +421,6 @@ TEST_F(IndexingTests, Test_StridedSlice_3) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("Z");
-
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
index 294e03c12..f442c0bb9 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
@@ -50,7 +50,6 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_1) {
 
     context.setOutputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo());
 
-    nd4j_printf("Starting execution...\n","");
     PointersManager pm(LaunchContext::defaultContext(), "test_DeclarableOp_execution_1");
     execCustomOp2(nullptr, op.getOpHash(), &context);
 
@@ -78,7 +77,6 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_2) {
 
     context.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
-    nd4j_printf("Starting execution...\n","");
     PointersManager pm(LaunchContext::defaultContext(), "test_DeclarableOp_execution_2");
     execCustomOp2(nullptr, op.getOpHash(), &context);
 
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
index 21af8e380..aa75ea1ab 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
@@ -426,6 +426,24 @@ TEST_F(JavaInteropTests, Test_FastPath_Validation_2) {
     ASSERT_NE(Status::OK(), status);
 }
 
+TEST_F(JavaInteropTests, Test_empty_cast_1) {
+    auto x = NDArrayFactory::create<bool>('c', {1, 0, 2});
+    auto z = NDArrayFactory::create<Nd4jLong>('c', {1, 0, 2});
+    auto e = NDArrayFactory::create<Nd4jLong>('c', {1, 0, 2});
+
+    Nd4jLong iArgs[] = {10};
+
+    Context ctx(1);
+    ctx.setInputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo());
+    ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
+    ctx.setIArguments(iArgs, 1);
+
+    nd4j::ops::cast op;
+    auto result = op.execute(&ctx);
+    ASSERT_EQ(Status::OK(), result);
+    ASSERT_EQ(e, z);
+}
+
 /*
 TEST_F(JavaInteropTests, test_avgpooling_edge_1) {
     int inOutH = 35;
@@ -1183,7 +1201,9 @@ TEST_F(JavaInteropTests, test_bfloat16_rng) {
     RandomGenerator rng(119, 323841120L);
     bfloat16 args[2] = {(bfloat16) 0.0f, (bfloat16) 1.0f};
     execRandom(nullptr, nd4j::random::Ops::UniformDistribution, &rng, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), args);
-    z.printIndexedBuffer("z");
+
+    //z.printIndexedBuffer("z");
+
     ASSERT_TRUE(z.sumNumber().e<float>(0) > 0);
 }
 
@@ -1192,7 +1212,7 @@ TEST_F(JavaInteropTests, test_ismax_view) {
     auto v = original.subarray({NDIndex::all(), NDIndex::all(), NDIndex::interval(0, 40, 2)});
     v->assign(1.0);
 
-    auto e = v->ulike();
+    auto e = v->like();
     auto t = e.tensorAlongDimension(0, {0, 1});
     t->assign(1.0);
 
@@ -1208,7 +1228,6 @@ TEST_F(JavaInteropTests, test_ismax_view) {
     nd4j::ops::ismax op;
     op.execute(&ctx);
 
-    z.printIndexedBuffer("z");
     ASSERT_EQ(e, z);
 
     delete v;
diff --git a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
index c1dc1acfe..30244b7dc 100644
--- a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
@@ -68,8 +68,6 @@ TEST_F(LambdaTests, test_basic_1) {
     ASSERT_EQ(0, res);
 
     ASSERT_EQ(e, x);
-
-    x.printIndexedBuffer("x");
 }
 
 void test(NDArray &x) {
@@ -127,7 +125,6 @@ TEST_F(LambdaTests, test_basic_2) {
 
     test(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -137,7 +134,6 @@ TEST_F(LambdaTests, test_basic_3) {
 
     test(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -147,7 +143,6 @@ TEST_F(LambdaTests, test_basic_4) {
 
     test2<float>(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -158,7 +153,6 @@ TEST_F(LambdaTests, test_basic_5) {
 
     testPairwise(x, y);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -168,7 +162,6 @@ TEST_F(LambdaTests, test_basic_6) {
 
     testIndexed(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -180,7 +173,6 @@ TEST_F(LambdaTests, test_basic_7) {
 
     testTriplewise(w, x, y);
 
-    w.printIndexedBuffer("w");
     ASSERT_EQ(e, w);
 }
 
@@ -191,7 +183,6 @@ TEST_F(LambdaTests, test_basic_8) {
 
     testIndexedPairwise(x, y);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
index 5308ee99d..f48ee54f6 100644
--- a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
@@ -442,11 +442,11 @@ TEST_F(LegacyOpsTests, reduce3_1) {
 
     //int *tadShapeBuffer = shape::computeResultShape(shapeBuffer,dimension,dimensionLength);
     auto tadShapeBuffer = nd4j::ShapeUtils::evalReduceShapeInfo('c', dim, shapeBuffer, false, true, nullptr);
-    functions::reduce3::Reduce3<float, float>::exec(opNum, x, xShapeBuffer, extraVals, y, shapeBuffer, result, tadShapeBuffer, dimension, dimensionLength);
+    functions::reduce3::Reduce3<float, float>::exec(opNum, x, xShapeBuffer, extraVals, y, shapeBuffer, result, tadShapeBuffer, dimension, dimensionLength, 0, 4);
 
     float distancesAssertion[4] = {0.0,8.0,16.0,24.0};
     for(int i = 0; i < 4; i++)
-        ASSERT_EQ(distancesAssertion[i],result[i]);
+        ASSERT_NEAR(distancesAssertion[i],result[i], 1e-5);
 
     delete[] shapeBuffer;
     delete[] xShapeBuffer;
@@ -726,6 +726,26 @@ TEST_F(LegacyOpsTests, test_legacy_reduce_empty_3) {
     ASSERT_EQ(e, z);
 }
 
+TEST_F(LegacyOpsTests, test_legacy_reduce_empty_4) {
+    if (!Environment::getInstance()->isCPU())
+        return;
+    int a = 0;
+
+    auto x = NDArrayFactory::create<float>('c', {1, 0, 2});
+    auto d = NDArrayFactory::create<int>('c', {1}, {a});
+    auto z = NDArrayFactory::create<float>('c', {0, 2});
+    auto e = NDArrayFactory::create<float>('c', {0, 2});
+
+
+
+    ::execReduceSame2(nullptr, reduce::SameOps::Sum,
+            x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
+            nullptr,
+            z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
+            d.buffer(), d.shapeInfo(), d.specialBuffer(), d.specialShapeInfo());
+
+}
+
 TEST_F(LegacyOpsTests, test_legacy_transform_float_1) {
     auto x = NDArrayFactory::create<float>('c', {1, 0, 4});
 
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
index 4ab884d28..71ad6929b 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
@@ -152,7 +152,6 @@ TEST_F(NDArrayCudaBasicsTests, Test_Cosine_1) {
     //ASSERT_TRUE(y->isActualOnDeviceSide());
     //ASSERT_TRUE(y->isActualOnHostSide());
     //y->syncToHost();
-    y->printBuffer("Cosine");
     delete x;
     delete y;
 }
@@ -251,9 +250,6 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_3) {
     cudaMemcpy(z.buffer(), z.specialBuffer(), z.lengthOf() * z.sizeOfT(), cudaMemcpyDeviceToHost);
     res = cudaStreamSynchronize(*stream);
     ASSERT_EQ(0, res);
-    x.printBuffer("3X = ");
-    y.printBuffer("3Y = ");
-    z.printBuffer("3Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -347,11 +343,7 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_6) {
     x += y;
     //x.applyPairwiseTransform(pairwise::Add, &y, &z, nullptr);
     x.syncToHost();
-    x.printBuffer("6X = ");
-    //y.printBuffer("3Y = ");
-    //z.printBuffer("3Result out");
 
-    //
     // cudaFree(devBufferPtrX);
     //cudaFree(devBufferPtrZ);
     //cudaFree(devShapePtrX);
@@ -381,11 +373,7 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_7) {
     x += 2.;
     //x.applyPairwiseTransform(pairwise::Add, &y, &z, nullptr);
     x.syncToHost();
-    x.printBuffer("7X = ");
-    //y.printBuffer("3Y = ");
-    //z.printBuffer("3Result out");
 
-    //
     // cudaFree(devBufferPtrX);
     //cudaFree(devBufferPtrZ);
     //cudaFree(devShapePtrX);
@@ -445,9 +433,6 @@ TEST_F(NDArrayCudaBasicsTests, TestMultiply_2) {
     //res = cudaMalloc(reinterpret_cast<void **>(&devShapePtrX), shape::shapeInfoByteLength(x.shapeInfo()));
     //ASSERT_EQ(0, res);
     x.applyPairwiseTransform(pairwise::Multiply, &y, &z, nullptr);
-    x.printBuffer("3X = ");
-    y.printBuffer("3Y = ");
-    z.printBuffer("3Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -744,8 +729,7 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_2) {
 
     cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    z.printBuffer("Result with Broadcast2 (multiply)");
-    exp.printBuffer("Expect with Broadcast2 (multiply)");
+
     // verify results
     for (int e = 0; e < z.lengthOf(); e++)
         ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
@@ -811,7 +795,6 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_3) {
 
     //cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     //z.syncToHost();
-    z.printBuffer("Result with Broadcast3 (multiply)");
     // verify results
     for (int e = 0; e < z.lengthOf(); e++)
         ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
@@ -842,11 +825,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_1) {
     //res = cudaMalloc(reinterpret_cast<void **>(&devShapePtrX), shape::shapeInfoByteLength(x.shapeInfo()));
     //ASSERT_EQ(0, res);
     //x.applyPairwiseTransform(pairwise::Multiply, &y, &z, nullptr);
-    //x.printBuffer("23X = ");
-    //y.printBuffer("23Y = ");
     x *= y;
     //x.syncToHost();
-    x.printBuffer("54Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -995,7 +975,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastRaw_1) {
     // allocate required amount of global device memory and copy host data to it
     //cudaResult = allocateDeviceMem(*pLc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
     for(size_t i = 0; i < devicePtrs.size(); ++i) {
-        nd4j_printf("Allocation of %i bytes with device\n", hostData[i].second)
         cudaResult = cudaMalloc(&devicePtrs[i], hostData[i].second); //if(cudaResult != 0) return cudaResult;
         ASSERT_EQ(cudaResult, 0);
         cudaMemcpy(devicePtrs[i], hostData[i].first, hostData[i].second, cudaMemcpyHostToDevice);
@@ -1047,7 +1026,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply) {
     //x.printBuffer("23X = ");
     //y.printBuffer("23Y = ");
     x *= y;
-    x.printBuffer("55Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -1082,7 +1060,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_2) {
     //y.printBuffer("23Y = ");
     //void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray* other, NDArray* target, const bool checkTargetShape, ExtraArguments *extraArgs)
     x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), &y, &exp);
-    exp.printBuffer("56Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -1111,8 +1088,6 @@ TEST_F(NDArrayCudaBasicsTests, TestReduceSum_1) {
     ASSERT_EQ(0, res);
     y.syncToHost();
 
-    x.printBuffer("X = ");
-    y.printBuffer("Y = ");
     ASSERT_NEAR(y.e<double>(0), 15, 1e-5);
 }
 
@@ -1120,7 +1095,6 @@ TEST_F(NDArrayCudaBasicsTests, TestReduceSum_1) {
 TEST_F(NDArrayCudaBasicsTests, TestDup1) {
 
     NDArray array('c', {2,3}, {1,2,3,4,5,6});
-    array.printBuffer("Array at start");
     auto arrC = array.dup('c');
     auto arrF = array.dup('f');
     // arrC->printBuffer("arrC");
@@ -1498,22 +1472,18 @@ TEST_F(NDArrayCudaBasicsTests, EqualityTest1) {
             arrayA->p(i, k, (float) i);
         }
     }
-    arrayA->printBuffer("arrayA is ");
+
     for (int i = 0; i < arrayB->rows(); i++) {
         for (int k = 0; k < arrayB->columns(); k++) {
             arrayB->p(i, k, (float) i);
         }
     }
-    arrayB->printBuffer("arrayB is ");
 
     for (int i = 0; i < arrayC->rows(); i++) {
         for (int k = 0; k < arrayC->columns(); k++) {
             arrayC->p(i, k, (float) i+1);
         }
     }
-    arrayC->printBuffer("arrayC is ");
-
-
 
     ASSERT_TRUE(arrayA->equalsTo(arrayB, 1e-5));
 
@@ -1920,8 +1890,6 @@ TEST_F(NDArrayCudaBasicsTests, Tile_Test_2_2)
     auto y = x.tile({1,2,1});
     auto exp = NDArrayFactory::create<float>('f', {2, 2, 2});
     exp = 10.;
-    y.printShapeInfo("Output SHAPE");
-    y.printBuffer("Output TILE");
     ASSERT_TRUE(exp.equalsTo(y));
 }
 
@@ -1945,17 +1913,13 @@ TEST_F(NDArrayCudaBasicsTests, Operator_Plus_Test_2)
 {
     double expBuff[] = {2., 3, 3., 4., 4., 5, 5., 6., 6., 7, 7., 8.};
     NDArray a('c', {4,4}, {1.,2,3,4,5,6,7,8,9,2,3,2,1,0,4,7.}, nd4j::DataType::FLOAT32);
-    a.printBuffer();
     auto x = NDArrayFactory::create<double>('c', {3, 2, 1});
     auto y = NDArrayFactory::create<double>('c',    {1, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {3, 2, 2});
 
     x.linspace(1);
     y.linspace(1);
-    x.printBuffer("X=");
-    y.printBuffer("Y=");
     auto result = x + y;
-    result.printIndexedBuffer("Result");
 
     ASSERT_TRUE(expected.isSameShape(&result));
     ASSERT_TRUE(expected.equalsTo(&result));
@@ -2133,7 +2097,7 @@ TEST_F(NDArrayCudaBasicsTests, Test_diagonal_1) {
     for (Nd4jLong e = 0; e < exp.lengthOf(); ++e) {
         printf("VAL[%ld] = %f\n", e, diag->e<float>(e)); //, exp.e<float>(e), 1.e-5);
     }
-    diag->printIndexedBuffer("DIAGONAL");
+
     for (Nd4jLong e = 0; e < exp.lengthOf(); ++e) {
         ASSERT_NEAR(diag->e<float>(e), exp.e<float>(e), 1.e-5);
     }
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
index 75608f2bc..747ecc183 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
@@ -174,8 +174,6 @@ TEST_F(NDArrayTest, EqualityTest1) {
             arrayC->p(i, k, (float) i+1);
         }
     }
-    arrayB->printBuffer("B =");
-    arrayC->printBuffer("C =");
 
     //nd4j_printf("A B\n","");
     ASSERT_TRUE(arrayA->equalsTo(arrayB, 1e-5));
@@ -1699,7 +1697,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension2) {
     NDArray exp(expBuff, expShapeInfo);
 
     auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {1});
-    result->printIndexedBuffer("VARIANCE2");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 
@@ -1714,7 +1711,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension3) {
     x.linspace(1); // 1, 2, 3, ..., 100
     exp.assign(825.f);
     auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {0});
-    result->printIndexedBuffer("VARIANCE3");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 
@@ -1729,7 +1725,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension4) {
     x.linspace(1); // 1, 2, 3, ..., 100
     exp.assign(1716.);
     auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {0});
-    result->printIndexedBuffer("VARIANCE4");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
index 9f9937368..a497cd9e6 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
@@ -184,7 +184,6 @@ TEST_F(NDArrayTest2, SetIdentity_test_8) {
 
     auto x = NDArrayFactory::create<float>('c', {3, 3, 3});
     auto xExp = NDArrayFactory::create<float>('c', {3, 3, 3}, {1.,0.,0. ,0.,0.,0., 0.,0.,0.,   0.,0.,0. ,0.,1.,0., 0.,0.,0.,  0.,0.,0. ,0.,0.,0., 0.,0.,1.});
-    xExp.printIndexedBuffer("Identity8");
     x.setIdentity();
 
     ASSERT_TRUE(x.equalsTo(&xExp));
@@ -921,8 +920,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_1) {
     NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(5, subArr1->ews());
     delete subArr1;
 }
@@ -933,8 +930,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_2) {
     NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(1, subArr1->ews());
     delete subArr1;
 }
@@ -945,8 +940,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_3) {
     NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(1, subArr1->ews());
     delete subArr1;
 }
@@ -957,8 +950,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_4) {
     NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(10, subArr1->ews());
     delete subArr1;
 }
@@ -1074,8 +1065,6 @@ TEST_F(NDArrayTest2, test_subarray_interval_1) {
     NDArray x('f', {10, 10}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(10, subArr1->sizeAt(0));
     ASSERT_EQ(9, subArr1->sizeAt(1));
     delete subArr1;
@@ -1086,8 +1075,6 @@ TEST_F(NDArrayTest2, test_subarray_interval_2) {
     NDArray x('c', {10, 10}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(10, subArr1->sizeAt(0));
     ASSERT_EQ(9, subArr1->sizeAt(1));
     delete subArr1;
@@ -1098,10 +1085,8 @@ TEST_F(NDArrayTest2, test_subarray_3d_cf) {
     NDArray c('c', {10, 20, 30}, nd4j::DataType::FLOAT32);
 
     auto subarrayF = f({0,0, 0,0, 2,3}, true);
-    subarrayF.printShapeInfo("F subarray shapeInfo");
 
     auto subarrayC = c({2,3, 0,0, 0,0}, true);
-    subarrayC.printShapeInfo("C subarray shapeInfo");
 }
 
 TEST_F(NDArrayTest2, test_broadcast_row_1) {
@@ -1133,8 +1118,6 @@ TEST_F(NDArrayTest2, test_broadcast_column_2) {
     e.assign(1.0f);
 
     x.applyTrueBroadcast(BroadcastOpsTuple::Add(), &y, &x, false);
-    x.printShapeInfo();
-    x.printIndexedBuffer();
 
     ASSERT_EQ(e, x);
 }
@@ -1189,8 +1172,6 @@ TEST_F(NDArrayTest2, test_long_sum_1) {
     auto x = NDArrayFactory::create<Nd4jLong>('c', {2, 2}, {1, 2, 3, 4});
 
     auto z = x.reduceAlongDims(reduce::Sum, {0});
-
-    z.printIndexedBuffer("z long");
 }
 
 //////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
index 9aac42ddf..95b3027cc 100644
--- a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
@@ -191,7 +191,8 @@ TEST_F(NativeOpsTests, ExecBroadcast_2) {
 #ifdef __CUDABLAS__
 printf("Unsupported for cuda now.\n");
 #else
-    auto dimension = NDArrayFactory::create<int>('c', {1}, {(int)0});
+    int dimd = 0;
+    auto dimension = NDArrayFactory::create<int>('c', {1}, {dimd});
 
     ::execBroadcastBool(nullptr,
         broadcast::EqualTo,
@@ -525,8 +526,8 @@ TEST_F(NativeOpsTests, Reduce3Test_1) {
                             y.specialBuffer(), y.specialShapeInfo(),
                             exp.buffer(), exp.shapeInfo(),
                             exp.specialBuffer(), exp.specialShapeInfo());
-//    x.printIndexedBuffer("Input");
-//    exp.printIndexedBuffer("Reduce3 Dot");
+    //z.printIndexedBuffer("Z");
+    //exp.printIndexedBuffer("Reduce3 Dot");
     ASSERT_TRUE(exp.equalsTo(z));
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
index d8174f000..0d879748d 100644
--- a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
@@ -81,34 +81,6 @@ TEST_F(OmpLaunchHelperTests, Test_BetterThreads_3) {
     ASSERT_EQ(1, n);
 }
 
-//////////////////////////////////////////////////////////////////////
-TEST_F(OmpLaunchHelperTests, loop_test1) {
-    
-    const Nd4jLong N = 20010;
-    Nd4jLong desiredNumThreads = 2;
-    int x[N] = {0};
-
-    OmpLaunchHelper info(N, desiredNumThreads);
-    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-    {                        
-        auto threadNum = omp_get_thread_num();
-        auto xi = x + info.getThreadOffset(threadNum);
-
-        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-        PRAGMA_OMP_SIMD
-        for (Nd4jLong i = 0; i < ulen; i++)
-            xi[i] = xi[i] + 1;
-    }
-    
-    #ifdef _OPENMP
-        ASSERT_EQ(desiredNumThreads, info._numThreads);
-    #else
-        ASSERT_EQ(1, info._numThreads);
-    #endif
-    
-}
-
 TEST_F(OmpLaunchHelperTests, test_tad_threads_1) {
     Nd4jLong numTads = 16;
     Nd4jLong tadLength = 16;
diff --git a/libnd4j/tests_cpu/layers_tests/OpsArena.cpp b/libnd4j/tests_cpu/layers_tests/OpsArena.cpp
deleted file mode 100644
index b09a4e043..000000000
--- a/libnd4j/tests_cpu/layers_tests/OpsArena.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by raver119 on 11.10.2017.
-//
-// This "set of tests" is special one - we don't check ops results here. we just check for memory equality BEFORE op launch and AFTER op launch
-//
-//
-#include "testlayers.h"
-#include <vector>
-#include <ops/declarable/CustomOperations.h>
-#include <ops/declarable/OpTuple.h>
-#include <ops/declarable/OpRegistrator.h>
-#include <memory/MemoryReport.h>
-#include <memory/MemoryUtils.h>
-#include <MmulHelper.h>
-
-using namespace nd4j;
-using namespace nd4j::ops;
-
-class OpsArena : public testing::Test {
-public:
-    const int numIterations = 0;
-    std::vector<OpTuple *> tuples;
-
-
-    OpsArena() {
-        // nd4j_printf("\nStarting memory tests...\n","");
-
-
-        // conv2d_bp
-        tuples.push_back((new OpTuple("conv2d_bp"))
-                                 ->addInput(NDArrayFactory::create_<float>('c', {2, 1, 4, 4}))
-                                 ->addInput(NDArrayFactory::create_<float>('c', {3, 3, 1, 2}))
-                                 //->addInput(new NDArray<float>('c', {2, 1}))
-                                 ->addInput(NDArrayFactory::create_<float>('c', {2, 2, 4, 4}))
-                                 ->setIArgs({3, 3, 1, 1, 0, 0, 1, 1, 1}));
-
-
-        // mergeavg
-        tuples.emplace_back((new OpTuple("mergeavg"))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100})));
-
-        // mergemax
-        auto mergeMax_X0 = NDArrayFactory::create_<float>('c', {100, 100});
-        auto mergeMax_X1 = NDArrayFactory::create_<float>('c', {100, 100});
-        auto mergeMax_X2 = NDArrayFactory::create_<float>('c', {100, 100});
-        tuples.push_back(new OpTuple("mergemax", {mergeMax_X0, mergeMax_X1, mergeMax_X2}, {}, {}));
-
-        // conv2d
-        auto conv2d_Input = NDArrayFactory::create_<float>('c', {1, 2, 5, 4});
-        auto conv2d_Weights = NDArrayFactory::create_<float>('c', {2, 2, 2, 3});
-        auto conv2d_Bias = NDArrayFactory::create_<float>('c', {3, 1});
-        tuples.push_back(new OpTuple("conv2d", {conv2d_Input, conv2d_Weights, conv2d_Bias}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1, 0}));
-
-        // test custom op
-        tuples.emplace_back((new OpTuple("testcustom"))
-                                    ->setIArgs({1, 2})
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100})));
-
-
-        // deconv2d
-        tuples.emplace_back((new OpTuple("deconv2d"))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {2, 3, 4, 4}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {5, 5, 3, 3}))
-                                    ->setIArgs({5, 5, 1, 1, 0, 0, 1, 1, 0, 0}));
-
-        // maxpool2d
-        tuples.emplace_back((new OpTuple("maxpool2d"))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {2, 1, 28, 28}))
-                                    ->setIArgs({5, 5, 1, 1, 0, 0, 2, 2, 0}));
-    }
-
-
-    ~OpsArena() {
-        for (auto v: tuples)
-            delete v;
-    }
-
-};
-
-
-TEST_F(OpsArena, TestFeedForward) {
-    nd4j::ops::mergeavg op0;
-    nd4j::ops::mergemax op1;
-
-#ifdef _WIN32
-    if (1 > 0)
-        return;
-#endif
-
-    for (auto tuple: tuples) {
-        auto op = OpRegistrator::getInstance()->getOperation(tuple->_opName);
-        if (op == nullptr) {
-            // nd4j_printf("Can't find Op by name: [%s]\n", tuple->_opName);
-            ASSERT_TRUE(false);
-        }
-
-        // nd4j_printf("Testing op [%s]\n", tuple->_opName);
-        nd4j::memory::MemoryReport before, after;
-
-        // warmup
-        auto tmp1 = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs);
-        auto tmp2 = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs);
-        delete tmp1;
-        delete tmp2;
-
-        auto b = nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before);
-
-        if (!b)
-            ASSERT_TRUE(false);
-
-        for (int e = 0; e < numIterations; e++) {
-            auto result = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs);
-
-            // we just want to be sure op was executed successfully
-            ASSERT_TRUE(result->size() > 0);
-
-            delete result;
-        }
-
-
-        auto a = nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after);
-        if (!a)
-            ASSERT_TRUE(false);
-
-
-        // this is our main assertion. memory footprint after op run should NOT be higher then before
-        if (after > before) {
-            // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", tuple->_opName, before.getRSS(), after.getRSS())
-        //    ASSERT_TRUE(after <= before);
-        }
-    }
-}
-
-
-
-TEST_F(OpsArena, TestMmulHelper1) {
-    auto a = NDArrayFactory::create<float>('c', {100, 100});
-    auto b = NDArrayFactory::create<float>('c', {100, 100});
-    auto c = NDArrayFactory::create<float>('c', {100, 100});
-
-    nd4j::MmulHelper::mmul(&a, &b, &c);
-
-    nd4j::memory::MemoryReport before, after;
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before);
-
-    for (int e = 0; e < numIterations; e++) {
-        nd4j::MmulHelper::mmul(&a, &b, &c);
-    }
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after);
-    if (after > before) {
-        // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", "mmulHelper", before.getRSS(), after.getRSS())
-        //ASSERT_TRUE(after <= before);
-    }
-}
-
-
-TEST_F(OpsArena, TestMmulHelper2) {
-    auto a = NDArrayFactory::create<float>('c', {100, 100});
-    auto b = NDArrayFactory::create<float>('c', {100, 100});
-
-    auto c = nd4j::MmulHelper::mmul(&a, &b);
-    delete c;
-
-    nd4j::memory::MemoryReport before, after;
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before);
-
-    for (int e = 0; e < numIterations; e++) {
-        c = nd4j::MmulHelper::mmul(&a, &b);
-        delete c;
-    }
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after);
-    if (after > before) {
-        // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", "mmulHelper", before.getRSS(), after.getRSS())
-        ASSERT_TRUE(after <= before);
-    }
-}
-
diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
index 0254d1877..d5880d689 100644
--- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
@@ -419,9 +419,6 @@ TEST_F(ParityOpsTests, Test_Shape_1) {
 
     auto z = result->at(0);
 
-    z->printShapeInfo("z shape");
-    z->printIndexedBuffer(" z buffr");
-
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -1362,7 +1359,8 @@ TEST_F(ParityOpsTests, scatterND_sub_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer();
+    //exp.printIndexedBuffer("e");
+    //z->printIndexedBuffer("z");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
diff --git a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
new file mode 100644
index 000000000..998b8164b
--- /dev/null
+++ b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include "testlayers.h"
+#include <Graph.h>
+#include <chrono>
+#include <Node.h>
+#include <ops/declarable/CustomOperations.h>
+#include <graph/profiling/GraphProfilingHelper.h>
+#include <type_conversions.h>
+#include <helpers/threshold.h>
+#include <helpers/MmulHelper.h>
+#include <ops/ops.h>
+#include <OmpLaunchHelper.h>
+#include <GradCheck.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <Loops.h>
+#include <RandomLauncher.h>
+
+#include <helpers/BenchmarkHelper.h>
+#include <ops/declarable/helpers/scatter.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <helpers/ConstantTadHelper.h>
+#include <array>
+#include <performance/benchmarking/FullBenchmarkSuit.h>
+#include <performance/benchmarking/LightBenchmarkSuit.h>
+
+#include <ops/declarable/helpers/legacy_helpers.h>
+#include <execution/ThreadPool.h>
+
+using namespace nd4j;
+using namespace nd4j::graph;
+
+class PerformanceTests : public testing::Test {
+public:
+    int numIterations = 100;
+
+    PerformanceTests() {
+        samediff::ThreadPool::getInstance();
+    }
+};
+
+#ifdef RELEASE_BUILD
+
+TEST_F(PerformanceTests, test_maxpooling2d_1) {
+    std::vector<Nd4jLong> valuesX;
+    auto x = NDArrayFactory::create<float>('c', {32, 3, 224, 224});
+    auto z = NDArrayFactory::create<float>('c', {32, 3, 224, 224});
+    x.linspace(1.0f);
+    Nd4jLong k = 5;
+
+
+    Nd4jLong iArgs[] {k,k, 1,1, 0,0, 1,1, 1};
+    Context ctx(1);
+    ctx.setInputArray(0, &x);
+    ctx.setOutputArray(0, &z);
+    ctx.setIArguments(iArgs, 9);
+
+    nd4j::ops::maxpool2d op;
+
+    for (int i = 0; i < numIterations; i++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        op.execute(&ctx);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        valuesX.emplace_back(outerTime);
+
+        if ((i + 1) % 1000 == 0)
+            nd4j_printf("Iteration %i finished...\n", i + 1);
+    }
+
+    std::sort(valuesX.begin(), valuesX.end());
+    nd4j_printf("Execution time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]);
+}
+
+#endif
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index e95c6eca6..dfb685e22 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -57,12 +57,201 @@ public:
         fflush(stdout);
     }
 };
-
+/*
 TEST_F(PlaygroundTests, test_s_1) {
     auto t = ::runLightBenchmarkSuit(true);
     delete[] t;
 }
 
+TEST_F(PlaygroundTests, test_s_2) {
+    std::atomic<int> s;
+    s = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        s++;
+    };
+
+    samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+    std::vector<Nd4jLong> values;
+
+    for (int e = 0; e < 100000; e++) {
+        s = 0;
+
+        auto timeStart = std::chrono::system_clock::now();
+        //samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+        PRAGMA_OMP_PARALLEL_THREADS(4) {
+            s++;
+        }
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    };
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Time: %lld;\n", values[values.size() / 2]);
+}
+ */
+/*
+TEST_F(PlaygroundTests, test_s_4) {
+    std::atomic<float> f;
+    std::atomic<int> s;
+    std::vector<Nd4jLong> valuesX, valuesY;
+    int iterations = 1000;
+    s = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        s++;
+    };
+
+    samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+
+    ////////
+
+    auto x = NDArrayFactory::create<float>('c', {32, 3, 256, 256});
+    auto z = NDArrayFactory::create<float>('c', {32, 3, 256, 256});
+    x.linspace(1.0);
+
+    auto xs0 = x.sizeAt(0);
+    auto xs1 = x.sizeAt(1);
+    auto xs2 = x.sizeAt(2);
+    auto xs3 = x.sizeAt(3);
+
+    auto buffer = x.bufferAsT<float>();
+    auto zbuffer = z.bufferAsT<float>();
+
+    for (int e = 0; e < iterations; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+        PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
+        for (int i = 0; i < xs0; i++) {
+            for (int j = 0; j < xs1; j++) {
+                auto thread_id = omp_get_thread_num();
+                for (int k = 0; k < xs2; k++) {
+                    for (int l = 0; l < xs3; l++) {
+                        zbuffer[thread_id] += buffer[i * j + (k*l)] * 2.5f;
+                    }
+                }
+            }
+        }
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        valuesX.emplace_back(outerTime);
+    }
+
+
+    for (int e = 0; e < iterations; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+        auto f2d = PRAGMA_THREADS_FOR_2D {
+            for (auto i = start_x; i < stop_x; i++) {
+                for (auto j = start_y; j < stop_y; j++) {
+
+                    for (auto k = 0; k < xs2; k++) {
+                        for (auto l = 0; l < xs3; l++) {
+                            zbuffer[thread_id] += buffer[i * j + (k * l)] * 2.5f;
+                        }
+                    }
+                }
+            }
+        };
+        samediff::Threads::parallel_for(f2d, 0, xs0, 1, 0, xs1, 1);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        valuesY.emplace_back(outerTime);
+    }
+
+    if (valuesX.size() > 0) {
+        std::sort(valuesX.begin(), valuesX.end());
+        nd4j_printf("OpenMP time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]);
+    }
+
+    if (valuesY.size() > 0) {
+        std::sort(valuesY.begin(), valuesY.end());
+        nd4j_printf("Threads time: %lld; Min: %lld; Max: %lld;\n", valuesY[valuesY.size() / 2], valuesY[0], valuesY[valuesY.size() - 1]);
+    }
+
+    nd4j_printf("Sum: %f\n", z.sumNumber().e<float>(0));
+}
+
+
+TEST_F(PlaygroundTests, test_s_5) {
+    auto x = NDArrayFactory::create<float>('c', {32, 1, 28, 28});
+
+    std::vector<Nd4jLong> values;
+    auto iterations = 100;
+
+    auto startX = 0;
+    auto stopX = x.sizeAt(0);
+    auto incX = 1;
+    auto startY = 0;
+    auto stopY = x.sizeAt(1);
+    auto incY = 1;
+    auto numThreads = 4;
+
+    // number of elements per loop
+    auto delta_x = (stopX - startX);
+    auto delta_y = (stopY - startY);
+
+    // number of iterations per loop
+    auto itersX = delta_x / incX;
+    auto itersY = delta_y / incY;
+
+    for (int e = 0; e < iterations; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        // picking best fit here
+        auto splitLoop = samediff::ThreadsHelper::pickLoop2d(numThreads, itersX, itersY);
+        auto span = samediff::Span2::build(splitLoop, 0, numThreads, startX, stopX, incX, startY, stopY, incY);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]);
+}
+
+
+TEST_F(PlaygroundTests, test_s_6) {
+    auto x = NDArrayFactory::create<float>('c', {1024 * 1024 * 64});
+    auto buffer = x.bufferAsT<float>();
+    auto len = x.lengthOf();
+    std::vector<Nd4jLong> values;
+    auto iterations = 1000;
+
+    for (int i = 0; i < iterations; i++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        // picking best fit here
+        for (int e = 0; e < len; e++) {
+            buffer[e] = (buffer[e] + 1.72f) * 3.17f - 0.0012f;
+        }
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]);
+}
+
+
+TEST_F(PlaygroundTests, test_s_3) {
+    std::atomic<int> s;
+    s = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        s++;
+    };
+
+    for (int e = 0; e < 10000; e++) {
+
+        samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+    }
+}
+ */
+
 /*
 TEST_F(PlaygroundTests, test_relubp_1) {
     auto x = NDArrayFactory::create<float>('c', {128, 64, 224, 224});
diff --git a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
index bc4db6e63..5c3ca340b 100644
--- a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
@@ -868,7 +868,6 @@ TEST_F(RNGTests, Test_UniformDistribution_04) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Uniform int distribution");
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
 
diff --git a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
index 4df0f3dc8..8bf12f58b 100644
--- a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
@@ -77,7 +77,7 @@ TEST_F(EuclideanDistanceTest,Test1) {
                                              result,
                                              tadShapeBuffer,
                                              dimension,
-                                             dimensionLength);
+                                             dimensionLength, 0, 2);
 
     ASSERT_EQ(result[1],result[0]);
 }
@@ -107,7 +107,7 @@ TEST_F(StdTest,MultiDimTest) {
             dimensionsForStd,
             dimensionLength,
             tad->tadOnlyShapeInfo,
-            tad->tadOffsets);
+            tad->tadOffsets, 0, shape::length(resultShapeInfo));
 
     // for(int i = 0; i < shape::length(resultShapeInfo); i++)
         // printf("%f\n",result[i]);
@@ -145,7 +145,7 @@ TEST_F(ReduceTest,MatrixTest) {
             dimension,
             dimensionLength,
             tad->tadOnlyShapeInfo,
-            tad->tadOffsets);
+            tad->tadOffsets, 0, tad->numTads);
 
     // for(int i = 0; i < shape::length(resultShapeInfo); i++)
     //     printf("%f\n",result[i]);
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
index ecc91779e..a8f430fe3 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
@@ -234,7 +234,6 @@ TEST_F(NormalThreeFourFive,DimensionTest) {
     tad->init(inputShapeBuffer,dimension,dimensionLength);
     tad->createTadOnlyShapeInfo();
     tad->createOffsets();
-    shape::printShapeInfoLinear(tad->tadOnlyShapeInfo);
     ASSERT_TRUE(arrsEquals(8,assertionBuffer,tad->tadOnlyShapeInfo));
 
     delete tad;
diff --git a/libnd4j/tests_cpu/layers_tests/TadTests.cpp b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
index aabef927f..b4a631a8c 100644
--- a/libnd4j/tests_cpu/layers_tests/TadTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
@@ -206,8 +206,6 @@ TEST_F(TadTests, test_TAD_empty_dims_1) {
     xTad.init(xShape, reinterpret_cast<int*>(112L), 0);
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
-    nd4j_printf("numTads: %i\n", (int) xTad.numTads);
-    shape::printShapeInfoLinear("TAD shape", xTad.tadOnlyShapeInfo);
 }
 
 TEST_F(TadTests, test_tad_order_1) {
@@ -218,7 +216,6 @@ TEST_F(TadTests, test_tad_order_1) {
     xTad.init(xShape, &dim, 1);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -230,7 +227,6 @@ TEST_F(TadTests, test_tad_order_2) {
     xTad.init(xShape, &dim, 1);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -243,7 +239,6 @@ TEST_F(TadTests, test_tad_order_3) {
     xTad.init(xShape, &dim, 1);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -256,7 +251,6 @@ TEST_F(TadTests, test_tad_order_4) {
     xTad.init(xShape, dim, 2);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -264,7 +258,6 @@ TEST_F(TadTests, test_column_1) {
     auto x = NDArrayFactory::create<float>('c', {5, 2});
     auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), 0);
 
-    shape::printShapeInfoLinear("column view", tadPack.primaryShapeInfo());
     ASSERT_EQ(1, shape::rank(tadPack.primaryShapeInfo()));
     ASSERT_EQ(5, shape::length(tadPack.primaryShapeInfo()));
     ASSERT_TRUE(shape::isVector(tadPack.primaryShapeInfo()));
diff --git a/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
new file mode 100644
index 000000000..1139d6076
--- /dev/null
+++ b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
@@ -0,0 +1,233 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include "testlayers.h"
+#include <ops/declarable/CustomOperations.h>
+#include <loops/type_conversions.h>
+#include <execution/Threads.h>
+#include <chrono>
+#include <execution/ThreadPool.h>
+
+using namespace samediff;
+using namespace nd4j;
+using namespace nd4j::ops;
+using namespace nd4j::graph;
+
+class ThreadsTests : public testing::Test {
+public:
+
+};
+
+TEST_F(ThreadsTests, th_test_1) {
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1023));
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1024));
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1026));
+
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 2043));
+    ASSERT_EQ(2, ThreadsHelper::numberOfThreads(6, 2048));
+}
+
+
+TEST_F(ThreadsTests, th_test_2) {
+    // in this case we'll get better split over second loop - exactly 32 elements per thread
+    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(32, 48, 1024));
+    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(6, 4, 16384));
+
+    // in this case we'll get better split over first loop - 2 loops/2048 elements per thread
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(32, 64, 1024));
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 6, 16384));
+
+    // in this case none of loops are good enough, but second loop is too small for split
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 64, 32));
+
+    // all loops are good enough, but we go with bigger one, since small
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(2, 64, 32));
+
+    // obviously split goes into second loop, to give 1024 elements per thread
+    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(2, 1, 2048));
+}
+
+TEST_F(ThreadsTests, th_test_3) {
+    // typical conv cases
+    ASSERT_EQ(1, ThreadsHelper::pickLoop3d(4, 32, 3, 128));
+    ASSERT_EQ(2, ThreadsHelper::pickLoop3d(4, 1, 128, 64));
+    ASSERT_EQ(3, ThreadsHelper::pickLoop3d(4, 1, 3, 128));
+
+    // checking for optimal threads for conv inference
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 1, 3, 128));
+    ASSERT_EQ(4, ThreadsHelper::numberOfThreads3d(4, 1, 3, 128));
+    ASSERT_EQ(8, ThreadsHelper::numberOfThreads3d(8, 1, 3, 128));
+
+    // checking for optimal threads for conv training
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 16, 3, 128));
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 128));
+
+
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 64));
+    ASSERT_EQ(1, ThreadsHelper::pickLoop3d(6, 8, 3, 64));
+}
+
+TEST_F(ThreadsTests, th_test_4) {
+    // typical conv cases
+    ASSERT_EQ(2, ThreadsHelper::numberOfThreads2d(2, 32, 3));
+    ASSERT_EQ(4, ThreadsHelper::numberOfThreads2d(4, 32, 3));
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 32, 1));
+    ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 16, 64));
+
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(4, 32, 1));
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));
+
+    // primes edge cases
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 19, 17));
+    ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 19, 17));
+
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));
+
+    for (auto e = 0; e < 6; e++) {
+        auto span = Span2::build(1, e, 6, 0, 19, 1, 0, 17, 1);
+
+        nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());
+    }
+
+    nd4j_printf("-----------------------\n","");
+    for (auto e = 0; e < 6; e++) {
+        auto span = Span2::build(1, e, 6, 0, 32, 1, 0, 3, 1);
+
+        nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());
+    }
+}
+
+
+TEST_F(ThreadsTests, test_span_converage_1) {
+    for (int b = 1; b <= 128; b++) {
+        for (int c = 1; c <= 64; c++) {
+            for (int t = 1; t <= 64; t++) {
+
+                auto threads = ThreadsHelper::numberOfThreads2d(t, b, c);
+                auto loop = ThreadsHelper::pickLoop2d(threads, b, c);
+
+                if (t > 1 && threads == 1 && (b > 1 && c > 1)) {
+                    nd4j_printf("Got 1 thread for [%i, %i] loop; initial max threads: %i\n", b, c, t)
+                }
+
+                auto sum = 0;
+                for (auto a = 0; a < threads; a++) {
+                    auto span = Span2::build(loop, a,threads, 0, b, 1, 0, c, 1);
+
+                    if (loop == 1)
+                        sum += span.stopX() - span.startX();
+                    else if (loop == 2)
+                        sum += span.stopY() - span.startY();
+                    else
+                        throw std::runtime_error("Bad loop!");
+                }
+
+                if (loop == 1)
+                    ASSERT_EQ(b, sum);
+                else
+                    ASSERT_EQ(c, sum);
+            }
+        }
+    }
+}
+
+TEST_F(ThreadsTests, validation_test_2d_1) {
+    if (1 > 0)
+        return;
+
+    std::vector<int> threads({1, 2, 4, 6, 8, 12, 16, 20, 32, 48, 64});
+
+    for (int e = 1; e < 1024; e++) {
+        for (int i = 1; i <= 1024; i++ ) {
+            for (auto t:threads) {
+                std::atomic<int64_t> sum;
+                sum.store(0);
+
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    for (auto x = start_x; x < stop_x; x += inc_x) {
+                        for (auto y = start_y; y < stop_y; y += inc_y) {
+                            sum++;
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, e, 1, 0, i, 1, t, true);
+
+                ASSERT_EQ(e * i, sum.load());
+            }
+        }
+
+        nd4j_printf("Finished iteration %i\n", e);
+    }
+}
+
+TEST_F(ThreadsTests, reduction_test_1) {
+
+    auto func = PRAGMA_REDUCE_LONG {
+        int64_t sum = 0;
+
+        for (auto e = start; e < stop; e++) {
+            sum++;
+        };
+
+        return sum;
+    };
+
+    auto sum = samediff::Threads::parallel_long(func, LAMBDA_AL {return _old + _new;}, 0, 8192, 1, 4);
+    ASSERT_EQ(8192, sum);
+}
+
+/*
+TEST_F(ThreadsTests, basic_test_1) {
+    if (!Environment::getInstance()->isCPU())
+        return;
+
+    auto instance = samediff::ThreadPool::getInstance();
+
+    auto array = NDArrayFactory::create<float>('c', {512, 768});
+    auto like = array.like();
+    auto buffer = array.bufferAsT<float>();
+    auto lbuffer = like.bufferAsT<float>();
+
+    auto func = PRAGMA_THREADS_FOR {
+        PRAGMA_OMP_SIMD
+        for (uint64_t e = start; e < stop; e += increment) {
+            buffer[e] += 1.0f;
+        }
+    };
+
+    auto timeStartThreads = std::chrono::system_clock::now();
+    samediff::Threads::parallel_for(func, 0, array.lengthOf());
+    auto timeEndThreads = std::chrono::system_clock::now();
+    auto outerTimeThreads = std::chrono::duration_cast<std::chrono::microseconds> (timeEndThreads - timeStartThreads).count();
+
+    auto timeStartOmp = std::chrono::system_clock::now();
+    PRAGMA_OMP_PARALLEL_FOR_SIMD
+    for (uint64_t e = 0; e < array.lengthOf(); e ++) {
+        lbuffer[e] += 1.0f;
+    }
+    auto timeEndOmp = std::chrono::system_clock::now();
+    auto outerTimeOmp = std::chrono::duration_cast<std::chrono::microseconds> (timeEndOmp - timeStartOmp).count();
+
+    ASSERT_NEAR((float) array.lengthOf(), array.sumNumber().e<float>(0), 1e-5f);
+
+    nd4j_printf("Threads time: %lld us; OMP time: %lld us; %p\n", outerTimeThreads, outerTimeOmp, instance)
+}
+ */
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
index 72ca854f8..fd277b971 100644
--- a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
@@ -55,7 +55,6 @@ TEST_F(WorkspaceTests, BasicInitialization2) {
 
     auto v = array.reduceNumber(reduce::Sum);
     auto f = v.e<float>(0);
-    v.printShapeInfo("v shape");
 
     ASSERT_NEAR(2.0f, f, 1e-5);
 
@@ -77,7 +76,6 @@ TEST_F(WorkspaceTests, BasicInitialization3) {
 
     auto v = array.reduceNumber(reduce::Sum);
     auto f = v.e<float>(0);
-    v.printShapeInfo("v shape");
 
     ASSERT_NEAR(2.0f, array.reduceNumber(reduce::Sum).e<float>(0), 1e-5);
 
diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
index 218035421..315839dba 100644
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@@ -109,15 +109,17 @@ endif()
 # -fsanitize=address
 # -fsanitize=leak
 if (APPLE)
-    set(CMAKE_CXX_FLAGS  " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true")
+    set(CMAKE_CXX_FLAGS  " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true -DAPPLE_BUILD=true")
 elseif(WIN32)
 	if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 		set(CMAKE_CXX_FLAGS  " -g -fPIC -std=c++11 -Wa,-mbig-obj")
 	endif()
 else()
+    set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -DLINUX_BUILD=true")
+
     if ("${_RELEASE}" OR CMAKE_BUILD_TYPE STREQUAL "Release")
         message("Release build for tests")
-        set(CMAKE_CXX_FLAGS  "-O3 -fPIC -std=c++11")
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -O3 -fPIC -std=c++11 -D_RELEASE=true")
         if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
             set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
         else()
diff --git a/libnd4j/tests_cpu/run_tests.sh b/libnd4j/tests_cpu/run_tests.sh
index e5cbd4106..2932827d4 100755
--- a/libnd4j/tests_cpu/run_tests.sh
+++ b/libnd4j/tests_cpu/run_tests.sh
@@ -16,9 +16,30 @@
 # SPDX-License-Identifier: Apache-2.0
 ################################################################################
 
-
 set -exo pipefail
 
+while [[ $# > 0 ]]
+do
+    key="$1"
+    value="${2:-}"
+
+    case $key in
+        -c|--chip)
+        CHIP="${value}"
+        shift # past argument
+        ;;
+        *)
+        # unknown option
+        ;;
+    esac
+    
+    if [[ $# > 0 ]]; then
+        shift # past argument or value
+    fi
+done
+
+CHIP="${CHIP:-cpu}"
+
 # On Mac, make sure it can find libraries for GCC
 export DYLD_LIBRARY_PATH=/usr/local/lib/gcc/8/:/usr/local/lib/gcc/7/:/usr/local/lib/gcc/6/:/usr/local/lib/gcc/5/
 
@@ -30,4 +51,4 @@ if [ -n "$BUILD_PATH" ]; then
     export PATH="$PATH:$BUILD_PATH"
 fi
 
-../blasbuild/cpu/tests_cpu/layers_tests/runtests --gtest_output="xml:../target/surefire-reports/TEST-results.xml"
+../blasbuild/${CHIP}/tests_cpu/layers_tests/runtests --gtest_output="xml:../target/surefire-reports/TEST-${CHIP}-results.xml"
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java
index 32df3e69d..8c80e3bb4 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java
@@ -509,7 +509,7 @@ public abstract class DifferentialFunction {
      * @return the arguments for a given function
      */
     public  SDVariable[] args() {
-        return sameDiff.getInputVariablesForOp(this);
+        return sameDiff == null ? null : sameDiff.getInputVariablesForOp(this);
     }
 
     /**
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java
new file mode 100644
index 000000000..103b0f960
--- /dev/null
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java
@@ -0,0 +1,189 @@
+package org.nd4j.autodiff.listeners.debugging;
+
+import lombok.*;
+import org.nd4j.autodiff.listeners.At;
+import org.nd4j.autodiff.listeners.BaseListener;
+import org.nd4j.autodiff.listeners.Operation;
+import org.nd4j.autodiff.samediff.SameDiff;
+import org.nd4j.autodiff.samediff.internal.SameDiffOp;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.api.ops.Op;
+import org.nd4j.linalg.dataset.api.MultiDataSet;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.util.ArrayUtil;
+
+import java.text.DecimalFormat;
+import java.util.*;
+
+/**
+ * A simple listener for benchmarking single operations in SameDiff<br>
+ * Supports 2 modes:<br>
+ * - SINGLE_ITER_PRINT: Print the runtime of the first iteration<br>
+ * - AGGREGATE: Collect statistics for multiple runs, that can be accessed (by op name) via {@link #getAggregateModeMap()}
+ *
+ * @author Alex Black
+ */
+@Getter
+public class OpBenchmarkListener extends BaseListener {
+
+    public enum Mode {SINGLE_ITER_PRINT, AGGREGATE}
+
+    private final Operation operation;
+    private final Mode mode;
+    private final long minRuntime;
+    private Map<String,OpExec> aggregateModeMap;
+
+    @Getter(AccessLevel.PRIVATE)
+    private long start;
+    @Getter(AccessLevel.PRIVATE)
+    private boolean printActive;
+    private boolean printDone;
+
+    public OpBenchmarkListener(Operation operation, @NonNull Mode mode) {
+        this(operation, mode, 0);
+    }
+
+    /**
+     * @param operation  Operation to collect stats for
+     * @param mode       Mode - see {@link OpBenchmarkListener}
+     * @param minRuntime Minimum runtime - only applies to Mode.SINGLE_ITER_PRINT. If op runtime below this: don't print
+     */
+    public OpBenchmarkListener(Operation operation, @NonNull Mode mode, long minRuntime) {
+        this.operation = operation;
+        this.mode = mode;
+        this.minRuntime = minRuntime;
+    }
+
+    @Override
+    public boolean isActive(Operation operation) {
+        return this.operation == null || this.operation == operation;
+    }
+
+    @Override
+    public void operationStart(SameDiff sd, Operation op) {
+        if(printDone)
+            return;
+        if(this.operation == null || this.operation == op)
+            printActive = true;
+    }
+
+    @Override
+    public void operationEnd(SameDiff sd, Operation op) {
+        if(printDone)
+            return;
+        if(this.operation == null || this.operation == op) {
+            printActive = false;
+            printDone = true;
+        }
+    }
+
+    @Override
+    public void preOpExecution(SameDiff sd, At at, SameDiffOp op) {
+        start = System.currentTimeMillis();
+    }
+
+    @Override
+    public void opExecution(SameDiff sd, At at, MultiDataSet batch, SameDiffOp op, INDArray[] outputs) {
+        long now = System.currentTimeMillis();
+
+        if (mode == Mode.SINGLE_ITER_PRINT && printActive && (now-start) > this.minRuntime) {
+            System.out.println(getOpString(op, now));
+        } else if (mode == Mode.AGGREGATE) {
+            if(aggregateModeMap == null)
+                aggregateModeMap = new LinkedHashMap<>();
+
+            if(!aggregateModeMap.containsKey(op.getName())){
+                String s = getOpString(op, null);
+                OpExec oe = new OpExec(op.getName(), op.getOp().opName(), op.getOp().getClass(),
+                        new ArrayList<Long>(), s);
+                aggregateModeMap.put(op.getName(), oe);
+            }
+
+            aggregateModeMap.get(op.getName()).getRuntimeMs().add(now-start);
+        }
+    }
+
+    private String getOpString(SameDiffOp op, Long now){
+        StringBuilder sb = new StringBuilder();
+        sb.append(op.getName()).append(" - ").append(op.getOp().getClass().getSimpleName())
+                .append("(").append(op.getOp().opName()).append(") - ");
+        if(now != null) {
+            sb.append(now - start).append(" ms\n");
+        }
+
+        if (op.getOp() instanceof DynamicCustomOp) {
+            DynamicCustomOp dco = (DynamicCustomOp) op.getOp();
+            int x = 0;
+
+            for (INDArray i : dco.inputArguments()) {
+                sb.append("  in ").append(x++).append(": ").append(i.shapeInfoToString()).append("\n");
+            }
+            x = 0;
+            for (INDArray o : dco.outputArguments()) {
+                sb.append("  out ").append(x++).append(": ").append(o.shapeInfoToString()).append("\n");
+            }
+            long[] iargs = dco.iArgs();
+            boolean[] bargs = dco.bArgs();
+            double[] targs = dco.tArgs();
+            if (iargs != null && iargs.length > 0) {
+                sb.append("  iargs: ").append(Arrays.toString(iargs)).append("\n");
+            }
+            if (bargs != null && bargs.length > 0) {
+                sb.append("  bargs: ").append(Arrays.toString(bargs)).append("\n");
+            }
+            if (targs != null && targs.length > 0) {
+                sb.append("  targs: ").append(Arrays.toString(targs)).append("\n");
+            }
+        } else {
+            Op o = (Op) op.getOp();
+            if (o.x() != null)
+                sb.append("  x: ").append(o.x().shapeInfoToString());
+            if (o.y() != null)
+                sb.append("  y: ").append(o.y().shapeInfoToString());
+            if (o.z() != null)
+                sb.append("  z: ").append(o.z().shapeInfoToString());
+        }
+        return sb.toString();
+    }
+
+
+    @AllArgsConstructor
+    @Data
+    public static class OpExec {
+        private final String opOwnName;
+        private final String opName;
+        private final Class<?> opClass;
+        private List<Long> runtimeMs;
+        private String firstIter;
+
+        @Override
+        public String toString(){
+            DecimalFormat df = new DecimalFormat("0.000");
+
+            return opOwnName + " - op class: " + opClass.getSimpleName() + " (op name: " + opName + ")\n"
+                    + "count: " + runtimeMs.size() + ", mean: " + df.format(avgMs()) + "ms, std: " + df.format(stdMs()) + "ms, min: " + minMs() + "ms, max: " + maxMs() + "ms\n"
+                    + firstIter;
+        }
+
+        public double avgMs() {
+            long sum = 0;
+            for (Long l : runtimeMs) {
+                sum += l;
+            }
+            return sum / (double) runtimeMs.size();
+        }
+
+        public double stdMs() {
+            return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).stdNumber().doubleValue();
+        }
+
+        public long minMs() {
+            return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).minNumber().longValue();
+        }
+
+        public long maxMs() {
+            return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).maxNumber().longValue();
+        }
+    }
+}
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java
index 55165b530..32a1cc362 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java
@@ -24,7 +24,7 @@ import org.nd4j.autodiff.listeners.Listener;
 import org.nd4j.autodiff.samediff.SDVariable;
 import org.nd4j.autodiff.samediff.SameDiff;
 import org.nd4j.autodiff.samediff.VariableType;
-import org.nd4j.autodiff.samediff.internal.memory.ArrayCloseMemoryMgr;
+import org.nd4j.autodiff.samediff.internal.memory.ArrayCacheMemoryMgr;
 import org.nd4j.base.Preconditions;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.memory.MemoryWorkspace;
@@ -84,8 +84,7 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
 
     public InferenceSession(@NonNull SameDiff sameDiff) {
         super(sameDiff);
-
-        mmgr = new ArrayCloseMemoryMgr();   //TODO replace this with new (planned) array reuse memory manager
+        mmgr = new ArrayCacheMemoryMgr();
     }
 
     @Override
@@ -215,7 +214,6 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
         }
 
         INDArray[] out = doExec(op.getOp(), outputFrameIter, opInputs, allIterInputs, constAndPhInputs);
-        op.getOp().clearArrays();
 
         if (log.isTraceEnabled()) {
             StringBuilder sb = new StringBuilder();
@@ -254,6 +252,7 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
                 }
             }
         }
+        op.getOp().clearArrays();
 
 
         //Record array uses for memory management/deallocation
@@ -842,11 +841,10 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
                     reqShape = reqShape.asDataType(dt);
                 }
 
-                if (currOutput == null || currOutput.wasClosed() || !currOutput.shapeDescriptor().equals(reqShape) || currOutput.isEmpty() != reqShape.isEmpty() || isLoop) {
-                    boolean isOutput = allReqVariables.contains(outNames[i]);
-                    INDArray out = mmgr.allocate(isOutput, reqShape);
-                    customOp.setOutputArgument(i, out);
-                }
+                //Always allocate new output array, rely on memory manager for efficient memory management and array reuse etc
+                boolean isOutput = allReqVariables.contains(outNames[i]);
+                INDArray out = mmgr.allocate(isOutput, reqShape);
+                customOp.setOutputArgument(i, out);
             }
 
         } else if (df instanceof Op) {
@@ -893,29 +891,17 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
 
             //Check output shape; allocate a new Z if required
             //For example, if minibatch size has changed since last op execution
+            boolean isOutput = allReqVariables.contains(((BaseOp) op).outputVariablesNames()[0]);
             if (emptyReduce) {
-                INDArray z = op.z();
-                if (z == null || !op.x().equalShapes(z) || isLoop) {
-                    //Note: edge case: [x,y].sum(empty) = [x,y] for TF import compatibility.
-                    z = mmgr.allocate(false, op.x().dataType(), op.x().shape());
-                    op.setZ(z);
-                }
+                //Always allocate new output array, rely on memory manager for efficient memory management and array reuse etc
+                INDArray z = mmgr.allocate(false, op.x().dataType(), op.x().shape());
+                op.setZ(z);
             } else {
                 List<LongShapeDescriptor> outputShape = ((BaseOp) op).calculateOutputShape();
                 Preconditions.checkState(outputShape != null && outputShape.size() == 1, "Could not calculate output shape for op: %s", op.getClass());
-                INDArray z = op.z();
-                if (z == null || z.wasClosed() || !outputShape.get(0).equals(z.shapeDescriptor()) || isLoop) {
-                    if (log.isTraceEnabled()) {
-                        log.trace("Existing op result (z) array shape for op {} was {}, allocating new array of shape {}",
-                                op.getClass().getSimpleName(), (z == null ? null : Arrays.toString(z.shape())), outputShape.get(0).toString());
-                    }
-
-                    LongShapeDescriptor lsd = outputShape.get(0);
-
-                    boolean isOutput = allReqVariables.contains(((BaseOp) op).outputVariablesNames()[0]);
-                    z = mmgr.allocate(isOutput, lsd);
-                    op.setZ(z);
-                }
+                LongShapeDescriptor lsd = outputShape.get(0);
+                INDArray z = mmgr.allocate(isOutput, lsd);
+                op.setZ(z);
             }
         }
 
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java
new file mode 100644
index 000000000..c802dd4e2
--- /dev/null
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java
@@ -0,0 +1,292 @@
+package org.nd4j.autodiff.samediff.internal.memory;
+
+import lombok.*;
+import org.bytedeco.javacpp.Pointer;
+import org.nd4j.base.Preconditions;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.shape.LongShapeDescriptor;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.util.ArrayUtil;
+
+import java.util.*;
+
+/**
+ * ArrayCacheMemoryMgr reuses arrays to reduce the number of memory allocations and deallocations.<br>
+ * Memory allocations and deallocations can be quite expensive, especially on GPUs.<br>
+ * Note that when arrays are reused, they are reused for the same datatype only.<br>
+ * If caching a released array would result in the the maximum cache size being is exceeded, the oldest arrays will
+ * be deallocated first, until the new array can in the cache.
+ * <br><br>
+ * By default, the following parameters are used for the cache:
+ * <ul>
+ * <li>Maximum cache size: 0.25 x max memory, where:</li>
+ * <ul>
+ *      <li>CPU: max memory is determined using {@link Pointer#maxBytes()}</li>
+ *      <li>GPU: max memory is determined using GPU 0 total memory</li>
+ * </ul>
+ * <li>Larger array max multiple: 2.0</li>
+ * <ul>
+ *     <li>This means: if an exact array size can't be provided from the cache, use the next smallest array with a buffer up to 2.0x larger than requested</li>
+ *     <li>If no cached arrays of size &lt; 2x requested exists, allocate a new array</li>
+ * </ul>
+ * <li>Small array threshold: 1024 elements</li>
+ * <ul>
+ *      <li>This means: the "larger array max multiple" doesn't apply below this level. For example, we might return a size 1 array backed by a size 1023 buffer</li>
+ * </ul>
+ * </ul>
+ *
+ * @author Alex Black
+ */
+@Getter
+public class ArrayCacheMemoryMgr extends AbstractMemoryMgr {
+
+    private final double maxMemFrac;
+    private final long smallArrayThreshold;
+    private final double largerArrayMaxMultiple;
+
+    private final long maxCacheBytes;
+    private final long totalMemBytes;
+
+    private long currentCacheSize = 0;
+    private Map<DataType, ArrayStore> arrayStores = new HashMap<>();
+
+    private LinkedHashSet<Long> lruCache = new LinkedHashSet<>();
+    private Map<Long,INDArray> lruCacheValues = new HashMap<>();
+
+    /**
+     * Create an ArrayCacheMemoryMgr with default settings as per {@link ArrayCacheMemoryMgr}
+     */
+    public ArrayCacheMemoryMgr() {
+        this(0.25, 1024, 2.0);
+    }
+
+    /**
+     * @param maxMemFrac             Maximum memory fraciton to use as cache
+     * @param smallArrayThreshold    Below this size (elements), don't apply the "largerArrayMaxMultiple" rule
+     * @param largerArrayMaxMultiple Maximum multiple of the requested size to return from the cache. If an array of size
+     *                               1024 is requested, and largerArrayMaxMultiple is 2.0, then we'll return from the cache
+     *                               the array with the smallest data buffer up to 2.0*1024 elements; otherwise we'll return
+     *                               a new array
+     */
+    public ArrayCacheMemoryMgr(double maxMemFrac, long smallArrayThreshold, double largerArrayMaxMultiple) {
+        Preconditions.checkArgument(maxMemFrac > 0 && maxMemFrac < 1, "Maximum memory fraction for cache must be between 0.0 and 1.0, got %s", maxMemFrac);
+        Preconditions.checkArgument(smallArrayThreshold >= 0, "Small array threshould must be >= 0, got %s", smallArrayThreshold);
+        Preconditions.checkArgument(largerArrayMaxMultiple >= 1.0, "Larger array max multiple must be >= 1.0, got %s", largerArrayMaxMultiple);
+        this.maxMemFrac = maxMemFrac;
+        this.smallArrayThreshold = smallArrayThreshold;
+        this.largerArrayMaxMultiple = largerArrayMaxMultiple;
+
+        if(isCpu()){
+            totalMemBytes = Pointer.maxBytes();
+        } else {
+            Properties p = Nd4j.getExecutioner().getEnvironmentInformation();
+            List devList = (List) p.get("cuda.devicesInformation");
+            Map m = (Map) devList.get(0);
+            totalMemBytes = (Long)m.get("cuda.totalMemory");
+        }
+        maxCacheBytes = (long)(maxMemFrac * totalMemBytes);
+    }
+
+    private boolean isCpu(){
+        String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
+        return !"CUDA".equalsIgnoreCase(backend);
+    }
+
+    @Override
+    public INDArray allocate(boolean detached, DataType dataType, long... shape) {
+        if (arrayStores.containsKey(dataType)) {
+            INDArray arr = arrayStores.get(dataType).get(shape);
+            if (arr != null) {
+                //Decrement cache size
+                currentCacheSize -= dataType.width() * arr.data().length();
+
+                return arr; //Allocated from cache
+            }
+        }
+
+        //Allocation failed, allocate new array
+        return Nd4j.createUninitializedDetached(dataType, shape);
+    }
+
+    @Override
+    public INDArray allocate(boolean detached, LongShapeDescriptor descriptor) {
+        return allocate(detached, descriptor.dataType(), descriptor.getShape());
+    }
+
+    @Override
+    public void release(@NonNull INDArray array) {
+        //Check for multiple releases of the array
+        long id = array.getId();
+        Preconditions.checkState(!lruCache.contains(id), "Array was released multiple times: id=%s, shape=%ndShape", id, array);
+
+
+        DataType dt = array.dataType();
+        long thisBytes = array.data().length() * dt.width();
+        if(array.dataType() == DataType.UTF8) {
+            //Don't cache string arrays due to variable length buffers
+            if(array.closeable())
+                array.close();
+        } else if (currentCacheSize + thisBytes > maxCacheBytes) {
+            if(thisBytes > maxCacheBytes){
+                //Can't store even if we clear everything - too large
+                if(array.closeable())
+                    array.close();
+                return;
+            }
+
+            //Need to deallocate some arrays to stay under limit - do in "oldest first" order
+            Iterator<Long> iter = lruCache.iterator();
+            while(currentCacheSize + thisBytes > maxCacheBytes){
+                long next = iter.next();
+                iter.remove();
+                INDArray nextOldest = lruCacheValues.remove(next);
+                DataType ndt = nextOldest.dataType();
+                long nextBytes = ndt.width() * nextOldest.data().length();
+                arrayStores.get(ndt).removeObject(nextOldest);
+                currentCacheSize -= nextBytes;
+
+                if(nextOldest.closeable())
+                    nextOldest.close();
+            }
+
+            //After clearing space - can now cache
+            cacheArray(array);
+        } else {
+            //OK to cache
+            cacheArray(array);
+        }
+
+        //Store in LRU cache for "last used" removal if we exceed cache size
+        lruCache.add(array.getId());
+        lruCacheValues.put(array.getId(), array);
+    }
+
+    private void cacheArray(INDArray array){
+        DataType dt = array.dataType();
+        if (!arrayStores.containsKey(dt))
+            arrayStores.put(dt, new ArrayStore());
+        arrayStores.get(dt).add(array);
+        currentCacheSize += array.data().length() * dt.width();
+
+        lruCache.add(array.getId());
+        lruCacheValues.put(array.getId(), array);
+    }
+
+    @Override
+    public void close() {
+        for (ArrayStore as : arrayStores.values()) {
+            as.close();
+        }
+    }
+
+
+    @Getter
+    public class ArrayStore {
+        private INDArray[] sorted = new INDArray[1000];     //TODO resizing, don't hardcode
+        private long[] lengths = new long[1000];
+        private long lengthSum;
+        private long bytesSum;
+        private int size;
+
+        private void add(@NonNull INDArray array) {
+            //Resize arrays
+            if(size == sorted.length){
+                sorted = Arrays.copyOf(sorted, 2*sorted.length);
+                lengths = Arrays.copyOf(lengths, 2*lengths.length);
+            }
+
+            long length = array.data().length();
+            int idx = Arrays.binarySearch(lengths, 0, size, length);
+            if (idx < 0) {
+                idx = -idx - 1;  //See binarySearch javadoc
+            }
+            for (int i = size - 1; i >= idx; i--) {
+                sorted[i + 1] = sorted[i];
+                lengths[i + 1] = lengths[i];
+            }
+            sorted[idx] = array;
+            lengths[idx] = length;
+            size++;
+            lengthSum += length;
+            bytesSum += length * array.dataType().width();
+        }
+
+        private INDArray get(long[] shape) {
+            if (size == 0)
+                return null;
+
+            long length = shape.length == 0 ? 1 : ArrayUtil.prod(shape);
+
+            int idx = Arrays.binarySearch(lengths, 0, size, length);
+            if (idx < 0) {
+                idx = -idx - 1;
+                if (idx >= size) {
+                    //Largest array is smaller than required -> can't return from cache
+                    return null;
+                }
+                INDArray nextSmallest = sorted[idx];
+                long nextSmallestLength = nextSmallest.data().length();
+                long nextSmallestLengthBytes = nextSmallestLength * nextSmallest.dataType().width();
+
+                boolean tooLarge = (length > (long) (nextSmallestLength * largerArrayMaxMultiple));
+
+                if (nextSmallestLengthBytes > smallArrayThreshold && tooLarge) {
+                    return null;
+                } // If less than smallArrayThreshold, ok, return as is
+            }
+
+            //Remove
+            INDArray arr = removeIdx(idx);
+
+            lruCache.remove(arr.getId());
+            lruCacheValues.remove(arr.getId());
+
+            //Create a new array with the specified buffer. This is for 2 reasons:
+            //(a) the cached array and requested array sizes may differ (though this is easy to check for)
+            //(b) Some SameDiff array use tracking uses *object identity* - so we want different objects when reusing arrays
+            //    to avoid issues there
+            return Nd4j.create(arr.data(), shape);
+        }
+
+        private void removeObject(INDArray array){
+            long length = array.data().length();
+            int idx = Arrays.binarySearch(lengths, 0, size, length);
+            Preconditions.checkState(idx > 0, "Cannot remove array from ArrayStore: no array with this length exists in the cache");
+            boolean found = false;
+            int i = 0;
+            while(!found && i <= size && lengths[i] == length){
+                found = sorted[i++] == array; //Object equality
+            }
+            Preconditions.checkState(found, "Cannot remove array: not found in ArrayCache");
+            removeIdx(i - 1);
+        }
+
+        private INDArray removeIdx(int idx){
+            INDArray arr = sorted[idx];
+            for (int i = idx; i < size; i++) {
+                sorted[i] = sorted[i + 1];
+                lengths[i] = lengths[i + 1];
+            }
+            sorted[size] = null;
+            lengths[size] = 0;
+            size--;
+
+            bytesSum -= (arr.data().length() * arr.dataType().width());
+            lengthSum -= arr.data().length();
+
+            return arr;
+        }
+
+        private void close() {
+            for (int i = 0; i < size; i++) {
+                if (sorted[i].closeable())
+                    sorted[i].close();
+                lengths[i] = 0;
+            }
+            lengthSum = 0;
+            bytesSum = 0;
+            size = 0;
+        }
+    }
+}
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java
index 3bf3105f8..0f8f48d86 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java
@@ -49,7 +49,7 @@ public class EqualsWithEps extends BaseReduce3Op {
 
     public EqualsWithEps(INDArray x, INDArray y, INDArray z, double eps, int... dimensions) {
         super(x, y, z, false, dimensions);
-        this.extraArgs = new Object[] {eps};
+        this.extraArgs = new Object[] {0.0, 0.0, eps};
     }
 
     public EqualsWithEps(INDArray x, INDArray y, double eps, int... dimensions) {
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index efa70d691..fecb64012 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -731,7 +731,6 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #define ND4J_EXPORT
 // #endif
 // #include <dll.h>
-// #include <helpers/BlasHelper.h>
 
 /*
 int tad_threshold = 1;
@@ -3604,6 +3603,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <indexing/IndicesList.h>
 // #include <graph/Intervals.h>
 // #include <array/DataType.h>
+// #include <array/DataTypeUtils.h>
 // #include <stdint.h>
 // #include <array/ArrayOptions.h>
 // #include <array/ArrayType.h>
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java
index 8af56286d..58ad965a6 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java
@@ -67,7 +67,7 @@ public class CpuMemoryManager extends BasicMemoryManager {
      */
     @Override
     public void release(@NonNull Pointer pointer, MemoryKind kind) {
-        Pointer.free(pointer);
+        NativeOpsHolder.getInstance().getDeviceNativeOps().freeHost(pointer);
         pointer.setNull();
     }
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index f915c8152..06c061fad 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -1,4 +1,4 @@
-// Targeted by JavaCPP version 1.5.1-1: DO NOT EDIT THIS FILE
+// Targeted by JavaCPP version 1.5.2: DO NOT EDIT THIS FILE
 
 package org.nd4j.nativeblas;
 
@@ -731,7 +731,6 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #define ND4J_EXPORT
 // #endif
 // #include <dll.h>
-// #include <helpers/BlasHelper.h>
 
 /*
 int tad_threshold = 1;
@@ -3604,6 +3603,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <indexing/IndicesList.h>
 // #include <graph/Intervals.h>
 // #include <array/DataType.h>
+// #include <array/DataTypeUtils.h>
 // #include <stdint.h>
 // #include <array/ArrayOptions.h>
 // #include <array/ArrayType.h>
@@ -5454,6 +5454,10 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
 
         
 
+        
+
+        
+
         
 
         
@@ -21232,6 +21236,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
  *
  * This program and the accompanying materials are made available under the
  * terms of the Apache License, Version 2.0 which is available at
@@ -21290,6 +21295,18 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
+        /*
+         * random_uniform distribution for types int32,int64, float16, float and double
+         * by default dtype is float32
+         *
+         * input:
+         *    0 - shape of output (1D int tensor)
+         *    1 - min val (0D of output type) - optional (0 as default)
+         *    2 - max val (0D of output type) - optional (inf as default)
+         *
+         * output:
+         *    0 - uniformly distributed values of given type (between min and max)
+         */
 //         #if NOT_EXCLUDED(OP_randomuniform)
         @Namespace("nd4j::ops") public static class randomuniform extends DeclarableCustomOp {
             static { Loader.load(); }
@@ -21362,6 +21379,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
+//         #if NOT_EXCLUDED(OP_random_crop)
         @Namespace("nd4j::ops") public static class random_crop extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
@@ -21377,6 +21395,50 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
+//         #endif
+
+        /**
+         * random_gamma op.
+         */
+//         #if NOT_EXCLUDED(OP_random_gamma)
+        @Namespace("nd4j::ops") public static class random_gamma extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public random_gamma(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public random_gamma(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public random_gamma position(long position) {
+                return (random_gamma)super.position(position);
+            }
+        
+                                                                                    public random_gamma() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
+        /**
+         * random_poisson op.
+         */
+//         #if NOT_EXCLUDED(OP_random_poisson)
+        @Namespace("nd4j::ops") public static class random_poisson extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public random_poisson(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public random_poisson(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public random_poisson position(long position) {
+                return (random_poisson)super.position(position);
+            }
+        
+                                                                                    public random_poisson() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
     
 
 
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java
new file mode 100644
index 000000000..6505bee20
--- /dev/null
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java
@@ -0,0 +1,119 @@
+package org.nd4j.autodiff.samediff;
+
+import org.junit.Test;
+import org.nd4j.autodiff.samediff.internal.memory.ArrayCacheMemoryMgr;
+import org.nd4j.linalg.BaseNd4jTest;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.factory.Nd4jBackend;
+
+import java.lang.reflect.Field;
+
+import static org.junit.Assert.*;
+
+public class MemoryMgrTest extends BaseNd4jTest {
+
+    public MemoryMgrTest(Nd4jBackend b){
+        super(b);
+    }
+
+    @Override
+    public char ordering(){
+        return 'c';
+    }
+
+    @Test
+    public void testArrayReuseTooLarge() throws Exception {
+
+        ArrayCacheMemoryMgr mmgr = new ArrayCacheMemoryMgr();
+        Field f = ArrayCacheMemoryMgr.class.getDeclaredField("maxCacheBytes");
+        f.setAccessible(true);
+        f.set(mmgr, 1000);
+
+        assertEquals(1000, mmgr.getMaxCacheBytes());
+
+        INDArray[] arrays = new INDArray[100];
+        for( int i=0; i<arrays.length; i++ ){
+            arrays[i] = Nd4j.create(DataType.FLOAT, 25);        //100 bytes each
+        }
+
+        for( int i=0; i<10; i++ ){
+            mmgr.release(arrays[i]);
+        }
+
+        assertEquals(1000, mmgr.getCurrentCacheSize());
+        ArrayCacheMemoryMgr.ArrayStore as = mmgr.getArrayStores().get(DataType.FLOAT);
+        assertEquals(1000, as.getBytesSum());
+        assertEquals(250, as.getLengthSum());
+        assertEquals(10, as.getSize());
+        assertEquals(10, mmgr.getLruCache().size());
+        assertEquals(10, mmgr.getLruCacheValues().size());
+
+
+        //At this point: array store is full.
+        //If we try to release more, the oldest (first released) values should be closed
+        for( int i=0; i<10; i++ ) {
+            INDArray toRelease = Nd4j.create(DataType.FLOAT, 25);
+            mmgr.release(toRelease);
+            //oldest N only should be closed by this point...
+            for( int j=0; j<10; j++ ){
+                if(j <= i){
+                    //Should have been closed
+                    assertTrue(arrays[j].wasClosed());
+                } else {
+                    //Should still be open
+                    assertFalse(arrays[j].wasClosed());
+                }
+            }
+        }
+
+
+        assertEquals(1000, mmgr.getCurrentCacheSize());
+        assertEquals(1000, as.getBytesSum());
+        assertEquals(250, as.getLengthSum());
+        assertEquals(10, as.getSize());
+        assertEquals(10, mmgr.getLruCache().size());
+        assertEquals(10, mmgr.getLruCacheValues().size());
+
+        //now, allocate some values:
+        for( int i=1; i<=10; i++ ) {
+            INDArray a1 = mmgr.allocate(true, DataType.FLOAT, 25);
+            assertEquals(1000 - i * 100, mmgr.getCurrentCacheSize());
+            assertEquals(1000 - i * 100, as.getBytesSum());
+            assertEquals(250 - i * 25, as.getLengthSum());
+            assertEquals(10 - i, as.getSize());
+            assertEquals(10 - i, mmgr.getLruCache().size());
+            assertEquals(10 - i, mmgr.getLruCacheValues().size());
+        }
+
+        assertEquals(0, mmgr.getCurrentCacheSize());
+        assertEquals(0, as.getBytesSum());
+        assertEquals(0, as.getLengthSum());
+        assertEquals(0, as.getSize());
+        assertEquals(0, mmgr.getLruCache().size());
+        assertEquals(0, mmgr.getLruCacheValues().size());
+    }
+
+    @Test
+    public void testManyArrays(){
+
+        ArrayCacheMemoryMgr mmgr = new ArrayCacheMemoryMgr();
+        for( int i=0; i<1000; i++ ){
+            mmgr.release(Nd4j.scalar(0));
+        }
+
+        assertEquals(4*1000, mmgr.getCurrentCacheSize());
+        assertEquals(1000, mmgr.getLruCache().size());
+        assertEquals(1000, mmgr.getLruCacheValues().size());
+
+        for( int i=0; i<1000; i++ ){
+            mmgr.release(Nd4j.scalar(0));
+        }
+
+        assertEquals(4*2000, mmgr.getCurrentCacheSize());
+        assertEquals(2000, mmgr.getLruCache().size());
+        assertEquals(2000, mmgr.getLruCacheValues().size());
+    }
+
+}
diff --git a/nd4s/build.sbt b/nd4s/build.sbt
index 1fbac5ae6..d523b754e 100644
--- a/nd4s/build.sbt
+++ b/nd4s/build.sbt
@@ -61,7 +61,7 @@ lazy val commonSettings = Seq(
 
 lazy val publishNexus = Seq(
   publishTo := {
-    val nexus = "https://nexus.ci.skymind.io/"
+    val nexus = "https://packages.konduit.ai/"
     if (isSnapshot.value)
       Some("snapshots" at nexus + "content/repositories/maven-snapshots")
     else
diff --git a/nd4s/src/main/scala/org/nd4s/Implicits.scala b/nd4s/src/main/scala/org/nd4s/Implicits.scala
index c1fa63e11..35df5a0cb 100644
--- a/nd4s/src/main/scala/org/nd4s/Implicits.scala
+++ b/nd4s/src/main/scala/org/nd4s/Implicits.scala
@@ -80,7 +80,7 @@ object Implicits {
   class IntArray2INDArray(val underlying: Array[Int]) extends AnyVal {
     def mkNDArray(shape: Array[Int], ord: NDOrdering = NDOrdering(Nd4j.order()), offset: Int = 0): INDArray = {
       val strides = Nd4j.getStrides(shape, ord.value)
-      Nd4j.create(underlying, shape.map(_.toLong), strides.map(_.toLong), ord.value, DataType.INT)
+      Nd4j.create(underlying.map(_.toInt), shape.map(_.toLong), strides.map(_.toLong), ord.value, DataType.INT)
     }
 
     def toNDArray: INDArray = Nd4j.createFromArray(underlying: _*)
diff --git a/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala
index 95715ecb1..25e8f374f 100644
--- a/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala
@@ -170,9 +170,9 @@ class ConstructionTest extends FlatSpec with Matchers {
     sd.setTrainingConfig(conf)
     sd.fit(new SingletonMultiDataSetIterator(mds), 1)
 
-    w.eval.toDoubleVector.head shouldBe (0.0629 +- 0.0001)
-    w.eval.toDoubleVector.tail.head shouldBe (0.3128 +- 0.0001)
-    w.eval.toDoubleVector.tail.tail.head shouldBe (0.2503 +- 0.0001)
+    w.getArr.get(0) shouldBe (0.0629 +- 0.0001)
+    w.getArr.get(1) shouldBe (0.3128 +- 0.0001)
+    w.getArr.get(2) shouldBe (0.2503 +- 0.0001)
     //Console.println(w.eval)
   }
 }
diff --git a/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala
index dc41b31f6..5eec9f237 100644
--- a/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala
@@ -209,7 +209,7 @@ class MathTest extends FlatSpec with Matchers {
     val x = sd.bind(arr)
     val y = new SDVariableWrapper(x)
 
-    x.get(SDIndex.point(0)).getArr shouldBe y(0).getArr
+    x.get(SDIndex.point(0)).eval shouldBe y(0).eval
   }
 
   "SDVariable " should "be indexable in 2d" in {
@@ -221,7 +221,7 @@ class MathTest extends FlatSpec with Matchers {
 
     x(0, ---).eval shouldBe x(SDIndex.point(0), SDIndex.all()).eval
 
-    val slice1 = x.get(SDIndex.interval(0, 2), SDIndex.all()).eval
+    val slice1 = x.get(SDIndex.interval(0L, 2L), SDIndex.all()).eval
     val slice2 = x(0 :: 2, ---).eval
     slice1 shouldBe slice2
   }
@@ -237,10 +237,10 @@ class MathTest extends FlatSpec with Matchers {
     x.get(SDIndex.point(0), SDIndex.point(0), SDIndex.all()).eval shouldBe x(0, 0, ---).eval
     x.get(SDIndex.point(0), SDIndex.point(0), SDIndex.point(0)).eval shouldBe x(0, 0, 0).eval
 
-    x.get(SDIndex.interval(0, 2), SDIndex.point(0), SDIndex.point(0)).eval shouldBe x(0 :: 2, 0, 0).eval
-    x.get(SDIndex.interval(0, 2), SDIndex.interval(0, 1), SDIndex.interval(0, 2)).eval shouldBe x(0 :: 2,
-                                                                                                  0 :: 1,
-                                                                                                  0 :: 2).eval
-    x.get(SDIndex.interval(0, 2), SDIndex.interval(0, 1), SDIndex.all()).eval shouldBe x(0 :: 2, 0 :: 1, ---).eval
+    x.get(SDIndex.interval(0L, 2L), SDIndex.point(0), SDIndex.point(0)).eval shouldBe x(0 :: 2, 0, 0).eval
+    x.get(SDIndex.interval(0L, 2L), SDIndex.interval(0L, 1L), SDIndex.interval(0L, 2L)).eval shouldBe x(0 :: 2,
+                                                                                                        0 :: 1,
+                                                                                                        0 :: 2).eval
+    x.get(SDIndex.interval(0L, 2L), SDIndex.interval(0L, 1L), SDIndex.all()).eval shouldBe x(0 :: 2, 0 :: 1, ---).eval
   }
 }
diff --git a/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala
index d0efee304..a12a8752e 100644
--- a/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala
@@ -60,11 +60,11 @@ class SameDiffTest extends FlatSpec with Matchers {
     sd.associateArrayWithVariable(inputArr, input)
     sd.associateArrayWithVariable(labelArr, label)
 
-    val result: INDArray = sd.execAndEndResult
-    assertEquals(1, result.length)
+    val result = sd.output(null: java.util.Map[String, org.nd4j.linalg.api.ndarray.INDArray], "loss")
+    assertEquals(1, result.values().size())
 
     val emptyMap = new HashMap[String, INDArray]()
-    sd.execBackwards(emptyMap)
+    sd.output(emptyMap, "loss")
   }
 
   "SameDiff" should "run test dense layer forward pass" in {
@@ -84,7 +84,7 @@ class SameDiffTest extends FlatSpec with Matchers {
     val expMmul = iInput.mmul(iWeights)
     val expZ = expMmul.addRowVector(iBias)
     val expOut = Transforms.sigmoid(expZ, true)
-    sd.exec(new HashMap[String, INDArray](), sd.outputs)
+    sd.output(new HashMap[String, INDArray](), "mmul", "out", "bias", "add")
     assertEquals(expMmul, mmul.getArr)
     assertEquals(expZ, z.getArr)
     assertEquals(expOut, out.getArr)
@@ -109,15 +109,18 @@ class SameDiffTest extends FlatSpec with Matchers {
       .dataSetFeatureMapping("in", "in2")
       .skipBuilderValidation(true)
       .build
-    sd.setTrainingConfig(c)
-    sd.fit(new SingletonMultiDataSetIterator(new MultiDataSet(Array[INDArray](inArr, inArr2), null)), 1)
-    val out = tanh.eval
+
+    val data = new HashMap[String, INDArray]()
+    data.put("in", Nd4j.randn(1, 3))
+    data.put("in2", Nd4j.randn(3, 4))
     in.convertToConstant
-    val out2 = tanh.eval
+    val out = sd.output(data, "tanh")
+    val out2 = sd.output(data, "tanh")
     assertEquals(out, out2)
     assertEquals(VariableType.CONSTANT, in.getVariableType)
     assertEquals(inArr, in.getArr)
     //Sanity check on fitting:
-    sd.fit(new SingletonMultiDataSetIterator(new MultiDataSet(Array[INDArray](inArr2), null)), 1)
+    sd.setTrainingConfig(c)
+    sd.fit(new SingletonMultiDataSetIterator(new MultiDataSet(Array[INDArray](inArr, inArr2), null)), 1)
   }
 }